diff options
| author | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2016-10-13 17:25:40 -0700 |
|---|---|---|
| committer | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2016-10-13 17:25:40 -0700 |
| commit | 4a7126a25b4dfd07d61c699f724118275acc0c25 (patch) | |
| tree | c78b82bfaa96f330d412ad63e355906f963c3faf /kernel | |
| parent | 930e19248e9b61da36c967687ca79c4d5f977919 (diff) | |
| parent | c8d2bc9bc39ebea8437fd974fdbc21847bb897a3 (diff) | |
Merge tag 'v4.8' into next
Sync up with mainline to bring in I2C host notify changes and other
updates.
Diffstat (limited to 'kernel')
175 files changed, 16415 insertions, 5927 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index f0c40bf49d9f..e2ec54e2b952 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -91,9 +91,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_TRACEPOINTS) += tracepoint.o obj-$(CONFIG_LATENCYTOP) += latencytop.o -obj-$(CONFIG_BINFMT_ELF) += elfcore.o -obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o -obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o +obj-$(CONFIG_ELFCORE) += elfcore.o obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_TRACE_CLOCK) += trace/ diff --git a/kernel/audit.c b/kernel/audit.c index 678c3f000191..a8a91bd2b2a9 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -64,7 +64,6 @@ #include <linux/security.h> #endif #include <linux/freezer.h> -#include <linux/tty.h> #include <linux/pid_namespace.h> #include <net/netns/generic.h> @@ -430,7 +429,6 @@ restart: attempts, audit_pid); set_current_state(TASK_INTERRUPTIBLE); schedule(); - __set_current_state(TASK_RUNNING); goto restart; } } @@ -934,7 +932,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (!audit_enabled && msg_type != AUDIT_USER_AVC) return 0; - err = audit_filter_user(msg_type); + err = audit_filter(msg_type, AUDIT_FILTER_USER); if (err == 1) { /* match or error */ err = 0; if (msg_type == AUDIT_USER_TTY) { @@ -1341,15 +1339,14 @@ static inline void audit_get_stamp(struct audit_context *ctx, static long wait_for_auditd(long sleep_time) { DECLARE_WAITQUEUE(wait, current); - set_current_state(TASK_UNINTERRUPTIBLE); - add_wait_queue_exclusive(&audit_backlog_wait, &wait); if (audit_backlog_limit && - skb_queue_len(&audit_skb_queue) > audit_backlog_limit) + skb_queue_len(&audit_skb_queue) > audit_backlog_limit) { + add_wait_queue_exclusive(&audit_backlog_wait, &wait); + set_current_state(TASK_UNINTERRUPTIBLE); sleep_time = schedule_timeout(sleep_time); - - __set_current_state(TASK_RUNNING); - remove_wait_queue(&audit_backlog_wait, &wait); + remove_wait_queue(&audit_backlog_wait, &wait); + } return sleep_time; } @@ -1382,7 +1379,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, if (audit_initialized != AUDIT_INITIALIZED) return NULL; - if (unlikely(audit_filter_type(type))) + if (unlikely(!audit_filter(type, AUDIT_FILTER_TYPE))) return NULL; if (gfp_mask & __GFP_DIRECT_RECLAIM) { @@ -1886,25 +1883,35 @@ out_null: audit_log_format(ab, " exe=(null)"); } +struct tty_struct *audit_get_tty(struct task_struct *tsk) +{ + struct tty_struct *tty = NULL; + unsigned long flags; + + spin_lock_irqsave(&tsk->sighand->siglock, flags); + if (tsk->signal) + tty = tty_kref_get(tsk->signal->tty); + spin_unlock_irqrestore(&tsk->sighand->siglock, flags); + return tty; +} + +void audit_put_tty(struct tty_struct *tty) +{ + tty_kref_put(tty); +} + void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) { const struct cred *cred; char comm[sizeof(tsk->comm)]; - char *tty; + struct tty_struct *tty; if (!ab) return; /* tsk == current */ cred = current_cred(); - - spin_lock_irq(&tsk->sighand->siglock); - if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) - tty = tsk->signal->tty->name; - else - tty = "(none)"; - spin_unlock_irq(&tsk->sighand->siglock); - + tty = audit_get_tty(tsk); audit_log_format(ab, " ppid=%d pid=%d auid=%u uid=%u gid=%u" " euid=%u suid=%u fsuid=%u" @@ -1920,11 +1927,11 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) from_kgid(&init_user_ns, cred->egid), from_kgid(&init_user_ns, cred->sgid), from_kgid(&init_user_ns, cred->fsgid), - tty, audit_get_sessionid(tsk)); - + tty ? tty_name(tty) : "(none)", + audit_get_sessionid(tsk)); + audit_put_tty(tty); audit_log_format(ab, " comm="); audit_log_untrustedstring(ab, get_task_comm(comm, tsk)); - audit_log_d_path_exe(ab, tsk->mm); audit_log_task_context(ab); } diff --git a/kernel/audit.h b/kernel/audit.h index cbbe6bb6496e..431444c3708b 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -23,6 +23,7 @@ #include <linux/audit.h> #include <linux/skbuff.h> #include <uapi/linux/mqueue.h> +#include <linux/tty.h> /* AUDIT_NAMES is the number of slots we reserve in the audit_context * for saving names from getname(). If we get more names we will allocate @@ -262,6 +263,9 @@ extern struct audit_entry *audit_dupe_rule(struct audit_krule *old); extern void audit_log_d_path_exe(struct audit_buffer *ab, struct mm_struct *mm); +extern struct tty_struct *audit_get_tty(struct task_struct *tsk); +extern void audit_put_tty(struct tty_struct *tty); + /* audit watch functions */ #ifdef CONFIG_AUDIT_WATCH extern void audit_put_watch(struct audit_watch *watch); @@ -327,6 +331,8 @@ extern pid_t audit_sig_pid; extern kuid_t audit_sig_uid; extern u32 audit_sig_sid; +extern int audit_filter(int msgtype, unsigned int listtype); + #ifdef CONFIG_AUDITSYSCALL extern int __audit_signal_info(int sig, struct task_struct *t); static inline int audit_signal_info(int sig, struct task_struct *t) diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 5efe9b299a12..25772476fa4a 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -661,10 +661,10 @@ static int tag_mount(struct vfsmount *mnt, void *arg) static int prune_tree_thread(void *unused) { for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - if (list_empty(&prune_list)) + if (list_empty(&prune_list)) { + set_current_state(TASK_INTERRUPTIBLE); schedule(); - __set_current_state(TASK_RUNNING); + } mutex_lock(&audit_cmd_mutex); mutex_lock(&audit_filter_mutex); @@ -693,16 +693,14 @@ static int audit_launch_prune(void) { if (prune_thread) return 0; - prune_thread = kthread_create(prune_tree_thread, NULL, + prune_thread = kthread_run(prune_tree_thread, NULL, "audit_prune_tree"); if (IS_ERR(prune_thread)) { pr_err("cannot start thread audit_prune_tree"); prune_thread = NULL; return -ENOMEM; - } else { - wake_up_process(prune_thread); - return 0; } + return 0; } /* called with audit_filter_mutex */ diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 3cf1c5978d39..0d302a87f21b 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -19,6 +19,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include <linux/file.h> #include <linux/kernel.h> #include <linux/audit.h> #include <linux/kthread.h> @@ -367,7 +368,7 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent) inode_unlock(d_backing_inode(parent->dentry)); if (d_is_positive(d)) { /* update watch filter fields */ - watch->dev = d_backing_inode(d)->i_sb->s_dev; + watch->dev = d->d_sb->s_dev; watch->ino = d_backing_inode(d)->i_ino; } dput(d); @@ -544,10 +545,11 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark) unsigned long ino; dev_t dev; - rcu_read_lock(); - exe_file = rcu_dereference(tsk->mm->exe_file); + exe_file = get_task_exe_file(tsk); + if (!exe_file) + return 0; ino = exe_file->f_inode->i_ino; dev = exe_file->f_inode->i_sb->s_dev; - rcu_read_unlock(); + fput(exe_file); return audit_mark_compare(mark, ino, dev); } diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 94ca7b1e5e7e..85d9cac497e4 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1290,113 +1290,72 @@ int audit_compare_dname_path(const char *dname, const char *path, int parentlen) return strncmp(p, dname, dlen); } -static int audit_filter_user_rules(struct audit_krule *rule, int type, - enum audit_state *state) +int audit_filter(int msgtype, unsigned int listtype) { - int i; - - for (i = 0; i < rule->field_count; i++) { - struct audit_field *f = &rule->fields[i]; - pid_t pid; - int result = 0; - u32 sid; - - switch (f->type) { - case AUDIT_PID: - pid = task_pid_nr(current); - result = audit_comparator(pid, f->op, f->val); - break; - case AUDIT_UID: - result = audit_uid_comparator(current_uid(), f->op, f->uid); - break; - case AUDIT_GID: - result = audit_gid_comparator(current_gid(), f->op, f->gid); - break; - case AUDIT_LOGINUID: - result = audit_uid_comparator(audit_get_loginuid(current), - f->op, f->uid); - break; - case AUDIT_LOGINUID_SET: - result = audit_comparator(audit_loginuid_set(current), - f->op, f->val); - break; - case AUDIT_MSGTYPE: - result = audit_comparator(type, f->op, f->val); - break; - case AUDIT_SUBJ_USER: - case AUDIT_SUBJ_ROLE: - case AUDIT_SUBJ_TYPE: - case AUDIT_SUBJ_SEN: - case AUDIT_SUBJ_CLR: - if (f->lsm_rule) { - security_task_getsecid(current, &sid); - result = security_audit_rule_match(sid, - f->type, - f->op, - f->lsm_rule, - NULL); - } - break; - } - - if (!result) - return 0; - } - switch (rule->action) { - case AUDIT_NEVER: *state = AUDIT_DISABLED; break; - case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; - } - return 1; -} - -int audit_filter_user(int type) -{ - enum audit_state state = AUDIT_DISABLED; struct audit_entry *e; - int rc, ret; - - ret = 1; /* Audit by default */ - - rcu_read_lock(); - list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { - rc = audit_filter_user_rules(&e->rule, type, &state); - if (rc) { - if (rc > 0 && state == AUDIT_DISABLED) - ret = 0; - break; - } - } - rcu_read_unlock(); - - return ret; -} - -int audit_filter_type(int type) -{ - struct audit_entry *e; - int result = 0; + int ret = 1; /* Audit by default */ rcu_read_lock(); - if (list_empty(&audit_filter_list[AUDIT_FILTER_TYPE])) + if (list_empty(&audit_filter_list[listtype])) goto unlock_and_return; + list_for_each_entry_rcu(e, &audit_filter_list[listtype], list) { + int i, result = 0; - list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TYPE], - list) { - int i; for (i = 0; i < e->rule.field_count; i++) { struct audit_field *f = &e->rule.fields[i]; - if (f->type == AUDIT_MSGTYPE) { - result = audit_comparator(type, f->op, f->val); - if (!result) - break; + pid_t pid; + u32 sid; + + switch (f->type) { + case AUDIT_PID: + pid = task_pid_nr(current); + result = audit_comparator(pid, f->op, f->val); + break; + case AUDIT_UID: + result = audit_uid_comparator(current_uid(), f->op, f->uid); + break; + case AUDIT_GID: + result = audit_gid_comparator(current_gid(), f->op, f->gid); + break; + case AUDIT_LOGINUID: + result = audit_uid_comparator(audit_get_loginuid(current), + f->op, f->uid); + break; + case AUDIT_LOGINUID_SET: + result = audit_comparator(audit_loginuid_set(current), + f->op, f->val); + break; + case AUDIT_MSGTYPE: + result = audit_comparator(msgtype, f->op, f->val); + break; + case AUDIT_SUBJ_USER: + case AUDIT_SUBJ_ROLE: + case AUDIT_SUBJ_TYPE: + case AUDIT_SUBJ_SEN: + case AUDIT_SUBJ_CLR: + if (f->lsm_rule) { + security_task_getsecid(current, &sid); + result = security_audit_rule_match(sid, + f->type, f->op, f->lsm_rule, NULL); + } + break; + default: + goto unlock_and_return; } + if (result < 0) /* error */ + goto unlock_and_return; + if (!result) + break; + } + if (result > 0) { + if (e->rule.action == AUDIT_NEVER || listtype == AUDIT_FILTER_TYPE) + ret = 0; + break; } - if (result) - goto unlock_and_return; } unlock_and_return: rcu_read_unlock(); - return result; + return ret; } static int update_lsm_rule(struct audit_krule *r) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 7d0e3cf8abe1..5abf1dc1f91c 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -63,7 +63,6 @@ #include <asm/unistd.h> #include <linux/security.h> #include <linux/list.h> -#include <linux/tty.h> #include <linux/binfmts.h> #include <linux/highmem.h> #include <linux/syscalls.h> @@ -73,6 +72,7 @@ #include <linux/compat.h> #include <linux/ctype.h> #include <linux/string.h> +#include <linux/uaccess.h> #include <uapi/linux/limits.h> #include "audit.h" @@ -82,7 +82,8 @@ #define AUDITSC_SUCCESS 1 #define AUDITSC_FAILURE 2 -/* no execve audit message should be longer than this (userspace limits) */ +/* no execve audit message should be longer than this (userspace limits), + * see the note near the top of audit_log_execve_info() about this value */ #define MAX_EXECVE_AUDIT_LEN 7500 /* max length to print of cmdline/proctitle value during audit */ @@ -695,8 +696,12 @@ static int audit_filter_rules(struct task_struct *tsk, ctx->prio = rule->prio; } switch (rule->action) { - case AUDIT_NEVER: *state = AUDIT_DISABLED; break; - case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; + case AUDIT_NEVER: + *state = AUDIT_DISABLED; + break; + case AUDIT_ALWAYS: + *state = AUDIT_RECORD_CONTEXT; + break; } return 1; } @@ -988,184 +993,178 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, return rc; } -/* - * to_send and len_sent accounting are very loose estimates. We aren't - * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being - * within about 500 bytes (next page boundary) - * - * why snprintf? an int is up to 12 digits long. if we just assumed when - * logging that a[%d]= was going to be 16 characters long we would be wasting - * space in every audit message. In one 7500 byte message we can log up to - * about 1000 min size arguments. That comes down to about 50% waste of space - * if we didn't do the snprintf to find out how long arg_num_len was. - */ -static int audit_log_single_execve_arg(struct audit_context *context, - struct audit_buffer **ab, - int arg_num, - size_t *len_sent, - const char __user *p, - char *buf) +static void audit_log_execve_info(struct audit_context *context, + struct audit_buffer **ab) { - char arg_num_len_buf[12]; - const char __user *tmp_p = p; - /* how many digits are in arg_num? 5 is the length of ' a=""' */ - size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5; - size_t len, len_left, to_send; - size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN; - unsigned int i, has_cntl = 0, too_long = 0; - int ret; - - /* strnlen_user includes the null we don't want to send */ - len_left = len = strnlen_user(p, MAX_ARG_STRLEN) - 1; - - /* - * We just created this mm, if we can't find the strings - * we just copied into it something is _very_ wrong. Similar - * for strings that are too long, we should not have created - * any. - */ - if (WARN_ON_ONCE(len < 0 || len > MAX_ARG_STRLEN - 1)) { - send_sig(SIGKILL, current, 0); - return -1; + long len_max; + long len_rem; + long len_full; + long len_buf; + long len_abuf; + long len_tmp; + bool require_data; + bool encode; + unsigned int iter; + unsigned int arg; + char *buf_head; + char *buf; + const char __user *p = (const char __user *)current->mm->arg_start; + + /* NOTE: this buffer needs to be large enough to hold all the non-arg + * data we put in the audit record for this argument (see the + * code below) ... at this point in time 96 is plenty */ + char abuf[96]; + + /* NOTE: we set MAX_EXECVE_AUDIT_LEN to a rather arbitrary limit, the + * current value of 7500 is not as important as the fact that it + * is less than 8k, a setting of 7500 gives us plenty of wiggle + * room if we go over a little bit in the logging below */ + WARN_ON_ONCE(MAX_EXECVE_AUDIT_LEN > 7500); + len_max = MAX_EXECVE_AUDIT_LEN; + + /* scratch buffer to hold the userspace args */ + buf_head = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL); + if (!buf_head) { + audit_panic("out of memory for argv string"); + return; } + buf = buf_head; + + audit_log_format(*ab, "argc=%d", context->execve.argc); - /* walk the whole argument looking for non-ascii chars */ + len_rem = len_max; + len_buf = 0; + len_full = 0; + require_data = true; + encode = false; + iter = 0; + arg = 0; do { - if (len_left > MAX_EXECVE_AUDIT_LEN) - to_send = MAX_EXECVE_AUDIT_LEN; - else - to_send = len_left; - ret = copy_from_user(buf, tmp_p, to_send); - /* - * There is no reason for this copy to be short. We just - * copied them here, and the mm hasn't been exposed to user- - * space yet. - */ - if (ret) { - WARN_ON(1); - send_sig(SIGKILL, current, 0); - return -1; - } - buf[to_send] = '\0'; - has_cntl = audit_string_contains_control(buf, to_send); - if (has_cntl) { - /* - * hex messages get logged as 2 bytes, so we can only - * send half as much in each message - */ - max_execve_audit_len = MAX_EXECVE_AUDIT_LEN / 2; - break; - } - len_left -= to_send; - tmp_p += to_send; - } while (len_left > 0); - - len_left = len; - - if (len > max_execve_audit_len) - too_long = 1; - - /* rewalk the argument actually logging the message */ - for (i = 0; len_left > 0; i++) { - int room_left; - - if (len_left > max_execve_audit_len) - to_send = max_execve_audit_len; - else - to_send = len_left; - - /* do we have space left to send this argument in this ab? */ - room_left = MAX_EXECVE_AUDIT_LEN - arg_num_len - *len_sent; - if (has_cntl) - room_left -= (to_send * 2); - else - room_left -= to_send; - if (room_left < 0) { - *len_sent = 0; - audit_log_end(*ab); - *ab = audit_log_start(context, GFP_KERNEL, AUDIT_EXECVE); - if (!*ab) - return 0; - } + /* NOTE: we don't ever want to trust this value for anything + * serious, but the audit record format insists we + * provide an argument length for really long arguments, + * e.g. > MAX_EXECVE_AUDIT_LEN, so we have no choice but + * to use strncpy_from_user() to obtain this value for + * recording in the log, although we don't use it + * anywhere here to avoid a double-fetch problem */ + if (len_full == 0) + len_full = strnlen_user(p, MAX_ARG_STRLEN) - 1; + + /* read more data from userspace */ + if (require_data) { + /* can we make more room in the buffer? */ + if (buf != buf_head) { + memmove(buf_head, buf, len_buf); + buf = buf_head; + } + + /* fetch as much as we can of the argument */ + len_tmp = strncpy_from_user(&buf_head[len_buf], p, + len_max - len_buf); + if (len_tmp == -EFAULT) { + /* unable to copy from userspace */ + send_sig(SIGKILL, current, 0); + goto out; + } else if (len_tmp == (len_max - len_buf)) { + /* buffer is not large enough */ + require_data = true; + /* NOTE: if we are going to span multiple + * buffers force the encoding so we stand + * a chance at a sane len_full value and + * consistent record encoding */ + encode = true; + len_full = len_full * 2; + p += len_tmp; + } else { + require_data = false; + if (!encode) + encode = audit_string_contains_control( + buf, len_tmp); + /* try to use a trusted value for len_full */ + if (len_full < len_max) + len_full = (encode ? + len_tmp * 2 : len_tmp); + p += len_tmp + 1; + } + len_buf += len_tmp; + buf_head[len_buf] = '\0'; - /* - * first record needs to say how long the original string was - * so we can be sure nothing was lost. - */ - if ((i == 0) && (too_long)) - audit_log_format(*ab, " a%d_len=%zu", arg_num, - has_cntl ? 2*len : len); - - /* - * normally arguments are small enough to fit and we already - * filled buf above when we checked for control characters - * so don't bother with another copy_from_user - */ - if (len >= max_execve_audit_len) - ret = copy_from_user(buf, p, to_send); - else - ret = 0; - if (ret) { - WARN_ON(1); - send_sig(SIGKILL, current, 0); - return -1; + /* length of the buffer in the audit record? */ + len_abuf = (encode ? len_buf * 2 : len_buf + 2); } - buf[to_send] = '\0'; - - /* actually log it */ - audit_log_format(*ab, " a%d", arg_num); - if (too_long) - audit_log_format(*ab, "[%d]", i); - audit_log_format(*ab, "="); - if (has_cntl) - audit_log_n_hex(*ab, buf, to_send); - else - audit_log_string(*ab, buf); - - p += to_send; - len_left -= to_send; - *len_sent += arg_num_len; - if (has_cntl) - *len_sent += to_send * 2; - else - *len_sent += to_send; - } - /* include the null we didn't log */ - return len + 1; -} -static void audit_log_execve_info(struct audit_context *context, - struct audit_buffer **ab) -{ - int i, len; - size_t len_sent = 0; - const char __user *p; - char *buf; + /* write as much as we can to the audit log */ + if (len_buf > 0) { + /* NOTE: some magic numbers here - basically if we + * can't fit a reasonable amount of data into the + * existing audit buffer, flush it and start with + * a new buffer */ + if ((sizeof(abuf) + 8) > len_rem) { + len_rem = len_max; + audit_log_end(*ab); + *ab = audit_log_start(context, + GFP_KERNEL, AUDIT_EXECVE); + if (!*ab) + goto out; + } - p = (const char __user *)current->mm->arg_start; + /* create the non-arg portion of the arg record */ + len_tmp = 0; + if (require_data || (iter > 0) || + ((len_abuf + sizeof(abuf)) > len_rem)) { + if (iter == 0) { + len_tmp += snprintf(&abuf[len_tmp], + sizeof(abuf) - len_tmp, + " a%d_len=%lu", + arg, len_full); + } + len_tmp += snprintf(&abuf[len_tmp], + sizeof(abuf) - len_tmp, + " a%d[%d]=", arg, iter++); + } else + len_tmp += snprintf(&abuf[len_tmp], + sizeof(abuf) - len_tmp, + " a%d=", arg); + WARN_ON(len_tmp >= sizeof(abuf)); + abuf[sizeof(abuf) - 1] = '\0'; + + /* log the arg in the audit record */ + audit_log_format(*ab, "%s", abuf); + len_rem -= len_tmp; + len_tmp = len_buf; + if (encode) { + if (len_abuf > len_rem) + len_tmp = len_rem / 2; /* encoding */ + audit_log_n_hex(*ab, buf, len_tmp); + len_rem -= len_tmp * 2; + len_abuf -= len_tmp * 2; + } else { + if (len_abuf > len_rem) + len_tmp = len_rem - 2; /* quotes */ + audit_log_n_string(*ab, buf, len_tmp); + len_rem -= len_tmp + 2; + /* don't subtract the "2" because we still need + * to add quotes to the remaining string */ + len_abuf -= len_tmp; + } + len_buf -= len_tmp; + buf += len_tmp; + } - audit_log_format(*ab, "argc=%d", context->execve.argc); + /* ready to move to the next argument? */ + if ((len_buf == 0) && !require_data) { + arg++; + iter = 0; + len_full = 0; + require_data = true; + encode = false; + } + } while (arg < context->execve.argc); - /* - * we need some kernel buffer to hold the userspace args. Just - * allocate one big one rather than allocating one of the right size - * for every single argument inside audit_log_single_execve_arg() - * should be <8k allocation so should be pretty safe. - */ - buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL); - if (!buf) { - audit_panic("out of memory for argv string"); - return; - } + /* NOTE: the caller handles the final audit_log_end() call */ - for (i = 0; i < context->execve.argc; i++) { - len = audit_log_single_execve_arg(context, ab, i, - &len_sent, p, buf); - if (len <= 0) - break; - p += len; - } - kfree(buf); +out: + kfree(buf_head); } static void show_special(struct audit_context *context, int *call_panic) @@ -1426,7 +1425,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts if (context->pwd.dentry && context->pwd.mnt) { ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); if (ab) { - audit_log_d_path(ab, " cwd=", &context->pwd); + audit_log_d_path(ab, "cwd=", &context->pwd); audit_log_end(ab); } } @@ -1980,21 +1979,26 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, { struct audit_buffer *ab; uid_t uid, oldloginuid, loginuid; + struct tty_struct *tty; if (!audit_enabled) return; + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); + if (!ab) + return; + uid = from_kuid(&init_user_ns, task_uid(current)); oldloginuid = from_kuid(&init_user_ns, koldloginuid); loginuid = from_kuid(&init_user_ns, kloginuid), + tty = audit_get_tty(current); - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); - if (!ab) - return; audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid); audit_log_task_context(ab); - audit_log_format(ab, " old-auid=%u auid=%u old-ses=%u ses=%u res=%d", - oldloginuid, loginuid, oldsessionid, sessionid, !rc); + audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d", + oldloginuid, loginuid, tty ? tty_name(tty) : "(none)", + oldsessionid, sessionid, !rc); + audit_put_tty(tty); audit_log_end(ab); } diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 76d5a794e426..633a650d7aeb 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -328,8 +328,8 @@ static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) } /* only called from syscall */ -static int fd_array_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags) +int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, + void *key, void *value, u64 map_flags) { struct bpf_array *array = container_of(map, struct bpf_array, map); void *new_ptr, *old_ptr; @@ -342,7 +342,7 @@ static int fd_array_map_update_elem(struct bpf_map *map, void *key, return -E2BIG; ufd = *(u32 *)value; - new_ptr = map->ops->map_fd_get_ptr(map, ufd); + new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); if (IS_ERR(new_ptr)) return PTR_ERR(new_ptr); @@ -371,10 +371,12 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key) } } -static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd) +static void *prog_fd_array_get_ptr(struct bpf_map *map, + struct file *map_file, int fd) { struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_prog *prog = bpf_prog_get(fd); + if (IS_ERR(prog)) return prog; @@ -382,14 +384,13 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd) bpf_prog_put(prog); return ERR_PTR(-EINVAL); } + return prog; } static void prog_fd_array_put_ptr(void *ptr) { - struct bpf_prog *prog = ptr; - - bpf_prog_put_rcu(prog); + bpf_prog_put(ptr); } /* decrement refcnt of all bpf_progs that are stored in this map */ @@ -407,7 +408,6 @@ static const struct bpf_map_ops prog_array_ops = { .map_free = fd_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, - .map_update_elem = fd_array_map_update_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = prog_fd_array_get_ptr, .map_fd_put_ptr = prog_fd_array_put_ptr, @@ -425,59 +425,105 @@ static int __init register_prog_array_map(void) } late_initcall(register_prog_array_map); -static void perf_event_array_map_free(struct bpf_map *map) +static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, + struct file *map_file) { - bpf_fd_array_map_clear(map); - fd_array_map_free(map); + struct bpf_event_entry *ee; + + ee = kzalloc(sizeof(*ee), GFP_ATOMIC); + if (ee) { + ee->event = perf_file->private_data; + ee->perf_file = perf_file; + ee->map_file = map_file; + } + + return ee; } -static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) +static void __bpf_event_entry_free(struct rcu_head *rcu) { - struct perf_event *event; - const struct perf_event_attr *attr; - struct file *file; + struct bpf_event_entry *ee; - file = perf_event_get(fd); - if (IS_ERR(file)) - return file; + ee = container_of(rcu, struct bpf_event_entry, rcu); + fput(ee->perf_file); + kfree(ee); +} - event = file->private_data; +static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee) +{ + call_rcu(&ee->rcu, __bpf_event_entry_free); +} - attr = perf_event_attrs(event); - if (IS_ERR(attr)) - goto err; +static void *perf_event_fd_array_get_ptr(struct bpf_map *map, + struct file *map_file, int fd) +{ + const struct perf_event_attr *attr; + struct bpf_event_entry *ee; + struct perf_event *event; + struct file *perf_file; - if (attr->inherit) - goto err; + perf_file = perf_event_get(fd); + if (IS_ERR(perf_file)) + return perf_file; - if (attr->type == PERF_TYPE_RAW) - return file; + event = perf_file->private_data; + ee = ERR_PTR(-EINVAL); - if (attr->type == PERF_TYPE_HARDWARE) - return file; + attr = perf_event_attrs(event); + if (IS_ERR(attr) || attr->inherit) + goto err_out; + + switch (attr->type) { + case PERF_TYPE_SOFTWARE: + if (attr->config != PERF_COUNT_SW_BPF_OUTPUT) + goto err_out; + /* fall-through */ + case PERF_TYPE_RAW: + case PERF_TYPE_HARDWARE: + ee = bpf_event_entry_gen(perf_file, map_file); + if (ee) + return ee; + ee = ERR_PTR(-ENOMEM); + /* fall-through */ + default: + break; + } - if (attr->type == PERF_TYPE_SOFTWARE && - attr->config == PERF_COUNT_SW_BPF_OUTPUT) - return file; -err: - fput(file); - return ERR_PTR(-EINVAL); +err_out: + fput(perf_file); + return ee; } static void perf_event_fd_array_put_ptr(void *ptr) { - fput((struct file *)ptr); + bpf_event_entry_free_rcu(ptr); +} + +static void perf_event_fd_array_release(struct bpf_map *map, + struct file *map_file) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct bpf_event_entry *ee; + int i; + + rcu_read_lock(); + for (i = 0; i < array->map.max_entries; i++) { + ee = READ_ONCE(array->ptrs[i]); + if (ee && ee->map_file == map_file) + fd_array_map_delete_elem(map, &i); + } + rcu_read_unlock(); } static const struct bpf_map_ops perf_event_array_ops = { .map_alloc = fd_array_map_alloc, - .map_free = perf_event_array_map_free, + .map_free = fd_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, - .map_update_elem = fd_array_map_update_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = perf_event_fd_array_get_ptr, .map_fd_put_ptr = perf_event_fd_array_put_ptr, + .map_release = perf_event_fd_array_release, }; static struct bpf_map_type_list perf_event_array_type __read_mostly = { @@ -491,3 +537,46 @@ static int __init register_perf_event_array_map(void) return 0; } late_initcall(register_perf_event_array_map); + +#ifdef CONFIG_SOCK_CGROUP_DATA +static void *cgroup_fd_array_get_ptr(struct bpf_map *map, + struct file *map_file /* not used */, + int fd) +{ + return cgroup_get_from_fd(fd); +} + +static void cgroup_fd_array_put_ptr(void *ptr) +{ + /* cgroup_put free cgrp after a rcu grace period */ + cgroup_put(ptr); +} + +static void cgroup_fd_array_free(struct bpf_map *map) +{ + bpf_fd_array_map_clear(map); + fd_array_map_free(map); +} + +static const struct bpf_map_ops cgroup_array_ops = { + .map_alloc = fd_array_map_alloc, + .map_free = cgroup_fd_array_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = fd_array_map_lookup_elem, + .map_delete_elem = fd_array_map_delete_elem, + .map_fd_get_ptr = cgroup_fd_array_get_ptr, + .map_fd_put_ptr = cgroup_fd_array_put_ptr, +}; + +static struct bpf_map_type_list cgroup_array_type __read_mostly = { + .ops = &cgroup_array_ops, + .type = BPF_MAP_TYPE_CGROUP_ARRAY, +}; + +static int __init register_cgroup_array_map(void) +{ + bpf_register_map_type(&cgroup_array_type); + return 0; +} +late_initcall(register_cgroup_array_map); +#endif diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index be0abf669ced..03fd23d4d587 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -129,14 +129,83 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, return fp; } -EXPORT_SYMBOL_GPL(bpf_prog_realloc); void __bpf_prog_free(struct bpf_prog *fp) { kfree(fp->aux); vfree(fp); } -EXPORT_SYMBOL_GPL(__bpf_prog_free); + +static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn) +{ + return BPF_CLASS(insn->code) == BPF_JMP && + /* Call and Exit are both special jumps with no + * target inside the BPF instruction image. + */ + BPF_OP(insn->code) != BPF_CALL && + BPF_OP(insn->code) != BPF_EXIT; +} + +static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta) +{ + struct bpf_insn *insn = prog->insnsi; + u32 i, insn_cnt = prog->len; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (!bpf_is_jmp_and_has_target(insn)) + continue; + + /* Adjust offset of jmps if we cross boundaries. */ + if (i < pos && i + insn->off + 1 > pos) + insn->off += delta; + else if (i > pos + delta && i + insn->off + 1 <= pos + delta) + insn->off -= delta; + } +} + +struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, + const struct bpf_insn *patch, u32 len) +{ + u32 insn_adj_cnt, insn_rest, insn_delta = len - 1; + struct bpf_prog *prog_adj; + + /* Since our patchlet doesn't expand the image, we're done. */ + if (insn_delta == 0) { + memcpy(prog->insnsi + off, patch, sizeof(*patch)); + return prog; + } + + insn_adj_cnt = prog->len + insn_delta; + + /* Several new instructions need to be inserted. Make room + * for them. Likely, there's no need for a new allocation as + * last page could have large enough tailroom. + */ + prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt), + GFP_USER); + if (!prog_adj) + return NULL; + + prog_adj->len = insn_adj_cnt; + + /* Patching happens in 3 steps: + * + * 1) Move over tail of insnsi from next instruction onwards, + * so we can patch the single target insn with one or more + * new ones (patching is always from 1 to n insns, n > 0). + * 2) Inject new instructions at the target location. + * 3) Adjust branch offsets if necessary. + */ + insn_rest = insn_adj_cnt - off - len; + + memmove(prog_adj->insnsi + off + len, prog_adj->insnsi + off + 1, + sizeof(*patch) * insn_rest); + memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len); + + bpf_adj_branches(prog_adj, off, insn_delta); + + return prog_adj; +} #ifdef CONFIG_BPF_JIT struct bpf_binary_header * @@ -162,7 +231,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, hdr->pages = size / PAGE_SIZE; hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); - start = (prandom_u32() % hole) & ~(alignment - 1); + start = (get_random_int() % hole) & ~(alignment - 1); /* Leave a random number of instructions before BPF code. */ *image_ptr = &hdr->image[start]; @@ -174,6 +243,209 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr) { module_memfree(hdr); } + +int bpf_jit_harden __read_mostly; + +static int bpf_jit_blind_insn(const struct bpf_insn *from, + const struct bpf_insn *aux, + struct bpf_insn *to_buff) +{ + struct bpf_insn *to = to_buff; + u32 imm_rnd = get_random_int(); + s16 off; + + BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG); + BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG); + + if (from->imm == 0 && + (from->code == (BPF_ALU | BPF_MOV | BPF_K) || + from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) { + *to++ = BPF_ALU64_REG(BPF_XOR, from->dst_reg, from->dst_reg); + goto out; + } + + switch (from->code) { + case BPF_ALU | BPF_ADD | BPF_K: + case BPF_ALU | BPF_SUB | BPF_K: + case BPF_ALU | BPF_AND | BPF_K: + case BPF_ALU | BPF_OR | BPF_K: + case BPF_ALU | BPF_XOR | BPF_K: + case BPF_ALU | BPF_MUL | BPF_K: + case BPF_ALU | BPF_MOV | BPF_K: + case BPF_ALU | BPF_DIV | BPF_K: + case BPF_ALU | BPF_MOD | BPF_K: + *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); + *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_ALU32_REG(from->code, from->dst_reg, BPF_REG_AX); + break; + + case BPF_ALU64 | BPF_ADD | BPF_K: + case BPF_ALU64 | BPF_SUB | BPF_K: + case BPF_ALU64 | BPF_AND | BPF_K: + case BPF_ALU64 | BPF_OR | BPF_K: + case BPF_ALU64 | BPF_XOR | BPF_K: + case BPF_ALU64 | BPF_MUL | BPF_K: + case BPF_ALU64 | BPF_MOV | BPF_K: + case BPF_ALU64 | BPF_DIV | BPF_K: + case BPF_ALU64 | BPF_MOD | BPF_K: + *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); + *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_ALU64_REG(from->code, from->dst_reg, BPF_REG_AX); + break; + + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JNE | BPF_K: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JSGT | BPF_K: + case BPF_JMP | BPF_JSGE | BPF_K: + case BPF_JMP | BPF_JSET | BPF_K: + /* Accommodate for extra offset in case of a backjump. */ + off = from->off; + if (off < 0) + off -= 2; + *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); + *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off); + break; + + case BPF_LD | BPF_ABS | BPF_W: + case BPF_LD | BPF_ABS | BPF_H: + case BPF_LD | BPF_ABS | BPF_B: + *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); + *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0); + break; + + case BPF_LD | BPF_IND | BPF_W: + case BPF_LD | BPF_IND | BPF_H: + case BPF_LD | BPF_IND | BPF_B: + *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); + *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_ALU32_REG(BPF_ADD, BPF_REG_AX, from->src_reg); + *to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0); + break; + + case BPF_LD | BPF_IMM | BPF_DW: + *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm); + *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32); + *to++ = BPF_ALU64_REG(BPF_MOV, aux[0].dst_reg, BPF_REG_AX); + break; + case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */ + *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm); + *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_ALU64_REG(BPF_OR, aux[0].dst_reg, BPF_REG_AX); + break; + + case BPF_ST | BPF_MEM | BPF_DW: + case BPF_ST | BPF_MEM | BPF_W: + case BPF_ST | BPF_MEM | BPF_H: + case BPF_ST | BPF_MEM | BPF_B: + *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); + *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_STX_MEM(from->code, from->dst_reg, BPF_REG_AX, from->off); + break; + } +out: + return to - to_buff; +} + +static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other, + gfp_t gfp_extra_flags) +{ + gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | + gfp_extra_flags; + struct bpf_prog *fp; + + fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL); + if (fp != NULL) { + kmemcheck_annotate_bitfield(fp, meta); + + /* aux->prog still points to the fp_other one, so + * when promoting the clone to the real program, + * this still needs to be adapted. + */ + memcpy(fp, fp_other, fp_other->pages * PAGE_SIZE); + } + + return fp; +} + +static void bpf_prog_clone_free(struct bpf_prog *fp) +{ + /* aux was stolen by the other clone, so we cannot free + * it from this path! It will be freed eventually by the + * other program on release. + * + * At this point, we don't need a deferred release since + * clone is guaranteed to not be locked. + */ + fp->aux = NULL; + __bpf_prog_free(fp); +} + +void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other) +{ + /* We have to repoint aux->prog to self, as we don't + * know whether fp here is the clone or the original. + */ + fp->aux->prog = fp; + bpf_prog_clone_free(fp_other); +} + +struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) +{ + struct bpf_insn insn_buff[16], aux[2]; + struct bpf_prog *clone, *tmp; + int insn_delta, insn_cnt; + struct bpf_insn *insn; + int i, rewritten; + + if (!bpf_jit_blinding_enabled()) + return prog; + + clone = bpf_prog_clone_create(prog, GFP_USER); + if (!clone) + return ERR_PTR(-ENOMEM); + + insn_cnt = clone->len; + insn = clone->insnsi; + + for (i = 0; i < insn_cnt; i++, insn++) { + /* We temporarily need to hold the original ld64 insn + * so that we can still access the first part in the + * second blinding run. + */ + if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW) && + insn[1].code == 0) + memcpy(aux, insn, sizeof(aux)); + + rewritten = bpf_jit_blind_insn(insn, aux, insn_buff); + if (!rewritten) + continue; + + tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten); + if (!tmp) { + /* Patching may have repointed aux->prog during + * realloc from the original one, so we need to + * fix it up here on error. + */ + bpf_jit_prog_release_other(prog, clone); + return ERR_PTR(-ENOMEM); + } + + clone = tmp; + insn_delta = rewritten - 1; + + /* Walk new program and skip insns we just inserted. */ + insn = clone->insnsi + i + insn_delta; + insn_cnt += insn_delta; + i += insn_delta; + } + + return clone; +} #endif /* CONFIG_BPF_JIT */ /* Base function for offset calculation. Needs to go into .text section, @@ -447,14 +719,13 @@ select_insn: if (unlikely(index >= array->map.max_entries)) goto out; - if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT)) goto out; tail_call_cnt++; prog = READ_ONCE(array->ptrs[index]); - if (unlikely(!prog)) + if (!prog) goto out; /* ARG1 at this point is guaranteed to point to CTX from @@ -692,15 +963,22 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) /** * bpf_prog_select_runtime - select exec runtime for BPF program * @fp: bpf_prog populated with internal BPF program + * @err: pointer to error variable * * Try to JIT eBPF program, if JIT is not available, use interpreter. * The BPF program will be executed via BPF_PROG_RUN() macro. */ -int bpf_prog_select_runtime(struct bpf_prog *fp) +struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) { fp->bpf_func = (void *) __bpf_prog_run; - bpf_int_jit_compile(fp); + /* eBPF JITs can rewrite the program in case constant + * blinding is active. However, in case of error during + * blinding, bpf_int_jit_compile() must always return a + * valid program, which in this case would simply not + * be JITed, but falls back to the interpreter. + */ + fp = bpf_int_jit_compile(fp); bpf_prog_lock_ro(fp); /* The tail call compatibility check can only be done at @@ -708,7 +986,9 @@ int bpf_prog_select_runtime(struct bpf_prog *fp) * with JITed or non JITed program concatenations and not * all eBPF JITs might immediately support all features. */ - return bpf_check_tail_call(fp); + *err = bpf_check_tail_call(fp); + + return fp; } EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); @@ -764,14 +1044,23 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; + const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; const struct bpf_func_proto bpf_get_current_comm_proto __weak; + const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { return NULL; } +u64 __weak +bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, + void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) +{ + return -ENOTSUPP; +} + /* Always built-in helper functions. */ const struct bpf_func_proto bpf_tail_call_proto = { .func = NULL, @@ -783,8 +1072,14 @@ const struct bpf_func_proto bpf_tail_call_proto = { }; /* For classic BPF JITs that don't implement bpf_int_jit_compile(). */ -void __weak bpf_int_jit_compile(struct bpf_prog *prog) +struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog) +{ + return prog; +} + +bool __weak bpf_helper_changes_skb_data(void *func) { + return false; } /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index fff3650d52fc..570eeca7bdfa 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -26,11 +26,18 @@ struct bpf_htab { struct bucket *buckets; void *elems; struct pcpu_freelist freelist; + void __percpu *extra_elems; atomic_t count; /* number of elements in this hashtable */ u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ }; +enum extra_elem_state { + HTAB_NOT_AN_EXTRA_ELEM = 0, + HTAB_EXTRA_ELEM_FREE, + HTAB_EXTRA_ELEM_USED +}; + /* each htab element is struct htab_elem + key + value */ struct htab_elem { union { @@ -38,7 +45,10 @@ struct htab_elem { struct bpf_htab *htab; struct pcpu_freelist_node fnode; }; - struct rcu_head rcu; + union { + struct rcu_head rcu; + enum extra_elem_state state; + }; u32 hash; char key[0] __aligned(8); }; @@ -113,6 +123,23 @@ free_elems: return err; } +static int alloc_extra_elems(struct bpf_htab *htab) +{ + void __percpu *pptr; + int cpu; + + pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN); + if (!pptr) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + ((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state = + HTAB_EXTRA_ELEM_FREE; + } + htab->extra_elems = pptr; + return 0; +} + /* Called from syscall */ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { @@ -185,6 +212,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (percpu) cost += (u64) round_up(htab->map.value_size, 8) * num_possible_cpus() * htab->map.max_entries; + else + cost += (u64) htab->elem_size * num_possible_cpus(); if (cost >= U32_MAX - PAGE_SIZE) /* make sure page count doesn't overflow */ @@ -212,14 +241,22 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) raw_spin_lock_init(&htab->buckets[i].lock); } + if (!percpu) { + err = alloc_extra_elems(htab); + if (err) + goto free_buckets; + } + if (!(attr->map_flags & BPF_F_NO_PREALLOC)) { err = prealloc_elems_and_freelist(htab); if (err) - goto free_buckets; + goto free_extra_elems; } return &htab->map; +free_extra_elems: + free_percpu(htab->extra_elems); free_buckets: kvfree(htab->buckets); free_htab: @@ -349,7 +386,6 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) free_percpu(htab_elem_get_ptr(l, htab->map.key_size)); kfree(l); - } static void htab_elem_free_rcu(struct rcu_head *head) @@ -370,6 +406,11 @@ static void htab_elem_free_rcu(struct rcu_head *head) static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) { + if (l->state == HTAB_EXTRA_ELEM_USED) { + l->state = HTAB_EXTRA_ELEM_FREE; + return; + } + if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) { pcpu_freelist_push(&htab->freelist, &l->fnode); } else { @@ -381,25 +422,44 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash, - bool percpu, bool onallcpus) + bool percpu, bool onallcpus, + bool old_elem_exists) { u32 size = htab->map.value_size; bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC); struct htab_elem *l_new; void __percpu *pptr; + int err = 0; if (prealloc) { l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist); if (!l_new) - return ERR_PTR(-E2BIG); + err = -E2BIG; } else { if (atomic_inc_return(&htab->count) > htab->map.max_entries) { atomic_dec(&htab->count); - return ERR_PTR(-E2BIG); + err = -E2BIG; + } else { + l_new = kmalloc(htab->elem_size, + GFP_ATOMIC | __GFP_NOWARN); + if (!l_new) + return ERR_PTR(-ENOMEM); } - l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); - if (!l_new) - return ERR_PTR(-ENOMEM); + } + + if (err) { + if (!old_elem_exists) + return ERR_PTR(err); + + /* if we're updating the existing element and the hash table + * is full, use per-cpu extra elems + */ + l_new = this_cpu_ptr(htab->extra_elems); + if (l_new->state != HTAB_EXTRA_ELEM_FREE) + return ERR_PTR(-E2BIG); + l_new->state = HTAB_EXTRA_ELEM_USED; + } else { + l_new->state = HTAB_NOT_AN_EXTRA_ELEM; } memcpy(l_new->key, key, key_size); @@ -489,7 +549,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, if (ret) goto err; - l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false); + l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, + !!l_old); if (IS_ERR(l_new)) { /* all pre-allocated elements are in use or memory exhausted */ ret = PTR_ERR(l_new); @@ -563,7 +624,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, } } else { l_new = alloc_htab_elem(htab, key, value, key_size, - hash, true, onallcpus); + hash, true, onallcpus, false); if (IS_ERR(l_new)) { ret = PTR_ERR(l_new); goto err; @@ -652,6 +713,7 @@ static void htab_map_free(struct bpf_map *map) htab_free_elems(htab); pcpu_freelist_destroy(&htab->freelist); } + free_percpu(htab->extra_elems); kvfree(htab->buckets); kfree(htab); } diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 50da680c479f..1ea3afba1a4f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -101,7 +101,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto = { static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) { - return raw_smp_processor_id(); + return smp_processor_id(); } const struct bpf_func_proto bpf_get_smp_processor_id_proto = { @@ -163,17 +163,26 @@ static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5) struct task_struct *task = current; char *buf = (char *) (long) r1; - if (!task) - return -EINVAL; + if (unlikely(!task)) + goto err_clear; - strlcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm))); + strncpy(buf, task->comm, size); + + /* Verifier guarantees that size > 0. For task->comm exceeding + * size, guarantee that buf is %NUL-terminated. Unconditionally + * done here to save the size test. + */ + buf[size - 1] = 0; return 0; +err_clear: + memset(buf, 0, size); + return -EINVAL; } const struct bpf_func_proto bpf_get_current_comm_proto = { .func = bpf_get_current_comm, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_STACK, + .arg1_type = ARG_PTR_TO_RAW_STACK, .arg2_type = ARG_CONST_STACK_SIZE, }; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 8f94ca1860cf..5967b870a895 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -11,7 +11,7 @@ * version 2 as published by the Free Software Foundation. */ -#include <linux/module.h> +#include <linux/init.h> #include <linux/magic.h> #include <linux/major.h> #include <linux/mount.h> @@ -119,18 +119,10 @@ static int bpf_inode_type(const struct inode *inode, enum bpf_type *type) return 0; } -static bool bpf_dname_reserved(const struct dentry *dentry) -{ - return strchr(dentry->d_name.name, '.'); -} - static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; - if (bpf_dname_reserved(dentry)) - return -EPERM; - inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -152,9 +144,6 @@ static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry, { struct inode *inode; - if (bpf_dname_reserved(dentry)) - return -EPERM; - inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFREG); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -187,31 +176,21 @@ static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode, } } -static int bpf_link(struct dentry *old_dentry, struct inode *dir, - struct dentry *new_dentry) +static struct dentry * +bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) { - if (bpf_dname_reserved(new_dentry)) - return -EPERM; - - return simple_link(old_dentry, dir, new_dentry); -} - -static int bpf_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) -{ - if (bpf_dname_reserved(new_dentry)) - return -EPERM; - - return simple_rename(old_dir, old_dentry, new_dir, new_dentry); + if (strchr(dentry->d_name.name, '.')) + return ERR_PTR(-EPERM); + return simple_lookup(dir, dentry, flags); } static const struct inode_operations bpf_dir_iops = { - .lookup = simple_lookup, + .lookup = bpf_lookup, .mknod = bpf_mkobj, .mkdir = bpf_mkdir, .rmdir = simple_rmdir, - .rename = bpf_rename, - .link = bpf_link, + .rename = simple_rename, + .link = simple_link, .unlink = simple_unlink, }; @@ -378,7 +357,7 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent) static struct dentry *bpf_mount(struct file_system_type *type, int flags, const char *dev_name, void *data) { - return mount_ns(type, flags, current->nsproxy->mnt_ns, bpf_fill_super); + return mount_nodev(type, flags, data, bpf_fill_super); } static struct file_system_type bpf_fs_type = { @@ -386,11 +365,8 @@ static struct file_system_type bpf_fs_type = { .name = "bpf", .mount = bpf_mount, .kill_sb = kill_litter_super, - .fs_flags = FS_USERNS_MOUNT, }; -MODULE_ALIAS_FS("bpf"); - static int __init bpf_init(void) { int ret; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 499d9e933f8e..bf4495fcd25d 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -66,7 +66,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || value_size < 8 || value_size % 8 || - value_size / 8 > PERF_MAX_STACK_DEPTH) + value_size / 8 > sysctl_perf_event_max_stack) return ERR_PTR(-EINVAL); /* hash table size must be power of 2 */ @@ -99,7 +99,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) if (err) goto free_smap; - err = get_callchain_buffers(); + err = get_callchain_buffers(sysctl_perf_event_max_stack); if (err) goto free_smap; @@ -116,7 +116,7 @@ free_smap: return ERR_PTR(err); } -static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) +u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) { struct pt_regs *regs = (struct pt_regs *) (long) r1; struct bpf_map *map = (struct bpf_map *) (long) r2; @@ -124,8 +124,8 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) struct perf_callchain_entry *trace; struct stack_map_bucket *bucket, *new_bucket, *old_bucket; u32 max_depth = map->value_size / 8; - /* stack_map_alloc() checks that max_depth <= PERF_MAX_STACK_DEPTH */ - u32 init_nr = PERF_MAX_STACK_DEPTH - max_depth; + /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ + u32 init_nr = sysctl_perf_event_max_stack - max_depth; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; u32 hash, id, trace_nr, trace_len; bool user = flags & BPF_F_USER_STACK; @@ -136,14 +136,15 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) return -EINVAL; - trace = get_perf_callchain(regs, init_nr, kernel, user, false, false); + trace = get_perf_callchain(regs, init_nr, kernel, user, + sysctl_perf_event_max_stack, false, false); if (unlikely(!trace)) /* couldn't fetch the stack trace */ return -EFAULT; /* get_perf_callchain() guarantees that trace->nr >= init_nr - * and trace-nr <= PERF_MAX_STACK_DEPTH, so trace_nr <= max_depth + * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth */ trace_nr = trace->nr - init_nr; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cf5e9f7ad13a..228f962447a5 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -124,7 +124,12 @@ void bpf_map_put_with_uref(struct bpf_map *map) static int bpf_map_release(struct inode *inode, struct file *filp) { - bpf_map_put_with_uref(filp->private_data); + struct bpf_map *map = filp->private_data; + + if (map->ops->map_release) + map->ops->map_release(map, filp); + + bpf_map_put_with_uref(map); return 0; } @@ -387,6 +392,13 @@ static int map_update_elem(union bpf_attr *attr) err = bpf_percpu_hash_update(map, key, value, attr->flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_update(map, key, value, attr->flags); + } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || + map->map_type == BPF_MAP_TYPE_PROG_ARRAY || + map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY) { + rcu_read_lock(); + err = bpf_fd_array_map_update_elem(map, f.file, key, value, + attr->flags); + rcu_read_unlock(); } else { rcu_read_lock(); err = map->ops->map_update_elem(map, key, value, attr->flags); @@ -612,7 +624,7 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog) free_uid(user); } -static void __prog_put_common(struct rcu_head *rcu) +static void __bpf_prog_put_rcu(struct rcu_head *rcu) { struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); @@ -621,17 +633,10 @@ static void __prog_put_common(struct rcu_head *rcu) bpf_prog_free(aux->prog); } -/* version of bpf_prog_put() that is called after a grace period */ -void bpf_prog_put_rcu(struct bpf_prog *prog) -{ - if (atomic_dec_and_test(&prog->aux->refcnt)) - call_rcu(&prog->aux->rcu, __prog_put_common); -} - void bpf_prog_put(struct bpf_prog *prog) { if (atomic_dec_and_test(&prog->aux->refcnt)) - __prog_put_common(&prog->aux->rcu); + call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); } EXPORT_SYMBOL_GPL(bpf_prog_put); @@ -639,7 +644,7 @@ static int bpf_prog_release(struct inode *inode, struct file *filp) { struct bpf_prog *prog = filp->private_data; - bpf_prog_put_rcu(prog); + bpf_prog_put(prog); return 0; } @@ -653,7 +658,7 @@ int bpf_prog_new_fd(struct bpf_prog *prog) O_RDWR | O_CLOEXEC); } -static struct bpf_prog *__bpf_prog_get(struct fd f) +static struct bpf_prog *____bpf_prog_get(struct fd f) { if (!f.file) return ERR_PTR(-EBADF); @@ -665,33 +670,50 @@ static struct bpf_prog *__bpf_prog_get(struct fd f) return f.file->private_data; } -struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) +struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i) { - if (atomic_inc_return(&prog->aux->refcnt) > BPF_MAX_REFCNT) { - atomic_dec(&prog->aux->refcnt); + if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) { + atomic_sub(i, &prog->aux->refcnt); return ERR_PTR(-EBUSY); } return prog; } +EXPORT_SYMBOL_GPL(bpf_prog_add); -/* called by sockets/tracing/seccomp before attaching program to an event - * pairs with bpf_prog_put() - */ -struct bpf_prog *bpf_prog_get(u32 ufd) +struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) +{ + return bpf_prog_add(prog, 1); +} + +static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type) { struct fd f = fdget(ufd); struct bpf_prog *prog; - prog = __bpf_prog_get(f); + prog = ____bpf_prog_get(f); if (IS_ERR(prog)) return prog; + if (type && prog->type != *type) { + prog = ERR_PTR(-EINVAL); + goto out; + } prog = bpf_prog_inc(prog); +out: fdput(f); - return prog; } -EXPORT_SYMBOL_GPL(bpf_prog_get); + +struct bpf_prog *bpf_prog_get(u32 ufd) +{ + return __bpf_prog_get(ufd, NULL); +} + +struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) +{ + return __bpf_prog_get(ufd, &type); +} +EXPORT_SYMBOL_GPL(bpf_prog_get_type); /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD kern_version @@ -762,7 +784,7 @@ static int bpf_prog_load(union bpf_attr *attr) fixup_bpf_calls(prog); /* eBPF program is ready to be JITed */ - err = bpf_prog_select_runtime(prog); + prog = bpf_prog_select_runtime(prog, &err); if (err < 0) goto free_used_maps; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c5c17a62f509..daea765d72e6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1,4 +1,5 @@ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -125,24 +126,18 @@ * are set to NOT_INIT to indicate that they are no longer readable. */ -/* types of values stored in eBPF registers */ -enum bpf_reg_type { - NOT_INIT = 0, /* nothing was written into register */ - UNKNOWN_VALUE, /* reg doesn't contain a valid pointer */ - PTR_TO_CTX, /* reg points to bpf_context */ - CONST_PTR_TO_MAP, /* reg points to struct bpf_map */ - PTR_TO_MAP_VALUE, /* reg points to map element value */ - PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */ - FRAME_PTR, /* reg == frame_pointer */ - PTR_TO_STACK, /* reg == frame_pointer + imm */ - CONST_IMM, /* constant integer value */ -}; - struct reg_state { enum bpf_reg_type type; union { - /* valid when type == CONST_IMM | PTR_TO_STACK */ - int imm; + /* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */ + s64 imm; + + /* valid when type == PTR_TO_PACKET* */ + struct { + u32 id; + u16 off; + u16 range; + }; /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | * PTR_TO_MAP_VALUE_OR_NULL @@ -199,9 +194,20 @@ struct verifier_env { struct verifier_state_list **explored_states; /* search pruning optimization */ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ u32 used_map_cnt; /* number of used maps */ + u32 id_gen; /* used to generate unique reg IDs */ bool allow_ptr_leaks; }; +#define BPF_COMPLEXITY_LIMIT_INSNS 65536 +#define BPF_COMPLEXITY_LIMIT_STACK 1024 + +struct bpf_call_arg_meta { + struct bpf_map *map_ptr; + bool raw_mode; + int regno; + int access_size; +}; + /* verbose verifier prints what it's seeing * bpf_check() is called under lock, so no race to access these global vars */ @@ -237,30 +243,39 @@ static const char * const reg_type_str[] = { [FRAME_PTR] = "fp", [PTR_TO_STACK] = "fp", [CONST_IMM] = "imm", + [PTR_TO_PACKET] = "pkt", + [PTR_TO_PACKET_END] = "pkt_end", }; -static void print_verifier_state(struct verifier_env *env) +static void print_verifier_state(struct verifier_state *state) { + struct reg_state *reg; enum bpf_reg_type t; int i; for (i = 0; i < MAX_BPF_REG; i++) { - t = env->cur_state.regs[i].type; + reg = &state->regs[i]; + t = reg->type; if (t == NOT_INIT) continue; verbose(" R%d=%s", i, reg_type_str[t]); if (t == CONST_IMM || t == PTR_TO_STACK) - verbose("%d", env->cur_state.regs[i].imm); + verbose("%lld", reg->imm); + else if (t == PTR_TO_PACKET) + verbose("(id=%d,off=%d,r=%d)", + reg->id, reg->off, reg->range); + else if (t == UNKNOWN_VALUE && reg->imm) + verbose("%lld", reg->imm); else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || t == PTR_TO_MAP_VALUE_OR_NULL) verbose("(ks=%d,vs=%d)", - env->cur_state.regs[i].map_ptr->key_size, - env->cur_state.regs[i].map_ptr->value_size); + reg->map_ptr->key_size, + reg->map_ptr->value_size); } for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { - if (env->cur_state.stack_slot_type[i] == STACK_SPILL) + if (state->stack_slot_type[i] == STACK_SPILL) verbose(" fp%d=%s", -MAX_BPF_STACK + i, - reg_type_str[env->cur_state.spilled_regs[i / BPF_REG_SIZE].type]); + reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]); } verbose("\n"); } @@ -444,7 +459,7 @@ static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx, elem->next = env->head; env->head = elem; env->stack_size++; - if (env->stack_size > 1024) { + if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) { verbose("BPF program is too complex\n"); goto err; } @@ -467,7 +482,6 @@ static void init_reg_state(struct reg_state *regs) for (i = 0; i < MAX_BPF_REG; i++) { regs[i].type = NOT_INIT; regs[i].imm = 0; - regs[i].map_ptr = NULL; } /* frame pointer */ @@ -482,7 +496,6 @@ static void mark_reg_unknown_value(struct reg_state *regs, u32 regno) BUG_ON(regno >= MAX_BPF_REG); regs[regno].type = UNKNOWN_VALUE; regs[regno].imm = 0; - regs[regno].map_ptr = NULL; } enum reg_arg_type { @@ -538,6 +551,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_MAP_VALUE_OR_NULL: case PTR_TO_STACK: case PTR_TO_CTX: + case PTR_TO_PACKET: + case PTR_TO_PACKET_END: case FRAME_PTR: case CONST_PTR_TO_MAP: return true; @@ -637,13 +652,44 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off, return 0; } +#define MAX_PACKET_OFF 0xffff + +static bool may_write_pkt_data(enum bpf_prog_type type) +{ + switch (type) { + case BPF_PROG_TYPE_XDP: + return true; + default: + return false; + } +} + +static int check_packet_access(struct verifier_env *env, u32 regno, int off, + int size) +{ + struct reg_state *regs = env->cur_state.regs; + struct reg_state *reg = ®s[regno]; + + off += reg->off; + if (off < 0 || off + size > reg->range) { + verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", + off, size, regno, reg->id, reg->off, reg->range); + return -EACCES; + } + return 0; +} + /* check access to 'struct bpf_context' fields */ static int check_ctx_access(struct verifier_env *env, int off, int size, - enum bpf_access_type t) + enum bpf_access_type t, enum bpf_reg_type *reg_type) { if (env->prog->aux->ops->is_valid_access && - env->prog->aux->ops->is_valid_access(off, size, t)) + env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) { + /* remember the offset of last byte accessed in ctx */ + if (env->prog->aux->max_ctx_offset < off + size) + env->prog->aux->max_ctx_offset = off + size; return 0; + } verbose("invalid bpf_context access off=%d size=%d\n", off, size); return -EACCES; @@ -663,6 +709,46 @@ static bool is_pointer_value(struct verifier_env *env, int regno) } } +static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg, + int off, int size) +{ + if (reg->type != PTR_TO_PACKET) { + if (off % size != 0) { + verbose("misaligned access off %d size %d\n", off, size); + return -EACCES; + } else { + return 0; + } + } + + switch (env->prog->type) { + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: + case BPF_PROG_TYPE_XDP: + break; + default: + verbose("verifier is misconfigured\n"); + return -EACCES; + } + + if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) + /* misaligned access to packet is ok on x86,arm,arm64 */ + return 0; + + if (reg->id && size != 1) { + verbose("Unknown packet alignment. Only byte-sized access allowed\n"); + return -EACCES; + } + + /* skb->data is NET_IP_ALIGN-ed */ + if ((NET_IP_ALIGN + reg->off + off) % size != 0) { + verbose("misaligned packet access off %d+%d+%d size %d\n", + NET_IP_ALIGN, reg->off, off, size); + return -EACCES; + } + return 0; +} + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@ -674,21 +760,21 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, int value_regno) { struct verifier_state *state = &env->cur_state; + struct reg_state *reg = &state->regs[regno]; int size, err = 0; - if (state->regs[regno].type == PTR_TO_STACK) - off += state->regs[regno].imm; + if (reg->type == PTR_TO_STACK) + off += reg->imm; size = bpf_size_to_bytes(bpf_size); if (size < 0) return size; - if (off % size != 0) { - verbose("misaligned access off %d size %d\n", off, size); - return -EACCES; - } + err = check_ptr_alignment(env, reg, off, size); + if (err) + return err; - if (state->regs[regno].type == PTR_TO_MAP_VALUE) { + if (reg->type == PTR_TO_MAP_VALUE) { if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { verbose("R%d leaks addr into map\n", value_regno); @@ -698,18 +784,23 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown_value(state->regs, value_regno); - } else if (state->regs[regno].type == PTR_TO_CTX) { + } else if (reg->type == PTR_TO_CTX) { + enum bpf_reg_type reg_type = UNKNOWN_VALUE; + if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { verbose("R%d leaks addr into ctx\n", value_regno); return -EACCES; } - err = check_ctx_access(env, off, size, t); - if (!err && t == BPF_READ && value_regno >= 0) + err = check_ctx_access(env, off, size, t, ®_type); + if (!err && t == BPF_READ && value_regno >= 0) { mark_reg_unknown_value(state->regs, value_regno); + if (env->allow_ptr_leaks) + /* note that reg.[id|off|range] == 0 */ + state->regs[value_regno].type = reg_type; + } - } else if (state->regs[regno].type == FRAME_PTR || - state->regs[regno].type == PTR_TO_STACK) { + } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) { if (off >= 0 || off < -MAX_BPF_STACK) { verbose("invalid stack off=%d size=%d\n", off, size); return -EACCES; @@ -725,11 +816,33 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, } else { err = check_stack_read(state, off, size, value_regno); } + } else if (state->regs[regno].type == PTR_TO_PACKET) { + if (t == BPF_WRITE && !may_write_pkt_data(env->prog->type)) { + verbose("cannot write into packet\n"); + return -EACCES; + } + if (t == BPF_WRITE && value_regno >= 0 && + is_pointer_value(env, value_regno)) { + verbose("R%d leaks addr into packet\n", value_regno); + return -EACCES; + } + err = check_packet_access(env, regno, off, size); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown_value(state->regs, value_regno); } else { verbose("R%d invalid mem access '%s'\n", - regno, reg_type_str[state->regs[regno].type]); + regno, reg_type_str[reg->type]); return -EACCES; } + + if (!err && size <= 2 && value_regno >= 0 && env->allow_ptr_leaks && + state->regs[value_regno].type == UNKNOWN_VALUE) { + /* 1 or 2 byte load zero-extends, determine the number of + * zero upper bits. Not doing it fo 4 byte load, since + * such values cannot be added to ptr_to_packet anyway. + */ + state->regs[value_regno].imm = 64 - size * 8; + } return err; } @@ -770,7 +883,8 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn) * and all elements of stack are initialized */ static int check_stack_boundary(struct verifier_env *env, int regno, - int access_size, bool zero_size_allowed) + int access_size, bool zero_size_allowed, + struct bpf_call_arg_meta *meta) { struct verifier_state *state = &env->cur_state; struct reg_state *regs = state->regs; @@ -796,6 +910,12 @@ static int check_stack_boundary(struct verifier_env *env, int regno, return -EACCES; } + if (meta && meta->raw_mode) { + meta->access_size = access_size; + meta->regno = regno; + return 0; + } + for (i = 0; i < access_size; i++) { if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) { verbose("invalid indirect read from stack off %d+%d size %d\n", @@ -807,7 +927,8 @@ static int check_stack_boundary(struct verifier_env *env, int regno, } static int check_func_arg(struct verifier_env *env, u32 regno, - enum bpf_arg_type arg_type, struct bpf_map **mapp) + enum bpf_arg_type arg_type, + struct bpf_call_arg_meta *meta) { struct reg_state *reg = env->cur_state.regs + regno; enum bpf_reg_type expected_type; @@ -839,7 +960,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno, expected_type = CONST_PTR_TO_MAP; } else if (arg_type == ARG_PTR_TO_CTX) { expected_type = PTR_TO_CTX; - } else if (arg_type == ARG_PTR_TO_STACK) { + } else if (arg_type == ARG_PTR_TO_STACK || + arg_type == ARG_PTR_TO_RAW_STACK) { expected_type = PTR_TO_STACK; /* One exception here. In case function allows for NULL to be * passed in as argument, it's a CONST_IMM type. Final test @@ -847,6 +969,7 @@ static int check_func_arg(struct verifier_env *env, u32 regno, */ if (reg->type == CONST_IMM && reg->imm == 0) expected_type = CONST_IMM; + meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK; } else { verbose("unsupported arg_type %d\n", arg_type); return -EFAULT; @@ -860,14 +983,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno, if (arg_type == ARG_CONST_MAP_PTR) { /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ - *mapp = reg->map_ptr; - + meta->map_ptr = reg->map_ptr; } else if (arg_type == ARG_PTR_TO_MAP_KEY) { /* bpf_map_xxx(..., map_ptr, ..., key) call: * check that [key, key + map->key_size) are within * stack limits and initialized */ - if (!*mapp) { + if (!meta->map_ptr) { /* in function declaration map_ptr must come before * map_key, so that it's verified and known before * we have to check map_key here. Otherwise it means @@ -876,19 +998,20 @@ static int check_func_arg(struct verifier_env *env, u32 regno, verbose("invalid map_ptr to access map->key\n"); return -EACCES; } - err = check_stack_boundary(env, regno, (*mapp)->key_size, - false); + err = check_stack_boundary(env, regno, meta->map_ptr->key_size, + false, NULL); } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity */ - if (!*mapp) { + if (!meta->map_ptr) { /* kernel subsystem misconfigured verifier */ verbose("invalid map_ptr to access map->value\n"); return -EACCES; } - err = check_stack_boundary(env, regno, (*mapp)->value_size, - false); + err = check_stack_boundary(env, regno, + meta->map_ptr->value_size, + false, NULL); } else if (arg_type == ARG_CONST_STACK_SIZE || arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO); @@ -903,7 +1026,7 @@ static int check_func_arg(struct verifier_env *env, u32 regno, return -EACCES; } err = check_stack_boundary(env, regno - 1, reg->imm, - zero_size_allowed); + zero_size_allowed, meta); } return err; @@ -929,6 +1052,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) if (func_id != BPF_FUNC_get_stackid) goto error; break; + case BPF_MAP_TYPE_CGROUP_ARRAY: + if (func_id != BPF_FUNC_skb_under_cgroup) + goto error; + break; default: break; } @@ -948,6 +1075,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) goto error; break; + case BPF_FUNC_skb_under_cgroup: + if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY) + goto error; + break; default: break; } @@ -959,13 +1090,55 @@ error: return -EINVAL; } +static int check_raw_mode(const struct bpf_func_proto *fn) +{ + int count = 0; + + if (fn->arg1_type == ARG_PTR_TO_RAW_STACK) + count++; + if (fn->arg2_type == ARG_PTR_TO_RAW_STACK) + count++; + if (fn->arg3_type == ARG_PTR_TO_RAW_STACK) + count++; + if (fn->arg4_type == ARG_PTR_TO_RAW_STACK) + count++; + if (fn->arg5_type == ARG_PTR_TO_RAW_STACK) + count++; + + return count > 1 ? -EINVAL : 0; +} + +static void clear_all_pkt_pointers(struct verifier_env *env) +{ + struct verifier_state *state = &env->cur_state; + struct reg_state *regs = state->regs, *reg; + int i; + + for (i = 0; i < MAX_BPF_REG; i++) + if (regs[i].type == PTR_TO_PACKET || + regs[i].type == PTR_TO_PACKET_END) + mark_reg_unknown_value(regs, i); + + for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { + if (state->stack_slot_type[i] != STACK_SPILL) + continue; + reg = &state->spilled_regs[i / BPF_REG_SIZE]; + if (reg->type != PTR_TO_PACKET && + reg->type != PTR_TO_PACKET_END) + continue; + reg->type = UNKNOWN_VALUE; + reg->imm = 0; + } +} + static int check_call(struct verifier_env *env, int func_id) { struct verifier_state *state = &env->cur_state; const struct bpf_func_proto *fn = NULL; struct reg_state *regs = state->regs; - struct bpf_map *map = NULL; struct reg_state *reg; + struct bpf_call_arg_meta meta; + bool changes_data; int i, err; /* find function prototype */ @@ -988,23 +1161,45 @@ static int check_call(struct verifier_env *env, int func_id) return -EINVAL; } + changes_data = bpf_helper_changes_skb_data(fn->func); + + memset(&meta, 0, sizeof(meta)); + + /* We only support one arg being in raw mode at the moment, which + * is sufficient for the helper functions we have right now. + */ + err = check_raw_mode(fn); + if (err) { + verbose("kernel subsystem misconfigured func %d\n", func_id); + return err; + } + /* check args */ - err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &map); + err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta); if (err) return err; - err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &map); + err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta); if (err) return err; - err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &map); + err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta); if (err) return err; - err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &map); + err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &meta); if (err) return err; - err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &map); + err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &meta); if (err) return err; + /* Mark slots with STACK_MISC in case of raw mode, stack offset + * is inferred from register state. + */ + for (i = 0; i < meta.access_size; i++) { + err = check_mem_access(env, meta.regno, i, BPF_B, BPF_WRITE, -1); + if (err) + return err; + } + /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { reg = regs + caller_saved[i]; @@ -1023,28 +1218,225 @@ static int check_call(struct verifier_env *env, int func_id) * can check 'value_size' boundary of memory access * to map element returned from bpf_map_lookup_elem() */ - if (map == NULL) { + if (meta.map_ptr == NULL) { verbose("kernel subsystem misconfigured verifier\n"); return -EINVAL; } - regs[BPF_REG_0].map_ptr = map; + regs[BPF_REG_0].map_ptr = meta.map_ptr; } else { verbose("unknown return type %d of func %d\n", fn->ret_type, func_id); return -EINVAL; } - err = check_map_func_compatibility(map, func_id); + err = check_map_func_compatibility(meta.map_ptr, func_id); if (err) return err; + if (changes_data) + clear_all_pkt_pointers(env); + return 0; +} + +static int check_packet_ptr_add(struct verifier_env *env, struct bpf_insn *insn) +{ + struct reg_state *regs = env->cur_state.regs; + struct reg_state *dst_reg = ®s[insn->dst_reg]; + struct reg_state *src_reg = ®s[insn->src_reg]; + struct reg_state tmp_reg; + s32 imm; + + if (BPF_SRC(insn->code) == BPF_K) { + /* pkt_ptr += imm */ + imm = insn->imm; + +add_imm: + if (imm <= 0) { + verbose("addition of negative constant to packet pointer is not allowed\n"); + return -EACCES; + } + if (imm >= MAX_PACKET_OFF || + imm + dst_reg->off >= MAX_PACKET_OFF) { + verbose("constant %d is too large to add to packet pointer\n", + imm); + return -EACCES; + } + /* a constant was added to pkt_ptr. + * Remember it while keeping the same 'id' + */ + dst_reg->off += imm; + } else { + if (src_reg->type == PTR_TO_PACKET) { + /* R6=pkt(id=0,off=0,r=62) R7=imm22; r7 += r6 */ + tmp_reg = *dst_reg; /* save r7 state */ + *dst_reg = *src_reg; /* copy pkt_ptr state r6 into r7 */ + src_reg = &tmp_reg; /* pretend it's src_reg state */ + /* if the checks below reject it, the copy won't matter, + * since we're rejecting the whole program. If all ok, + * then imm22 state will be added to r7 + * and r7 will be pkt(id=0,off=22,r=62) while + * r6 will stay as pkt(id=0,off=0,r=62) + */ + } + + if (src_reg->type == CONST_IMM) { + /* pkt_ptr += reg where reg is known constant */ + imm = src_reg->imm; + goto add_imm; + } + /* disallow pkt_ptr += reg + * if reg is not uknown_value with guaranteed zero upper bits + * otherwise pkt_ptr may overflow and addition will become + * subtraction which is not allowed + */ + if (src_reg->type != UNKNOWN_VALUE) { + verbose("cannot add '%s' to ptr_to_packet\n", + reg_type_str[src_reg->type]); + return -EACCES; + } + if (src_reg->imm < 48) { + verbose("cannot add integer value with %lld upper zero bits to ptr_to_packet\n", + src_reg->imm); + return -EACCES; + } + /* dst_reg stays as pkt_ptr type and since some positive + * integer value was added to the pointer, increment its 'id' + */ + dst_reg->id = ++env->id_gen; + + /* something was added to pkt_ptr, set range and off to zero */ + dst_reg->off = 0; + dst_reg->range = 0; + } + return 0; +} + +static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn) +{ + struct reg_state *regs = env->cur_state.regs; + struct reg_state *dst_reg = ®s[insn->dst_reg]; + u8 opcode = BPF_OP(insn->code); + s64 imm_log2; + + /* for type == UNKNOWN_VALUE: + * imm > 0 -> number of zero upper bits + * imm == 0 -> don't track which is the same as all bits can be non-zero + */ + + if (BPF_SRC(insn->code) == BPF_X) { + struct reg_state *src_reg = ®s[insn->src_reg]; + + if (src_reg->type == UNKNOWN_VALUE && src_reg->imm > 0 && + dst_reg->imm && opcode == BPF_ADD) { + /* dreg += sreg + * where both have zero upper bits. Adding them + * can only result making one more bit non-zero + * in the larger value. + * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47) + * 0xffff (imm=48) + 0xffff = 0x1fffe (imm=47) + */ + dst_reg->imm = min(dst_reg->imm, src_reg->imm); + dst_reg->imm--; + return 0; + } + if (src_reg->type == CONST_IMM && src_reg->imm > 0 && + dst_reg->imm && opcode == BPF_ADD) { + /* dreg += sreg + * where dreg has zero upper bits and sreg is const. + * Adding them can only result making one more bit + * non-zero in the larger value. + */ + imm_log2 = __ilog2_u64((long long)src_reg->imm); + dst_reg->imm = min(dst_reg->imm, 63 - imm_log2); + dst_reg->imm--; + return 0; + } + /* all other cases non supported yet, just mark dst_reg */ + dst_reg->imm = 0; + return 0; + } + + /* sign extend 32-bit imm into 64-bit to make sure that + * negative values occupy bit 63. Note ilog2() would have + * been incorrect, since sizeof(insn->imm) == 4 + */ + imm_log2 = __ilog2_u64((long long)insn->imm); + + if (dst_reg->imm && opcode == BPF_LSH) { + /* reg <<= imm + * if reg was a result of 2 byte load, then its imm == 48 + * which means that upper 48 bits are zero and shifting this reg + * left by 4 would mean that upper 44 bits are still zero + */ + dst_reg->imm -= insn->imm; + } else if (dst_reg->imm && opcode == BPF_MUL) { + /* reg *= imm + * if multiplying by 14 subtract 4 + * This is conservative calculation of upper zero bits. + * It's not trying to special case insn->imm == 1 or 0 cases + */ + dst_reg->imm -= imm_log2 + 1; + } else if (opcode == BPF_AND) { + /* reg &= imm */ + dst_reg->imm = 63 - imm_log2; + } else if (dst_reg->imm && opcode == BPF_ADD) { + /* reg += imm */ + dst_reg->imm = min(dst_reg->imm, 63 - imm_log2); + dst_reg->imm--; + } else if (opcode == BPF_RSH) { + /* reg >>= imm + * which means that after right shift, upper bits will be zero + * note that verifier already checked that + * 0 <= imm < 64 for shift insn + */ + dst_reg->imm += insn->imm; + if (unlikely(dst_reg->imm > 64)) + /* some dumb code did: + * r2 = *(u32 *)mem; + * r2 >>= 32; + * and all bits are zero now */ + dst_reg->imm = 64; + } else { + /* all other alu ops, means that we don't know what will + * happen to the value, mark it with unknown number of zero bits + */ + dst_reg->imm = 0; + } + + if (dst_reg->imm < 0) { + /* all 64 bits of the register can contain non-zero bits + * and such value cannot be added to ptr_to_packet, since it + * may overflow, mark it as unknown to avoid further eval + */ + dst_reg->imm = 0; + } + return 0; +} + +static int evaluate_reg_imm_alu(struct verifier_env *env, struct bpf_insn *insn) +{ + struct reg_state *regs = env->cur_state.regs; + struct reg_state *dst_reg = ®s[insn->dst_reg]; + struct reg_state *src_reg = ®s[insn->src_reg]; + u8 opcode = BPF_OP(insn->code); + + /* dst_reg->type == CONST_IMM here, simulate execution of 'add' insn. + * Don't care about overflow or negative values, just add them + */ + if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K) + dst_reg->imm += insn->imm; + else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) + dst_reg->imm += src_reg->imm; + else + mark_reg_unknown_value(regs, insn->dst_reg); return 0; } /* check validity of 32-bit and 64-bit arithmetic operations */ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) { - struct reg_state *regs = env->cur_state.regs; + struct reg_state *regs = env->cur_state.regs, *dst_reg; u8 opcode = BPF_OP(insn->code); int err; @@ -1133,8 +1525,6 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) } else { /* all other ALU ops: and, sub, xor, add, ... */ - bool stack_relative = false; - if (BPF_SRC(insn->code) == BPF_X) { if (insn->imm != 0 || insn->off != 0) { verbose("BPF_ALU uses reserved fields\n"); @@ -1172,11 +1562,36 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) } } + /* check dest operand */ + err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK); + if (err) + return err; + + dst_reg = ®s[insn->dst_reg]; + /* pattern match 'bpf_add Rx, imm' instruction */ if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && - regs[insn->dst_reg].type == FRAME_PTR && - BPF_SRC(insn->code) == BPF_K) { - stack_relative = true; + dst_reg->type == FRAME_PTR && BPF_SRC(insn->code) == BPF_K) { + dst_reg->type = PTR_TO_STACK; + dst_reg->imm = insn->imm; + return 0; + } else if (opcode == BPF_ADD && + BPF_CLASS(insn->code) == BPF_ALU64 && + (dst_reg->type == PTR_TO_PACKET || + (BPF_SRC(insn->code) == BPF_X && + regs[insn->src_reg].type == PTR_TO_PACKET))) { + /* ptr_to_packet += K|X */ + return check_packet_ptr_add(env, insn); + } else if (BPF_CLASS(insn->code) == BPF_ALU64 && + dst_reg->type == UNKNOWN_VALUE && + env->allow_ptr_leaks) { + /* unknown += K|X */ + return evaluate_reg_alu(env, insn); + } else if (BPF_CLASS(insn->code) == BPF_ALU64 && + dst_reg->type == CONST_IMM && + env->allow_ptr_leaks) { + /* reg_imm += K|X */ + return evaluate_reg_imm_alu(env, insn); } else if (is_pointer_value(env, insn->dst_reg)) { verbose("R%d pointer arithmetic prohibited\n", insn->dst_reg); @@ -1188,24 +1603,45 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) return -EACCES; } - /* check dest operand */ - err = check_reg_arg(regs, insn->dst_reg, DST_OP); - if (err) - return err; - - if (stack_relative) { - regs[insn->dst_reg].type = PTR_TO_STACK; - regs[insn->dst_reg].imm = insn->imm; - } + /* mark dest operand */ + mark_reg_unknown_value(regs, insn->dst_reg); } return 0; } +static void find_good_pkt_pointers(struct verifier_env *env, + struct reg_state *dst_reg) +{ + struct verifier_state *state = &env->cur_state; + struct reg_state *regs = state->regs, *reg; + int i; + /* r2 = r3; + * r2 += 8 + * if (r2 > pkt_end) goto somewhere + * r2 == dst_reg, pkt_end == src_reg, + * r2=pkt(id=n,off=8,r=0) + * r3=pkt(id=n,off=0,r=0) + * find register r3 and mark its range as r3=pkt(id=n,off=0,r=8) + * so that range of bytes [r3, r3 + 8) is safe to access + */ + for (i = 0; i < MAX_BPF_REG; i++) + if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id) + regs[i].range = dst_reg->off; + + for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { + if (state->stack_slot_type[i] != STACK_SPILL) + continue; + reg = &state->spilled_regs[i / BPF_REG_SIZE]; + if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id) + reg->range = dst_reg->off; + } +} + static int check_cond_jmp_op(struct verifier_env *env, struct bpf_insn *insn, int *insn_idx) { - struct reg_state *regs = env->cur_state.regs; + struct reg_state *regs = env->cur_state.regs, *dst_reg; struct verifier_state *other_branch; u8 opcode = BPF_OP(insn->code); int err; @@ -1243,11 +1679,12 @@ static int check_cond_jmp_op(struct verifier_env *env, if (err) return err; + dst_reg = ®s[insn->dst_reg]; + /* detect if R == 0 where R was initialized to zero earlier */ if (BPF_SRC(insn->code) == BPF_K && (opcode == BPF_JEQ || opcode == BPF_JNE) && - regs[insn->dst_reg].type == CONST_IMM && - regs[insn->dst_reg].imm == insn->imm) { + dst_reg->type == CONST_IMM && dst_reg->imm == insn->imm) { if (opcode == BPF_JEQ) { /* if (imm == imm) goto pc+off; * only follow the goto, ignore fall-through @@ -1269,44 +1706,30 @@ static int check_cond_jmp_op(struct verifier_env *env, /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */ if (BPF_SRC(insn->code) == BPF_K && - insn->imm == 0 && (opcode == BPF_JEQ || - opcode == BPF_JNE) && - regs[insn->dst_reg].type == PTR_TO_MAP_VALUE_OR_NULL) { + insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && + dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { if (opcode == BPF_JEQ) { /* next fallthrough insn can access memory via * this register */ regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; /* branch targer cannot access it, since reg == 0 */ - other_branch->regs[insn->dst_reg].type = CONST_IMM; - other_branch->regs[insn->dst_reg].imm = 0; + mark_reg_unknown_value(other_branch->regs, + insn->dst_reg); } else { other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; - regs[insn->dst_reg].type = CONST_IMM; - regs[insn->dst_reg].imm = 0; + mark_reg_unknown_value(regs, insn->dst_reg); } + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && + dst_reg->type == PTR_TO_PACKET && + regs[insn->src_reg].type == PTR_TO_PACKET_END) { + find_good_pkt_pointers(env, dst_reg); } else if (is_pointer_value(env, insn->dst_reg)) { verbose("R%d pointer comparison prohibited\n", insn->dst_reg); return -EACCES; - } else if (BPF_SRC(insn->code) == BPF_K && - (opcode == BPF_JEQ || opcode == BPF_JNE)) { - - if (opcode == BPF_JEQ) { - /* detect if (R == imm) goto - * and in the target state recognize that R = imm - */ - other_branch->regs[insn->dst_reg].type = CONST_IMM; - other_branch->regs[insn->dst_reg].imm = insn->imm; - } else { - /* detect if (R != imm) goto - * and in the fall-through state recognize that R = imm - */ - regs[insn->dst_reg].type = CONST_IMM; - regs[insn->dst_reg].imm = insn->imm; - } } if (log_level) - print_verifier_state(env); + print_verifier_state(&env->cur_state); return 0; } @@ -1384,14 +1807,14 @@ static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn) int i, err; if (!may_access_skb(env->prog->type)) { - verbose("BPF_LD_ABS|IND instructions not allowed for this program type\n"); + verbose("BPF_LD_[ABS|IND] instructions not allowed for this program type\n"); return -EINVAL; } if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || BPF_SIZE(insn->code) == BPF_DW || (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { - verbose("BPF_LD_ABS uses reserved fields\n"); + verbose("BPF_LD_[ABS|IND] uses reserved fields\n"); return -EINVAL; } @@ -1555,6 +1978,8 @@ peek_stack: goto peek_stack; else if (ret < 0) goto err_free; + if (t + 1 < insn_cnt) + env->explored_states[t + 1] = STATE_LIST_MARK; } else if (opcode == BPF_JA) { if (BPF_SRC(insns[t].code) != BPF_K) { ret = -EINVAL; @@ -1622,6 +2047,58 @@ err_free: return ret; } +/* the following conditions reduce the number of explored insns + * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet + */ +static bool compare_ptrs_to_packet(struct reg_state *old, struct reg_state *cur) +{ + if (old->id != cur->id) + return false; + + /* old ptr_to_packet is more conservative, since it allows smaller + * range. Ex: + * old(off=0,r=10) is equal to cur(off=0,r=20), because + * old(off=0,r=10) means that with range=10 the verifier proceeded + * further and found no issues with the program. Now we're in the same + * spot with cur(off=0,r=20), so we're safe too, since anything further + * will only be looking at most 10 bytes after this pointer. + */ + if (old->off == cur->off && old->range < cur->range) + return true; + + /* old(off=20,r=10) is equal to cur(off=22,re=22 or 5 or 0) + * since both cannot be used for packet access and safe(old) + * pointer has smaller off that could be used for further + * 'if (ptr > data_end)' check + * Ex: + * old(off=20,r=10) and cur(off=22,r=22) and cur(off=22,r=0) mean + * that we cannot access the packet. + * The safe range is: + * [ptr, ptr + range - off) + * so whenever off >=range, it means no safe bytes from this pointer. + * When comparing old->off <= cur->off, it means that older code + * went with smaller offset and that offset was later + * used to figure out the safe range after 'if (ptr > data_end)' check + * Say, 'old' state was explored like: + * ... R3(off=0, r=0) + * R4 = R3 + 20 + * ... now R4(off=20,r=0) <-- here + * if (R4 > data_end) + * ... R4(off=20,r=20), R3(off=0,r=20) and R3 can be used to access. + * ... the code further went all the way to bpf_exit. + * Now the 'cur' state at the mark 'here' has R4(off=30,r=0). + * old_R4(off=20,r=0) equal to cur_R4(off=30,r=0), since if the verifier + * goes further, such cur_R4 will give larger safe packet range after + * 'if (R4 > data_end)' and all further insn were already good with r=20, + * so they will be good with r=30 and we can prune the search. + */ + if (old->off <= cur->off && + old->off >= old->range && cur->off >= cur->range) + return true; + + return false; +} + /* compare two verifier states * * all states stored in state_list are known to be valid, since @@ -1650,17 +2127,25 @@ err_free: */ static bool states_equal(struct verifier_state *old, struct verifier_state *cur) { + struct reg_state *rold, *rcur; int i; for (i = 0; i < MAX_BPF_REG; i++) { - if (memcmp(&old->regs[i], &cur->regs[i], - sizeof(old->regs[0])) != 0) { - if (old->regs[i].type == NOT_INIT || - (old->regs[i].type == UNKNOWN_VALUE && - cur->regs[i].type != NOT_INIT)) - continue; - return false; - } + rold = &old->regs[i]; + rcur = &cur->regs[i]; + + if (memcmp(rold, rcur, sizeof(*rold)) == 0) + continue; + + if (rold->type == NOT_INIT || + (rold->type == UNKNOWN_VALUE && rcur->type != NOT_INIT)) + continue; + + if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET && + compare_ptrs_to_packet(rold, rcur)) + continue; + + return false; } for (i = 0; i < MAX_BPF_STACK; i++) { @@ -1759,7 +2244,7 @@ static int do_check(struct verifier_env *env) insn = &insns[insn_idx]; class = BPF_CLASS(insn->code); - if (++insn_processed > 32768) { + if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { verbose("BPF program is too large. Proccessed %d insn\n", insn_processed); return -E2BIG; @@ -1782,7 +2267,7 @@ static int do_check(struct verifier_env *env) if (log_level && do_print_state) { verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx); - print_verifier_state(env); + print_verifier_state(&env->cur_state); do_print_state = false; } @@ -1994,6 +2479,7 @@ process_bpf_exit: insn_idx++; } + verbose("processed %d insns\n", insn_processed); return 0; } @@ -2111,26 +2597,6 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env) insn->src_reg = 0; } -static void adjust_branches(struct bpf_prog *prog, int pos, int delta) -{ - struct bpf_insn *insn = prog->insnsi; - int insn_cnt = prog->len; - int i; - - for (i = 0; i < insn_cnt; i++, insn++) { - if (BPF_CLASS(insn->code) != BPF_JMP || - BPF_OP(insn->code) == BPF_CALL || - BPF_OP(insn->code) == BPF_EXIT) - continue; - - /* adjust offset of jmps if necessary */ - if (i < pos && i + insn->off + 1 > pos) - insn->off += delta; - else if (i > pos + delta && i + insn->off + 1 <= pos + delta) - insn->off -= delta; - } -} - /* convert load instructions that access fields of 'struct __sk_buff' * into sequence of instructions that access fields of 'struct sk_buff' */ @@ -2140,14 +2606,15 @@ static int convert_ctx_accesses(struct verifier_env *env) int insn_cnt = env->prog->len; struct bpf_insn insn_buf[16]; struct bpf_prog *new_prog; - u32 cnt; - int i; enum bpf_access_type type; + int i; if (!env->prog->aux->ops->convert_ctx_access) return 0; for (i = 0; i < insn_cnt; i++, insn++) { + u32 insn_delta, cnt; + if (insn->code == (BPF_LDX | BPF_MEM | BPF_W)) type = BPF_READ; else if (insn->code == (BPF_STX | BPF_MEM | BPF_W)) @@ -2169,34 +2636,18 @@ static int convert_ctx_accesses(struct verifier_env *env) return -EINVAL; } - if (cnt == 1) { - memcpy(insn, insn_buf, sizeof(*insn)); - continue; - } - - /* several new insns need to be inserted. Make room for them */ - insn_cnt += cnt - 1; - new_prog = bpf_prog_realloc(env->prog, - bpf_prog_size(insn_cnt), - GFP_USER); + new_prog = bpf_patch_insn_single(env->prog, i, insn_buf, cnt); if (!new_prog) return -ENOMEM; - new_prog->len = insn_cnt; - - memmove(new_prog->insnsi + i + cnt, new_prog->insns + i + 1, - sizeof(*insn) * (insn_cnt - i - cnt)); - - /* copy substitute insns in place of load instruction */ - memcpy(new_prog->insnsi + i, insn_buf, sizeof(*insn) * cnt); - - /* adjust branches in the whole program */ - adjust_branches(new_prog, i, cnt - 1); + insn_delta = cnt - 1; /* keep walking new program and skip insns we just inserted */ env->prog = new_prog; - insn = new_prog->insnsi + i + cnt - 1; - i += cnt - 1; + insn = new_prog->insnsi + i + insn_delta; + + insn_cnt += insn_delta; + i += insn_delta; } return 0; diff --git a/kernel/capability.c b/kernel/capability.c index 45432b54d5c6..00411c82dac5 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -361,6 +361,24 @@ bool has_capability_noaudit(struct task_struct *t, int cap) return has_ns_capability_noaudit(t, &init_user_ns, cap); } +static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit) +{ + int capable; + + if (unlikely(!cap_valid(cap))) { + pr_crit("capable() called with invalid cap=%u\n", cap); + BUG(); + } + + capable = audit ? security_capable(current_cred(), ns, cap) : + security_capable_noaudit(current_cred(), ns, cap); + if (capable == 0) { + current->flags |= PF_SUPERPRIV; + return true; + } + return false; +} + /** * ns_capable - Determine if the current task has a superior capability in effect * @ns: The usernamespace we want the capability in @@ -374,19 +392,27 @@ bool has_capability_noaudit(struct task_struct *t, int cap) */ bool ns_capable(struct user_namespace *ns, int cap) { - if (unlikely(!cap_valid(cap))) { - pr_crit("capable() called with invalid cap=%u\n", cap); - BUG(); - } - - if (security_capable(current_cred(), ns, cap) == 0) { - current->flags |= PF_SUPERPRIV; - return true; - } - return false; + return ns_capable_common(ns, cap, true); } EXPORT_SYMBOL(ns_capable); +/** + * ns_capable_noaudit - Determine if the current task has a superior capability + * (unaudited) in effect + * @ns: The usernamespace we want the capability in + * @cap: The capability to be tested for + * + * Return true if the current task has the given superior capability currently + * available for use, false if not. + * + * This sets PF_SUPERPRIV on the task if the capability is available on the + * assumption that it's about to be used. + */ +bool ns_capable_noaudit(struct user_namespace *ns, int cap) +{ + return ns_capable_common(ns, cap, false); +} +EXPORT_SYMBOL(ns_capable_noaudit); /** * capable - Determine if the current task has a superior capability in effect diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 86cb5c6e8932..d6b729beba49 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -61,7 +61,7 @@ #include <linux/cpuset.h> #include <linux/proc_ns.h> #include <linux/nsproxy.h> -#include <linux/proc_ns.h> +#include <linux/file.h> #include <net/sock.h> /* @@ -837,6 +837,8 @@ static void put_css_set_locked(struct css_set *cset) static void put_css_set(struct css_set *cset) { + unsigned long flags; + /* * Ensure that the refcount doesn't hit zero while any readers * can see it. Similar to atomic_dec_and_lock(), but for an @@ -845,9 +847,9 @@ static void put_css_set(struct css_set *cset) if (atomic_add_unless(&cset->refcount, -1, 1)) return; - spin_lock_bh(&css_set_lock); + spin_lock_irqsave(&css_set_lock, flags); put_css_set_locked(cset); - spin_unlock_bh(&css_set_lock); + spin_unlock_irqrestore(&css_set_lock, flags); } /* @@ -1070,11 +1072,11 @@ static struct css_set *find_css_set(struct css_set *old_cset, /* First see if we already have a cgroup group that matches * the desired set */ - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); cset = find_existing_css_set(old_cset, cgrp, template); if (cset) get_css_set(cset); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); if (cset) return cset; @@ -1102,7 +1104,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, * find_existing_css_set() */ memcpy(cset->subsys, template, sizeof(cset->subsys)); - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); /* Add reference counts and links from the new css_set. */ list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; @@ -1128,7 +1130,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, css_get(css); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); return cset; } @@ -1158,18 +1160,12 @@ static void cgroup_exit_root_id(struct cgroup_root *root) { lockdep_assert_held(&cgroup_mutex); - if (root->hierarchy_id) { - idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); - root->hierarchy_id = 0; - } + idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); } static void cgroup_free_root(struct cgroup_root *root) { if (root) { - /* hierarchy ID should already have been released */ - WARN_ON_ONCE(root->hierarchy_id); - idr_destroy(&root->cgroup_idr); kfree(root); } @@ -1192,7 +1188,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) * Release all the links from cset_links to this hierarchy's * root cgroup */ - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { list_del(&link->cset_link); @@ -1200,7 +1196,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) kfree(link); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); if (!list_empty(&root->root_list)) { list_del(&root->root_list); @@ -1600,11 +1596,11 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) ss->root = dst_root; css->cgroup = dcgrp; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); hash_for_each(css_set_table, i, cset, hlist) list_move_tail(&cset->e_cset_node[ss->id], &dcgrp->e_csets[ss->id]); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; @@ -1640,10 +1636,10 @@ static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, if (!buf) return -ENOMEM; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot); len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); if (len >= PATH_MAX) len = -ERANGE; @@ -1897,7 +1893,7 @@ static void cgroup_enable_task_cg_lists(void) { struct task_struct *p, *g; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); if (use_task_css_set_links) goto out_unlock; @@ -1922,8 +1918,12 @@ static void cgroup_enable_task_cg_lists(void) * entry won't be deleted though the process has exited. * Do it while holding siglock so that we don't end up * racing against cgroup_exit(). + * + * Interrupts were already disabled while acquiring + * the css_set_lock, so we do not need to disable it + * again when acquiring the sighand->siglock here. */ - spin_lock_irq(&p->sighand->siglock); + spin_lock(&p->sighand->siglock); if (!(p->flags & PF_EXITING)) { struct css_set *cset = task_css_set(p); @@ -1932,11 +1932,11 @@ static void cgroup_enable_task_cg_lists(void) list_add_tail(&p->cg_list, &cset->tasks); get_css_set(cset); } - spin_unlock_irq(&p->sighand->siglock); + spin_unlock(&p->sighand->siglock); } while_each_thread(g, p); read_unlock(&tasklist_lock); out_unlock: - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); } static void init_cgroup_housekeeping(struct cgroup *cgrp) @@ -2043,13 +2043,13 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) * Link the root cgroup in this hierarchy into all the css_set * objects. */ - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); hash_for_each(css_set_table, i, cset, hlist) { link_css_set(&tmp_links, cset, root_cgrp); if (css_set_populated(cset)) cgroup_update_populated(root_cgrp, true); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); BUG_ON(!list_empty(&root_cgrp->self.children)); BUG_ON(atomic_read(&root->nr_cgrps) != 1); @@ -2209,12 +2209,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, goto out_unlock; } - /* - * We know this subsystem has not yet been bound. Users in a non-init - * user namespace may only mount hierarchies with no bound subsystems, - * i.e. 'none,name=user1' - */ - if (!opts.none && !capable(CAP_SYS_ADMIN)) { + /* Hierarchies may only be created in the initial cgroup namespace. */ + if (ns != &init_cgroup_ns) { ret = -EPERM; goto out_unlock; } @@ -2256,11 +2252,11 @@ out_mount: struct cgroup *cgrp; mutex_lock(&cgroup_mutex); - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); cgrp = cset_cgroup_from_root(ns->root_cset, root); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb); @@ -2337,11 +2333,11 @@ char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, char *ret; mutex_lock(&cgroup_mutex); - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); return ret; @@ -2369,7 +2365,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) char *path = NULL; mutex_lock(&cgroup_mutex); - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); @@ -2382,7 +2378,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) path = buf; } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); return path; } @@ -2557,7 +2553,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, * the new cgroup. There are no failure cases after here, so this * is the commit point. */ - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_for_each_entry(cset, &tset->src_csets, mg_node) { list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) { struct css_set *from_cset = task_css_set(task); @@ -2568,7 +2564,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, put_css_set_locked(from_cset); } } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); /* * Migration is committed, all target tasks are now on dst_csets. @@ -2597,13 +2593,13 @@ out_cancel_attach: } } while_each_subsys_mask(); out_release_tset: - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_splice_init(&tset->dst_csets, &tset->src_csets); list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) { list_splice_tail_init(&cset->mg_tasks, &cset->tasks); list_del_init(&cset->mg_node); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); return ret; } @@ -2634,7 +2630,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) lockdep_assert_held(&cgroup_mutex); - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) { cset->mg_src_cgrp = NULL; cset->mg_dst_cgrp = NULL; @@ -2642,7 +2638,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) list_del_init(&cset->mg_preload_node); put_css_set_locked(cset); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); } /** @@ -2783,7 +2779,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup, * already PF_EXITING could be freed from underneath us unless we * take an rcu_read_lock. */ - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); rcu_read_lock(); task = leader; do { @@ -2792,7 +2788,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup, break; } while_each_thread(leader, task); rcu_read_unlock(); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); return cgroup_taskset_migrate(&tset, root); } @@ -2816,7 +2812,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, return -EBUSY; /* look up all src csets */ - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); rcu_read_lock(); task = leader; do { @@ -2826,7 +2822,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, break; } while_each_thread(leader, task); rcu_read_unlock(); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); /* prepare dst csets and commit */ ret = cgroup_migrate_prepare_dst(&preloaded_csets); @@ -2859,9 +2855,9 @@ static int cgroup_procs_write_permission(struct task_struct *task, struct cgroup *cgrp; struct inode *inode; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); while (!cgroup_is_descendant(dst_cgrp, cgrp)) cgrp = cgroup_parent(cgrp); @@ -2956,20 +2952,22 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) int retval = 0; mutex_lock(&cgroup_mutex); + percpu_down_write(&cgroup_threadgroup_rwsem); for_each_root(root) { struct cgroup *from_cgrp; if (root == &cgrp_dfl_root) continue; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); from_cgrp = task_cgroup_from_root(from, root); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); retval = cgroup_attach_task(from_cgrp, tsk, false); if (retval) break; } + percpu_up_write(&cgroup_threadgroup_rwsem); mutex_unlock(&cgroup_mutex); return retval; @@ -3080,7 +3078,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) percpu_down_write(&cgroup_threadgroup_rwsem); /* look up all csses currently attached to @cgrp's subtree */ - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { struct cgrp_cset_link *link; @@ -3088,14 +3086,14 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) cgroup_migrate_add_src(link->cset, dsct, &preloaded_csets); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); /* NULL dst indicates self on default hierarchy */ ret = cgroup_migrate_prepare_dst(&preloaded_csets); if (ret) goto out_finish; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { struct task_struct *task, *ntask; @@ -3107,7 +3105,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list) cgroup_taskset_add(task, &tset); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); ret = cgroup_taskset_migrate(&tset, cgrp->root); out_finish: @@ -3448,9 +3446,28 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, * Except for the root, subtree_control must be zero for a cgroup * with tasks so that child cgroups don't compete against tasks. */ - if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) { - ret = -EBUSY; - goto out_unlock; + if (enable && cgroup_parent(cgrp)) { + struct cgrp_cset_link *link; + + /* + * Because namespaces pin csets too, @cgrp->cset_links + * might not be empty even when @cgrp is empty. Walk and + * verify each cset. + */ + spin_lock_irq(&css_set_lock); + + ret = 0; + list_for_each_entry(link, &cgrp->cset_links, cset_link) { + if (css_set_populated(link->cset)) { + ret = -EBUSY; + break; + } + } + + spin_unlock_irq(&css_set_lock); + + if (ret) + goto out_unlock; } /* save and update control masks and prepare csses */ @@ -3901,17 +3918,19 @@ void cgroup_file_notify(struct cgroup_file *cfile) * cgroup_task_count - count the number of tasks in a cgroup. * @cgrp: the cgroup in question * - * Return the number of tasks in the cgroup. + * Return the number of tasks in the cgroup. The returned number can be + * higher than the actual number of tasks due to css_set references from + * namespace roots and temporary usages. */ static int cgroup_task_count(const struct cgroup *cgrp) { int count = 0; struct cgrp_cset_link *link; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_for_each_entry(link, &cgrp->cset_links, cset_link) count += atomic_read(&link->cset->refcount); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); return count; } @@ -4249,7 +4268,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, memset(it, 0, sizeof(*it)); - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); it->ss = css->ss; @@ -4262,7 +4281,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, css_task_iter_advance_css_set(it); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); } /** @@ -4280,7 +4299,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) it->cur_task = NULL; } - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); if (it->task_pos) { it->cur_task = list_entry(it->task_pos, struct task_struct, @@ -4289,7 +4308,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) css_task_iter_advance(it); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); return it->cur_task; } @@ -4303,10 +4322,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) void css_task_iter_end(struct css_task_iter *it) { if (it->cur_cset) { - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_del(&it->iters_node); put_css_set_locked(it->cur_cset); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); } if (it->cur_task) @@ -4337,11 +4356,13 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) mutex_lock(&cgroup_mutex); + percpu_down_write(&cgroup_threadgroup_rwsem); + /* all tasks in @from are being moved, all csets are source */ - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_for_each_entry(link, &from->cset_links, cset_link) cgroup_migrate_add_src(link->cset, to, &preloaded_csets); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); ret = cgroup_migrate_prepare_dst(&preloaded_csets); if (ret) @@ -4365,6 +4386,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) } while (task && !ret); out_err: cgroup_migrate_finish(&preloaded_csets); + percpu_up_write(&cgroup_threadgroup_rwsem); mutex_unlock(&cgroup_mutex); return ret; } @@ -5063,6 +5085,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, memset(css, 0, sizeof(*css)); css->cgroup = cgrp; css->ss = ss; + css->id = -1; INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); css->serial_nr = css_serial_nr_next++; @@ -5139,6 +5162,8 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, lockdep_assert_held(&cgroup_mutex); css = ss->css_alloc(parent_css); + if (!css) + css = ERR_PTR(-ENOMEM); if (IS_ERR(css)) return css; @@ -5150,7 +5175,7 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL); if (err < 0) - goto err_free_percpu_ref; + goto err_free_css; css->id = err; /* @css is ready to be brought online now, make it visible */ @@ -5174,9 +5199,6 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, err_list_del: list_del_rcu(&css->sibling); - cgroup_idr_remove(&ss->css_idr, css->id); -err_free_percpu_ref: - percpu_ref_exit(&css->refcnt); err_free_css: call_rcu(&css->rcu_head, css_free_rcu_fn); return ERR_PTR(err); @@ -5451,10 +5473,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) */ cgrp->self.flags &= ~CSS_ONLINE; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_for_each_entry(link, &cgrp->cset_links, cset_link) link->cset->dead = true; - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); /* initiate massacre of all css's */ for_each_css(css, ssid, cgrp) @@ -5725,7 +5747,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, goto out; mutex_lock(&cgroup_mutex); - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); for_each_root(root) { struct cgroup_subsys *ss; @@ -5778,7 +5800,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, retval = 0; out_unlock: - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); kfree(buf); out: @@ -5923,13 +5945,13 @@ void cgroup_post_fork(struct task_struct *child) if (use_task_css_set_links) { struct css_set *cset; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); cset = task_css_set(current); if (list_empty(&child->cg_list)) { get_css_set(cset); css_set_move_task(child, NULL, cset, false); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); } /* @@ -5974,9 +5996,9 @@ void cgroup_exit(struct task_struct *tsk) cset = task_css_set(tsk); if (!list_empty(&tsk->cg_list)) { - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); css_set_move_task(tsk, cset, NULL, false); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); } else { get_css_set(cset); } @@ -6044,9 +6066,9 @@ static void cgroup_release_agent(struct work_struct *work) if (!pathbuf || !agentbuf) goto out; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); if (!path) goto out; @@ -6168,7 +6190,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) { WARN_ON_ONCE(!rcu_read_lock_held()); - return id > 0 ? idr_find(&ss->css_idr, id) : NULL; + return idr_find(&ss->css_idr, id); } /** @@ -6205,6 +6227,40 @@ struct cgroup *cgroup_get_from_path(const char *path) } EXPORT_SYMBOL_GPL(cgroup_get_from_path); +/** + * cgroup_get_from_fd - get a cgroup pointer from a fd + * @fd: fd obtained by open(cgroup2_dir) + * + * Find the cgroup from a fd which should be obtained + * by opening a cgroup directory. Returns a pointer to the + * cgroup on success. ERR_PTR is returned if the cgroup + * cannot be found. + */ +struct cgroup *cgroup_get_from_fd(int fd) +{ + struct cgroup_subsys_state *css; + struct cgroup *cgrp; + struct file *f; + + f = fget_raw(fd); + if (!f) + return ERR_PTR(-EBADF); + + css = css_tryget_online_from_dir(f->f_path.dentry, NULL); + fput(f); + if (IS_ERR(css)) + return ERR_CAST(css); + + cgrp = css->cgroup; + if (!cgroup_on_dfl(cgrp)) { + cgroup_put(cgrp); + return ERR_PTR(-EBADF); + } + + return cgrp; +} +EXPORT_SYMBOL_GPL(cgroup_get_from_fd); + /* * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data * definition in cgroup-defs.h. @@ -6235,6 +6291,12 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) if (cgroup_sk_alloc_disabled) return; + /* Socket clone path */ + if (skcd->val) { + cgroup_get(sock_cgroup_ptr(skcd)); + return; + } + rcu_read_lock(); while (true) { @@ -6305,14 +6367,11 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); - mutex_lock(&cgroup_mutex); - spin_lock_bh(&css_set_lock); - + /* It is not safe to take cgroup_mutex here */ + spin_lock_irq(&css_set_lock); cset = task_css_set(current); get_css_set(cset); - - spin_unlock_bh(&css_set_lock); - mutex_unlock(&cgroup_mutex); + spin_unlock_irq(&css_set_lock); new_ns = alloc_cgroup_ns(); if (IS_ERR(new_ns)) { @@ -6435,7 +6494,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) if (!name_buf) return -ENOMEM; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); rcu_read_lock(); cset = rcu_dereference(current->cgroups); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { @@ -6446,7 +6505,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) c->root->hierarchy_id, name_buf); } rcu_read_unlock(); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); kfree(name_buf); return 0; } @@ -6457,7 +6516,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) struct cgroup_subsys_state *css = seq_css(seq); struct cgrp_cset_link *link; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { struct css_set *cset = link->cset; struct task_struct *task; @@ -6480,7 +6539,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) overflow: seq_puts(seq, " ...\n"); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); return 0; } diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c index 303097b37429..2bd673783f1a 100644 --- a/kernel/cgroup_pids.c +++ b/kernel/cgroup_pids.c @@ -49,6 +49,12 @@ struct pids_cgroup { */ atomic64_t counter; int64_t limit; + + /* Handle for "pids.events" */ + struct cgroup_file events_file; + + /* Number of times fork failed because limit was hit. */ + atomic64_t events_limit; }; static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css) @@ -72,6 +78,7 @@ pids_css_alloc(struct cgroup_subsys_state *parent) pids->limit = PIDS_MAX; atomic64_set(&pids->counter, 0); + atomic64_set(&pids->events_limit, 0); return &pids->css; } @@ -213,10 +220,21 @@ static int pids_can_fork(struct task_struct *task) { struct cgroup_subsys_state *css; struct pids_cgroup *pids; + int err; css = task_css_check(current, pids_cgrp_id, true); pids = css_pids(css); - return pids_try_charge(pids, 1); + err = pids_try_charge(pids, 1); + if (err) { + /* Only log the first time events_limit is incremented. */ + if (atomic64_inc_return(&pids->events_limit) == 1) { + pr_info("cgroup: fork rejected by pids controller in "); + pr_cont_cgroup_path(task_cgroup(current, pids_cgrp_id)); + pr_cont("\n"); + } + cgroup_file_notify(&pids->events_file); + } + return err; } static void pids_cancel_fork(struct task_struct *task) @@ -288,6 +306,14 @@ static s64 pids_current_read(struct cgroup_subsys_state *css, return atomic64_read(&pids->counter); } +static int pids_events_show(struct seq_file *sf, void *v) +{ + struct pids_cgroup *pids = css_pids(seq_css(sf)); + + seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events_limit)); + return 0; +} + static struct cftype pids_files[] = { { .name = "max", @@ -300,6 +326,12 @@ static struct cftype pids_files[] = { .read_s64 = pids_current_read, .flags = CFTYPE_NOT_ON_ROOT, }, + { + .name = "events", + .seq_show = pids_events_show, + .file_offset = offsetof(struct pids_cgroup, events_file), + .flags = CFTYPE_NOT_ON_ROOT, + }, { } /* terminate */ }; diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config new file mode 100644 index 000000000000..9f748ed7bea8 --- /dev/null +++ b/kernel/configs/android-base.config @@ -0,0 +1,152 @@ +# KEEP ALPHABETICALLY SORTED +# CONFIG_DEVKMEM is not set +# CONFIG_DEVMEM is not set +# CONFIG_INET_LRO is not set +# CONFIG_MODULES is not set +# CONFIG_OABI_COMPAT is not set +# CONFIG_SYSVIPC is not set +CONFIG_ANDROID=y +CONFIG_ANDROID_BINDER_IPC=y +CONFIG_ANDROID_LOW_MEMORY_KILLER=y +CONFIG_ARMV8_DEPRECATED=y +CONFIG_ASHMEM=y +CONFIG_AUDIT=y +CONFIG_BLK_DEV_DM=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_CGROUPS=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_CGROUP_DEBUG=y +CONFIG_CGROUP_FREEZER=y +CONFIG_CGROUP_SCHED=y +CONFIG_CP15_BARRIER_EMULATION=y +CONFIG_DM_CRYPT=y +CONFIG_DM_VERITY=y +CONFIG_DM_VERITY_FEC=y +CONFIG_EMBEDDED=y +CONFIG_FB=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_INET6_AH=y +CONFIG_INET6_ESP=y +CONFIG_INET6_IPCOMP=y +CONFIG_INET=y +CONFIG_INET_DIAG_DESTROY=y +CONFIG_INET_ESP=y +CONFIG_INET_XFRM_MODE_TUNNEL=y +CONFIG_IP6_NF_FILTER=y +CONFIG_IP6_NF_IPTABLES=y +CONFIG_IP6_NF_MANGLE=y +CONFIG_IP6_NF_RAW=y +CONFIG_IP6_NF_TARGET_REJECT=y +CONFIG_IPV6=y +CONFIG_IPV6_MIP6=y +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_IPV6_OPTIMISTIC_DAD=y +CONFIG_IPV6_PRIVACY=y +CONFIG_IPV6_ROUTER_PREF=y +CONFIG_IPV6_ROUTE_INFO=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_NF_ARPFILTER=y +CONFIG_IP_NF_ARPTABLES=y +CONFIG_IP_NF_ARP_MANGLE=y +CONFIG_IP_NF_FILTER=y +CONFIG_IP_NF_IPTABLES=y +CONFIG_IP_NF_MANGLE=y +CONFIG_IP_NF_MATCH_AH=y +CONFIG_IP_NF_MATCH_ECN=y +CONFIG_IP_NF_MATCH_TTL=y +CONFIG_IP_NF_NAT=y +CONFIG_IP_NF_RAW=y +CONFIG_IP_NF_SECURITY=y +CONFIG_IP_NF_TARGET_MASQUERADE=y +CONFIG_IP_NF_TARGET_NETMAP=y +CONFIG_IP_NF_TARGET_REDIRECT=y +CONFIG_IP_NF_TARGET_REJECT=y +CONFIG_NET=y +CONFIG_NETDEVICES=y +CONFIG_NETFILTER=y +CONFIG_NETFILTER_TPROXY=y +CONFIG_NETFILTER_XT_MATCH_COMMENT=y +CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y +CONFIG_NETFILTER_XT_MATCH_CONNMARK=y +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y +CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y +CONFIG_NETFILTER_XT_MATCH_HELPER=y +CONFIG_NETFILTER_XT_MATCH_IPRANGE=y +CONFIG_NETFILTER_XT_MATCH_LENGTH=y +CONFIG_NETFILTER_XT_MATCH_LIMIT=y +CONFIG_NETFILTER_XT_MATCH_MAC=y +CONFIG_NETFILTER_XT_MATCH_MARK=y +CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y +CONFIG_NETFILTER_XT_MATCH_POLICY=y +CONFIG_NETFILTER_XT_MATCH_QUOTA=y +CONFIG_NETFILTER_XT_MATCH_SOCKET=y +CONFIG_NETFILTER_XT_MATCH_STATE=y +CONFIG_NETFILTER_XT_MATCH_STATISTIC=y +CONFIG_NETFILTER_XT_MATCH_STRING=y +CONFIG_NETFILTER_XT_MATCH_TIME=y +CONFIG_NETFILTER_XT_MATCH_U32=y +CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y +CONFIG_NETFILTER_XT_TARGET_CONNMARK=y +CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y +CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y +CONFIG_NETFILTER_XT_TARGET_MARK=y +CONFIG_NETFILTER_XT_TARGET_NFLOG=y +CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y +CONFIG_NETFILTER_XT_TARGET_SECMARK=y +CONFIG_NETFILTER_XT_TARGET_TCPMSS=y +CONFIG_NETFILTER_XT_TARGET_TPROXY=y +CONFIG_NETFILTER_XT_TARGET_TRACE=y +CONFIG_NET_CLS_ACT=y +CONFIG_NET_CLS_U32=y +CONFIG_NET_EMATCH=y +CONFIG_NET_EMATCH_U32=y +CONFIG_NET_KEY=y +CONFIG_NET_SCHED=y +CONFIG_NET_SCH_HTB=y +CONFIG_NF_CONNTRACK=y +CONFIG_NF_CONNTRACK_AMANDA=y +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CONNTRACK_FTP=y +CONFIG_NF_CONNTRACK_H323=y +CONFIG_NF_CONNTRACK_IPV4=y +CONFIG_NF_CONNTRACK_IPV6=y +CONFIG_NF_CONNTRACK_IRC=y +CONFIG_NF_CONNTRACK_NETBIOS_NS=y +CONFIG_NF_CONNTRACK_PPTP=y +CONFIG_NF_CONNTRACK_SANE=y +CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_TFTP=y +CONFIG_NF_CT_NETLINK=y +CONFIG_NF_CT_PROTO_DCCP=y +CONFIG_NF_CT_PROTO_SCTP=y +CONFIG_NF_CT_PROTO_UDPLITE=y +CONFIG_NF_NAT=y +CONFIG_NO_HZ=y +CONFIG_PACKET=y +CONFIG_PM_AUTOSLEEP=y +CONFIG_PM_WAKELOCKS=y +CONFIG_PPP=y +CONFIG_PPP_BSDCOMP=y +CONFIG_PPP_DEFLATE=y +CONFIG_PPP_MPPE=y +CONFIG_PREEMPT=y +CONFIG_QUOTA=y +CONFIG_RTC_CLASS=y +CONFIG_RT_GROUP_SCHED=y +CONFIG_SECURITY=y +CONFIG_SECURITY_NETWORK=y +CONFIG_SECURITY_SELINUX=y +CONFIG_SETEND_EMULATION=y +CONFIG_STAGING=y +CONFIG_SWP_EMULATION=y +CONFIG_SYNC=y +CONFIG_TUN=y +CONFIG_UNIX=y +CONFIG_USB_GADGET=y +CONFIG_USB_CONFIGFS=y +CONFIG_USB_CONFIGFS_F_FS=y +CONFIG_USB_CONFIGFS_F_MIDI=y +CONFIG_USB_OTG_WAKELOCK=y +CONFIG_XFRM_USER=y diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config new file mode 100644 index 000000000000..e3b953e966d2 --- /dev/null +++ b/kernel/configs/android-recommended.config @@ -0,0 +1,121 @@ +# KEEP ALPHABETICALLY SORTED +# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set +# CONFIG_INPUT_MOUSE is not set +# CONFIG_LEGACY_PTYS is not set +# CONFIG_NF_CONNTRACK_SIP is not set +# CONFIG_PM_WAKELOCKS_GC is not set +# CONFIG_VT is not set +CONFIG_BACKLIGHT_LCD_SUPPORT=y +CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=8192 +CONFIG_COMPACTION=y +CONFIG_DEBUG_RODATA=y +CONFIG_DM_UEVENT=y +CONFIG_DRAGONRISE_FF=y +CONFIG_ENABLE_DEFAULT_TRACERS=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_FS_SECURITY=y +CONFIG_FUSE_FS=y +CONFIG_GREENASIA_FF=y +CONFIG_HIDRAW=y +CONFIG_HID_A4TECH=y +CONFIG_HID_ACRUX=y +CONFIG_HID_ACRUX_FF=y +CONFIG_HID_APPLE=y +CONFIG_HID_BELKIN=y +CONFIG_HID_CHERRY=y +CONFIG_HID_CHICONY=y +CONFIG_HID_CYPRESS=y +CONFIG_HID_DRAGONRISE=y +CONFIG_HID_ELECOM=y +CONFIG_HID_EMS_FF=y +CONFIG_HID_EZKEY=y +CONFIG_HID_GREENASIA=y +CONFIG_HID_GYRATION=y +CONFIG_HID_HOLTEK=y +CONFIG_HID_KENSINGTON=y +CONFIG_HID_KEYTOUCH=y +CONFIG_HID_KYE=y +CONFIG_HID_LCPOWER=y +CONFIG_HID_LOGITECH=y +CONFIG_HID_LOGITECH_DJ=y +CONFIG_HID_MAGICMOUSE=y +CONFIG_HID_MICROSOFT=y +CONFIG_HID_MONTEREY=y +CONFIG_HID_MULTITOUCH=y +CONFIG_HID_NTRIG=y +CONFIG_HID_ORTEK=y +CONFIG_HID_PANTHERLORD=y +CONFIG_HID_PETALYNX=y +CONFIG_HID_PICOLCD=y +CONFIG_HID_PRIMAX=y +CONFIG_HID_PRODIKEYS=y +CONFIG_HID_ROCCAT=y +CONFIG_HID_SAITEK=y +CONFIG_HID_SAMSUNG=y +CONFIG_HID_SMARTJOYPLUS=y +CONFIG_HID_SONY=y +CONFIG_HID_SPEEDLINK=y +CONFIG_HID_SUNPLUS=y +CONFIG_HID_THRUSTMASTER=y +CONFIG_HID_TIVO=y +CONFIG_HID_TOPSEED=y +CONFIG_HID_TWINHAN=y +CONFIG_HID_UCLOGIC=y +CONFIG_HID_WACOM=y +CONFIG_HID_WALTOP=y +CONFIG_HID_WIIMOTE=y +CONFIG_HID_ZEROPLUS=y +CONFIG_HID_ZYDACRON=y +CONFIG_INPUT_EVDEV=y +CONFIG_INPUT_GPIO=y +CONFIG_INPUT_JOYSTICK=y +CONFIG_INPUT_MISC=y +CONFIG_INPUT_TABLET=y +CONFIG_INPUT_UINPUT=y +CONFIG_ION=y +CONFIG_JOYSTICK_XPAD=y +CONFIG_JOYSTICK_XPAD_FF=y +CONFIG_JOYSTICK_XPAD_LEDS=y +CONFIG_KALLSYMS_ALL=y +CONFIG_KSM=y +CONFIG_LOGIG940_FF=y +CONFIG_LOGIRUMBLEPAD2_FF=y +CONFIG_LOGITECH_FF=y +CONFIG_MD=y +CONFIG_MEDIA_SUPPORT=y +CONFIG_MSDOS_FS=y +CONFIG_PANIC_TIMEOUT=5 +CONFIG_PANTHERLORD_FF=y +CONFIG_PERF_EVENTS=y +CONFIG_PM_DEBUG=y +CONFIG_PM_RUNTIME=y +CONFIG_PM_WAKELOCKS_LIMIT=0 +CONFIG_POWER_SUPPLY=y +CONFIG_PSTORE=y +CONFIG_PSTORE_CONSOLE=y +CONFIG_PSTORE_RAM=y +CONFIG_SCHEDSTATS=y +CONFIG_SMARTJOYPLUS_FF=y +CONFIG_SND=y +CONFIG_SOUND=y +CONFIG_SUSPEND_TIME=y +CONFIG_TABLET_USB_ACECAD=y +CONFIG_TABLET_USB_AIPTEK=y +CONFIG_TABLET_USB_GTCO=y +CONFIG_TABLET_USB_HANWANG=y +CONFIG_TABLET_USB_KBTAB=y +CONFIG_TASKSTATS=y +CONFIG_TASK_DELAY_ACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_TASK_XACCT=y +CONFIG_TIMER_STATS=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_UHID=y +CONFIG_USB_ANNOUNCE_NEW_DEVICES=y +CONFIG_USB_EHCI_HCD=y +CONFIG_USB_HIDDEV=y +CONFIG_USB_USBNET=y +CONFIG_VFAT_FS=y diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config index c2de56ab0fce..7fa0c4ae6394 100644 --- a/kernel/configs/tiny.config +++ b/kernel/configs/tiny.config @@ -1,4 +1,12 @@ +# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set CONFIG_CC_OPTIMIZE_FOR_SIZE=y +# CONFIG_KERNEL_GZIP is not set +# CONFIG_KERNEL_BZIP2 is not set +# CONFIG_KERNEL_LZMA is not set CONFIG_KERNEL_XZ=y +# CONFIG_KERNEL_LZO is not set +# CONFIG_KERNEL_LZ4 is not set CONFIG_OPTIMIZE_INLINING=y +# CONFIG_SLAB is not set +# CONFIG_SLUB is not set CONFIG_SLOB=y diff --git a/kernel/cpu.c b/kernel/cpu.c index 3e3f6e49eabb..341bf80f80bd 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -517,6 +517,13 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, if (!cpu_online(cpu)) return 0; + /* + * If we are up and running, use the hotplug thread. For early calls + * we invoke the thread function directly. + */ + if (!st->thread) + return cpuhp_invoke_callback(cpu, state, cb); + st->cb_state = state; st->cb = cb; /* @@ -703,21 +710,6 @@ static int takedown_cpu(unsigned int cpu) struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); int err; - /* - * By now we've cleared cpu_active_mask, wait for all preempt-disabled - * and RCU users of this state to go away such that all new such users - * will observe it. - * - * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might - * not imply sync_sched(), so wait for both. - * - * Do sync before park smpboot threads to take care the rcu boost case. - */ - if (IS_ENABLED(CONFIG_PREEMPT)) - synchronize_rcu_mult(call_rcu, call_rcu_sched); - else - synchronize_rcu(); - /* Park the smpboot threads */ kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread); smpboot_park_threads(cpu); @@ -923,8 +915,6 @@ void cpuhp_online_idle(enum cpuhp_state state) st->state = CPUHP_AP_ONLINE_IDLE; - /* The cpu is marked online, set it active now */ - set_cpu_active(cpu, true); /* Unpark the stopper thread and the hotplug thread of this cpu */ stop_machine_unpark(cpu); kthread_unpark(st->thread); @@ -1190,6 +1180,31 @@ static struct cpuhp_step cpuhp_bp_states[] = { .teardown = NULL, .cant_stop = true, }, + [CPUHP_PERF_PREPARE] = { + .name = "perf prepare", + .startup = perf_event_init_cpu, + .teardown = perf_event_exit_cpu, + }, + [CPUHP_WORKQUEUE_PREP] = { + .name = "workqueue prepare", + .startup = workqueue_prepare_cpu, + .teardown = NULL, + }, + [CPUHP_HRTIMERS_PREPARE] = { + .name = "hrtimers prepare", + .startup = hrtimers_prepare_cpu, + .teardown = hrtimers_dead_cpu, + }, + [CPUHP_SMPCFD_PREPARE] = { + .name = "SMPCFD prepare", + .startup = smpcfd_prepare_cpu, + .teardown = smpcfd_dead_cpu, + }, + [CPUHP_RCUTREE_PREP] = { + .name = "RCU-tree prepare", + .startup = rcutree_prepare_cpu, + .teardown = rcutree_dead_cpu, + }, /* * Preparatory and dead notifiers. Will be replaced once the notifiers * are converted to states. @@ -1201,6 +1216,16 @@ static struct cpuhp_step cpuhp_bp_states[] = { .skip_onerr = true, .cant_stop = true, }, + /* + * On the tear-down path, timers_dead_cpu() must be invoked + * before blk_mq_queue_reinit_notify() from notify_dead(), + * otherwise a RCU stall occurs. + */ + [CPUHP_TIMERS_DEAD] = { + .name = "timers dead", + .startup = NULL, + .teardown = timers_dead_cpu, + }, /* Kicks the plugged cpu into life */ [CPUHP_BRINGUP_CPU] = { .name = "cpu:bringup", @@ -1208,6 +1233,10 @@ static struct cpuhp_step cpuhp_bp_states[] = { .teardown = NULL, .cant_stop = true, }, + [CPUHP_AP_SMPCFD_DYING] = { + .startup = NULL, + .teardown = smpcfd_dying_cpu, + }, /* * Handled on controll processor until the plugged processor manages * this itself. @@ -1218,6 +1247,8 @@ static struct cpuhp_step cpuhp_bp_states[] = { .teardown = takedown_cpu, .cant_stop = true, }, +#else + [CPUHP_BRINGUP_CPU] = { }, #endif }; @@ -1236,6 +1267,16 @@ static struct cpuhp_step cpuhp_ap_states[] = { .name = "ap:offline", .cant_stop = true, }, + /* First state is scheduler control. Interrupts are disabled */ + [CPUHP_AP_SCHED_STARTING] = { + .name = "sched:starting", + .startup = sched_cpu_starting, + .teardown = sched_cpu_dying, + }, + [CPUHP_AP_RCUTREE_DYING] = { + .startup = NULL, + .teardown = rcutree_dying_cpu, + }, /* * Low level startup/teardown notifiers. Run with interrupts * disabled. Will be removed once the notifiers are converted to @@ -1259,6 +1300,22 @@ static struct cpuhp_step cpuhp_ap_states[] = { .startup = smpboot_unpark_threads, .teardown = NULL, }, + [CPUHP_AP_PERF_ONLINE] = { + .name = "perf online", + .startup = perf_event_init_cpu, + .teardown = perf_event_exit_cpu, + }, + [CPUHP_AP_WORKQUEUE_ONLINE] = { + .name = "workqueue online", + .startup = workqueue_online_cpu, + .teardown = workqueue_offline_cpu, + }, + [CPUHP_AP_RCUTREE_ONLINE] = { + .name = "RCU-tree online", + .startup = rcutree_online_cpu, + .teardown = rcutree_offline_cpu, + }, + /* * Online/down_prepare notifiers. Will be removed once the notifiers * are converted to states. @@ -1274,6 +1331,15 @@ static struct cpuhp_step cpuhp_ap_states[] = { * The dynamically registered state space is here */ +#ifdef CONFIG_SMP + /* Last state is scheduler control setting the cpu active */ + [CPUHP_AP_ACTIVE] = { + .name = "sched:active", + .startup = sched_cpu_activate, + .teardown = sched_cpu_deactivate, + }, +#endif + /* CPU is fully up and running. */ [CPUHP_ONLINE] = { .name = "online", diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1902956baba1..2b4c20ab5bbe 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -61,7 +61,7 @@ #include <linux/cgroup.h> #include <linux/wait.h> -struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE; +DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); /* See "Frequency meter" comments, below. */ @@ -325,8 +325,7 @@ static struct file_system_type cpuset_fs_type = { /* * Return in pmask the portion of a cpusets's cpus_allowed that * are online. If none are online, walk up the cpuset hierarchy - * until we find one that does have some online cpus. The top - * cpuset always has some cpus online. + * until we find one that does have some online cpus. * * One way or another, we guarantee to return some non-empty subset * of cpu_online_mask. @@ -335,8 +334,20 @@ static struct file_system_type cpuset_fs_type = { */ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) { - while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) + while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) { cs = parent_cs(cs); + if (unlikely(!cs)) { + /* + * The top cpuset doesn't have any online cpu as a + * consequence of a race between cpuset_hotplug_work + * and cpu hotplug notifier. But we know the top + * cpuset's effective_cpus is on its way to to be + * identical to cpu_online_mask. + */ + cpumask_copy(pmask, cpu_online_mask); + return; + } + } cpumask_and(pmask, cs->effective_cpus, cpu_online_mask); } @@ -1034,15 +1045,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, { bool need_loop; - /* - * Allow tasks that have access to memory reserves because they have - * been OOM killed to get memory anywhere. - */ - if (unlikely(test_thread_flag(TIF_MEMDIE))) - return; - if (current->flags & PF_EXITING) /* Let dying task have memory */ - return; - task_lock(tsk); /* * Determine if a loop is necessary if another thread is doing @@ -2078,6 +2080,20 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) mutex_unlock(&cpuset_mutex); } +/* + * Make sure the new task conform to the current state of its parent, + * which could have been changed by cpuset just after it inherits the + * state from the parent and before it sits on the cgroup's task list. + */ +static void cpuset_fork(struct task_struct *task) +{ + if (task_css_is_root(task, cpuset_cgrp_id)) + return; + + set_cpus_allowed_ptr(task, ¤t->cpus_allowed); + task->mems_allowed = current->mems_allowed; +} + struct cgroup_subsys cpuset_cgrp_subsys = { .css_alloc = cpuset_css_alloc, .css_online = cpuset_css_online, @@ -2088,6 +2104,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .attach = cpuset_attach, .post_attach = cpuset_post_attach, .bind = cpuset_bind, + .fork = cpuset_fork, .legacy_cftypes = files, .early_init = true, }; @@ -2528,27 +2545,27 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. */ -int __cpuset_node_allowed(int node, gfp_t gfp_mask) +bool __cpuset_node_allowed(int node, gfp_t gfp_mask) { struct cpuset *cs; /* current cpuset ancestors */ int allowed; /* is allocation in zone z allowed? */ unsigned long flags; if (in_interrupt()) - return 1; + return true; if (node_isset(node, current->mems_allowed)) - return 1; + return true; /* * Allow tasks that have access to memory reserves because they have * been OOM killed to get memory anywhere. */ if (unlikely(test_thread_flag(TIF_MEMDIE))) - return 1; + return true; if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ - return 0; + return false; if (current->flags & PF_EXITING) /* Let dying task have memory */ - return 1; + return true; /* Not hardwall and node outside mems_allowed: scan up cpusets */ spin_lock_irqsave(&callback_lock, flags); @@ -2591,13 +2608,7 @@ int __cpuset_node_allowed(int node, gfp_t gfp_mask) static int cpuset_spread_node(int *rotor) { - int node; - - node = next_node(*rotor, current->mems_allowed); - if (node == MAX_NUMNODES) - node = first_node(current->mems_allowed); - *rotor = node; - return node; + return *rotor = next_node_in(*rotor, current->mems_allowed); } int cpuset_mem_spread_node(void) diff --git a/kernel/cred.c b/kernel/cred.c index 0c0cd8a62285..5f264fb5737d 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -689,6 +689,8 @@ EXPORT_SYMBOL(set_security_override_from_ctx); */ int set_create_files_as(struct cred *new, struct inode *inode) { + if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid)) + return -EINVAL; new->fsuid = inode->i_uid; new->fsgid = inode->i_gid; return security_kernel_create_files_as(new, inode); diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 343c22f5e867..e9fdb5203de5 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -18,18 +18,28 @@ struct callchain_cpus_entries { struct perf_callchain_entry *cpu_entries[0]; }; +int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH; +int sysctl_perf_event_max_contexts_per_stack __read_mostly = PERF_MAX_CONTEXTS_PER_STACK; + +static inline size_t perf_callchain_entry__sizeof(void) +{ + return (sizeof(struct perf_callchain_entry) + + sizeof(__u64) * (sysctl_perf_event_max_stack + + sysctl_perf_event_max_contexts_per_stack)); +} + static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); static atomic_t nr_callchain_events; static DEFINE_MUTEX(callchain_mutex); static struct callchain_cpus_entries *callchain_cpus_entries; -__weak void perf_callchain_kernel(struct perf_callchain_entry *entry, +__weak void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { } -__weak void perf_callchain_user(struct perf_callchain_entry *entry, +__weak void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { } @@ -73,7 +83,7 @@ static int alloc_callchain_buffers(void) if (!entries) return -ENOMEM; - size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; + size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS; for_each_possible_cpu(cpu) { entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, @@ -94,7 +104,7 @@ fail: return -ENOMEM; } -int get_callchain_buffers(void) +int get_callchain_buffers(int event_max_stack) { int err = 0; int count; @@ -111,6 +121,15 @@ int get_callchain_buffers(void) /* If the allocation failed, give up */ if (!callchain_cpus_entries) err = -ENOMEM; + /* + * If requesting per event more than the global cap, + * return a different error to help userspace figure + * this out. + * + * And also do it here so that we have &callchain_mutex held. + */ + if (event_max_stack > sysctl_perf_event_max_stack) + err = -EOVERFLOW; goto exit; } @@ -147,7 +166,8 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx) cpu = smp_processor_id(); - return &entries->cpu_entries[cpu][*rctx]; + return (((void *)entries->cpu_entries[cpu]) + + (*rctx * perf_callchain_entry__sizeof())); } static void @@ -163,18 +183,20 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) bool user = !event->attr.exclude_callchain_user; /* Disallow cross-task user callchains. */ bool crosstask = event->ctx->task && event->ctx->task != current; + const u32 max_stack = event->attr.sample_max_stack; if (!kernel && !user) return NULL; - return get_perf_callchain(regs, 0, kernel, user, crosstask, true); + return get_perf_callchain(regs, 0, kernel, user, max_stack, crosstask, true); } struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, - bool crosstask, bool add_mark) + u32 max_stack, bool crosstask, bool add_mark) { struct perf_callchain_entry *entry; + struct perf_callchain_entry_ctx ctx; int rctx; entry = get_callchain_entry(&rctx); @@ -184,12 +206,16 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, if (!entry) goto exit_put; - entry->nr = init_nr; + ctx.entry = entry; + ctx.max_stack = max_stack; + ctx.nr = entry->nr = init_nr; + ctx.contexts = 0; + ctx.contexts_maxed = false; if (kernel && !user_mode(regs)) { if (add_mark) - perf_callchain_store(entry, PERF_CONTEXT_KERNEL); - perf_callchain_kernel(entry, regs); + perf_callchain_store_context(&ctx, PERF_CONTEXT_KERNEL); + perf_callchain_kernel(&ctx, regs); } if (user) { @@ -205,8 +231,8 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, goto exit_put; if (add_mark) - perf_callchain_store(entry, PERF_CONTEXT_USER); - perf_callchain_user(entry, regs); + perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); + perf_callchain_user(&ctx, regs); } } @@ -215,3 +241,30 @@ exit_put: return entry; } + +/* + * Used for sysctl_perf_event_max_stack and + * sysctl_perf_event_max_contexts_per_stack. + */ +int perf_event_max_stack_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *value = table->data; + int new_value = *value, ret; + struct ctl_table new_table = *table; + + new_table.data = &new_value; + ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos); + if (ret || !write) + return ret; + + mutex_lock(&callchain_mutex); + if (atomic_read(&nr_callchain_events)) + ret = -EBUSY; + else + *value = new_value; + + mutex_unlock(&callchain_mutex); + + return ret; +} diff --git a/kernel/events/core.c b/kernel/events/core.c index c0ded2416615..fc9bb2225291 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -44,6 +44,8 @@ #include <linux/compat.h> #include <linux/bpf.h> #include <linux/filter.h> +#include <linux/namei.h> +#include <linux/parser.h> #include "internal.h" @@ -240,18 +242,6 @@ unlock: return ret; } -static void event_function_local(struct perf_event *event, event_f func, void *data) -{ - struct event_function_struct efs = { - .event = event, - .func = func, - .data = data, - }; - - int ret = event_function(&efs); - WARN_ON_ONCE(ret); -} - static void event_function_call(struct perf_event *event, event_f func, void *data) { struct perf_event_context *ctx = event->ctx; @@ -301,6 +291,54 @@ again: raw_spin_unlock_irq(&ctx->lock); } +/* + * Similar to event_function_call() + event_function(), but hard assumes IRQs + * are already disabled and we're on the right CPU. + */ +static void event_function_local(struct perf_event *event, event_f func, void *data) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct task_struct *task = READ_ONCE(ctx->task); + struct perf_event_context *task_ctx = NULL; + + WARN_ON_ONCE(!irqs_disabled()); + + if (task) { + if (task == TASK_TOMBSTONE) + return; + + task_ctx = ctx; + } + + perf_ctx_lock(cpuctx, task_ctx); + + task = ctx->task; + if (task == TASK_TOMBSTONE) + goto unlock; + + if (task) { + /* + * We must be either inactive or active and the right task, + * otherwise we're screwed, since we cannot IPI to somewhere + * else. + */ + if (ctx->is_active) { + if (WARN_ON_ONCE(task != current)) + goto unlock; + + if (WARN_ON_ONCE(cpuctx->task_ctx != ctx)) + goto unlock; + } + } else { + WARN_ON_ONCE(&cpuctx->ctx != ctx); + } + + func(event, cpuctx, ctx, data); +unlock: + perf_ctx_unlock(cpuctx, task_ctx); +} + #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ PERF_FLAG_FD_OUTPUT |\ PERF_FLAG_PID_CGROUP |\ @@ -333,6 +371,7 @@ static atomic_t perf_sched_count; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static DEFINE_PER_CPU(int, perf_sched_cb_usages); +static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; @@ -394,6 +433,13 @@ int perf_proc_update_handler(struct ctl_table *table, int write, if (ret || !write) return ret; + /* + * If throttling is disabled don't allow the write: + */ + if (sysctl_perf_cpu_time_max_percent == 100 || + sysctl_perf_cpu_time_max_percent == 0) + return -EINVAL; + max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; update_perf_cpu_limits(); @@ -438,7 +484,7 @@ static u64 __report_allowed; static void perf_duration_warn(struct irq_work *w) { - printk_ratelimited(KERN_WARNING + printk_ratelimited(KERN_INFO "perf: interrupt took too long (%lld > %lld), lowering " "kernel.perf_event_max_sample_rate to %d\n", __report_avg, __report_allowed, @@ -833,6 +879,32 @@ perf_cgroup_mark_enabled(struct perf_event *event, } } } + +/* + * Update cpuctx->cgrp so that it is set when first cgroup event is added and + * cleared when last cgroup event is removed. + */ +static inline void +list_update_cgroup_event(struct perf_event *event, + struct perf_event_context *ctx, bool add) +{ + struct perf_cpu_context *cpuctx; + + if (!is_cgroup_event(event)) + return; + + if (add && ctx->nr_cgroups++) + return; + else if (!add && --ctx->nr_cgroups) + return; + /* + * Because cgroup events are always per-cpu events, + * this will always be called from the right CPU. + */ + cpuctx = __get_cpu_context(ctx); + cpuctx->cgrp = add ? event->cgrp : NULL; +} + #else /* !CONFIG_CGROUP_PERF */ static inline bool @@ -910,6 +982,13 @@ perf_cgroup_mark_enabled(struct perf_event *event, struct perf_event_context *ctx) { } + +static inline void +list_update_cgroup_event(struct perf_event *event, + struct perf_event_context *ctx, bool add) +{ +} + #endif /* @@ -1382,6 +1461,7 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) static void list_add_event(struct perf_event *event, struct perf_event_context *ctx) { + lockdep_assert_held(&ctx->lock); WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); @@ -1402,8 +1482,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) list_add_tail(&event->group_entry, list); } - if (is_cgroup_event(event)) - ctx->nr_cgroups++; + list_update_cgroup_event(event, ctx, true); list_add_rcu(&event->event_entry, &ctx->event_list); ctx->nr_events++; @@ -1571,8 +1650,6 @@ static void perf_group_attach(struct perf_event *event) static void list_del_event(struct perf_event *event, struct perf_event_context *ctx) { - struct perf_cpu_context *cpuctx; - WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); @@ -1584,20 +1661,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) event->attach_state &= ~PERF_ATTACH_CONTEXT; - if (is_cgroup_event(event)) { - ctx->nr_cgroups--; - /* - * Because cgroup events are always per-cpu events, this will - * always be called from the right CPU. - */ - cpuctx = __get_cpu_context(ctx); - /* - * If there are no more cgroup events then clear cgrp to avoid - * stale pointer in update_cgrp_time_from_cpuctx(). - */ - if (!ctx->nr_cgroups) - cpuctx->cgrp = NULL; - } + list_update_cgroup_event(event, ctx, false); ctx->nr_events--; if (event->attr.inherit_stat) @@ -1676,17 +1740,38 @@ static bool is_orphaned_event(struct perf_event *event) return event->state == PERF_EVENT_STATE_DEAD; } -static inline int pmu_filter_match(struct perf_event *event) +static inline int __pmu_filter_match(struct perf_event *event) { struct pmu *pmu = event->pmu; return pmu->filter_match ? pmu->filter_match(event) : 1; } +/* + * Check whether we should attempt to schedule an event group based on + * PMU-specific filtering. An event group can consist of HW and SW events, + * potentially with a SW leader, so we must check all the filters, to + * determine whether a group is schedulable: + */ +static inline int pmu_filter_match(struct perf_event *event) +{ + struct perf_event *child; + + if (!__pmu_filter_match(event)) + return 0; + + list_for_each_entry(child, &event->sibling_list, group_entry) { + if (!__pmu_filter_match(child)) + return 0; + } + + return 1; +} + static inline int event_filter_match(struct perf_event *event) { - return (event->cpu == -1 || event->cpu == smp_processor_id()) - && perf_cgroup_match(event) && pmu_filter_match(event); + return (event->cpu == -1 || event->cpu == smp_processor_id()) && + perf_cgroup_match(event) && pmu_filter_match(event); } static void @@ -1706,8 +1791,8 @@ event_sched_out(struct perf_event *event, * maintained, otherwise bogus information is return * via read() for time_enabled, time_running: */ - if (event->state == PERF_EVENT_STATE_INACTIVE - && !event_filter_match(event)) { + if (event->state == PERF_EVENT_STATE_INACTIVE && + !event_filter_match(event)) { delta = tstamp - event->tstamp_stopped; event->tstamp_running += delta; event->tstamp_stopped = tstamp; @@ -1927,8 +2012,13 @@ event_sched_in(struct perf_event *event, if (event->state <= PERF_EVENT_STATE_OFF) return 0; - event->state = PERF_EVENT_STATE_ACTIVE; - event->oncpu = smp_processor_id(); + WRITE_ONCE(event->oncpu, smp_processor_id()); + /* + * Order event::oncpu write to happen before the ACTIVE state + * is visible. + */ + smp_wmb(); + WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE); /* * Unthrottle events, since we scheduled we might have missed several @@ -2200,10 +2290,15 @@ perf_install_in_context(struct perf_event_context *ctx, lockdep_assert_held(&ctx->mutex); - event->ctx = ctx; if (event->cpu != -1) event->cpu = cpu; + /* + * Ensures that if we can observe event->ctx, both the event and ctx + * will be 'complete'. See perf_iterate_sb_cpu(). + */ + smp_store_release(&event->ctx, ctx); + if (!task) { cpu_function_call(cpu, __perf_install_in_context, event); return; @@ -2360,6 +2455,112 @@ void perf_event_enable(struct perf_event *event) } EXPORT_SYMBOL_GPL(perf_event_enable); +struct stop_event_data { + struct perf_event *event; + unsigned int restart; +}; + +static int __perf_event_stop(void *info) +{ + struct stop_event_data *sd = info; + struct perf_event *event = sd->event; + + /* if it's already INACTIVE, do nothing */ + if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) + return 0; + + /* matches smp_wmb() in event_sched_in() */ + smp_rmb(); + + /* + * There is a window with interrupts enabled before we get here, + * so we need to check again lest we try to stop another CPU's event. + */ + if (READ_ONCE(event->oncpu) != smp_processor_id()) + return -EAGAIN; + + event->pmu->stop(event, PERF_EF_UPDATE); + + /* + * May race with the actual stop (through perf_pmu_output_stop()), + * but it is only used for events with AUX ring buffer, and such + * events will refuse to restart because of rb::aux_mmap_count==0, + * see comments in perf_aux_output_begin(). + * + * Since this is happening on a event-local CPU, no trace is lost + * while restarting. + */ + if (sd->restart) + event->pmu->start(event, PERF_EF_START); + + return 0; +} + +static int perf_event_stop(struct perf_event *event, int restart) +{ + struct stop_event_data sd = { + .event = event, + .restart = restart, + }; + int ret = 0; + + do { + if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) + return 0; + + /* matches smp_wmb() in event_sched_in() */ + smp_rmb(); + + /* + * We only want to restart ACTIVE events, so if the event goes + * inactive here (event->oncpu==-1), there's nothing more to do; + * fall through with ret==-ENXIO. + */ + ret = cpu_function_call(READ_ONCE(event->oncpu), + __perf_event_stop, &sd); + } while (ret == -EAGAIN); + + return ret; +} + +/* + * In order to contain the amount of racy and tricky in the address filter + * configuration management, it is a two part process: + * + * (p1) when userspace mappings change as a result of (1) or (2) or (3) below, + * we update the addresses of corresponding vmas in + * event::addr_filters_offs array and bump the event::addr_filters_gen; + * (p2) when an event is scheduled in (pmu::add), it calls + * perf_event_addr_filters_sync() which calls pmu::addr_filters_sync() + * if the generation has changed since the previous call. + * + * If (p1) happens while the event is active, we restart it to force (p2). + * + * (1) perf_addr_filters_apply(): adjusting filters' offsets based on + * pre-existing mappings, called once when new filters arrive via SET_FILTER + * ioctl; + * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly + * registered mapping, called for every new mmap(), with mm::mmap_sem down + * for reading; + * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process + * of exec. + */ +void perf_event_addr_filters_sync(struct perf_event *event) +{ + struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); + + if (!has_addr_filter(event)) + return; + + raw_spin_lock(&ifh->lock); + if (event->addr_filters_gen != event->hw.addr_filters_gen) { + event->pmu->addr_filters_sync(event); + event->hw.addr_filters_gen = event->addr_filters_gen; + } + raw_spin_unlock(&ifh->lock); +} +EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync); + static int _perf_event_refresh(struct perf_event *event, int refresh) { /* @@ -3209,16 +3410,6 @@ out: put_ctx(clone_ctx); } -void perf_event_exec(void) -{ - int ctxn; - - rcu_read_lock(); - for_each_task_context_nr(ctxn) - perf_event_enable_on_exec(ctxn); - rcu_read_unlock(); -} - struct perf_read_data { struct perf_event *event; bool group; @@ -3358,8 +3549,17 @@ static int perf_event_read(struct perf_event *event, bool group) .group = group, .ret = 0, }; - smp_call_function_single(event->oncpu, - __perf_event_read, &data, 1); + /* + * Purposely ignore the smp_call_function_single() return + * value. + * + * If event->oncpu isn't a valid CPU it means the event got + * scheduled out and that will have updated the event count. + * + * Therefore, either way, we'll have an up-to-date event count + * after this. + */ + (void)smp_call_function_single(event->oncpu, __perf_event_read, &data, 1); ret = data.ret; } else if (event->state == PERF_EVENT_STATE_INACTIVE) { struct perf_event_context *ctx = event->ctx; @@ -3562,6 +3762,39 @@ static void free_event_rcu(struct rcu_head *head) static void ring_buffer_attach(struct perf_event *event, struct ring_buffer *rb); +static void detach_sb_event(struct perf_event *event) +{ + struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); + + raw_spin_lock(&pel->lock); + list_del_rcu(&event->sb_list); + raw_spin_unlock(&pel->lock); +} + +static bool is_sb_event(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + + if (event->parent) + return false; + + if (event->attach_state & PERF_ATTACH_TASK) + return false; + + if (attr->mmap || attr->mmap_data || attr->mmap2 || + attr->comm || attr->comm_exec || + attr->task || + attr->context_switch) + return true; + return false; +} + +static void unaccount_pmu_sb_event(struct perf_event *event) +{ + if (is_sb_event(event)) + detach_sb_event(event); +} + static void unaccount_event_cpu(struct perf_event *event, int cpu) { if (event->parent) @@ -3625,6 +3858,8 @@ static void unaccount_event(struct perf_event *event) } unaccount_event_cpu(event, event->cpu); + + unaccount_pmu_sb_event(event); } static void perf_sched_delayed(struct work_struct *work) @@ -3694,7 +3929,7 @@ static void exclusive_event_destroy(struct perf_event *event) static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) { - if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && + if ((e1->pmu == e2->pmu) && (e1->cpu == e2->cpu || e1->cpu == -1 || e2->cpu == -1)) @@ -3720,6 +3955,9 @@ static bool exclusive_event_installable(struct perf_event *event, return true; } +static void perf_addr_filters_splice(struct perf_event *event, + struct list_head *head); + static void _free_event(struct perf_event *event) { irq_work_sync(&event->pending); @@ -3747,6 +3985,8 @@ static void _free_event(struct perf_event *event) } perf_event_free_bpf_prog(event); + perf_addr_filters_splice(event, NULL); + kfree(event->addr_filters_offs); if (event->destroy) event->destroy(event); @@ -3754,10 +3994,8 @@ static void _free_event(struct perf_event *event) if (event->ctx) put_ctx(event->ctx); - if (event->pmu) { - exclusive_event_destroy(event); - module_put(event->pmu->module); - } + exclusive_event_destroy(event); + module_put(event->pmu->module); call_rcu(&event->rcu_head, free_event_rcu); } @@ -4343,6 +4581,19 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon case PERF_EVENT_IOC_SET_BPF: return perf_event_set_bpf_prog(event, arg); + case PERF_EVENT_IOC_PAUSE_OUTPUT: { + struct ring_buffer *rb; + + rcu_read_lock(); + rb = rcu_dereference(event->rb); + if (!rb || !rb->nr_pages) { + rcu_read_unlock(); + return -EINVAL; + } + rb_toggle_paused(rb, !!arg); + rcu_read_unlock(); + return 0; + } default: return -ENOTTY; } @@ -4594,6 +4845,19 @@ static void ring_buffer_attach(struct perf_event *event, spin_unlock_irqrestore(&rb->event_lock, flags); } + /* + * Avoid racing with perf_mmap_close(AUX): stop the event + * before swizzling the event::rb pointer; if it's getting + * unmapped, its aux_mmap_count will be 0 and it won't + * restart. See the comment in __perf_pmu_output_stop(). + * + * Data will inevitably be lost when set_output is done in + * mid-air, but then again, whoever does it like this is + * not in for the data anyway. + */ + if (has_aux(event)) + perf_event_stop(event, 0); + rcu_assign_pointer(event->rb, rb); if (old_rb) { @@ -4659,6 +4923,8 @@ static void perf_mmap_open(struct vm_area_struct *vma) event->pmu->event_mapped(event); } +static void perf_pmu_output_stop(struct perf_event *event); + /* * A buffer can be mmap()ed multiple times; either directly through the same * event, or through other events by use of perf_event_set_output(). @@ -4686,10 +4952,22 @@ static void perf_mmap_close(struct vm_area_struct *vma) */ if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) { + /* + * Stop all AUX events that are writing to this buffer, + * so that we can free its AUX pages and corresponding PMU + * data. Note that after rb::aux_mmap_count dropped to zero, + * they won't start any more (see perf_aux_output_begin()). + */ + perf_pmu_output_stop(event); + + /* now it's safe to free the pages */ atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); vma->vm_mm->pinned_vm -= rb->aux_mmap_locked; + /* this has to be the last one */ rb_free_aux(rb); + WARN_ON_ONCE(atomic_read(&rb->aux_refcount)); + mutex_unlock(&event->mmap_mutex); } @@ -5420,16 +5698,26 @@ void perf_output_sample(struct perf_output_handle *handle, } if (sample_type & PERF_SAMPLE_RAW) { - if (data->raw) { - u32 raw_size = data->raw->size; - u32 real_size = round_up(raw_size + sizeof(u32), - sizeof(u64)) - sizeof(u32); - u64 zero = 0; - - perf_output_put(handle, real_size); - __output_copy(handle, data->raw->data, raw_size); - if (real_size - raw_size) - __output_copy(handle, &zero, real_size - raw_size); + struct perf_raw_record *raw = data->raw; + + if (raw) { + struct perf_raw_frag *frag = &raw->frag; + + perf_output_put(handle, raw->size); + do { + if (frag->copy) { + __output_custom(handle, frag->copy, + frag->data, frag->size); + } else { + __output_copy(handle, frag->data, + frag->size); + } + if (perf_raw_frag_last(frag)) + break; + frag = frag->next; + } while (1); + if (frag->pad) + __output_skip(handle, NULL, frag->pad); } else { struct { u32 size; @@ -5554,14 +5842,28 @@ void perf_prepare_sample(struct perf_event_header *header, } if (sample_type & PERF_SAMPLE_RAW) { - int size = sizeof(u32); - - if (data->raw) - size += data->raw->size; - else - size += sizeof(u32); + struct perf_raw_record *raw = data->raw; + int size; + + if (raw) { + struct perf_raw_frag *frag = &raw->frag; + u32 sum = 0; + + do { + sum += frag->size; + if (perf_raw_frag_last(frag)) + break; + frag = frag->next; + } while (1); + + size = round_up(sum + sizeof(u32), sizeof(u64)); + raw->size = size - sizeof(u32); + frag->pad = raw->size - sum; + } else { + size = sizeof(u64); + } - header->size += round_up(size, sizeof(u64)); + header->size += size; } if (sample_type & PERF_SAMPLE_BRANCH_STACK) { @@ -5630,9 +5932,13 @@ void perf_prepare_sample(struct perf_event_header *header, } } -void perf_event_output(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) +static void __always_inline +__perf_event_output(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs, + int (*output_begin)(struct perf_output_handle *, + struct perf_event *, + unsigned int)) { struct perf_output_handle handle; struct perf_event_header header; @@ -5642,7 +5948,7 @@ void perf_event_output(struct perf_event *event, perf_prepare_sample(&header, data, event, regs); - if (perf_output_begin(&handle, event, header.size)) + if (output_begin(&handle, event, header.size)) goto exit; perf_output_sample(&handle, &header, data, event); @@ -5653,6 +5959,30 @@ exit: rcu_read_unlock(); } +void +perf_event_output_forward(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + __perf_event_output(event, data, regs, perf_output_begin_forward); +} + +void +perf_event_output_backward(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + __perf_event_output(event, data, regs, perf_output_begin_backward); +} + +void +perf_event_output(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + __perf_event_output(event, data, regs, perf_output_begin); +} + /* * read event_id */ @@ -5693,16 +6023,41 @@ perf_event_read_event(struct perf_event *event, perf_output_end(&handle); } -typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data); +typedef void (perf_iterate_f)(struct perf_event *event, void *data); static void -perf_event_aux_ctx(struct perf_event_context *ctx, - perf_event_aux_output_cb output, - void *data) +perf_iterate_ctx(struct perf_event_context *ctx, + perf_iterate_f output, + void *data, bool all) { struct perf_event *event; list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (!all) { + if (event->state < PERF_EVENT_STATE_INACTIVE) + continue; + if (!event_filter_match(event)) + continue; + } + + output(event, data); + } +} + +static void perf_iterate_sb_cpu(perf_iterate_f output, void *data) +{ + struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events); + struct perf_event *event; + + list_for_each_entry_rcu(event, &pel->list, sb_list) { + /* + * Skip events that are not fully formed yet; ensure that + * if we observe event->ctx, both event and ctx will be + * complete enough. See perf_install_in_context(). + */ + if (!smp_load_acquire(&event->ctx)) + continue; + if (event->state < PERF_EVENT_STATE_INACTIVE) continue; if (!event_filter_match(event)) @@ -5711,51 +6066,174 @@ perf_event_aux_ctx(struct perf_event_context *ctx, } } +/* + * Iterate all events that need to receive side-band events. + * + * For new callers; ensure that account_pmu_sb_event() includes + * your event, otherwise it might not get delivered. + */ static void -perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data, - struct perf_event_context *task_ctx) +perf_iterate_sb(perf_iterate_f output, void *data, + struct perf_event_context *task_ctx) { + struct perf_event_context *ctx; + int ctxn; + rcu_read_lock(); preempt_disable(); - perf_event_aux_ctx(task_ctx, output, data); + + /* + * If we have task_ctx != NULL we only notify the task context itself. + * The task_ctx is set only for EXIT events before releasing task + * context. + */ + if (task_ctx) { + perf_iterate_ctx(task_ctx, output, data, false); + goto done; + } + + perf_iterate_sb_cpu(output, data); + + for_each_task_context_nr(ctxn) { + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (ctx) + perf_iterate_ctx(ctx, output, data, false); + } +done: preempt_enable(); rcu_read_unlock(); } -static void -perf_event_aux(perf_event_aux_output_cb output, void *data, - struct perf_event_context *task_ctx) +/* + * Clear all file-based filters at exec, they'll have to be + * re-instated when/if these objects are mmapped again. + */ +static void perf_event_addr_filters_exec(struct perf_event *event, void *data) +{ + struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); + struct perf_addr_filter *filter; + unsigned int restart = 0, count = 0; + unsigned long flags; + + if (!has_addr_filter(event)) + return; + + raw_spin_lock_irqsave(&ifh->lock, flags); + list_for_each_entry(filter, &ifh->list, entry) { + if (filter->inode) { + event->addr_filters_offs[count] = 0; + restart++; + } + + count++; + } + + if (restart) + event->addr_filters_gen++; + raw_spin_unlock_irqrestore(&ifh->lock, flags); + + if (restart) + perf_event_stop(event, 1); +} + +void perf_event_exec(void) { - struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; - struct pmu *pmu; int ctxn; + rcu_read_lock(); + for_each_task_context_nr(ctxn) { + ctx = current->perf_event_ctxp[ctxn]; + if (!ctx) + continue; + + perf_event_enable_on_exec(ctxn); + + perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, + true); + } + rcu_read_unlock(); +} + +struct remote_output { + struct ring_buffer *rb; + int err; +}; + +static void __perf_event_output_stop(struct perf_event *event, void *data) +{ + struct perf_event *parent = event->parent; + struct remote_output *ro = data; + struct ring_buffer *rb = ro->rb; + struct stop_event_data sd = { + .event = event, + }; + + if (!has_aux(event)) + return; + + if (!parent) + parent = event; + /* - * If we have task_ctx != NULL we only notify - * the task context itself. The task_ctx is set - * only for EXIT events before releasing task - * context. + * In case of inheritance, it will be the parent that links to the + * ring-buffer, but it will be the child that's actually using it. + * + * We are using event::rb to determine if the event should be stopped, + * however this may race with ring_buffer_attach() (through set_output), + * which will make us skip the event that actually needs to be stopped. + * So ring_buffer_attach() has to stop an aux event before re-assigning + * its rb pointer. */ - if (task_ctx) { - perf_event_aux_task_ctx(output, data, task_ctx); - return; - } + if (rcu_dereference(parent->rb) == rb) + ro->err = __perf_event_stop(&sd); +} + +static int __perf_pmu_output_stop(void *info) +{ + struct perf_event *event = info; + struct pmu *pmu = event->pmu; + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + struct remote_output ro = { + .rb = event->rb, + }; rcu_read_lock(); - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->unique_pmu != pmu) - goto next; - perf_event_aux_ctx(&cpuctx->ctx, output, data); - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto next; - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (ctx) - perf_event_aux_ctx(ctx, output, data); -next: - put_cpu_ptr(pmu->pmu_cpu_context); + perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false); + if (cpuctx->task_ctx) + perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop, + &ro, false); + rcu_read_unlock(); + + return ro.err; +} + +static void perf_pmu_output_stop(struct perf_event *event) +{ + struct perf_event *iter; + int err, cpu; + +restart: + rcu_read_lock(); + list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) { + /* + * For per-CPU events, we need to make sure that neither they + * nor their children are running; for cpu==-1 events it's + * sufficient to stop the event itself if it's active, since + * it can't have children. + */ + cpu = iter->cpu; + if (cpu == -1) + cpu = READ_ONCE(iter->oncpu); + + if (cpu == -1) + continue; + + err = cpu_function_call(cpu, __perf_pmu_output_stop, event); + if (err == -EAGAIN) { + rcu_read_unlock(); + goto restart; + } } rcu_read_unlock(); } @@ -5852,7 +6330,7 @@ static void perf_event_task(struct task_struct *task, }, }; - perf_event_aux(perf_event_task_output, + perf_iterate_sb(perf_event_task_output, &task_event, task_ctx); } @@ -5931,7 +6409,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; - perf_event_aux(perf_event_comm_output, + perf_iterate_sb(perf_event_comm_output, comm_event, NULL); } @@ -6162,13 +6640,92 @@ got_name: mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; - perf_event_aux(perf_event_mmap_output, + perf_iterate_sb(perf_event_mmap_output, mmap_event, NULL); kfree(buf); } +/* + * Check whether inode and address range match filter criteria. + */ +static bool perf_addr_filter_match(struct perf_addr_filter *filter, + struct file *file, unsigned long offset, + unsigned long size) +{ + if (filter->inode != file->f_inode) + return false; + + if (filter->offset > offset + size) + return false; + + if (filter->offset + filter->size < offset) + return false; + + return true; +} + +static void __perf_addr_filters_adjust(struct perf_event *event, void *data) +{ + struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); + struct vm_area_struct *vma = data; + unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags; + struct file *file = vma->vm_file; + struct perf_addr_filter *filter; + unsigned int restart = 0, count = 0; + + if (!has_addr_filter(event)) + return; + + if (!file) + return; + + raw_spin_lock_irqsave(&ifh->lock, flags); + list_for_each_entry(filter, &ifh->list, entry) { + if (perf_addr_filter_match(filter, file, off, + vma->vm_end - vma->vm_start)) { + event->addr_filters_offs[count] = vma->vm_start; + restart++; + } + + count++; + } + + if (restart) + event->addr_filters_gen++; + raw_spin_unlock_irqrestore(&ifh->lock, flags); + + if (restart) + perf_event_stop(event, 1); +} + +/* + * Adjust all task's events' filters to the new vma + */ +static void perf_addr_filters_adjust(struct vm_area_struct *vma) +{ + struct perf_event_context *ctx; + int ctxn; + + /* + * Data tracing isn't supported yet and as such there is no need + * to keep track of anything that isn't related to executable code: + */ + if (!(vma->vm_flags & VM_EXEC)) + return; + + rcu_read_lock(); + for_each_task_context_nr(ctxn) { + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (!ctx) + continue; + + perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true); + } + rcu_read_unlock(); +} + void perf_event_mmap(struct vm_area_struct *vma) { struct perf_mmap_event mmap_event; @@ -6200,6 +6757,7 @@ void perf_event_mmap(struct vm_area_struct *vma) /* .flags (attr_mmap2 only) */ }; + perf_addr_filters_adjust(vma); perf_event_mmap_event(&mmap_event); } @@ -6350,7 +6908,7 @@ static void perf_event_switch(struct task_struct *task, }, }; - perf_event_aux(perf_event_switch_output, + perf_iterate_sb(perf_event_switch_output, &switch_event, NULL); } @@ -6491,10 +7049,7 @@ static int __perf_event_overflow(struct perf_event *event, irq_work_queue(&event->pending); } - if (event->overflow_handler) - event->overflow_handler(event, data, regs); - else - perf_event_output(event, data, regs); + event->overflow_handler(event, data, regs); if (*perf_event_fasync(event) && event->pending_kill) { event->pending_wakeup = 1; @@ -6727,7 +7282,7 @@ int perf_swevent_get_recursion_context(void) } EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); -inline void perf_swevent_put_recursion_context(int rctx) +void perf_swevent_put_recursion_context(int rctx) { struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); @@ -6960,7 +7515,7 @@ static struct pmu perf_swevent = { static int perf_tp_filter_match(struct perf_event *event, struct perf_sample_data *data) { - void *record = data->raw->data; + void *record = data->raw->frag.data; /* only top level events have filters set */ if (event->parent) @@ -6989,7 +7544,26 @@ static int perf_tp_event_match(struct perf_event *event, return 1; } -void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, +void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, + struct trace_event_call *call, u64 count, + struct pt_regs *regs, struct hlist_head *head, + struct task_struct *task) +{ + struct bpf_prog *prog = call->prog; + + if (prog) { + *(struct pt_regs **)raw_data = regs; + if (!trace_call_bpf(prog, raw_data) || hlist_empty(head)) { + perf_swevent_put_recursion_context(rctx); + return; + } + } + perf_tp_event(call->event.type, count, raw_data, size, regs, head, + rctx, task); +} +EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit); + +void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, struct task_struct *task) { @@ -6997,13 +7571,17 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, struct perf_event *event; struct perf_raw_record raw = { - .size = entry_size, - .data = record, + .frag = { + .size = entry_size, + .data = record, + }, }; - perf_sample_data_init(&data, addr, 0); + perf_sample_data_init(&data, 0, 0); data.raw = &raw; + perf_trace_buf_update(record, event_type); + hlist_for_each_entry_rcu(event, head, hlist_entry) { if (perf_tp_event_match(event, &data, regs)) perf_swevent_event(event, count, &data, regs); @@ -7081,24 +7659,6 @@ static inline void perf_tp_register(void) perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); } -static int perf_event_set_filter(struct perf_event *event, void __user *arg) -{ - char *filter_str; - int ret; - - if (event->attr.type != PERF_TYPE_TRACEPOINT) - return -EINVAL; - - filter_str = strndup_user(arg, PAGE_SIZE); - if (IS_ERR(filter_str)) - return PTR_ERR(filter_str); - - ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); - - kfree(filter_str); - return ret; -} - static void perf_event_free_filter(struct perf_event *event) { ftrace_profile_free_filter(event); @@ -7106,6 +7666,7 @@ static void perf_event_free_filter(struct perf_event *event) static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) { + bool is_kprobe, is_tracepoint; struct bpf_prog *prog; if (event->attr.type != PERF_TYPE_TRACEPOINT) @@ -7114,20 +7675,31 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) if (event->tp_event->prog) return -EEXIST; - if (!(event->tp_event->flags & TRACE_EVENT_FL_UKPROBE)) - /* bpf programs can only be attached to u/kprobes */ + is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; + is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; + if (!is_kprobe && !is_tracepoint) + /* bpf programs can only be attached to u/kprobe or tracepoint */ return -EINVAL; prog = bpf_prog_get(prog_fd); if (IS_ERR(prog)) return PTR_ERR(prog); - if (prog->type != BPF_PROG_TYPE_KPROBE) { + if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) || + (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) { /* valid fd, but invalid bpf program type */ bpf_prog_put(prog); return -EINVAL; } + if (is_tracepoint) { + int off = trace_event_get_offsets(event->tp_event); + + if (prog->aux->max_ctx_offset > off) { + bpf_prog_put(prog); + return -EACCES; + } + } event->tp_event->prog = prog; return 0; @@ -7153,11 +7725,6 @@ static inline void perf_tp_register(void) { } -static int perf_event_set_filter(struct perf_event *event, void __user *arg) -{ - return -ENOENT; -} - static void perf_event_free_filter(struct perf_event *event) { } @@ -7186,6 +7753,393 @@ void perf_bp_event(struct perf_event *bp, void *data) #endif /* + * Allocate a new address filter + */ +static struct perf_addr_filter * +perf_addr_filter_new(struct perf_event *event, struct list_head *filters) +{ + int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu); + struct perf_addr_filter *filter; + + filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node); + if (!filter) + return NULL; + + INIT_LIST_HEAD(&filter->entry); + list_add_tail(&filter->entry, filters); + + return filter; +} + +static void free_filters_list(struct list_head *filters) +{ + struct perf_addr_filter *filter, *iter; + + list_for_each_entry_safe(filter, iter, filters, entry) { + if (filter->inode) + iput(filter->inode); + list_del(&filter->entry); + kfree(filter); + } +} + +/* + * Free existing address filters and optionally install new ones + */ +static void perf_addr_filters_splice(struct perf_event *event, + struct list_head *head) +{ + unsigned long flags; + LIST_HEAD(list); + + if (!has_addr_filter(event)) + return; + + /* don't bother with children, they don't have their own filters */ + if (event->parent) + return; + + raw_spin_lock_irqsave(&event->addr_filters.lock, flags); + + list_splice_init(&event->addr_filters.list, &list); + if (head) + list_splice(head, &event->addr_filters.list); + + raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags); + + free_filters_list(&list); +} + +/* + * Scan through mm's vmas and see if one of them matches the + * @filter; if so, adjust filter's address range. + * Called with mm::mmap_sem down for reading. + */ +static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter, + struct mm_struct *mm) +{ + struct vm_area_struct *vma; + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + struct file *file = vma->vm_file; + unsigned long off = vma->vm_pgoff << PAGE_SHIFT; + unsigned long vma_size = vma->vm_end - vma->vm_start; + + if (!file) + continue; + + if (!perf_addr_filter_match(filter, file, off, vma_size)) + continue; + + return vma->vm_start; + } + + return 0; +} + +/* + * Update event's address range filters based on the + * task's existing mappings, if any. + */ +static void perf_event_addr_filters_apply(struct perf_event *event) +{ + struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); + struct task_struct *task = READ_ONCE(event->ctx->task); + struct perf_addr_filter *filter; + struct mm_struct *mm = NULL; + unsigned int count = 0; + unsigned long flags; + + /* + * We may observe TASK_TOMBSTONE, which means that the event tear-down + * will stop on the parent's child_mutex that our caller is also holding + */ + if (task == TASK_TOMBSTONE) + return; + + mm = get_task_mm(event->ctx->task); + if (!mm) + goto restart; + + down_read(&mm->mmap_sem); + + raw_spin_lock_irqsave(&ifh->lock, flags); + list_for_each_entry(filter, &ifh->list, entry) { + event->addr_filters_offs[count] = 0; + + /* + * Adjust base offset if the filter is associated to a binary + * that needs to be mapped: + */ + if (filter->inode) + event->addr_filters_offs[count] = + perf_addr_filter_apply(filter, mm); + + count++; + } + + event->addr_filters_gen++; + raw_spin_unlock_irqrestore(&ifh->lock, flags); + + up_read(&mm->mmap_sem); + + mmput(mm); + +restart: + perf_event_stop(event, 1); +} + +/* + * Address range filtering: limiting the data to certain + * instruction address ranges. Filters are ioctl()ed to us from + * userspace as ascii strings. + * + * Filter string format: + * + * ACTION RANGE_SPEC + * where ACTION is one of the + * * "filter": limit the trace to this region + * * "start": start tracing from this address + * * "stop": stop tracing at this address/region; + * RANGE_SPEC is + * * for kernel addresses: <start address>[/<size>] + * * for object files: <start address>[/<size>]@</path/to/object/file> + * + * if <size> is not specified, the range is treated as a single address. + */ +enum { + IF_ACT_FILTER, + IF_ACT_START, + IF_ACT_STOP, + IF_SRC_FILE, + IF_SRC_KERNEL, + IF_SRC_FILEADDR, + IF_SRC_KERNELADDR, +}; + +enum { + IF_STATE_ACTION = 0, + IF_STATE_SOURCE, + IF_STATE_END, +}; + +static const match_table_t if_tokens = { + { IF_ACT_FILTER, "filter" }, + { IF_ACT_START, "start" }, + { IF_ACT_STOP, "stop" }, + { IF_SRC_FILE, "%u/%u@%s" }, + { IF_SRC_KERNEL, "%u/%u" }, + { IF_SRC_FILEADDR, "%u@%s" }, + { IF_SRC_KERNELADDR, "%u" }, +}; + +/* + * Address filter string parser + */ +static int +perf_event_parse_addr_filter(struct perf_event *event, char *fstr, + struct list_head *filters) +{ + struct perf_addr_filter *filter = NULL; + char *start, *orig, *filename = NULL; + struct path path; + substring_t args[MAX_OPT_ARGS]; + int state = IF_STATE_ACTION, token; + unsigned int kernel = 0; + int ret = -EINVAL; + + orig = fstr = kstrdup(fstr, GFP_KERNEL); + if (!fstr) + return -ENOMEM; + + while ((start = strsep(&fstr, " ,\n")) != NULL) { + ret = -EINVAL; + + if (!*start) + continue; + + /* filter definition begins */ + if (state == IF_STATE_ACTION) { + filter = perf_addr_filter_new(event, filters); + if (!filter) + goto fail; + } + + token = match_token(start, if_tokens, args); + switch (token) { + case IF_ACT_FILTER: + case IF_ACT_START: + filter->filter = 1; + + case IF_ACT_STOP: + if (state != IF_STATE_ACTION) + goto fail; + + state = IF_STATE_SOURCE; + break; + + case IF_SRC_KERNELADDR: + case IF_SRC_KERNEL: + kernel = 1; + + case IF_SRC_FILEADDR: + case IF_SRC_FILE: + if (state != IF_STATE_SOURCE) + goto fail; + + if (token == IF_SRC_FILE || token == IF_SRC_KERNEL) + filter->range = 1; + + *args[0].to = 0; + ret = kstrtoul(args[0].from, 0, &filter->offset); + if (ret) + goto fail; + + if (filter->range) { + *args[1].to = 0; + ret = kstrtoul(args[1].from, 0, &filter->size); + if (ret) + goto fail; + } + + if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) { + int fpos = filter->range ? 2 : 1; + + filename = match_strdup(&args[fpos]); + if (!filename) { + ret = -ENOMEM; + goto fail; + } + } + + state = IF_STATE_END; + break; + + default: + goto fail; + } + + /* + * Filter definition is fully parsed, validate and install it. + * Make sure that it doesn't contradict itself or the event's + * attribute. + */ + if (state == IF_STATE_END) { + if (kernel && event->attr.exclude_kernel) + goto fail; + + if (!kernel) { + if (!filename) + goto fail; + + /* look up the path and grab its inode */ + ret = kern_path(filename, LOOKUP_FOLLOW, &path); + if (ret) + goto fail_free_name; + + filter->inode = igrab(d_inode(path.dentry)); + path_put(&path); + kfree(filename); + filename = NULL; + + ret = -EINVAL; + if (!filter->inode || + !S_ISREG(filter->inode->i_mode)) + /* free_filters_list() will iput() */ + goto fail; + } + + /* ready to consume more filters */ + state = IF_STATE_ACTION; + filter = NULL; + } + } + + if (state != IF_STATE_ACTION) + goto fail; + + kfree(orig); + + return 0; + +fail_free_name: + kfree(filename); +fail: + free_filters_list(filters); + kfree(orig); + + return ret; +} + +static int +perf_event_set_addr_filter(struct perf_event *event, char *filter_str) +{ + LIST_HEAD(filters); + int ret; + + /* + * Since this is called in perf_ioctl() path, we're already holding + * ctx::mutex. + */ + lockdep_assert_held(&event->ctx->mutex); + + if (WARN_ON_ONCE(event->parent)) + return -EINVAL; + + /* + * For now, we only support filtering in per-task events; doing so + * for CPU-wide events requires additional context switching trickery, + * since same object code will be mapped at different virtual + * addresses in different processes. + */ + if (!event->ctx->task) + return -EOPNOTSUPP; + + ret = perf_event_parse_addr_filter(event, filter_str, &filters); + if (ret) + return ret; + + ret = event->pmu->addr_filters_validate(&filters); + if (ret) { + free_filters_list(&filters); + return ret; + } + + /* remove existing filters, if any */ + perf_addr_filters_splice(event, &filters); + + /* install new filters */ + perf_event_for_each_child(event, perf_event_addr_filters_apply); + + return ret; +} + +static int perf_event_set_filter(struct perf_event *event, void __user *arg) +{ + char *filter_str; + int ret = -EINVAL; + + if ((event->attr.type != PERF_TYPE_TRACEPOINT || + !IS_ENABLED(CONFIG_EVENT_TRACING)) && + !has_addr_filter(event)) + return -EINVAL; + + filter_str = strndup_user(arg, PAGE_SIZE); + if (IS_ERR(filter_str)) + return PTR_ERR(filter_str); + + if (IS_ENABLED(CONFIG_EVENT_TRACING) && + event->attr.type == PERF_TYPE_TRACEPOINT) + ret = ftrace_profile_set_filter(event, event->attr.config, + filter_str); + else if (has_addr_filter(event)) + ret = perf_event_set_addr_filter(event, filter_str); + + kfree(filter_str); + return ret; +} + +/* * hrtimer based swevent callback */ @@ -7542,6 +8496,20 @@ static void free_pmu_context(struct pmu *pmu) out: mutex_unlock(&pmus_lock); } + +/* + * Let userspace know that this PMU supports address range filtering: + */ +static ssize_t nr_addr_filters_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct pmu *pmu = dev_get_drvdata(dev); + + return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); +} +DEVICE_ATTR_RO(nr_addr_filters); + static struct idr pmu_idr; static ssize_t @@ -7643,9 +8611,19 @@ static int pmu_dev_alloc(struct pmu *pmu) if (ret) goto free_dev; + /* For PMUs with address filters, throw in an extra attribute: */ + if (pmu->nr_addr_filters) + ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters); + + if (ret) + goto del_dev; + out: return ret; +del_dev: + device_del(pmu->dev); + free_dev: put_device(pmu->dev); goto out; @@ -7685,6 +8663,21 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) } skip_type: + if (pmu->task_ctx_nr == perf_hw_context) { + static int hw_context_taken = 0; + + /* + * Other than systems with heterogeneous CPUs, it never makes + * sense for two PMUs to share perf_hw_context. PMUs which are + * uncore must use perf_invalid_context. + */ + if (WARN_ON_ONCE(hw_context_taken && + !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS))) + pmu->task_ctx_nr = perf_invalid_context; + + hw_context_taken = 1; + } + pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); if (pmu->pmu_cpu_context) goto got_cpu_context; @@ -7772,6 +8765,8 @@ void perf_pmu_unregister(struct pmu *pmu) free_percpu(pmu->pmu_disable_count); if (pmu->type >= PERF_TYPE_MAX) idr_remove(&pmu_idr, pmu->type); + if (pmu->nr_addr_filters) + device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); device_del(pmu->dev); put_device(pmu->dev); free_pmu_context(pmu); @@ -7843,6 +8838,28 @@ unlock: return pmu; } +static void attach_sb_event(struct perf_event *event) +{ + struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); + + raw_spin_lock(&pel->lock); + list_add_rcu(&event->sb_list, &pel->list); + raw_spin_unlock(&pel->lock); +} + +/* + * We keep a list of all !task (and therefore per-cpu) events + * that need to receive side-band records. + * + * This avoids having to scan all the various PMU per-cpu contexts + * looking for them. + */ +static void account_pmu_sb_event(struct perf_event *event) +{ + if (is_sb_event(event)) + attach_sb_event(event); +} + static void account_event_cpu(struct perf_event *event, int cpu) { if (event->parent) @@ -7923,6 +8940,8 @@ static void account_event(struct perf_event *event) enabled: account_event_cpu(event, event->cpu); + + account_pmu_sb_event(event); } /* @@ -7965,6 +8984,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, INIT_LIST_HEAD(&event->sibling_list); INIT_LIST_HEAD(&event->rb_entry); INIT_LIST_HEAD(&event->active_entry); + INIT_LIST_HEAD(&event->addr_filters.list); INIT_HLIST_NODE(&event->hlist_entry); @@ -7972,6 +8992,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, init_irq_work(&event->pending, perf_pending_event); mutex_init(&event->mmap_mutex); + raw_spin_lock_init(&event->addr_filters.lock); atomic_long_set(&event->refcount, 1); event->cpu = cpu; @@ -8006,8 +9027,16 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, context = parent_event->overflow_handler_context; } - event->overflow_handler = overflow_handler; - event->overflow_handler_context = context; + if (overflow_handler) { + event->overflow_handler = overflow_handler; + event->overflow_handler_context = context; + } else if (is_write_backward(event)){ + event->overflow_handler = perf_event_output_backward; + event->overflow_handler_context = NULL; + } else { + event->overflow_handler = perf_event_output_forward; + event->overflow_handler_context = NULL; + } perf_event__state_init(event); @@ -8048,11 +9077,22 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (err) goto err_pmu; + if (has_addr_filter(event)) { + event->addr_filters_offs = kcalloc(pmu->nr_addr_filters, + sizeof(unsigned long), + GFP_KERNEL); + if (!event->addr_filters_offs) + goto err_per_task; + + /* force hw sync on the address filters */ + event->addr_filters_gen = 1; + } + if (!event->parent) { if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { - err = get_callchain_buffers(); + err = get_callchain_buffers(attr->sample_max_stack); if (err) - goto err_per_task; + goto err_addr_filters; } } @@ -8061,6 +9101,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, return event; +err_addr_filters: + kfree(event->addr_filters_offs); + err_per_task: exclusive_event_destroy(event); @@ -8240,6 +9283,13 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) goto out; /* + * Either writing ring buffer from beginning or from end. + * Mixing is not allowed. + */ + if (is_write_backward(output_event) != is_write_backward(event)) + goto out; + + /* * If both events generate aux data, they must be on the same PMU */ if (has_aux(event) && has_aux(output_event) && @@ -8362,6 +9412,9 @@ SYSCALL_DEFINE5(perf_event_open, return -EINVAL; } + if (!attr.sample_max_stack) + attr.sample_max_stack = sysctl_perf_event_max_stack; + /* * In cgroup mode, the pid argument is used to pass the fd * opened to the cgroup directory in cgroupfs. The cpu argument @@ -8435,7 +9488,7 @@ SYSCALL_DEFINE5(perf_event_open, if (is_sampling_event(event)) { if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { - err = -ENOTSUPP; + err = -EOPNOTSUPP; goto err_alloc; } } @@ -9397,10 +10450,13 @@ static void __init perf_event_init_all_cpus(void) swhash = &per_cpu(swevent_htable, cpu); mutex_init(&swhash->hlist_mutex); INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu)); + + INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); + raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); } } -static void perf_event_init_cpu(int cpu) +int perf_event_init_cpu(unsigned int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); @@ -9413,6 +10469,7 @@ static void perf_event_init_cpu(int cpu) rcu_assign_pointer(swhash->swevent_hlist, hlist); } mutex_unlock(&swhash->hlist_mutex); + return 0; } #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE @@ -9444,14 +10501,17 @@ static void perf_event_exit_cpu_context(int cpu) } srcu_read_unlock(&pmus_srcu, idx); } +#else + +static void perf_event_exit_cpu_context(int cpu) { } + +#endif -static void perf_event_exit_cpu(int cpu) +int perf_event_exit_cpu(unsigned int cpu) { perf_event_exit_cpu_context(cpu); + return 0; } -#else -static inline void perf_event_exit_cpu(int cpu) { } -#endif static int perf_reboot(struct notifier_block *notifier, unsigned long val, void *v) @@ -9473,46 +10533,6 @@ static struct notifier_block perf_reboot_notifier = { .priority = INT_MIN, }; -static int -perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) -{ - unsigned int cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - - case CPU_UP_PREPARE: - /* - * This must be done before the CPU comes alive, because the - * moment we can run tasks we can encounter (software) events. - * - * Specifically, someone can have inherited events on kthreadd - * or a pre-existing worker thread that gets re-bound. - */ - perf_event_init_cpu(cpu); - break; - - case CPU_DOWN_PREPARE: - /* - * This must be done before the CPU dies because after that an - * active event might want to IPI the CPU and that'll not work - * so great for dead CPUs. - * - * XXX smp_call_function_single() return -ENXIO without a warn - * so we could possibly deal with this. - * - * This is safe against new events arriving because - * sys_perf_event_open() serializes against hotplug using - * get_online_cpus(). - */ - perf_event_exit_cpu(cpu); - break; - default: - break; - } - - return NOTIFY_OK; -} - void __init perf_event_init(void) { int ret; @@ -9525,7 +10545,7 @@ void __init perf_event_init(void) perf_pmu_register(&perf_cpu_clock, NULL, -1); perf_pmu_register(&perf_task_clock, NULL, -1); perf_tp_register(); - perf_cpu_notifier(perf_cpu_notify); + perf_event_init_cpu(smp_processor_id()); register_reboot_notifier(&perf_reboot_notifier); ret = init_hw_breakpoint(); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 4199b6d193f5..486fd78eb8d5 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -11,13 +11,13 @@ struct ring_buffer { atomic_t refcount; struct rcu_head rcu_head; - struct irq_work irq_work; #ifdef CONFIG_PERF_USE_VMALLOC struct work_struct work; int page_order; /* allocation order */ #endif int nr_pages; /* nr of data pages */ int overwrite; /* can overwrite itself */ + int paused; /* can write into ring buffer */ atomic_t poll; /* POLL_ for wakeups */ @@ -65,6 +65,14 @@ static inline void rb_free_rcu(struct rcu_head *rcu_head) rb_free(rb); } +static inline void rb_toggle_paused(struct ring_buffer *rb, bool pause) +{ + if (!pause && rb->nr_pages) + rb->paused = 0; + else + rb->paused = 1; +} + extern struct ring_buffer * rb_alloc(int nr_pages, long watermark, int cpu, int flags); extern void perf_event_wakeup(struct perf_event *event); @@ -115,21 +123,19 @@ static inline unsigned long perf_aux_size(struct ring_buffer *rb) return rb->aux_nr_pages << PAGE_SHIFT; } -#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ -static inline unsigned long \ -func_name(struct perf_output_handle *handle, \ - const void *buf, unsigned long len) \ +#define __DEFINE_OUTPUT_COPY_BODY(advance_buf, memcpy_func, ...) \ { \ unsigned long size, written; \ \ do { \ size = min(handle->size, len); \ - written = memcpy_func(handle->addr, buf, size); \ + written = memcpy_func(__VA_ARGS__); \ written = size - written; \ \ len -= written; \ handle->addr += written; \ - buf += written; \ + if (advance_buf) \ + buf += written; \ handle->size -= written; \ if (!handle->size) { \ struct ring_buffer *rb = handle->rb; \ @@ -144,6 +150,21 @@ func_name(struct perf_output_handle *handle, \ return len; \ } +#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ +static inline unsigned long \ +func_name(struct perf_output_handle *handle, \ + const void *buf, unsigned long len) \ +__DEFINE_OUTPUT_COPY_BODY(true, memcpy_func, handle->addr, buf, size) + +static inline unsigned long +__output_custom(struct perf_output_handle *handle, perf_copy_f copy_func, + const void *buf, unsigned long len) +{ + unsigned long orig_len = len; + __DEFINE_OUTPUT_COPY_BODY(false, copy_func, handle->addr, buf, + orig_len - len, size) +} + static inline unsigned long memcpy_common(void *dst, const void *src, unsigned long n) { diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 7611d0f66cf8..257fa460b846 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -102,8 +102,21 @@ out: preempt_enable(); } -int perf_output_begin(struct perf_output_handle *handle, - struct perf_event *event, unsigned int size) +static bool __always_inline +ring_buffer_has_space(unsigned long head, unsigned long tail, + unsigned long data_size, unsigned int size, + bool backward) +{ + if (!backward) + return CIRC_SPACE(head, tail, data_size) >= size; + else + return CIRC_SPACE(tail, head, data_size) >= size; +} + +static int __always_inline +__perf_output_begin(struct perf_output_handle *handle, + struct perf_event *event, unsigned int size, + bool backward) { struct ring_buffer *rb; unsigned long tail, offset, head; @@ -125,8 +138,11 @@ int perf_output_begin(struct perf_output_handle *handle, if (unlikely(!rb)) goto out; - if (unlikely(!rb->nr_pages)) + if (unlikely(rb->paused)) { + if (rb->nr_pages) + local_inc(&rb->lost); goto out; + } handle->rb = rb; handle->event = event; @@ -143,9 +159,12 @@ int perf_output_begin(struct perf_output_handle *handle, do { tail = READ_ONCE(rb->user_page->data_tail); offset = head = local_read(&rb->head); - if (!rb->overwrite && - unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size)) - goto fail; + if (!rb->overwrite) { + if (unlikely(!ring_buffer_has_space(head, tail, + perf_data_size(rb), + size, backward))) + goto fail; + } /* * The above forms a control dependency barrier separating the @@ -159,9 +178,17 @@ int perf_output_begin(struct perf_output_handle *handle, * See perf_output_put_handle(). */ - head += size; + if (!backward) + head += size; + else + head -= size; } while (local_cmpxchg(&rb->head, offset, head) != offset); + if (backward) { + offset = head; + head = (u64)(-head); + } + /* * We rely on the implied barrier() by local_cmpxchg() to ensure * none of the data stores below can be lifted up by the compiler. @@ -203,6 +230,26 @@ out: return -ENOSPC; } +int perf_output_begin_forward(struct perf_output_handle *handle, + struct perf_event *event, unsigned int size) +{ + return __perf_output_begin(handle, event, size, false); +} + +int perf_output_begin_backward(struct perf_output_handle *handle, + struct perf_event *event, unsigned int size) +{ + return __perf_output_begin(handle, event, size, true); +} + +int perf_output_begin(struct perf_output_handle *handle, + struct perf_event *event, unsigned int size) +{ + + return __perf_output_begin(handle, event, size, + unlikely(is_write_backward(event))); +} + unsigned int perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len) { @@ -221,8 +268,6 @@ void perf_output_end(struct perf_output_handle *handle) rcu_read_unlock(); } -static void rb_irq_work(struct irq_work *work); - static void ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) { @@ -243,16 +288,13 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) INIT_LIST_HEAD(&rb->event_list); spin_lock_init(&rb->event_lock); - init_irq_work(&rb->irq_work, rb_irq_work); -} - -static void ring_buffer_put_async(struct ring_buffer *rb) -{ - if (!atomic_dec_and_test(&rb->refcount)) - return; - rb->rcu_head.next = (void *)rb; - irq_work_queue(&rb->irq_work); + /* + * perf_output_begin() only checks rb->paused, therefore + * rb->paused must be true if we have no pages for output. + */ + if (!rb->nr_pages) + rb->paused = 1; } /* @@ -264,6 +306,10 @@ static void ring_buffer_put_async(struct ring_buffer *rb) * The ordering is similar to that of perf_output_{begin,end}, with * the exception of (B), which should be taken care of by the pmu * driver, since ordering rules will differ depending on hardware. + * + * Call this from pmu::start(); see the comment in perf_aux_output_end() + * about its use in pmu callbacks. Both can also be called from the PMI + * handler if needed. */ void *perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *event) @@ -284,7 +330,21 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, if (!rb) return NULL; - if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount)) + if (!rb_has_aux(rb)) + goto err; + + /* + * If aux_mmap_count is zero, the aux buffer is in perf_mmap_close(), + * about to get freed, so we leave immediately. + * + * Checking rb::aux_mmap_count and rb::refcount has to be done in + * the same order, see perf_mmap_close. Otherwise we end up freeing + * aux pages in this path, which is a bug, because in_atomic(). + */ + if (!atomic_read(&rb->aux_mmap_count)) + goto err; + + if (!atomic_inc_not_zero(&rb->aux_refcount)) goto err; /* @@ -328,10 +388,11 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, return handle->rb->aux_priv; err_put: + /* can't be last */ rb_free_aux(rb); err: - ring_buffer_put_async(rb); + ring_buffer_put(rb); handle->event = NULL; return NULL; @@ -342,6 +403,10 @@ err: * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the * pmu driver's responsibility to observe ordering rules of the hardware, * so that all the data is externally visible before this is called. + * + * Note: this has to be called from pmu::stop() callback, as the assumption + * of the AUX buffer management code is that after pmu::stop(), the AUX + * transaction must be stopped and therefore drop the AUX reference count. */ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, bool truncated) @@ -389,8 +454,9 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, handle->event = NULL; local_set(&rb->aux_nest, 0); + /* can't be last */ rb_free_aux(rb); - ring_buffer_put_async(rb); + ring_buffer_put(rb); } /* @@ -471,6 +537,14 @@ static void __rb_free_aux(struct ring_buffer *rb) { int pg; + /* + * Should never happen, the last reference should be dropped from + * perf_mmap_close() path, which first stops aux transactions (which + * in turn are the atomic holders of aux_refcount) and then does the + * last rb_free_aux(). + */ + WARN_ON_ONCE(in_atomic()); + if (rb->aux_priv) { rb->free_aux(rb->aux_priv); rb->free_aux = NULL; @@ -582,18 +656,7 @@ out: void rb_free_aux(struct ring_buffer *rb) { if (atomic_dec_and_test(&rb->aux_refcount)) - irq_work_queue(&rb->irq_work); -} - -static void rb_irq_work(struct irq_work *work) -{ - struct ring_buffer *rb = container_of(work, struct ring_buffer, irq_work); - - if (!atomic_read(&rb->aux_refcount)) __rb_free_aux(rb); - - if (rb->rcu_head.next == (void *)rb) - call_rcu(&rb->rcu_head, rb_free_rcu); } #ifndef CONFIG_PERF_USE_VMALLOC diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7edc95edfaee..8c50276b60d1 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -172,8 +172,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); err = -EAGAIN; ptep = page_check_address(page, mm, addr, &ptl, 0); - if (!ptep) + if (!ptep) { + mem_cgroup_cancel_charge(kpage, memcg, false); goto unlock; + } get_page(kpage); page_add_new_anon_rmap(kpage, vma, addr, false); @@ -200,7 +202,6 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, err = 0; unlock: - mem_cgroup_cancel_charge(kpage, memcg, false); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); unlock_page(page); return err; @@ -1130,7 +1131,9 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) struct vm_area_struct *vma; int ret; - down_write(&mm->mmap_sem); + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; + if (mm->uprobes_state.xol_area) { ret = -EALREADY; goto fail; @@ -1469,7 +1472,8 @@ static void dup_xol_work(struct callback_head *work) if (current->flags & PF_EXITING) return; - if (!__create_xol_area(current->utask->dup_xol_addr)) + if (!__create_xol_area(current->utask->dup_xol_addr) && + !fatal_signal_pending(current)) uprobe_warn(current, "dup xol area"); } @@ -1694,8 +1698,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) int result; pagefault_disable(); - result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr, - sizeof(opcode)); + result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr); pagefault_enable(); if (likely(result == 0)) diff --git a/kernel/exit.c b/kernel/exit.c index fd90195667e1..091a78be3b09 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -211,6 +211,82 @@ repeat: } /* + * Note that if this function returns a valid task_struct pointer (!NULL) + * task->usage must remain >0 for the duration of the RCU critical section. + */ +struct task_struct *task_rcu_dereference(struct task_struct **ptask) +{ + struct sighand_struct *sighand; + struct task_struct *task; + + /* + * We need to verify that release_task() was not called and thus + * delayed_put_task_struct() can't run and drop the last reference + * before rcu_read_unlock(). We check task->sighand != NULL, + * but we can read the already freed and reused memory. + */ +retry: + task = rcu_dereference(*ptask); + if (!task) + return NULL; + + probe_kernel_address(&task->sighand, sighand); + + /* + * Pairs with atomic_dec_and_test() in put_task_struct(). If this task + * was already freed we can not miss the preceding update of this + * pointer. + */ + smp_rmb(); + if (unlikely(task != READ_ONCE(*ptask))) + goto retry; + + /* + * We've re-checked that "task == *ptask", now we have two different + * cases: + * + * 1. This is actually the same task/task_struct. In this case + * sighand != NULL tells us it is still alive. + * + * 2. This is another task which got the same memory for task_struct. + * We can't know this of course, and we can not trust + * sighand != NULL. + * + * In this case we actually return a random value, but this is + * correct. + * + * If we return NULL - we can pretend that we actually noticed that + * *ptask was updated when the previous task has exited. Or pretend + * that probe_slab_address(&sighand) reads NULL. + * + * If we return the new task (because sighand is not NULL for any + * reason) - this is fine too. This (new) task can't go away before + * another gp pass. + * + * And note: We could even eliminate the false positive if re-read + * task->sighand once again to avoid the falsely NULL. But this case + * is very unlikely so we don't care. + */ + if (!sighand) + return NULL; + + return task; +} + +struct task_struct *try_get_task_struct(struct task_struct **ptask) +{ + struct task_struct *task; + + rcu_read_lock(); + task = task_rcu_dereference(ptask); + if (task) + get_task_struct(task); + rcu_read_unlock(); + + return task; +} + +/* * Determine if a process group is "orphaned", according to the POSIX * definition in 2.2.2.52. Orphaned process groups are not to be affected * by terminal-generated stop signals. Newly orphaned process groups are @@ -639,7 +715,7 @@ static void check_stack_usage(void) spin_lock(&low_water_lock); if (free < lowest_to_date) { - pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n", + pr_info("%s (%d) used greatest stack depth: %lu bytes left\n", current->comm, task_pid_nr(current), free); lowest_to_date = free; } @@ -700,10 +776,14 @@ void do_exit(long code) exit_signals(tsk); /* sets PF_EXITING */ /* - * tsk->flags are checked in the futex code to protect against - * an exiting task cleaning up the robust pi futexes. + * Ensure that all new tsk->pi_lock acquisitions must observe + * PF_EXITING. Serializes against futex.c:attach_to_pi_owner(). */ smp_mb(); + /* + * Ensure that we must observe the pi_state in exit_mm() -> + * mm_release() -> exit_pi_state_list(). + */ raw_spin_unlock_wait(&tsk->pi_lock); if (unlikely(in_atomic())) { @@ -746,7 +826,7 @@ void do_exit(long code) disassociate_ctty(1); exit_task_namespaces(tsk); exit_task_work(tsk); - exit_thread(); + exit_thread(tsk); /* * Flush inherited counters to the parent - before the parent @@ -768,12 +848,7 @@ void do_exit(long code) TASKS_RCU(preempt_enable()); exit_notify(tsk, group_dead); proc_exit_connector(tsk); -#ifdef CONFIG_NUMA - task_lock(tsk); - mpol_put(tsk->mempolicy); - tsk->mempolicy = NULL; - task_unlock(tsk); -#endif + mpol_put_task_policy(tsk); #ifdef CONFIG_FUTEX if (unlikely(current->pi_state_cache)) kfree(current->pi_state_cache); @@ -918,17 +993,28 @@ static int eligible_pid(struct wait_opts *wo, struct task_struct *p) task_pid_type(p, wo->wo_type) == wo->wo_pid; } -static int eligible_child(struct wait_opts *wo, struct task_struct *p) +static int +eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p) { if (!eligible_pid(wo, p)) return 0; - /* Wait for all children (clone and not) if __WALL is set; - * otherwise, wait for clone children *only* if __WCLONE is - * set; otherwise, wait for non-clone children *only*. (Note: - * A "clone" child here is one that reports to its parent - * using a signal other than SIGCHLD.) */ - if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) - && !(wo->wo_flags & __WALL)) + + /* + * Wait for all children (clone and not) if __WALL is set or + * if it is traced by us. + */ + if (ptrace || (wo->wo_flags & __WALL)) + return 1; + + /* + * Otherwise, wait for clone children *only* if __WCLONE is set; + * otherwise, wait for non-clone children *only*. + * + * Note: a "clone" child here is one that reports to its parent + * using a signal other than SIGCHLD, or a non-leader thread which + * we can only see if it is traced by us. + */ + if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) return 0; return 1; @@ -1300,7 +1386,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, if (unlikely(exit_state == EXIT_DEAD)) return 0; - ret = eligible_child(wo, p); + ret = eligible_child(wo, ptrace, p); if (!ret) return ret; @@ -1524,7 +1610,8 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, enum pid_type type; long ret; - if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED)) + if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED| + __WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) return -EINVAL; diff --git a/kernel/fork.c b/kernel/fork.c index d277e83ed3e0..beb31725f7e2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -148,57 +148,49 @@ static inline void free_task_struct(struct task_struct *tsk) } #endif -void __weak arch_release_thread_info(struct thread_info *ti) +void __weak arch_release_thread_stack(unsigned long *stack) { } -#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR +#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR /* * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a * kmemcache based allocator. */ # if THREAD_SIZE >= PAGE_SIZE -static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) { - struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, - THREAD_SIZE_ORDER); - - if (page) - memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, - 1 << THREAD_SIZE_ORDER); + struct page *page = alloc_pages_node(node, THREADINFO_GFP, + THREAD_SIZE_ORDER); return page ? page_address(page) : NULL; } -static inline void free_thread_info(struct thread_info *ti) +static inline void free_thread_stack(unsigned long *stack) { - struct page *page = virt_to_page(ti); - - memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, - -(1 << THREAD_SIZE_ORDER)); - __free_kmem_pages(page, THREAD_SIZE_ORDER); + __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER); } # else -static struct kmem_cache *thread_info_cache; +static struct kmem_cache *thread_stack_cache; -static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) { - return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node); + return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); } -static void free_thread_info(struct thread_info *ti) +static void free_thread_stack(unsigned long *stack) { - kmem_cache_free(thread_info_cache, ti); + kmem_cache_free(thread_stack_cache, stack); } -void thread_info_cache_init(void) +void thread_stack_cache_init(void) { - thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE, + thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE, THREAD_SIZE, 0, NULL); - BUG_ON(thread_info_cache == NULL); + BUG_ON(thread_stack_cache == NULL); } # endif #endif @@ -221,18 +213,24 @@ struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; -static void account_kernel_stack(struct thread_info *ti, int account) +static void account_kernel_stack(unsigned long *stack, int account) { - struct zone *zone = page_zone(virt_to_page(ti)); + /* All stack pages are in the same zone and belong to the same memcg. */ + struct page *first_page = virt_to_page(stack); + + mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, + THREAD_SIZE / 1024 * account); - mod_zone_page_state(zone, NR_KERNEL_STACK, account); + memcg_kmem_update_page_stat( + first_page, MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); } void free_task(struct task_struct *tsk) { account_kernel_stack(tsk->stack, -1); - arch_release_thread_info(tsk->stack); - free_thread_info(tsk->stack); + arch_release_thread_stack(tsk->stack); + free_thread_stack(tsk->stack); rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); put_seccomp_filter(tsk); @@ -340,26 +338,27 @@ void set_task_stack_end_magic(struct task_struct *tsk) *stackend = STACK_END_MAGIC; /* for overflow detection */ } -static struct task_struct *dup_task_struct(struct task_struct *orig) +static struct task_struct *dup_task_struct(struct task_struct *orig, int node) { struct task_struct *tsk; - struct thread_info *ti; - int node = tsk_fork_get_node(orig); + unsigned long *stack; int err; + if (node == NUMA_NO_NODE) + node = tsk_fork_get_node(orig); tsk = alloc_task_struct_node(node); if (!tsk) return NULL; - ti = alloc_thread_info_node(tsk, node); - if (!ti) + stack = alloc_thread_stack_node(tsk, node); + if (!stack) goto free_tsk; err = arch_dup_task_struct(tsk, orig); if (err) - goto free_ti; + goto free_stack; - tsk->stack = ti; + tsk->stack = stack; #ifdef CONFIG_SECCOMP /* * We must handle setting up seccomp filters once we're under @@ -391,14 +390,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) tsk->task_frag.page = NULL; tsk->wake_q.next = NULL; - account_kernel_stack(ti, 1); + account_kernel_stack(stack, 1); kcov_task_init(tsk); return tsk; -free_ti: - free_thread_info(ti); +free_stack: + free_thread_stack(stack); free_tsk: free_task_struct(tsk); return NULL; @@ -413,7 +412,10 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) unsigned long charge; uprobe_start_dup_mmap(); - down_write(&oldmm->mmap_sem); + if (down_write_killable(&oldmm->mmap_sem)) { + retval = -EINTR; + goto fail_uprobe_end; + } flush_cache_dup_mm(oldmm); uprobe_dup_mmap(oldmm, mm); /* @@ -525,6 +527,7 @@ out: up_write(&mm->mmap_sem); flush_tlb_mm(oldmm); up_write(&oldmm->mmap_sem); +fail_uprobe_end: uprobe_end_dup_mmap(); return retval; fail_nomem_anon_vma_fork: @@ -699,6 +702,26 @@ void __mmdrop(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(__mmdrop); +static inline void __mmput(struct mm_struct *mm) +{ + VM_BUG_ON(atomic_read(&mm->mm_users)); + + uprobe_clear_state(mm); + exit_aio(mm); + ksm_exit(mm); + khugepaged_exit(mm); /* must run before exit_mmap */ + exit_mmap(mm); + set_mm_exe_file(mm, NULL); + if (!list_empty(&mm->mmlist)) { + spin_lock(&mmlist_lock); + list_del(&mm->mmlist); + spin_unlock(&mmlist_lock); + } + if (mm->binfmt) + module_put(mm->binfmt->module); + mmdrop(mm); +} + /* * Decrement the use count and release all resources for an mm. */ @@ -706,24 +729,26 @@ void mmput(struct mm_struct *mm) { might_sleep(); + if (atomic_dec_and_test(&mm->mm_users)) + __mmput(mm); +} +EXPORT_SYMBOL_GPL(mmput); + +#ifdef CONFIG_MMU +static void mmput_async_fn(struct work_struct *work) +{ + struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work); + __mmput(mm); +} + +void mmput_async(struct mm_struct *mm) +{ if (atomic_dec_and_test(&mm->mm_users)) { - uprobe_clear_state(mm); - exit_aio(mm); - ksm_exit(mm); - khugepaged_exit(mm); /* must run before exit_mmap */ - exit_mmap(mm); - set_mm_exe_file(mm, NULL); - if (!list_empty(&mm->mmlist)) { - spin_lock(&mmlist_lock); - list_del(&mm->mmlist); - spin_unlock(&mmlist_lock); - } - if (mm->binfmt) - module_put(mm->binfmt->module); - mmdrop(mm); + INIT_WORK(&mm->async_put_work, mmput_async_fn); + schedule_work(&mm->async_put_work); } } -EXPORT_SYMBOL_GPL(mmput); +#endif /** * set_mm_exe_file - change a reference to the mm's executable file @@ -774,6 +799,29 @@ struct file *get_mm_exe_file(struct mm_struct *mm) EXPORT_SYMBOL(get_mm_exe_file); /** + * get_task_exe_file - acquire a reference to the task's executable file + * + * Returns %NULL if task's mm (if any) has no associated executable file or + * this is a kernel thread with borrowed mm (see the comment above get_task_mm). + * User must release file via fput(). + */ +struct file *get_task_exe_file(struct task_struct *task) +{ + struct file *exe_file = NULL; + struct mm_struct *mm; + + task_lock(task); + mm = task->mm; + if (mm) { + if (!(task->flags & PF_KTHREAD)) + exe_file = get_mm_exe_file(mm); + } + task_unlock(task); + return exe_file; +} +EXPORT_SYMBOL(get_task_exe_file); + +/** * get_task_mm - acquire a reference to the task's mm * * Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning @@ -888,14 +936,12 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) deactivate_mm(tsk, mm); /* - * If we're exiting normally, clear a user-space tid field if - * requested. We leave this alone when dying by signal, to leave - * the value intact in a core dump, and to save the unnecessary - * trouble, say, a killed vfork parent shouldn't touch this mm. - * Userland only wants this done for a sys_exit. + * Signal userspace if we're not exiting with a core dump + * because we want to leave the value intact for debugging + * purposes. */ if (tsk->clear_child_tid) { - if (!(tsk->flags & PF_SIGNALED) && + if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) && atomic_read(&mm->mm_users) > 1) { /* * We don't check the error code - if userspace has @@ -1256,7 +1302,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, int __user *child_tidptr, struct pid *pid, int trace, - unsigned long tls) + unsigned long tls, + int node) { int retval; struct task_struct *p; @@ -1308,7 +1355,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto fork_out; retval = -ENOMEM; - p = dup_task_struct(current); + p = dup_task_struct(current, node); if (!p) goto fork_out; @@ -1378,7 +1425,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->real_start_time = ktime_get_boot_ns(); p->io_context = NULL; p->audit_context = NULL; - threadgroup_change_begin(current); cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); @@ -1470,7 +1516,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, pid = alloc_pid(p->nsproxy->pid_ns_for_children); if (IS_ERR(pid)) { retval = PTR_ERR(pid); - goto bad_fork_cleanup_io; + goto bad_fork_cleanup_thread; } } @@ -1494,7 +1540,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, * sigaltstack should be cleared when sharing the same VM */ if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM) - p->sas_ss_sp = p->sas_ss_size = 0; + sas_ss_reset(p); /* * Syscall tracing and stepping should be turned off in the @@ -1530,6 +1576,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; + threadgroup_change_begin(current); /* * Ensure that the cgroup subsystem policies allow the new process to be * forked. It should be noted the the new process's css_set can be changed @@ -1630,8 +1677,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, bad_fork_cancel_cgroup: cgroup_cancel_fork(p); bad_fork_free_pid: + threadgroup_change_end(current); if (pid != &init_struct_pid) free_pid(pid); +bad_fork_cleanup_thread: + exit_thread(p); bad_fork_cleanup_io: if (p->io_context) exit_io_context(p); @@ -1660,7 +1710,6 @@ bad_fork_cleanup_policy: mpol_put(p->mempolicy); bad_fork_cleanup_threadgroup_lock: #endif - threadgroup_change_end(current); delayacct_tsk_free(p); bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); @@ -1684,7 +1733,8 @@ static inline void init_idle_pids(struct pid_link *links) struct task_struct *fork_idle(int cpu) { struct task_struct *task; - task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0); + task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0, + cpu_to_node(cpu)); if (!IS_ERR(task)) { init_idle_pids(task->pids); init_idle(task, cpu); @@ -1729,7 +1779,7 @@ long _do_fork(unsigned long clone_flags, } p = copy_process(clone_flags, stack_start, stack_size, - child_tidptr, NULL, trace, tls); + child_tidptr, NULL, trace, tls, NUMA_NO_NODE); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. diff --git a/kernel/freezer.c b/kernel/freezer.c index a8900a3bc27a..6f56a9e219fa 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -42,7 +42,7 @@ bool freezing_slow_path(struct task_struct *p) if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) return false; - if (test_thread_flag(TIF_MEMDIE)) + if (test_tsk_thread_flag(p, TIF_MEMDIE)) return false; if (pm_nosig_freezing || cgroup_freezing(p)) diff --git a/kernel/futex.c b/kernel/futex.c index c20f06f38ef3..46cb3a301bc1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -179,7 +179,15 @@ int __read_mostly futex_cmpxchg_enabled; * Futex flags used to encode options to functions and preserve them across * restarts. */ -#define FLAGS_SHARED 0x01 +#ifdef CONFIG_MMU +# define FLAGS_SHARED 0x01 +#else +/* + * NOMMU does not have per process address space. Let the compiler optimize + * code away. + */ +# define FLAGS_SHARED 0x00 +#endif #define FLAGS_CLOCKRT 0x02 #define FLAGS_HAS_TIMEOUT 0x04 @@ -405,6 +413,16 @@ static void get_futex_key_refs(union futex_key *key) if (!key->both.ptr) return; + /* + * On MMU less systems futexes are always "private" as there is no per + * process address space. We need the smp wmb nevertheless - yes, + * arch/blackfin has MMU less SMP ... + */ + if (!IS_ENABLED(CONFIG_MMU)) { + smp_mb(); /* explicit smp_mb(); (B) */ + return; + } + switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: ihold(key->shared.inode); /* implies smp_mb(); (B) */ @@ -436,6 +454,9 @@ static void drop_futex_key_refs(union futex_key *key) return; } + if (!IS_ENABLED(CONFIG_MMU)) + return; + switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: iput(key->shared.inode); @@ -469,7 +490,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; - struct page *page; + struct page *page, *tail; struct address_space *mapping; int err, ro = 0; @@ -530,7 +551,15 @@ again: * considered here and page lock forces unnecessarily serialization * From this point on, mapping will be re-verified if necessary and * page lock will be acquired only if it is unavoidable - */ + * + * Mapping checks require the head page for any compound page so the + * head page and mapping is looked up now. For anonymous pages, it + * does not matter if the page splits in the future as the key is + * based on the address. For filesystem-backed pages, the tail is + * required as the index of the page determines the key. For + * base pages, there is no tail page and tail == page. + */ + tail = page; page = compound_head(page); mapping = READ_ONCE(page->mapping); @@ -654,7 +683,7 @@ again: key->both.offset |= FUT_OFF_INODE; /* inode-based key */ key->shared.inode = inode; - key->shared.pgoff = basepage_index(page); + key->shared.pgoff = basepage_index(tail); rcu_read_unlock(); } @@ -729,7 +758,7 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from) int ret; pagefault_disable(); - ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); + ret = __get_user(*dest, from); pagefault_enable(); return ret ? -EFAULT : 0; diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index c92e44855ddd..1276aabaab55 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -37,6 +37,7 @@ config ARCH_HAS_GCOV_PROFILE_ALL config GCOV_PROFILE_ALL bool "Profile entire Kernel" + depends on !COMPILE_TEST depends on GCOV_KERNEL depends on ARCH_HAS_GCOV_PROFILE_ALL default n diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index e25e92fb44fa..6a5c239c7669 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -18,7 +18,7 @@ #include <linux/vmalloc.h> #include "gcov.h" -#if __GNUC__ == 5 && __GNUC_MINOR__ >= 1 +#if (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1) #define GCOV_COUNTERS 10 #elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9 #define GCOV_COUNTERS 9 diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 2ee42e95a3ce..1d3ee3169202 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -9,3 +9,4 @@ obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o obj-$(CONFIG_PM_SLEEP) += pm.o obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o +obj-$(CONFIG_SMP) += affinity.o diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c new file mode 100644 index 000000000000..32f6cfcff212 --- /dev/null +++ b/kernel/irq/affinity.c @@ -0,0 +1,63 @@ + +#include <linux/interrupt.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/cpu.h> + +static int get_first_sibling(unsigned int cpu) +{ + unsigned int ret; + + ret = cpumask_first(topology_sibling_cpumask(cpu)); + if (ret < nr_cpu_ids) + return ret; + return cpu; +} + +/* + * Take a map of online CPUs and the number of available interrupt vectors + * and generate an output cpumask suitable for spreading MSI/MSI-X vectors + * so that they are distributed as good as possible around the CPUs. If + * more vectors than CPUs are available we'll map one to each CPU, + * otherwise we map one to the first sibling of each socket. + * + * If there are more vectors than CPUs we will still only have one bit + * set per CPU, but interrupt code will keep on assigning the vectors from + * the start of the bitmap until we run out of vectors. + */ +struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs) +{ + struct cpumask *affinity_mask; + unsigned int max_vecs = *nr_vecs; + + if (max_vecs == 1) + return NULL; + + affinity_mask = kzalloc(cpumask_size(), GFP_KERNEL); + if (!affinity_mask) { + *nr_vecs = 1; + return NULL; + } + + get_online_cpus(); + if (max_vecs >= num_online_cpus()) { + cpumask_copy(affinity_mask, cpu_online_mask); + *nr_vecs = num_online_cpus(); + } else { + unsigned int vecs = 0, cpu; + + for_each_online_cpu(cpu) { + if (cpu == get_first_sibling(cpu)) { + cpumask_set_cpu(cpu, affinity_mask); + vecs++; + } + + if (--max_vecs == 0) + break; + } + *nr_vecs = vecs; + } + put_online_cpus(); + + return affinity_mask; +} diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 2f9f2b0e79f2..26ba5654d9d5 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -426,6 +426,49 @@ out_unlock: } EXPORT_SYMBOL_GPL(handle_simple_irq); +/** + * handle_untracked_irq - Simple and software-decoded IRQs. + * @desc: the interrupt description structure for this irq + * + * Untracked interrupts are sent from a demultiplexing interrupt + * handler when the demultiplexer does not know which device it its + * multiplexed irq domain generated the interrupt. IRQ's handled + * through here are not subjected to stats tracking, randomness, or + * spurious interrupt detection. + * + * Note: Like handle_simple_irq, the caller is expected to handle + * the ack, clear, mask and unmask issues if necessary. + */ +void handle_untracked_irq(struct irq_desc *desc) +{ + unsigned int flags = 0; + + raw_spin_lock(&desc->lock); + + if (!irq_may_run(desc)) + goto out_unlock; + + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + + if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { + desc->istate |= IRQS_PENDING; + goto out_unlock; + } + + desc->istate &= ~IRQS_PENDING; + irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); + raw_spin_unlock(&desc->lock); + + __handle_irq_event_percpu(desc, &flags); + + raw_spin_lock(&desc->lock); + irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); + +out_unlock: + raw_spin_unlock(&desc->lock); +} +EXPORT_SYMBOL_GPL(handle_untracked_irq); + /* * Called unconditionally from handle_level_irq() and only for oneshot * interrupts from handle_fasteoi_irq() @@ -777,6 +820,21 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, desc->name = name; if (handle != handle_bad_irq && is_chained) { + unsigned int type = irqd_get_trigger_type(&desc->irq_data); + + /* + * We're about to start this interrupt immediately, + * hence the need to set the trigger configuration. + * But the .set_type callback may have overridden the + * flow handler, ignoring that we're dealing with a + * chained interrupt. Reset it immediately because we + * do know better. + */ + if (type != IRQ_TYPE_NONE) { + __irq_set_trigger(desc, type); + desc->handle_irq = handle; + } + irq_settings_set_noprobe(desc); irq_settings_set_norequest(desc); irq_settings_set_nothread(desc); @@ -1093,3 +1151,43 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) return 0; } + +/** + * irq_chip_pm_get - Enable power for an IRQ chip + * @data: Pointer to interrupt specific data + * + * Enable the power to the IRQ chip referenced by the interrupt data + * structure. + */ +int irq_chip_pm_get(struct irq_data *data) +{ + int retval; + + if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) { + retval = pm_runtime_get_sync(data->chip->parent_device); + if (retval < 0) { + pm_runtime_put_noidle(data->chip->parent_device); + return retval; + } + } + + return 0; +} + +/** + * irq_chip_pm_put - Disable power for an IRQ chip + * @data: Pointer to interrupt specific data + * + * Disable the power to the IRQ chip referenced by the interrupt data + * structure, belongs. Note that power will only be disabled, once this + * function has been called for all IRQs that have called irq_chip_pm_get(). + */ +int irq_chip_pm_put(struct irq_data *data) +{ + int retval = 0; + + if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) + retval = pm_runtime_put(data->chip->parent_device); + + return (retval < 0) ? retval : 0; +} diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a15b5485b446..d3f24905852c 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -132,10 +132,10 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action) wake_up_process(action->thread); } -irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) +irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags) { irqreturn_t retval = IRQ_NONE; - unsigned int flags = 0, irq = desc->irq_data.irq; + unsigned int irq = desc->irq_data.irq; struct irqaction *action; for_each_action_of_desc(desc, action) { @@ -164,7 +164,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) /* Fall through to add to randomness */ case IRQ_HANDLED: - flags |= action->flags; + *flags |= action->flags; break; default: @@ -174,7 +174,17 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) retval |= res; } - add_interrupt_randomness(irq, flags); + return retval; +} + +irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) +{ + irqreturn_t retval; + unsigned int flags = 0; + + retval = __handle_irq_event_percpu(desc, &flags); + + add_interrupt_randomness(desc->irq_data.irq, flags); if (!noirqdebug) note_interrupt(desc, retval); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 09be2c903c6d..bc226e783bd2 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -7,6 +7,7 @@ */ #include <linux/irqdesc.h> #include <linux/kernel_stat.h> +#include <linux/pm_runtime.h> #ifdef CONFIG_SPARSE_IRQ # define IRQ_BITMAP_BITS (NR_IRQS + 8196) @@ -83,6 +84,7 @@ extern void irq_mark_irq(unsigned int irq); extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); +irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags); irqreturn_t handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event(struct irq_desc *desc); @@ -105,6 +107,8 @@ static inline void unregister_handler_proc(unsigned int irq, struct irqaction *action) { } #endif +extern bool irq_can_set_affinity_usr(unsigned int irq); + extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); extern void irq_set_thread_affinity(struct irq_desc *desc); diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index 14777af8e097..1a9abc1c8ea0 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c @@ -19,9 +19,9 @@ * * Allocate a virq that can be used to send IPI to any CPU in dest mask. * - * On success it'll return linux irq number and 0 on failure + * On success it'll return linux irq number and error code on failure */ -unsigned int irq_reserve_ipi(struct irq_domain *domain, +int irq_reserve_ipi(struct irq_domain *domain, const struct cpumask *dest) { unsigned int nr_irqs, offset; @@ -30,18 +30,18 @@ unsigned int irq_reserve_ipi(struct irq_domain *domain, if (!domain ||!irq_domain_is_ipi(domain)) { pr_warn("Reservation on a non IPI domain\n"); - return 0; + return -EINVAL; } if (!cpumask_subset(dest, cpu_possible_mask)) { pr_warn("Reservation is not in possible_cpu_mask\n"); - return 0; + return -EINVAL; } nr_irqs = cpumask_weight(dest); if (!nr_irqs) { pr_warn("Reservation for empty destination mask\n"); - return 0; + return -EINVAL; } if (irq_domain_is_ipi_single(domain)) { @@ -72,18 +72,18 @@ unsigned int irq_reserve_ipi(struct irq_domain *domain, next = cpumask_next(next, dest); if (next < nr_cpu_ids) { pr_warn("Destination mask has holes\n"); - return 0; + return -EINVAL; } } - virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE); + virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE, NULL); if (virq <= 0) { pr_warn("Can't reserve IPI, failed to alloc descs\n"); - return 0; + return -ENOMEM; } virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE, - (void *) dest, true); + (void *) dest, true, NULL); if (virq <= 0) { pr_warn("Can't reserve IPI, failed to alloc hw irqs\n"); @@ -100,17 +100,20 @@ unsigned int irq_reserve_ipi(struct irq_domain *domain, free_descs: irq_free_descs(virq, nr_irqs); - return 0; + return -EBUSY; } /** * irq_destroy_ipi() - unreserve an IPI that was previously allocated * @irq: linux irq number to be destroyed + * @dest: cpumask of cpus which should have the IPI removed + * + * The IPIs allocated with irq_reserve_ipi() are retuerned to the system + * destroying all virqs associated with them. * - * Return the IPIs allocated with irq_reserve_ipi() to the system destroying - * all virqs associated with them. + * Return 0 on success or error code on failure. */ -void irq_destroy_ipi(unsigned int irq) +int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest) { struct irq_data *data = irq_get_irq_data(irq); struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL; @@ -118,23 +121,33 @@ void irq_destroy_ipi(unsigned int irq) unsigned int nr_irqs; if (!irq || !data || !ipimask) - return; + return -EINVAL; domain = data->domain; if (WARN_ON(domain == NULL)) - return; + return -EINVAL; if (!irq_domain_is_ipi(domain)) { pr_warn("Trying to destroy a non IPI domain!\n"); - return; + return -EINVAL; } - if (irq_domain_is_ipi_per_cpu(domain)) - nr_irqs = cpumask_weight(ipimask); - else + if (WARN_ON(!cpumask_subset(dest, ipimask))) + /* + * Must be destroying a subset of CPUs to which this IPI + * was set up to target + */ + return -EINVAL; + + if (irq_domain_is_ipi_per_cpu(domain)) { + irq = irq + cpumask_first(dest) - data->common->ipi_offset; + nr_irqs = cpumask_weight(dest); + } else { nr_irqs = 1; + } irq_domain_free_irqs(irq, nr_irqs); + return 0; } /** diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 0ccd028817d7..a623b44f2d4b 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -68,9 +68,13 @@ static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) return 0; } -static void desc_smp_init(struct irq_desc *desc, int node) +static void desc_smp_init(struct irq_desc *desc, int node, + const struct cpumask *affinity) { - cpumask_copy(desc->irq_common_data.affinity, irq_default_affinity); + if (!affinity) + affinity = irq_default_affinity; + cpumask_copy(desc->irq_common_data.affinity, affinity); + #ifdef CONFIG_GENERIC_PENDING_IRQ cpumask_clear(desc->pending_mask); #endif @@ -82,11 +86,12 @@ static void desc_smp_init(struct irq_desc *desc, int node) #else static inline int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; } -static inline void desc_smp_init(struct irq_desc *desc, int node) { } +static inline void +desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { } #endif static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, - struct module *owner) + const struct cpumask *affinity, struct module *owner) { int cpu; @@ -107,7 +112,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, desc->owner = owner; for_each_possible_cpu(cpu) *per_cpu_ptr(desc->kstat_irqs, cpu) = 0; - desc_smp_init(desc, node); + desc_smp_init(desc, node, affinity); } int nr_irqs = NR_IRQS; @@ -158,7 +163,9 @@ void irq_unlock_sparse(void) mutex_unlock(&sparse_irq_lock); } -static struct irq_desc *alloc_desc(int irq, int node, struct module *owner) +static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, + const struct cpumask *affinity, + struct module *owner) { struct irq_desc *desc; gfp_t gfp = GFP_KERNEL; @@ -178,7 +185,8 @@ static struct irq_desc *alloc_desc(int irq, int node, struct module *owner) lockdep_set_class(&desc->lock, &irq_desc_lock_class); init_rcu_head(&desc->rcu); - desc_set_defaults(irq, desc, node, owner); + desc_set_defaults(irq, desc, node, affinity, owner); + irqd_set(&desc->irq_data, flags); return desc; @@ -223,13 +231,32 @@ static void free_desc(unsigned int irq) } static int alloc_descs(unsigned int start, unsigned int cnt, int node, - struct module *owner) + const struct cpumask *affinity, struct module *owner) { + const struct cpumask *mask = NULL; struct irq_desc *desc; - int i; + unsigned int flags; + int i, cpu = -1; + + if (affinity && cpumask_empty(affinity)) + return -EINVAL; + + flags = affinity ? IRQD_AFFINITY_MANAGED : 0; for (i = 0; i < cnt; i++) { - desc = alloc_desc(start + i, node, owner); + if (affinity) { + cpu = cpumask_next(cpu, affinity); + if (cpu >= nr_cpu_ids) + cpu = cpumask_first(affinity); + node = cpu_to_node(cpu); + + /* + * For single allocations we use the caller provided + * mask otherwise we use the mask of the target cpu + */ + mask = cnt == 1 ? affinity : cpumask_of(cpu); + } + desc = alloc_desc(start + i, node, flags, mask, owner); if (!desc) goto err; mutex_lock(&sparse_irq_lock); @@ -277,7 +304,7 @@ int __init early_irq_init(void) nr_irqs = initcnt; for (i = 0; i < initcnt; i++) { - desc = alloc_desc(i, node, NULL); + desc = alloc_desc(i, node, 0, NULL, NULL); set_bit(i, allocated_irqs); irq_insert_desc(i, desc); } @@ -311,7 +338,7 @@ int __init early_irq_init(void) alloc_masks(&desc[i], GFP_KERNEL, node); raw_spin_lock_init(&desc[i].lock); lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); - desc_set_defaults(i, &desc[i], node, NULL); + desc_set_defaults(i, &desc[i], node, NULL, NULL); } return arch_early_irq_init(); } @@ -328,11 +355,12 @@ static void free_desc(unsigned int irq) unsigned long flags; raw_spin_lock_irqsave(&desc->lock, flags); - desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL); + desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL); raw_spin_unlock_irqrestore(&desc->lock, flags); } static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, + const struct cpumask *affinity, struct module *owner) { u32 i; @@ -453,12 +481,15 @@ EXPORT_SYMBOL_GPL(irq_free_descs); * @cnt: Number of consecutive irqs to allocate. * @node: Preferred node on which the irq descriptor should be allocated * @owner: Owning module (can be NULL) + * @affinity: Optional pointer to an affinity mask which hints where the + * irq descriptors should be allocated and which default + * affinities to use * * Returns the first irq number or error code */ int __ref __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, - struct module *owner) + struct module *owner, const struct cpumask *affinity) { int start, ret; @@ -494,7 +525,7 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, bitmap_set(allocated_irqs, start, cnt); mutex_unlock(&sparse_irq_lock); - return alloc_descs(start, cnt, node, owner); + return alloc_descs(start, cnt, node, affinity, owner); err: mutex_unlock(&sparse_irq_lock); @@ -512,7 +543,7 @@ EXPORT_SYMBOL_GPL(__irq_alloc_descs); */ unsigned int irq_alloc_hwirqs(int cnt, int node) { - int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL); + int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL, NULL); if (irq < 0) return 0; @@ -595,7 +626,8 @@ void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus) chip_bus_sync_unlock(desc); } -int irq_set_percpu_devid(unsigned int irq) +int irq_set_percpu_devid_partition(unsigned int irq, + const struct cpumask *affinity) { struct irq_desc *desc = irq_to_desc(irq); @@ -610,10 +642,33 @@ int irq_set_percpu_devid(unsigned int irq) if (!desc->percpu_enabled) return -ENOMEM; + if (affinity) + desc->percpu_affinity = affinity; + else + desc->percpu_affinity = cpu_possible_mask; + irq_set_percpu_devid_flags(irq); return 0; } +int irq_set_percpu_devid(unsigned int irq) +{ + return irq_set_percpu_devid_partition(irq, NULL); +} + +int irq_get_percpu_devid_partition(unsigned int irq, struct cpumask *affinity) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (!desc || !desc->percpu_enabled) + return -EINVAL; + + if (affinity) + cpumask_copy(affinity, desc->percpu_affinity); + + return 0; +} + void kstat_incr_irq_this_cpu(unsigned int irq) { kstat_incr_irqs_this_cpu(irq_to_desc(irq)); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 3a519a01118b..4752b43662e0 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -139,12 +139,7 @@ void irq_domain_remove(struct irq_domain *domain) { mutex_lock(&irq_domain_mutex); - /* - * radix_tree_delete() takes care of destroying the root - * node when all entries are removed. Shout if there are - * any mappings left. - */ - WARN_ON(domain->revmap_tree.height); + WARN_ON(!radix_tree_empty(&domain->revmap_tree)); list_del(&domain->link); @@ -243,14 +238,15 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, EXPORT_SYMBOL_GPL(irq_domain_add_legacy); /** - * irq_find_matching_fwnode() - Locates a domain for a given fwnode - * @fwnode: FW descriptor of the interrupt controller + * irq_find_matching_fwspec() - Locates a domain for a given fwspec + * @fwspec: FW specifier for an interrupt * @bus_token: domain-specific data */ -struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode, +struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec, enum irq_domain_bus_token bus_token) { struct irq_domain *h, *found = NULL; + struct fwnode_handle *fwnode = fwspec->fwnode; int rc; /* We might want to match the legacy controller last since @@ -264,7 +260,9 @@ struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode, */ mutex_lock(&irq_domain_mutex); list_for_each_entry(h, &irq_domain_list, link) { - if (h->ops->match) + if (h->ops->select && fwspec->param_count) + rc = h->ops->select(h, fwspec, bus_token); + else if (h->ops->match) rc = h->ops->match(h, to_of_node(fwnode), bus_token); else rc = ((fwnode != NULL) && (h->fwnode == fwnode) && @@ -279,7 +277,7 @@ struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode, mutex_unlock(&irq_domain_mutex); return found; } -EXPORT_SYMBOL_GPL(irq_find_matching_fwnode); +EXPORT_SYMBOL_GPL(irq_find_matching_fwspec); /** * irq_set_default_host() - Set a "default" irq domain @@ -483,7 +481,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain, } /* Allocate a virtual interrupt number */ - virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node)); + virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), NULL); if (virq <= 0) { pr_debug("-> virq allocation failed\n"); return 0; @@ -569,16 +567,15 @@ static void of_phandle_args_to_fwspec(struct of_phandle_args *irq_data, unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) { struct irq_domain *domain; + struct irq_data *irq_data; irq_hw_number_t hwirq; unsigned int type = IRQ_TYPE_NONE; int virq; if (fwspec->fwnode) { - domain = irq_find_matching_fwnode(fwspec->fwnode, - DOMAIN_BUS_WIRED); + domain = irq_find_matching_fwspec(fwspec, DOMAIN_BUS_WIRED); if (!domain) - domain = irq_find_matching_fwnode(fwspec->fwnode, - DOMAIN_BUS_ANY); + domain = irq_find_matching_fwspec(fwspec, DOMAIN_BUS_ANY); } else { domain = irq_default_domain; } @@ -592,15 +589,46 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) if (irq_domain_translate(domain, fwspec, &hwirq, &type)) return 0; - if (irq_domain_is_hierarchy(domain)) { + /* + * WARN if the irqchip returns a type with bits + * outside the sense mask set and clear these bits. + */ + if (WARN_ON(type & ~IRQ_TYPE_SENSE_MASK)) + type &= IRQ_TYPE_SENSE_MASK; + + /* + * If we've already configured this interrupt, + * don't do it again, or hell will break loose. + */ + virq = irq_find_mapping(domain, hwirq); + if (virq) { + /* + * If the trigger type is not specified or matches the + * current trigger type then we are done so return the + * interrupt number. + */ + if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq)) + return virq; + /* - * If we've already configured this interrupt, - * don't do it again, or hell will break loose. + * If the trigger type has not been set yet, then set + * it now and return the interrupt number. */ - virq = irq_find_mapping(domain, hwirq); - if (virq) + if (irq_get_trigger_type(virq) == IRQ_TYPE_NONE) { + irq_data = irq_get_irq_data(virq); + if (!irq_data) + return 0; + + irqd_set_trigger_type(irq_data, type); return virq; + } + pr_warn("type mismatch, failed to map hwirq-%lu for %s!\n", + hwirq, of_node_full_name(to_of_node(fwspec->fwnode))); + return 0; + } + + if (irq_domain_is_hierarchy(domain)) { virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, fwspec); if (virq <= 0) return 0; @@ -611,10 +639,18 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) return virq; } - /* Set type if specified and different than the current one */ - if (type != IRQ_TYPE_NONE && - type != irq_get_trigger_type(virq)) - irq_set_irq_type(virq, type); + irq_data = irq_get_irq_data(virq); + if (!irq_data) { + if (irq_domain_is_hierarchy(domain)) + irq_domain_free_irqs(virq, 1); + else + irq_dispose_mapping(virq); + return 0; + } + + /* Store trigger type */ + irqd_set_trigger_type(irq_data, type); + return virq; } EXPORT_SYMBOL_GPL(irq_create_fwspec_mapping); @@ -644,8 +680,12 @@ void irq_dispose_mapping(unsigned int virq) if (WARN_ON(domain == NULL)) return; - irq_domain_disassociate(domain, virq); - irq_free_desc(virq); + if (irq_domain_is_hierarchy(domain)) { + irq_domain_free_irqs(virq, 1); + } else { + irq_domain_disassociate(domain, virq); + irq_free_desc(virq); + } } EXPORT_SYMBOL_GPL(irq_dispose_mapping); @@ -839,19 +879,23 @@ const struct irq_domain_ops irq_domain_simple_ops = { EXPORT_SYMBOL_GPL(irq_domain_simple_ops); int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, - int node) + int node, const struct cpumask *affinity) { unsigned int hint; if (virq >= 0) { - virq = irq_alloc_descs(virq, virq, cnt, node); + virq = __irq_alloc_descs(virq, virq, cnt, node, THIS_MODULE, + affinity); } else { hint = hwirq % nr_irqs; if (hint == 0) hint++; - virq = irq_alloc_descs_from(hint, cnt, node); - if (virq <= 0 && hint > 1) - virq = irq_alloc_descs_from(1, cnt, node); + virq = __irq_alloc_descs(-1, hint, cnt, node, THIS_MODULE, + affinity); + if (virq <= 0 && hint > 1) { + virq = __irq_alloc_descs(-1, 1, cnt, node, THIS_MODULE, + affinity); + } } return virq; @@ -1099,6 +1143,7 @@ void irq_domain_free_irqs_common(struct irq_domain *domain, unsigned int virq, } irq_domain_free_irqs_parent(domain, virq, nr_irqs); } +EXPORT_SYMBOL_GPL(irq_domain_free_irqs_common); /** * irq_domain_free_irqs_top - Clear handler and handler data, clear irqdata and free parent @@ -1147,8 +1192,10 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, if (recursive) ret = irq_domain_alloc_irqs_recursive(parent, irq_base, nr_irqs, arg); - if (ret >= 0) - ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg); + if (ret < 0) + return ret; + + ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg); if (ret < 0 && recursive) irq_domain_free_irqs_recursive(parent, irq_base, nr_irqs); @@ -1163,6 +1210,7 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, * @node: NUMA node id for memory allocation * @arg: domain specific argument * @realloc: IRQ descriptors have already been allocated if true + * @affinity: Optional irq affinity mask for multiqueue devices * * Allocate IRQ numbers and initialized all data structures to support * hierarchy IRQ domains. @@ -1178,7 +1226,7 @@ int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, */ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, unsigned int nr_irqs, int node, void *arg, - bool realloc) + bool realloc, const struct cpumask *affinity) { int i, ret, virq; @@ -1196,7 +1244,8 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, if (realloc && irq_base >= 0) { virq = irq_base; } else { - virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node); + virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node, + affinity); if (virq < 0) { pr_debug("cannot allocate IRQ(base %d, count %d)\n", irq_base, nr_irqs); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index cc1cc641d653..9530fcd27704 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -115,12 +115,12 @@ EXPORT_SYMBOL(synchronize_irq); #ifdef CONFIG_SMP cpumask_var_t irq_default_affinity; -static int __irq_can_set_affinity(struct irq_desc *desc) +static bool __irq_can_set_affinity(struct irq_desc *desc) { if (!desc || !irqd_can_balance(&desc->irq_data) || !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) - return 0; - return 1; + return false; + return true; } /** @@ -134,6 +134,21 @@ int irq_can_set_affinity(unsigned int irq) } /** + * irq_can_set_affinity_usr - Check if affinity of a irq can be set from user space + * @irq: Interrupt to check + * + * Like irq_can_set_affinity() above, but additionally checks for the + * AFFINITY_MANAGED flag. + */ +bool irq_can_set_affinity_usr(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + return __irq_can_set_affinity(desc) && + !irqd_affinity_is_managed(&desc->irq_data); +} + +/** * irq_set_thread_affinity - Notify irq threads to adjust affinity * @desc: irq descriptor which has affitnity changed * @@ -338,10 +353,11 @@ static int setup_affinity(struct irq_desc *desc, struct cpumask *mask) return 0; /* - * Preserve an userspace affinity setup, but make sure that - * one of the targets is online. + * Preserve the managed affinity setting and an userspace affinity + * setup, but make sure that one of the targets is online. */ - if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { + if (irqd_affinity_is_managed(&desc->irq_data) || + irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { if (cpumask_intersects(desc->irq_common_data.affinity, cpu_online_mask)) set = desc->irq_common_data.affinity; @@ -1117,6 +1133,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) new->irq = irq; /* + * If the trigger type is not specified by the caller, + * then use the default for this interrupt. + */ + if (!(new->flags & IRQF_TRIGGER_MASK)) + new->flags |= irqd_get_trigger_type(&desc->irq_data); + + /* * Check whether the interrupt nests into another interrupt * thread. */ @@ -1407,12 +1430,20 @@ int setup_irq(unsigned int irq, struct irqaction *act) int retval; struct irq_desc *desc = irq_to_desc(irq); - if (WARN_ON(irq_settings_is_per_cpu_devid(desc))) + if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) return -EINVAL; + + retval = irq_chip_pm_get(&desc->irq_data); + if (retval < 0) + return retval; + chip_bus_lock(desc); retval = __setup_irq(irq, desc, act); chip_bus_sync_unlock(desc); + if (retval) + irq_chip_pm_put(&desc->irq_data); + return retval; } EXPORT_SYMBOL_GPL(setup_irq); @@ -1506,6 +1537,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) } } + irq_chip_pm_put(&desc->irq_data); module_put(desc->owner); kfree(action->secondary); return action; @@ -1648,11 +1680,18 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, action->name = devname; action->dev_id = dev_id; + retval = irq_chip_pm_get(&desc->irq_data); + if (retval < 0) { + kfree(action); + return retval; + } + chip_bus_lock(desc); retval = __setup_irq(irq, desc, action); chip_bus_sync_unlock(desc); if (retval) { + irq_chip_pm_put(&desc->irq_data); kfree(action->secondary); kfree(action); } @@ -1730,7 +1769,14 @@ void enable_percpu_irq(unsigned int irq, unsigned int type) if (!desc) return; + /* + * If the trigger type is not specified by the caller, then + * use the default for this interrupt. + */ type &= IRQ_TYPE_SENSE_MASK; + if (type == IRQ_TYPE_NONE) + type = irqd_get_trigger_type(&desc->irq_data); + if (type != IRQ_TYPE_NONE) { int ret; @@ -1822,6 +1868,7 @@ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_ unregister_handler_proc(irq, action); + irq_chip_pm_put(&desc->irq_data); module_put(desc->owner); return action; @@ -1884,10 +1931,18 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) if (!desc || !irq_settings_is_per_cpu_devid(desc)) return -EINVAL; + + retval = irq_chip_pm_get(&desc->irq_data); + if (retval < 0) + return retval; + chip_bus_lock(desc); retval = __setup_irq(irq, desc, act); chip_bus_sync_unlock(desc); + if (retval) + irq_chip_pm_put(&desc->irq_data); + return retval; } @@ -1931,12 +1986,20 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, action->name = devname; action->percpu_dev_id = dev_id; + retval = irq_chip_pm_get(&desc->irq_data); + if (retval < 0) { + kfree(action); + return retval; + } + chip_bus_lock(desc); retval = __setup_irq(irq, desc, action); chip_bus_sync_unlock(desc); - if (retval) + if (retval) { + irq_chip_pm_put(&desc->irq_data); kfree(action); + } return retval; } diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 38e89ce7b071..19e9dfbe97fa 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -324,7 +324,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, struct msi_domain_ops *ops = info->ops; msi_alloc_info_t arg; struct msi_desc *desc; - int i, ret, virq = -1; + int i, ret, virq; ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg); if (ret) @@ -332,13 +332,10 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, for_each_msi_entry(desc, dev) { ops->set_desc(&arg, desc); - if (info->flags & MSI_FLAG_IDENTITY_MAP) - virq = (int)ops->get_hwirq(info, &arg); - else - virq = -1; - virq = __irq_domain_alloc_irqs(domain, virq, desc->nvec_used, - dev_to_node(dev), &arg, false); + virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used, + dev_to_node(dev), &arg, false, + desc->affinity); if (virq < 0) { ret = -ENOSPC; if (ops->handle_error) @@ -356,11 +353,23 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, ops->msi_finish(&arg, 0); for_each_msi_entry(desc, dev) { + virq = desc->irq; if (desc->nvec_used == 1) dev_dbg(dev, "irq %d for MSI\n", virq); else dev_dbg(dev, "irq [%d-%d] for MSI\n", virq, virq + desc->nvec_used - 1); + /* + * This flag is set by the PCI layer as we need to activate + * the MSI entries before the PCI layer enables MSI in the + * card. Otherwise the card latches a random msi message. + */ + if (info->flags & MSI_FLAG_ACTIVATE_EARLY) { + struct irq_data *irq_data; + + irq_data = irq_domain_get_irq_data(domain, desc->irq); + irq_domain_activate_irq(irq_data); + } } return 0; diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 4e1b94726818..feaa813b84a9 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -96,7 +96,7 @@ static ssize_t write_irq_affinity(int type, struct file *file, cpumask_var_t new_value; int err; - if (!irq_can_set_affinity(irq) || no_irq_affinity) + if (!irq_can_set_affinity_usr(irq) || no_irq_affinity) return -EIO; if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) @@ -311,7 +311,6 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) !name_unique(irq, action)) return; - memset(name, 0, MAX_NAMELEN); snprintf(name, MAX_NAMELEN, "%s", action->name); /* create /proc/irq/1234/handler/ */ @@ -340,7 +339,6 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) if (desc->dir) goto out_unlock; - memset(name, 0, MAX_NAMELEN); sprintf(name, "%d", irq); /* create /proc/irq/1234 */ @@ -386,7 +384,6 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) #endif remove_proc_entry("spurious", desc->dir); - memset(name, 0, MAX_NAMELEN); sprintf(name, "%u", irq); remove_proc_entry(name, root_irq_dir); } @@ -421,12 +418,8 @@ void init_irq_proc(void) /* * Create entries for all existing IRQs. */ - for_each_irq_desc(irq, desc) { - if (!desc) - continue; - + for_each_irq_desc(irq, desc) register_irq_proc(irq, desc); - } } #ifdef CONFIG_GENERIC_IRQ_SHOW diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 05254eeb4b4e..93ad6c1fb9b6 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -14,6 +14,7 @@ #include <linux/err.h> #include <linux/static_key.h> #include <linux/jump_label_ratelimit.h> +#include <linux/bug.h> #ifdef HAVE_JUMP_LABEL @@ -56,15 +57,81 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) static void jump_label_update(struct static_key *key); +/* + * There are similar definitions for the !HAVE_JUMP_LABEL case in jump_label.h. + * The use of 'atomic_read()' requires atomic.h and its problematic for some + * kernel headers such as kernel.h and others. Since static_key_count() is not + * used in the branch statements as it is for the !HAVE_JUMP_LABEL case its ok + * to have it be a function here. Similarly, for 'static_key_enable()' and + * 'static_key_disable()', which require bug.h. This should allow jump_label.h + * to be included from most/all places for HAVE_JUMP_LABEL. + */ +int static_key_count(struct static_key *key) +{ + /* + * -1 means the first static_key_slow_inc() is in progress. + * static_key_enabled() must return true, so return 1 here. + */ + int n = atomic_read(&key->enabled); + + return n >= 0 ? n : 1; +} +EXPORT_SYMBOL_GPL(static_key_count); + +void static_key_enable(struct static_key *key) +{ + int count = static_key_count(key); + + WARN_ON_ONCE(count < 0 || count > 1); + + if (!count) + static_key_slow_inc(key); +} +EXPORT_SYMBOL_GPL(static_key_enable); + +void static_key_disable(struct static_key *key) +{ + int count = static_key_count(key); + + WARN_ON_ONCE(count < 0 || count > 1); + + if (count) + static_key_slow_dec(key); +} +EXPORT_SYMBOL_GPL(static_key_disable); + void static_key_slow_inc(struct static_key *key) { + int v, v1; + STATIC_KEY_CHECK_USE(); - if (atomic_inc_not_zero(&key->enabled)) - return; + + /* + * Careful if we get concurrent static_key_slow_inc() calls; + * later calls must wait for the first one to _finish_ the + * jump_label_update() process. At the same time, however, + * the jump_label_update() call below wants to see + * static_key_enabled(&key) for jumps to be updated properly. + * + * So give a special meaning to negative key->enabled: it sends + * static_key_slow_inc() down the slow path, and it is non-zero + * so it counts as "enabled" in jump_label_update(). Note that + * atomic_inc_unless_negative() checks >= 0, so roll our own. + */ + for (v = atomic_read(&key->enabled); v > 0; v = v1) { + v1 = atomic_cmpxchg(&key->enabled, v, v + 1); + if (likely(v1 == v)) + return; + } jump_label_lock(); - if (atomic_inc_return(&key->enabled) == 1) + if (atomic_read(&key->enabled) == 0) { + atomic_set(&key->enabled, -1); jump_label_update(key); + atomic_set(&key->enabled, 1); + } else { + atomic_inc(&key->enabled); + } jump_label_unlock(); } EXPORT_SYMBOL_GPL(static_key_slow_inc); @@ -72,6 +139,13 @@ EXPORT_SYMBOL_GPL(static_key_slow_inc); static void __static_key_slow_dec(struct static_key *key, unsigned long rate_limit, struct delayed_work *work) { + /* + * The negative count check is valid even when a negative + * key->enabled is in use by static_key_slow_inc(); a + * __static_key_slow_dec() before the first static_key_slow_inc() + * returns is unbalanced, because all other static_key_slow_inc() + * instances block while the update is in progress. + */ if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { WARN(atomic_read(&key->enabled) < 0, "jump label: negative count!\n"); @@ -205,6 +279,18 @@ void __init jump_label_init(void) struct static_key *key = NULL; struct jump_entry *iter; + /* + * Since we are initializing the static_key.enabled field with + * with the 'raw' int values (to avoid pulling in atomic.h) in + * jump_label.h, let's make sure that is safe. There are only two + * cases to check since we initialize to 0 or 1. + */ + BUILD_BUG_ON((int)ATOMIC_INIT(0) != 0); + BUILD_BUG_ON((int)ATOMIC_INIT(1) != 1); + + if (static_key_initialized) + return; + jump_label_lock(); jump_label_sort_entries(iter_start, iter_stop); @@ -254,11 +340,14 @@ static int __jump_label_mod_text_reserved(void *start, void *end) { struct module *mod; + preempt_disable(); mod = __module_text_address((unsigned long)start); + WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); + preempt_enable(); + if (!mod) return 0; - WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); return __jump_label_text_reserved(mod->jump_entries, mod->jump_entries + mod->num_jump_entries, @@ -422,7 +511,7 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val, return notifier_from_errno(ret); } -struct notifier_block jump_label_module_nb = { +static struct notifier_block jump_label_module_nb = { .notifier_call = jump_label_module_notify, .priority = 1, /* higher than tracepoints */ }; diff --git a/kernel/kcov.c b/kernel/kcov.c index a02f2dddd1d7..8d44b3fea9d0 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -264,7 +264,12 @@ static const struct file_operations kcov_fops = { static int __init kcov_init(void) { - if (!debugfs_create_file("kcov", 0600, NULL, NULL, &kcov_fops)) { + /* + * The kcov debugfs file won't ever get removed and thus, + * there is no need to protect it against removal races. The + * use of debugfs_create_file_unsafe() is actually safe here. + */ + if (!debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops)) { pr_err("failed to create kcov in debugfs\n"); return -ENOMEM; } diff --git a/kernel/kexec.c b/kernel/kexec.c index ee70aef5cd81..980936a90ee6 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -48,7 +48,8 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, if (kexec_on_panic) { /* Verify we have a valid entry point */ - if ((entry < crashk_res.start) || (entry > crashk_res.end)) + if ((entry < phys_to_boot_phys(crashk_res.start)) || + (entry > phys_to_boot_phys(crashk_res.end))) return -EADDRNOTAVAIL; } @@ -103,6 +104,65 @@ out_free_image: return ret; } +static int do_kexec_load(unsigned long entry, unsigned long nr_segments, + struct kexec_segment __user *segments, unsigned long flags) +{ + struct kimage **dest_image, *image; + unsigned long i; + int ret; + + if (flags & KEXEC_ON_CRASH) { + dest_image = &kexec_crash_image; + if (kexec_crash_image) + arch_kexec_unprotect_crashkres(); + } else { + dest_image = &kexec_image; + } + + if (nr_segments == 0) { + /* Uninstall image */ + kimage_free(xchg(dest_image, NULL)); + return 0; + } + if (flags & KEXEC_ON_CRASH) { + /* + * Loading another kernel to switch to if this one + * crashes. Free any current crash dump kernel before + * we corrupt it. + */ + kimage_free(xchg(&kexec_crash_image, NULL)); + } + + ret = kimage_alloc_init(&image, entry, nr_segments, segments, flags); + if (ret) + return ret; + + if (flags & KEXEC_PRESERVE_CONTEXT) + image->preserve_context = 1; + + ret = machine_kexec_prepare(image); + if (ret) + goto out; + + for (i = 0; i < nr_segments; i++) { + ret = kimage_load_segment(image, &image->segment[i]); + if (ret) + goto out; + } + + kimage_terminate(image); + + /* Install the new kernel and uninstall the old */ + image = xchg(dest_image, image); + +out: + if ((flags & KEXEC_ON_CRASH) && kexec_crash_image) + arch_kexec_protect_crashkres(); + + kimage_free(image); + return ret; +} + /* * Exec Kernel system call: for obvious reasons only root may call it. * @@ -127,7 +187,6 @@ out_free_image: SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, struct kexec_segment __user *, segments, unsigned long, flags) { - struct kimage **dest_image, *image; int result; /* We only trust the superuser with rebooting the system. */ @@ -152,9 +211,6 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, if (nr_segments > KEXEC_SEGMENT_MAX) return -EINVAL; - image = NULL; - result = 0; - /* Because we write directly to the reserved memory * region when loading crash kernels we need a mutex here to * prevent multiple crash kernels from attempting to load @@ -166,53 +222,9 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, if (!mutex_trylock(&kexec_mutex)) return -EBUSY; - dest_image = &kexec_image; - if (flags & KEXEC_ON_CRASH) - dest_image = &kexec_crash_image; - if (nr_segments > 0) { - unsigned long i; - - if (flags & KEXEC_ON_CRASH) { - /* - * Loading another kernel to switch to if this one - * crashes. Free any current crash dump kernel before - * we corrupt it. - */ - - kimage_free(xchg(&kexec_crash_image, NULL)); - result = kimage_alloc_init(&image, entry, nr_segments, - segments, flags); - crash_map_reserved_pages(); - } else { - /* Loading another kernel to reboot into. */ - - result = kimage_alloc_init(&image, entry, nr_segments, - segments, flags); - } - if (result) - goto out; - - if (flags & KEXEC_PRESERVE_CONTEXT) - image->preserve_context = 1; - result = machine_kexec_prepare(image); - if (result) - goto out; - - for (i = 0; i < nr_segments; i++) { - result = kimage_load_segment(image, &image->segment[i]); - if (result) - goto out; - } - kimage_terminate(image); - if (flags & KEXEC_ON_CRASH) - crash_unmap_reserved_pages(); - } - /* Install the new kernel, and Uninstall the old */ - image = xchg(dest_image, image); + result = do_kexec_load(entry, nr_segments, segments, flags); -out: mutex_unlock(&kexec_mutex); - kimage_free(image); return result; } diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 1391d3ee3b86..561675589511 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -95,6 +95,12 @@ int kexec_should_crash(struct task_struct *p) return 0; } +int kexec_crash_loaded(void) +{ + return !!kexec_crash_image; +} +EXPORT_SYMBOL_GPL(kexec_crash_loaded); + /* * When kexec transitions to the new kernel there is a one-to-one * mapping between physical and virtual addresses. On processors @@ -140,6 +146,7 @@ int kexec_should_crash(struct task_struct *p) * allocating pages whose destination address we do not care about. */ #define KIMAGE_NO_DEST (-1UL) +#define PAGE_COUNT(x) (((x) + PAGE_SIZE - 1) >> PAGE_SHIFT) static struct page *kimage_alloc_page(struct kimage *image, gfp_t gfp_mask, @@ -147,8 +154,9 @@ static struct page *kimage_alloc_page(struct kimage *image, int sanity_check_segment_list(struct kimage *image) { - int result, i; + int i; unsigned long nr_segments = image->nr_segments; + unsigned long total_pages = 0; /* * Verify we have good destination addresses. The caller is @@ -163,16 +171,17 @@ int sanity_check_segment_list(struct kimage *image) * simply because addresses are changed to page size * granularity. */ - result = -EADDRNOTAVAIL; for (i = 0; i < nr_segments; i++) { unsigned long mstart, mend; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; + if (mstart > mend) + return -EADDRNOTAVAIL; if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) - return result; + return -EADDRNOTAVAIL; if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) - return result; + return -EADDRNOTAVAIL; } /* Verify our destination addresses do not overlap. @@ -180,7 +189,6 @@ int sanity_check_segment_list(struct kimage *image) * through very weird things can happen with no * easy explanation as one segment stops on another. */ - result = -EINVAL; for (i = 0; i < nr_segments; i++) { unsigned long mstart, mend; unsigned long j; @@ -194,7 +202,7 @@ int sanity_check_segment_list(struct kimage *image) pend = pstart + image->segment[j].memsz; /* Do the segments overlap ? */ if ((mend > pstart) && (mstart < pend)) - return result; + return -EINVAL; } } @@ -203,12 +211,26 @@ int sanity_check_segment_list(struct kimage *image) * and it is easier to check up front than to be surprised * later on. */ - result = -EINVAL; for (i = 0; i < nr_segments; i++) { if (image->segment[i].bufsz > image->segment[i].memsz) - return result; + return -EINVAL; + } + + /* + * Verify that no more than half of memory will be consumed. If the + * request from userspace is too large, a large amount of time will be + * wasted allocating pages, which can cause a soft lockup. + */ + for (i = 0; i < nr_segments; i++) { + if (PAGE_COUNT(image->segment[i].memsz) > totalram_pages / 2) + return -EINVAL; + + total_pages += PAGE_COUNT(image->segment[i].memsz); } + if (total_pages > totalram_pages / 2) + return -EINVAL; + /* * Verify we have good destination addresses. Normally * the caller is responsible for making certain we don't @@ -220,16 +242,15 @@ int sanity_check_segment_list(struct kimage *image) */ if (image->type == KEXEC_TYPE_CRASH) { - result = -EADDRNOTAVAIL; for (i = 0; i < nr_segments; i++) { unsigned long mstart, mend; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz - 1; /* Ensure we are within the crash kernel limits */ - if ((mstart < crashk_res.start) || - (mend > crashk_res.end)) - return result; + if ((mstart < phys_to_boot_phys(crashk_res.start)) || + (mend > phys_to_boot_phys(crashk_res.end))) + return -EADDRNOTAVAIL; } } @@ -352,7 +373,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image, pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order); if (!pages) break; - pfn = page_to_pfn(pages); + pfn = page_to_boot_pfn(pages); epfn = pfn + count; addr = pfn << PAGE_SHIFT; eaddr = epfn << PAGE_SHIFT; @@ -478,7 +499,7 @@ static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) return -ENOMEM; ind_page = page_address(page); - *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; + *image->entry = virt_to_boot_phys(ind_page) | IND_INDIRECTION; image->entry = ind_page; image->last_entry = ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); @@ -533,13 +554,13 @@ void kimage_terminate(struct kimage *image) #define for_each_kimage_entry(image, ptr, entry) \ for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ ptr = (entry & IND_INDIRECTION) ? \ - phys_to_virt((entry & PAGE_MASK)) : ptr + 1) + boot_phys_to_virt((entry & PAGE_MASK)) : ptr + 1) static void kimage_free_entry(kimage_entry_t entry) { struct page *page; - page = pfn_to_page(entry >> PAGE_SHIFT); + page = boot_pfn_to_page(entry >> PAGE_SHIFT); kimage_free_pages(page); } @@ -633,7 +654,7 @@ static struct page *kimage_alloc_page(struct kimage *image, * have a match. */ list_for_each_entry(page, &image->dest_pages, lru) { - addr = page_to_pfn(page) << PAGE_SHIFT; + addr = page_to_boot_pfn(page) << PAGE_SHIFT; if (addr == destination) { list_del(&page->lru); return page; @@ -648,12 +669,12 @@ static struct page *kimage_alloc_page(struct kimage *image, if (!page) return NULL; /* If the page cannot be used file it away */ - if (page_to_pfn(page) > + if (page_to_boot_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { list_add(&page->lru, &image->unusable_pages); continue; } - addr = page_to_pfn(page) << PAGE_SHIFT; + addr = page_to_boot_pfn(page) << PAGE_SHIFT; /* If it is the destination page we want use it */ if (addr == destination) @@ -676,7 +697,7 @@ static struct page *kimage_alloc_page(struct kimage *image, struct page *old_page; old_addr = *old & PAGE_MASK; - old_page = pfn_to_page(old_addr >> PAGE_SHIFT); + old_page = boot_pfn_to_page(old_addr >> PAGE_SHIFT); copy_highpage(page, old_page); *old = addr | (*old & ~PAGE_MASK); @@ -732,7 +753,7 @@ static int kimage_load_normal_segment(struct kimage *image, result = -ENOMEM; goto out; } - result = kimage_add_page(image, page_to_pfn(page) + result = kimage_add_page(image, page_to_boot_pfn(page) << PAGE_SHIFT); if (result < 0) goto out; @@ -793,7 +814,7 @@ static int kimage_load_crash_segment(struct kimage *image, char *ptr; size_t uchunk, mchunk; - page = pfn_to_page(maddr >> PAGE_SHIFT); + page = boot_pfn_to_page(maddr >> PAGE_SHIFT); if (!page) { result = -ENOMEM; goto out; @@ -893,6 +914,7 @@ void crash_kexec(struct pt_regs *regs) old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu); if (old_cpu == PANIC_CPU_INVALID) { /* This is the 1st CPU which comes here, so go ahead. */ + printk_nmi_flush_on_panic(); __crash_kexec(regs); /* @@ -920,7 +942,7 @@ void __weak crash_free_reserved_phys_range(unsigned long begin, unsigned long addr; for (addr = begin; addr < end; addr += PAGE_SIZE) - free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT)); + free_reserved_page(boot_pfn_to_page(addr >> PAGE_SHIFT)); } int crash_shrink_memory(unsigned long new_size) @@ -953,7 +975,6 @@ int crash_shrink_memory(unsigned long new_size) start = roundup(start, KEXEC_CRASH_MEM_ALIGN); end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN); - crash_map_reserved_pages(); crash_free_reserved_phys_range(end, crashk_res.end); if ((start == end) && (crashk_res.parent != NULL)) @@ -967,7 +988,6 @@ int crash_shrink_memory(unsigned long new_size) crashk_res.end = end - 1; insert_resource(&iomem_resource, ram_res); - crash_unmap_reserved_pages(); unlock: mutex_unlock(&kexec_mutex); @@ -1375,7 +1395,7 @@ void vmcoreinfo_append_str(const char *fmt, ...) void __weak arch_crash_save_vmcoreinfo(void) {} -unsigned long __weak paddr_vmcoreinfo_note(void) +phys_addr_t __weak paddr_vmcoreinfo_note(void) { return __pa((unsigned long)(char *)&vmcoreinfo_note); } @@ -1410,7 +1430,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_STRUCT_SIZE(list_head); VMCOREINFO_SIZE(nodemask_t); VMCOREINFO_OFFSET(page, flags); - VMCOREINFO_OFFSET(page, _count); + VMCOREINFO_OFFSET(page, _refcount); VMCOREINFO_OFFSET(page, mapping); VMCOREINFO_OFFSET(page, lru); VMCOREINFO_OFFSET(page, _mapcount); @@ -1552,13 +1572,14 @@ int kernel_kexec(void) } /* - * Add and remove page tables for crashkernel memory + * Protection mechanism for crashkernel reserved memory after + * the kdump kernel is loaded. * * Provide an empty default implementation here -- architecture * code may override this */ -void __weak crash_map_reserved_pages(void) +void __weak arch_kexec_protect_crashkres(void) {} -void __weak crash_unmap_reserved_pages(void) +void __weak arch_kexec_unprotect_crashkres(void) {} diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index c72d2ff5896e..037c321c5618 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -274,8 +274,11 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, return -EBUSY; dest_image = &kexec_image; - if (flags & KEXEC_FILE_ON_CRASH) + if (flags & KEXEC_FILE_ON_CRASH) { dest_image = &kexec_crash_image; + if (kexec_crash_image) + arch_kexec_unprotect_crashkres(); + } if (flags & KEXEC_FILE_UNLOAD) goto exchange; @@ -324,6 +327,9 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, exchange: image = xchg(dest_image, image); out: + if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image) + arch_kexec_protect_crashkres(); + mutex_unlock(&kexec_mutex); kimage_free(image); return ret; @@ -881,7 +887,10 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min, return 0; out: vfree(pi->sechdrs); + pi->sechdrs = NULL; + vfree(pi->purgatory_buf); + pi->purgatory_buf = NULL; return ret; } diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 152da4a48867..ee1bc1bb8feb 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -101,7 +101,7 @@ KERNEL_ATTR_RO(kexec_loaded); static ssize_t kexec_crash_loaded_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", !!kexec_crash_image); + return sprintf(buf, "%d\n", kexec_crash_loaded()); } KERNEL_ATTR_RO(kexec_crash_loaded); @@ -128,8 +128,8 @@ KERNEL_ATTR_RW(kexec_crash_size); static ssize_t vmcoreinfo_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%lx %x\n", - paddr_vmcoreinfo_note(), + phys_addr_t vmcore_base = paddr_vmcoreinfo_note(); + return sprintf(buf, "%pa %x\n", &vmcore_base, (unsigned int)sizeof(vmcoreinfo_note)); } KERNEL_ATTR_RO(vmcoreinfo); diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index d68fbf63b083..8bbe50704621 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -28,6 +28,8 @@ #include <linux/list.h> #include <linux/kallsyms.h> #include <linux/livepatch.h> +#include <linux/elf.h> +#include <linux/moduleloader.h> #include <asm/cacheflush.h> /** @@ -204,76 +206,110 @@ static int klp_find_object_symbol(const char *objname, const char *name, return -EINVAL; } -/* - * external symbols are located outside the parent object (where the parent - * object is either vmlinux or the kmod being patched). - */ -static int klp_find_external_symbol(struct module *pmod, const char *name, - unsigned long *addr) +static int klp_resolve_symbols(Elf_Shdr *relasec, struct module *pmod) { - const struct kernel_symbol *sym; - - /* first, check if it's an exported symbol */ - preempt_disable(); - sym = find_symbol(name, NULL, NULL, true, true); - if (sym) { - *addr = sym->value; - preempt_enable(); - return 0; - } - preempt_enable(); + int i, cnt, vmlinux, ret; + char objname[MODULE_NAME_LEN]; + char symname[KSYM_NAME_LEN]; + char *strtab = pmod->core_kallsyms.strtab; + Elf_Rela *relas; + Elf_Sym *sym; + unsigned long sympos, addr; /* - * Check if it's in another .o within the patch module. This also - * checks that the external symbol is unique. + * Since the field widths for objname and symname in the sscanf() + * call are hard-coded and correspond to MODULE_NAME_LEN and + * KSYM_NAME_LEN respectively, we must make sure that MODULE_NAME_LEN + * and KSYM_NAME_LEN have the values we expect them to have. + * + * Because the value of MODULE_NAME_LEN can differ among architectures, + * we use the smallest/strictest upper bound possible (56, based on + * the current definition of MODULE_NAME_LEN) to prevent overflows. */ - return klp_find_object_symbol(pmod->name, name, 0, addr); + BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 128); + + relas = (Elf_Rela *) relasec->sh_addr; + /* For each rela in this klp relocation section */ + for (i = 0; i < relasec->sh_size / sizeof(Elf_Rela); i++) { + sym = pmod->core_kallsyms.symtab + ELF_R_SYM(relas[i].r_info); + if (sym->st_shndx != SHN_LIVEPATCH) { + pr_err("symbol %s is not marked as a livepatch symbol", + strtab + sym->st_name); + return -EINVAL; + } + + /* Format: .klp.sym.objname.symname,sympos */ + cnt = sscanf(strtab + sym->st_name, + ".klp.sym.%55[^.].%127[^,],%lu", + objname, symname, &sympos); + if (cnt != 3) { + pr_err("symbol %s has an incorrectly formatted name", + strtab + sym->st_name); + return -EINVAL; + } + + /* klp_find_object_symbol() treats a NULL objname as vmlinux */ + vmlinux = !strcmp(objname, "vmlinux"); + ret = klp_find_object_symbol(vmlinux ? NULL : objname, + symname, sympos, &addr); + if (ret) + return ret; + + sym->st_value = addr; + } + + return 0; } static int klp_write_object_relocations(struct module *pmod, struct klp_object *obj) { - int ret = 0; - unsigned long val; - struct klp_reloc *reloc; + int i, cnt, ret = 0; + const char *objname, *secname; + char sec_objname[MODULE_NAME_LEN]; + Elf_Shdr *sec; if (WARN_ON(!klp_is_object_loaded(obj))) return -EINVAL; - if (WARN_ON(!obj->relocs)) - return -EINVAL; + objname = klp_is_module(obj) ? obj->name : "vmlinux"; module_disable_ro(pmod); + /* For each klp relocation section */ + for (i = 1; i < pmod->klp_info->hdr.e_shnum; i++) { + sec = pmod->klp_info->sechdrs + i; + secname = pmod->klp_info->secstrings + sec->sh_name; + if (!(sec->sh_flags & SHF_RELA_LIVEPATCH)) + continue; - for (reloc = obj->relocs; reloc->name; reloc++) { - /* discover the address of the referenced symbol */ - if (reloc->external) { - if (reloc->sympos > 0) { - pr_err("non-zero sympos for external reloc symbol '%s' is not supported\n", - reloc->name); - ret = -EINVAL; - goto out; - } - ret = klp_find_external_symbol(pmod, reloc->name, &val); - } else - ret = klp_find_object_symbol(obj->name, - reloc->name, - reloc->sympos, - &val); + /* + * Format: .klp.rela.sec_objname.section_name + * See comment in klp_resolve_symbols() for an explanation + * of the selected field width value. + */ + cnt = sscanf(secname, ".klp.rela.%55[^.]", sec_objname); + if (cnt != 1) { + pr_err("section %s has an incorrectly formatted name", + secname); + ret = -EINVAL; + break; + } + + if (strcmp(objname, sec_objname)) + continue; + + ret = klp_resolve_symbols(sec, pmod); if (ret) - goto out; + break; - ret = klp_write_module_reloc(pmod, reloc->type, reloc->loc, - val + reloc->addend); - if (ret) { - pr_err("relocation failed for symbol '%s' at 0x%016lx (%d)\n", - reloc->name, val, ret); - goto out; - } + ret = apply_relocate_add(pmod->klp_info->sechdrs, + pmod->core_kallsyms.strtab, + pmod->klp_info->symndx, i, pmod); + if (ret) + break; } -out: - module_enable_ro(pmod); + module_enable_ro(pmod, true); return ret; } @@ -298,6 +334,19 @@ unlock: rcu_read_unlock(); } +/* + * Convert a function address into the appropriate ftrace location. + * + * Usually this is just the address of the function, but on some architectures + * it's more complicated so allow them to provide a custom behaviour. + */ +#ifndef klp_get_ftrace_location +static unsigned long klp_get_ftrace_location(unsigned long faddr) +{ + return faddr; +} +#endif + static void klp_disable_func(struct klp_func *func) { struct klp_ops *ops; @@ -312,8 +361,14 @@ static void klp_disable_func(struct klp_func *func) return; if (list_is_singular(&ops->func_stack)) { + unsigned long ftrace_loc; + + ftrace_loc = klp_get_ftrace_location(func->old_addr); + if (WARN_ON(!ftrace_loc)) + return; + WARN_ON(unregister_ftrace_function(&ops->fops)); - WARN_ON(ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0)); + WARN_ON(ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0)); list_del_rcu(&func->stack_node); list_del(&ops->node); @@ -338,6 +393,15 @@ static int klp_enable_func(struct klp_func *func) ops = klp_find_ops(func->old_addr); if (!ops) { + unsigned long ftrace_loc; + + ftrace_loc = klp_get_ftrace_location(func->old_addr); + if (!ftrace_loc) { + pr_err("failed to find location for function '%s'\n", + func->old_name); + return -EINVAL; + } + ops = kzalloc(sizeof(*ops), GFP_KERNEL); if (!ops) return -ENOMEM; @@ -352,7 +416,7 @@ static int klp_enable_func(struct klp_func *func) INIT_LIST_HEAD(&ops->func_stack); list_add_rcu(&func->stack_node, &ops->func_stack); - ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 0, 0); + ret = ftrace_set_filter_ip(&ops->fops, ftrace_loc, 0, 0); if (ret) { pr_err("failed to set ftrace filter for function '%s' (%d)\n", func->old_name, ret); @@ -363,7 +427,7 @@ static int klp_enable_func(struct klp_func *func) if (ret) { pr_err("failed to register ftrace handler for function '%s' (%d)\n", func->old_name, ret); - ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0); + ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0); goto err; } @@ -683,6 +747,9 @@ static void klp_free_patch(struct klp_patch *patch) static int klp_init_func(struct klp_object *obj, struct klp_func *func) { + if (!func->old_name || !func->new_func) + return -EINVAL; + INIT_LIST_HEAD(&func->stack_node); func->state = KLP_DISABLED; @@ -703,11 +770,9 @@ static int klp_init_object_loaded(struct klp_patch *patch, struct klp_func *func; int ret; - if (obj->relocs) { - ret = klp_write_object_relocations(patch->mod, obj); - if (ret) - return ret; - } + ret = klp_write_object_relocations(patch->mod, obj); + if (ret) + return ret; klp_for_each_func(obj, func) { ret = klp_find_object_symbol(obj->name, func->old_name, @@ -842,12 +907,18 @@ int klp_register_patch(struct klp_patch *patch) { int ret; - if (!klp_initialized()) - return -ENODEV; - if (!patch || !patch->mod) return -EINVAL; + if (!is_livepatch_module(patch->mod)) { + pr_err("module %s is not marked as a livepatch module", + patch->mod->name); + return -EINVAL; + } + + if (!klp_initialized()) + return -ENODEV; + /* * A reference is taken on the patch module to prevent it from being * unloaded. Right now, we don't allow patch modules to unload since diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 78c1c0ee6dc1..589d763a49b3 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -45,6 +45,8 @@ #include <linux/bitops.h> #include <linux/gfp.h> #include <linux/kmemcheck.h> +#include <linux/random.h> +#include <linux/jhash.h> #include <asm/sections.h> @@ -308,10 +310,14 @@ static struct hlist_head chainhash_table[CHAINHASH_SIZE]; * It's a 64-bit hash, because it's important for the keys to be * unique. */ -#define iterate_chain_key(key1, key2) \ - (((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \ - ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \ - (key2)) +static inline u64 iterate_chain_key(u64 key, u32 idx) +{ + u32 k0 = key, k1 = key >> 32; + + __jhash_mix(idx, k0, k1); /* Macro that modifies arguments! */ + + return k0 | (u64)k1 << 32; +} void lockdep_off(void) { @@ -708,7 +714,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) * yet. Otherwise we look it up. We cache the result in the lock object * itself, so actual lookup of the hash should be once per lock object. */ -static inline struct lock_class * +static struct lock_class * register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) { struct lockdep_subclass_key *key; @@ -3585,7 +3591,35 @@ static int __lock_is_held(struct lockdep_map *lock) return 0; } -static void __lock_pin_lock(struct lockdep_map *lock) +static struct pin_cookie __lock_pin_lock(struct lockdep_map *lock) +{ + struct pin_cookie cookie = NIL_COOKIE; + struct task_struct *curr = current; + int i; + + if (unlikely(!debug_locks)) + return cookie; + + for (i = 0; i < curr->lockdep_depth; i++) { + struct held_lock *hlock = curr->held_locks + i; + + if (match_held_lock(hlock, lock)) { + /* + * Grab 16bits of randomness; this is sufficient to not + * be guessable and still allows some pin nesting in + * our u32 pin_count. + */ + cookie.val = 1 + (prandom_u32() >> 16); + hlock->pin_count += cookie.val; + return cookie; + } + } + + WARN(1, "pinning an unheld lock\n"); + return cookie; +} + +static void __lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie) { struct task_struct *curr = current; int i; @@ -3597,7 +3631,7 @@ static void __lock_pin_lock(struct lockdep_map *lock) struct held_lock *hlock = curr->held_locks + i; if (match_held_lock(hlock, lock)) { - hlock->pin_count++; + hlock->pin_count += cookie.val; return; } } @@ -3605,7 +3639,7 @@ static void __lock_pin_lock(struct lockdep_map *lock) WARN(1, "pinning an unheld lock\n"); } -static void __lock_unpin_lock(struct lockdep_map *lock) +static void __lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie) { struct task_struct *curr = current; int i; @@ -3620,7 +3654,11 @@ static void __lock_unpin_lock(struct lockdep_map *lock) if (WARN(!hlock->pin_count, "unpinning an unpinned lock\n")) return; - hlock->pin_count--; + hlock->pin_count -= cookie.val; + + if (WARN((int)hlock->pin_count < 0, "pin count corrupted\n")) + hlock->pin_count = 0; + return; } } @@ -3751,24 +3789,44 @@ int lock_is_held(struct lockdep_map *lock) } EXPORT_SYMBOL_GPL(lock_is_held); -void lock_pin_lock(struct lockdep_map *lock) +struct pin_cookie lock_pin_lock(struct lockdep_map *lock) { + struct pin_cookie cookie = NIL_COOKIE; unsigned long flags; if (unlikely(current->lockdep_recursion)) - return; + return cookie; raw_local_irq_save(flags); check_flags(flags); current->lockdep_recursion = 1; - __lock_pin_lock(lock); + cookie = __lock_pin_lock(lock); current->lockdep_recursion = 0; raw_local_irq_restore(flags); + + return cookie; } EXPORT_SYMBOL_GPL(lock_pin_lock); -void lock_unpin_lock(struct lockdep_map *lock) +void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie) +{ + unsigned long flags; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + check_flags(flags); + + current->lockdep_recursion = 1; + __lock_repin_lock(lock, cookie); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_repin_lock); + +void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie) { unsigned long flags; @@ -3779,7 +3837,7 @@ void lock_unpin_lock(struct lockdep_map *lock) check_flags(flags); current->lockdep_recursion = 1; - __lock_unpin_lock(lock); + __lock_unpin_lock(lock, cookie); current->lockdep_recursion = 0; raw_local_irq_restore(flags); } diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 8ef1919d63b2..f8c5af52a131 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -75,12 +75,7 @@ struct lock_stress_stats { long n_lock_acquired; }; -#if defined(MODULE) -#define LOCKTORTURE_RUNNABLE_INIT 1 -#else -#define LOCKTORTURE_RUNNABLE_INIT 0 -#endif -int torture_runnable = LOCKTORTURE_RUNNABLE_INIT; +int torture_runnable = IS_ENABLED(MODULE); module_param(torture_runnable, int, 0444); MODULE_PARM_DESC(torture_runnable, "Start locktorture at module init"); @@ -394,12 +389,12 @@ static void torture_rtmutex_boost(struct torture_random_state *trsp) if (!rt_task(current)) { /* - * (1) Boost priority once every ~50k operations. When the + * Boost priority once every ~50k operations. When the * task tries to take the lock, the rtmutex it will account * for the new priority, and do any corresponding pi-dance. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * factor))) { + if (trsp && !(torture_random(trsp) % + (cxt.nrealwriters_stress * factor))) { policy = SCHED_FIFO; param.sched_priority = MAX_RT_PRIO - 1; } else /* common case, do nothing */ @@ -748,6 +743,15 @@ static void lock_torture_cleanup(void) if (torture_cleanup_begin()) return; + /* + * Indicates early cleanup, meaning that the test has not run, + * such as when passing bogus args when loading the module. As + * such, only perform the underlying torture-specific cleanups, + * and avoid anything related to locktorture. + */ + if (!cxt.lwsa) + goto end; + if (writer_tasks) { for (i = 0; i < cxt.nrealwriters_stress; i++) torture_stop_kthread(lock_torture_writer, @@ -776,6 +780,7 @@ static void lock_torture_cleanup(void) else lock_torture_print_module_parms(cxt.cur_ops, "End of test: SUCCESS"); +end: torture_cleanup_end(); } @@ -870,6 +875,7 @@ static int __init lock_torture_init(void) VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory"); firsterr = -ENOMEM; kfree(cxt.lwsa); + cxt.lwsa = NULL; goto unwind; } @@ -878,6 +884,7 @@ static int __init lock_torture_init(void) cxt.lrsa[i].n_lock_acquired = 0; } } + lock_torture_print_module_parms(cxt.cur_ops, "Start of test"); /* Prepare torture context. */ diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 3ef3736002d8..9c951fade415 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -49,21 +49,21 @@ void debug_mutex_free_waiter(struct mutex_waiter *waiter) } void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti) + struct task_struct *task) { SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); /* Mark the current thread as blocked on the lock: */ - ti->task->blocked_on = waiter; + task->blocked_on = waiter; } void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti) + struct task_struct *task) { DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); - DEBUG_LOCKS_WARN_ON(waiter->task != ti->task); - DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter); - ti->task->blocked_on = NULL; + DEBUG_LOCKS_WARN_ON(waiter->task != task); + DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter); + task->blocked_on = NULL; list_del_init(&waiter->list); waiter->task = NULL; diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h index 0799fd3e4cfa..57a871ae3c81 100644 --- a/kernel/locking/mutex-debug.h +++ b/kernel/locking/mutex-debug.h @@ -20,21 +20,21 @@ extern void debug_mutex_wake_waiter(struct mutex *lock, extern void debug_mutex_free_waiter(struct mutex_waiter *waiter); extern void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti); + struct task_struct *task); extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti); + struct task_struct *task); extern void debug_mutex_unlock(struct mutex *lock); extern void debug_mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key); static inline void mutex_set_owner(struct mutex *lock) { - lock->owner = current; + WRITE_ONCE(lock->owner, current); } static inline void mutex_clear_owner(struct mutex *lock) { - lock->owner = NULL; + WRITE_ONCE(lock->owner, NULL); } #define spin_lock_mutex(lock, flags) \ diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index e364b424b019..a70b90db3909 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -486,9 +486,6 @@ __ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) if (!hold_ctx) return 0; - if (unlikely(ctx == hold_ctx)) - return -EALREADY; - if (ctx->stamp - hold_ctx->stamp <= LONG_MAX && (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) { #ifdef CONFIG_DEBUG_MUTEXES @@ -514,6 +511,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, unsigned long flags; int ret; + if (use_ww_ctx) { + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); + if (unlikely(ww_ctx == READ_ONCE(ww->ctx))) + return -EALREADY; + } + preempt_disable(); mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); @@ -534,7 +537,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, goto skip_wait; debug_mutex_lock_common(lock, &waiter); - debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); + debug_mutex_add_waiter(lock, &waiter, task); /* add waiting tasks to the end of the waitqueue (FIFO): */ list_add_tail(&waiter.list, &lock->wait_list); @@ -581,7 +584,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } __set_task_state(task, TASK_RUNNING); - mutex_remove_waiter(lock, &waiter, current_thread_info()); + mutex_remove_waiter(lock, &waiter, task); /* set it to 0 if there are no waiters left: */ if (likely(list_empty(&lock->wait_list))) atomic_set(&lock->count, 0); @@ -602,7 +605,7 @@ skip_wait: return 0; err: - mutex_remove_waiter(lock, &waiter, task_thread_info(task)); + mutex_remove_waiter(lock, &waiter, task); spin_unlock_mutex(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); mutex_release(&lock->dep_map, 1, ip); diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 5cda397607f2..6cd6b8e9efd7 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -13,18 +13,24 @@ do { spin_lock(lock); (void)(flags); } while (0) #define spin_unlock_mutex(lock, flags) \ do { spin_unlock(lock); (void)(flags); } while (0) -#define mutex_remove_waiter(lock, waiter, ti) \ +#define mutex_remove_waiter(lock, waiter, task) \ __list_del((waiter)->list.prev, (waiter)->list.next) #ifdef CONFIG_MUTEX_SPIN_ON_OWNER +/* + * The mutex owner can get read and written to locklessly. + * We should use WRITE_ONCE when writing the owner value to + * avoid store tearing, otherwise, a thread could potentially + * read a partially written and incomplete owner value. + */ static inline void mutex_set_owner(struct mutex *lock) { - lock->owner = current; + WRITE_ONCE(lock->owner, current); } static inline void mutex_clear_owner(struct mutex *lock) { - lock->owner = NULL; + WRITE_ONCE(lock->owner, NULL); } #else static inline void mutex_set_owner(struct mutex *lock) diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index f231e0bb311c..bec0b647f9cc 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -37,6 +37,7 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *brw) free_percpu(brw->fast_read_ctr); brw->fast_read_ctr = NULL; /* catch use after free bugs */ } +EXPORT_SYMBOL_GPL(percpu_free_rwsem); /* * This is the fast-path for down_read/up_read. If it succeeds we rely diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index fec082338668..19248ddf37ce 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -93,7 +93,7 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts) * that accesses can't leak upwards out of our subsequent critical * section in the case that the lock is currently held for write. */ - cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts) - _QR_BIAS; + cnts = atomic_fetch_add_acquire(_QR_BIAS, &lock->cnts); rspin_until_writer_unlock(lock, cnts); /* diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index ce2f75e32ae1..b2caec7315af 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -90,7 +90,7 @@ static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]); * therefore increment the cpu number by one. */ -static inline u32 encode_tail(int cpu, int idx) +static inline __pure u32 encode_tail(int cpu, int idx) { u32 tail; @@ -103,7 +103,7 @@ static inline u32 encode_tail(int cpu, int idx) return tail; } -static inline struct mcs_spinlock *decode_tail(u32 tail) +static inline __pure struct mcs_spinlock *decode_tail(u32 tail) { int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; @@ -267,6 +267,123 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, #define queued_spin_lock_slowpath native_queued_spin_lock_slowpath #endif +/* + * Various notes on spin_is_locked() and spin_unlock_wait(), which are + * 'interesting' functions: + * + * PROBLEM: some architectures have an interesting issue with atomic ACQUIRE + * operations in that the ACQUIRE applies to the LOAD _not_ the STORE (ARM64, + * PPC). Also qspinlock has a similar issue per construction, the setting of + * the locked byte can be unordered acquiring the lock proper. + * + * This gets to be 'interesting' in the following cases, where the /should/s + * end up false because of this issue. + * + * + * CASE 1: + * + * So the spin_is_locked() correctness issue comes from something like: + * + * CPU0 CPU1 + * + * global_lock(); local_lock(i) + * spin_lock(&G) spin_lock(&L[i]) + * for (i) if (!spin_is_locked(&G)) { + * spin_unlock_wait(&L[i]); smp_acquire__after_ctrl_dep(); + * return; + * } + * // deal with fail + * + * Where it is important CPU1 sees G locked or CPU0 sees L[i] locked such + * that there is exclusion between the two critical sections. + * + * The load from spin_is_locked(&G) /should/ be constrained by the ACQUIRE from + * spin_lock(&L[i]), and similarly the load(s) from spin_unlock_wait(&L[i]) + * /should/ be constrained by the ACQUIRE from spin_lock(&G). + * + * Similarly, later stuff is constrained by the ACQUIRE from CTRL+RMB. + * + * + * CASE 2: + * + * For spin_unlock_wait() there is a second correctness issue, namely: + * + * CPU0 CPU1 + * + * flag = set; + * smp_mb(); spin_lock(&l) + * spin_unlock_wait(&l); if (!flag) + * // add to lockless list + * spin_unlock(&l); + * // iterate lockless list + * + * Which wants to ensure that CPU1 will stop adding bits to the list and CPU0 + * will observe the last entry on the list (if spin_unlock_wait() had ACQUIRE + * semantics etc..) + * + * Where flag /should/ be ordered against the locked store of l. + */ + +/* + * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before + * issuing an _unordered_ store to set _Q_LOCKED_VAL. + * + * This means that the store can be delayed, but no later than the + * store-release from the unlock. This means that simply observing + * _Q_LOCKED_VAL is not sufficient to determine if the lock is acquired. + * + * There are two paths that can issue the unordered store: + * + * (1) clear_pending_set_locked(): *,1,0 -> *,0,1 + * + * (2) set_locked(): t,0,0 -> t,0,1 ; t != 0 + * atomic_cmpxchg_relaxed(): t,0,0 -> 0,0,1 + * + * However, in both cases we have other !0 state we've set before to queue + * ourseves: + * + * For (1) we have the atomic_cmpxchg_acquire() that set _Q_PENDING_VAL, our + * load is constrained by that ACQUIRE to not pass before that, and thus must + * observe the store. + * + * For (2) we have a more intersting scenario. We enqueue ourselves using + * xchg_tail(), which ends up being a RELEASE. This in itself is not + * sufficient, however that is followed by an smp_cond_acquire() on the same + * word, giving a RELEASE->ACQUIRE ordering. This again constrains our load and + * guarantees we must observe that store. + * + * Therefore both cases have other !0 state that is observable before the + * unordered locked byte store comes through. This means we can use that to + * wait for the lock store, and then wait for an unlock. + */ +#ifndef queued_spin_unlock_wait +void queued_spin_unlock_wait(struct qspinlock *lock) +{ + u32 val; + + for (;;) { + val = atomic_read(&lock->val); + + if (!val) /* not locked, we're done */ + goto done; + + if (val & _Q_LOCKED_MASK) /* locked, go wait for unlock */ + break; + + /* not locked, but pending, wait until we observe the lock */ + cpu_relax(); + } + + /* any unlock is good */ + while (atomic_read(&lock->val) & _Q_LOCKED_MASK) + cpu_relax(); + +done: + smp_acquire__after_ctrl_dep(); +} +EXPORT_SYMBOL(queued_spin_unlock_wait); +#endif + #endif /* _GEN_PV_LOCK_SLOWPATH */ /** @@ -358,7 +475,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) * sequentiality; this is because not all clear_pending_set_locked() * implementations imply full barriers. */ - smp_cond_acquire(!(atomic_read(&lock->val) & _Q_LOCKED_MASK)); + smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_MASK)); /* * take ownership and clear the pending bit. @@ -395,6 +512,8 @@ queue: * pending stuff. * * p,*,* -> n,*,* + * + * RELEASE, such that the stores to @node must be complete. */ old = xchg_tail(lock, tail); next = NULL; @@ -405,6 +524,15 @@ queue: */ if (old & _Q_TAIL_MASK) { prev = decode_tail(old); + /* + * The above xchg_tail() is also a load of @lock which generates, + * through decode_tail(), a pointer. + * + * The address dependency matches the RELEASE of xchg_tail() + * such that the access to @prev must happen after. + */ + smp_read_barrier_depends(); + WRITE_ONCE(prev->next, node); pv_wait_node(node, prev); @@ -434,7 +562,7 @@ queue: * * The PV pv_wait_head_or_lock function, if active, will acquire * the lock and return a non-zero value. So we have to skip the - * smp_cond_acquire() call. As the next PV queue head hasn't been + * smp_cond_load_acquire() call. As the next PV queue head hasn't been * designated yet, there is no way for the locked value to become * _Q_SLOW_VAL. So both the set_locked() and the * atomic_cmpxchg_relaxed() calls will be safe. @@ -445,7 +573,7 @@ queue: if ((val = pv_wait_head_or_lock(lock, node))) goto locked; - smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK)); + val = smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_PENDING_MASK)); locked: /* @@ -465,9 +593,9 @@ locked: break; } /* - * The smp_cond_acquire() call above has provided the necessary - * acquire semantics required for locking. At most two - * iterations of this loop may be ran. + * The smp_cond_load_acquire() call above has provided the + * necessary acquire semantics required for locking. At most + * two iterations of this loop may be ran. */ old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL); if (old == val) @@ -491,7 +619,7 @@ release: /* * release the node */ - this_cpu_dec(mcs_nodes[0].count); + __this_cpu_dec(mcs_nodes[0].count); } EXPORT_SYMBOL(queued_spin_lock_slowpath); diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 21ede57f68b3..8a99abf58080 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -112,12 +112,12 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock) #else /* _Q_PENDING_BITS == 8 */ static __always_inline void set_pending(struct qspinlock *lock) { - atomic_set_mask(_Q_PENDING_VAL, &lock->val); + atomic_or(_Q_PENDING_VAL, &lock->val); } static __always_inline void clear_pending(struct qspinlock *lock) { - atomic_clear_mask(_Q_PENDING_VAL, &lock->val); + atomic_andnot(_Q_PENDING_VAL, &lock->val); } static __always_inline int trylock_clear_pending(struct qspinlock *lock) @@ -450,7 +450,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) goto gotlock; } } - WRITE_ONCE(pn->state, vcpu_halted); + WRITE_ONCE(pn->state, vcpu_hashed); qstat_inc(qstat_pv_wait_head, true); qstat_inc(qstat_pv_wait_again, waitcnt); pv_wait(&l->locked, _Q_SLOW_VAL); diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index d734b7502001..b9d031516254 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -153,7 +153,6 @@ static ssize_t qstat_read(struct file *file, char __user *user_buf, */ if ((counter == qstat_pv_latency_kick) || (counter == qstat_pv_latency_wake)) { - stat = 0; if (kicks) stat = DIV_ROUND_CLOSEST_ULL(stat, kicks); } @@ -191,8 +190,6 @@ static ssize_t qstat_write(struct file *file, const char __user *user_buf, for (i = 0 ; i < qstat_num; i++) WRITE_ONCE(ptr[i], 0); - for (i = 0 ; i < qstat_num; i++) - WRITE_ONCE(ptr[i], 0); } return count; } @@ -214,10 +211,8 @@ static int __init init_qspinlock_stat(void) struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL); int i; - if (!d_qstat) { - pr_warn("Could not create 'qlockstat' debugfs directory\n"); - return 0; - } + if (!d_qstat) + goto out; /* * Create the debugfs files @@ -227,12 +222,20 @@ static int __init init_qspinlock_stat(void) * performance. */ for (i = 0; i < qstat_num; i++) - debugfs_create_file(qstat_names[i], 0400, d_qstat, - (void *)(long)i, &fops_qstat); + if (!debugfs_create_file(qstat_names[i], 0400, d_qstat, + (void *)(long)i, &fops_qstat)) + goto fail_undo; + + if (!debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat, + (void *)(long)qstat_reset_cnts, &fops_qstat)) + goto fail_undo; - debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat, - (void *)(long)qstat_reset_cnts, &fops_qstat); return 0; +fail_undo: + debugfs_remove_recursive(d_qstat); +out: + pr_warn("Could not create 'qlockstat' debugfs entries\n"); + return -ENOMEM; } fs_initcall(init_qspinlock_stat); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 3e746607abe5..1ec0f48962b3 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1478,7 +1478,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); */ int __sched rt_mutex_trylock(struct rt_mutex *lock) { - if (WARN_ON(in_irq() || in_nmi() || in_serving_softirq())) + if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq())) return 0; return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index 3a5048572065..1591f6b3539f 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c @@ -191,11 +191,12 @@ int __down_read_trylock(struct rw_semaphore *sem) /* * get a write lock on the semaphore */ -void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) +int __sched __down_write_common(struct rw_semaphore *sem, int state) { struct rwsem_waiter waiter; struct task_struct *tsk; unsigned long flags; + int ret = 0; raw_spin_lock_irqsave(&sem->wait_lock, flags); @@ -215,21 +216,33 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) */ if (sem->count == 0) break; - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (signal_pending_state(state, current)) { + ret = -EINTR; + goto out; + } + set_task_state(tsk, state); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); schedule(); raw_spin_lock_irqsave(&sem->wait_lock, flags); } /* got the lock */ sem->count = -1; +out: list_del(&waiter.list); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + + return ret; } void __sched __down_write(struct rw_semaphore *sem) { - __down_write_nested(sem, 0); + __down_write_common(sem, TASK_UNINTERRUPTIBLE); +} + +int __sched __down_write_killable(struct rw_semaphore *sem) +{ + return __down_write_common(sem, TASK_KILLABLE); } /* diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index a4d4de05b2d1..447e08de1fab 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -80,7 +80,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, debug_check_no_locks_freed((void *)sem, sizeof(*sem)); lockdep_init_map(&sem->dep_map, name, key, 0); #endif - sem->count = RWSEM_UNLOCKED_VALUE; + atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); raw_spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); #ifdef CONFIG_RWSEM_SPIN_ON_OWNER @@ -114,12 +114,16 @@ enum rwsem_wake_type { * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed) * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so) * - there must be someone on the queue - * - the spinlock must be held by the caller + * - the wait_lock must be held by the caller + * - tasks are marked for wakeup, the caller must later invoke wake_up_q() + * to actually wakeup the blocked task(s) and drop the reference count, + * preferably when the wait_lock is released * - woken process blocks are discarded from the list after having task zeroed - * - writers are only woken if downgrading is false + * - writers are only marked woken if downgrading is false */ static struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) +__rwsem_mark_wake(struct rw_semaphore *sem, + enum rwsem_wake_type wake_type, struct wake_q_head *wake_q) { struct rwsem_waiter *waiter; struct task_struct *tsk; @@ -128,13 +132,16 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); if (waiter->type == RWSEM_WAITING_FOR_WRITE) { - if (wake_type == RWSEM_WAKE_ANY) - /* Wake writer at the front of the queue, but do not - * grant it the lock yet as we want other writers - * to be able to steal it. Readers, on the other hand, - * will block as they will notice the queued writer. + if (wake_type == RWSEM_WAKE_ANY) { + /* + * Mark writer at the front of the queue for wakeup. + * Until the task is actually later awoken later by + * the caller, other writers are able to steal it. + * Readers, on the other hand, will block as they + * will notice the queued writer. */ - wake_up_process(waiter->task); + wake_q_add(wake_q, waiter->task); + } goto out; } @@ -146,15 +153,27 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) if (wake_type != RWSEM_WAKE_READ_OWNED) { adjustment = RWSEM_ACTIVE_READ_BIAS; try_reader_grant: - oldcount = rwsem_atomic_update(adjustment, sem) - adjustment; + oldcount = atomic_long_fetch_add(adjustment, &sem->count); + if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { - /* A writer stole the lock. Undo our reader grant. */ - if (rwsem_atomic_update(-adjustment, sem) & - RWSEM_ACTIVE_MASK) + /* + * If the count is still less than RWSEM_WAITING_BIAS + * after removing the adjustment, it is assumed that + * a writer has stolen the lock. We have to undo our + * reader grant. + */ + if (atomic_long_add_return(-adjustment, &sem->count) < + RWSEM_WAITING_BIAS) goto out; /* Last active locker left. Retry waking readers. */ goto try_reader_grant; } + /* + * It is not really necessary to set it to reader-owned here, + * but it gives the spinners an early indication that the + * readers now have the lock. + */ + rwsem_set_reader_owned(sem); } /* Grant an infinite number of read locks to the readers at the front @@ -179,7 +198,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) adjustment -= RWSEM_WAITING_BIAS; if (adjustment) - rwsem_atomic_add(adjustment, sem); + atomic_long_add(adjustment, &sem->count); next = sem->wait_list.next; loop = woken; @@ -187,17 +206,15 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) waiter = list_entry(next, struct rwsem_waiter, list); next = waiter->list.next; tsk = waiter->task; + + wake_q_add(wake_q, tsk); /* - * Make sure we do not wakeup the next reader before - * setting the nil condition to grant the next reader; - * otherwise we could miss the wakeup on the other - * side and end up sleeping again. See the pairing - * in rwsem_down_read_failed(). + * Ensure that the last operation is setting the reader + * waiter to nil such that rwsem_down_read_failed() cannot + * race with do_exit() by always holding a reference count + * to the task to wakeup. */ - smp_mb(); - waiter->task = NULL; - wake_up_process(tsk); - put_task_struct(tsk); + smp_store_release(&waiter->task, NULL); } while (--loop); sem->wait_list.next = next; @@ -216,11 +233,11 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; struct rwsem_waiter waiter; struct task_struct *tsk = current; + WAKE_Q(wake_q); /* set up my own style of waitqueue */ waiter.task = tsk; waiter.type = RWSEM_WAITING_FOR_READ; - get_task_struct(tsk); raw_spin_lock_irq(&sem->wait_lock); if (list_empty(&sem->wait_list)) @@ -228,7 +245,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) list_add_tail(&waiter.list, &sem->wait_list); /* we're now waiting on the lock, but no longer actively locking */ - count = rwsem_atomic_update(adjustment, sem); + count = atomic_long_add_return(adjustment, &sem->count); /* If there are no active locks, wake the front queued process(es). * @@ -238,9 +255,10 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) if (count == RWSEM_WAITING_BIAS || (count > RWSEM_WAITING_BIAS && adjustment != -RWSEM_ACTIVE_READ_BIAS)) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); + sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); + wake_up_q(&wake_q); /* wait to be given the lock */ while (true) { @@ -255,17 +273,29 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) } EXPORT_SYMBOL(rwsem_down_read_failed); +/* + * This function must be called with the sem->wait_lock held to prevent + * race conditions between checking the rwsem wait list and setting the + * sem->count accordingly. + */ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) { /* - * Try acquiring the write lock. Check count first in order - * to reduce unnecessary expensive cmpxchg() operations. + * Avoid trying to acquire write lock if count isn't RWSEM_WAITING_BIAS. + */ + if (count != RWSEM_WAITING_BIAS) + return false; + + /* + * Acquire the lock by trying to set it to ACTIVE_WRITE_BIAS. If there + * are other tasks on the wait list, we need to add on WAITING_BIAS. */ - if (count == RWSEM_WAITING_BIAS && - cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, - RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { - if (!list_is_singular(&sem->wait_list)) - rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); + count = list_is_singular(&sem->wait_list) ? + RWSEM_ACTIVE_WRITE_BIAS : + RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS; + + if (atomic_long_cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count) + == RWSEM_WAITING_BIAS) { rwsem_set_owner(sem); return true; } @@ -279,13 +309,13 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) */ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) { - long old, count = READ_ONCE(sem->count); + long old, count = atomic_long_read(&sem->count); while (true) { if (!(count == 0 || count == RWSEM_WAITING_BIAS)) return false; - old = cmpxchg_acquire(&sem->count, count, + old = atomic_long_cmpxchg_acquire(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); if (old == count) { rwsem_set_owner(sem); @@ -306,16 +336,11 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) rcu_read_lock(); owner = READ_ONCE(sem->owner); - if (!owner) { - long count = READ_ONCE(sem->count); + if (!rwsem_owner_is_writer(owner)) { /* - * If sem->owner is not set, yet we have just recently entered the - * slowpath with the lock being active, then there is a possibility - * reader(s) may have the lock. To be safe, bail spinning in these - * situations. + * Don't spin if the rwsem is readers owned. */ - if (count & RWSEM_ACTIVE_MASK) - ret = false; + ret = !rwsem_owner_is_reader(owner); goto done; } @@ -325,10 +350,15 @@ done: return ret; } -static noinline -bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) +/* + * Return true only if we can still spin on the owner field of the rwsem. + */ +static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) { - long count; + struct task_struct *owner = READ_ONCE(sem->owner); + + if (!rwsem_owner_is_writer(owner)) + goto out; rcu_read_lock(); while (sem->owner == owner) { @@ -349,22 +379,16 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) cpu_relax_lowlatency(); } rcu_read_unlock(); - - if (READ_ONCE(sem->owner)) - return true; /* new owner, continue spinning */ - +out: /* - * When the owner is not set, the lock could be free or - * held by readers. Check the counter to verify the - * state. + * If there is a new owner or the owner is not set, we continue + * spinning. */ - count = READ_ONCE(sem->count); - return (count == 0 || count == RWSEM_WAITING_BIAS); + return !rwsem_owner_is_reader(READ_ONCE(sem->owner)); } static bool rwsem_optimistic_spin(struct rw_semaphore *sem) { - struct task_struct *owner; bool taken = false; preempt_disable(); @@ -376,12 +400,17 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) if (!osq_lock(&sem->osq)) goto done; - while (true) { - owner = READ_ONCE(sem->owner); - if (owner && !rwsem_spin_on_owner(sem, owner)) - break; - - /* wait_lock will be acquired if write_lock is obtained */ + /* + * Optimistically spin on the owner field and attempt to acquire the + * lock whenever the owner changes. Spinning will be stopped when: + * 1) the owning writer isn't running; or + * 2) readers own the lock as we can't determine if they are + * actively running or not. + */ + while (rwsem_spin_on_owner(sem)) { + /* + * Try to acquire the lock + */ if (rwsem_try_write_lock_unqueued(sem)) { taken = true; break; @@ -393,7 +422,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) * we're an RT task that will live-lock because we won't let * the owner complete. */ - if (!owner && (need_resched() || rt_task(current))) + if (!sem->owner && (need_resched() || rt_task(current))) break; /* @@ -433,15 +462,17 @@ static inline bool rwsem_has_spinner(struct rw_semaphore *sem) /* * Wait until we successfully acquire the write lock */ -__visible -struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) +static inline struct rw_semaphore * +__rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) { long count; bool waiting = true; /* any queued threads before us */ struct rwsem_waiter waiter; + struct rw_semaphore *ret = sem; + WAKE_Q(wake_q); /* undo write bias from down_write operation, stop active locking */ - count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem); + count = atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS, &sem->count); /* do optimistic spinning and steal lock if possible */ if (rwsem_optimistic_spin(sem)) @@ -464,21 +495,32 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) /* we're now waiting on the lock, but no longer actively locking */ if (waiting) { - count = READ_ONCE(sem->count); + count = atomic_long_read(&sem->count); /* * If there were already threads queued before us and there are * no active writers, the lock must be read owned; so we try to * wake any read locks that were queued ahead of us. */ - if (count > RWSEM_WAITING_BIAS) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); + if (count > RWSEM_WAITING_BIAS) { + WAKE_Q(wake_q); + + sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); + /* + * The wakeup is normally called _after_ the wait_lock + * is released, but given that we are proactively waking + * readers we can deal with the wake_q overhead as it is + * similar to releasing and taking the wait_lock again + * for attempting rwsem_try_write_lock(). + */ + wake_up_q(&wake_q); + } } else - count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); + count = atomic_long_add_return(RWSEM_WAITING_BIAS, &sem->count); /* wait until we successfully acquire the lock */ - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(state); while (true) { if (rwsem_try_write_lock(count, sem)) break; @@ -486,21 +528,49 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) /* Block until there are no active lockers. */ do { + if (signal_pending_state(state, current)) + goto out_nolock; + schedule(); - set_current_state(TASK_UNINTERRUPTIBLE); - } while ((count = sem->count) & RWSEM_ACTIVE_MASK); + set_current_state(state); + } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK); raw_spin_lock_irq(&sem->wait_lock); } __set_current_state(TASK_RUNNING); + list_del(&waiter.list); + raw_spin_unlock_irq(&sem->wait_lock); + return ret; + +out_nolock: + __set_current_state(TASK_RUNNING); + raw_spin_lock_irq(&sem->wait_lock); list_del(&waiter.list); + if (list_empty(&sem->wait_list)) + atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); + else + __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); + wake_up_q(&wake_q); - return sem; + return ERR_PTR(-EINTR); +} + +__visible struct rw_semaphore * __sched +rwsem_down_write_failed(struct rw_semaphore *sem) +{ + return __rwsem_down_write_failed_common(sem, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(rwsem_down_write_failed); +__visible struct rw_semaphore * __sched +rwsem_down_write_failed_killable(struct rw_semaphore *sem) +{ + return __rwsem_down_write_failed_common(sem, TASK_KILLABLE); +} +EXPORT_SYMBOL(rwsem_down_write_failed_killable); + /* * handle waking up a waiter on the semaphore * - up_read/up_write has decremented the active part of count if we come here @@ -509,6 +579,7 @@ __visible struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) { unsigned long flags; + WAKE_Q(wake_q); /* * If a spinner is present, it is not necessary to do the wakeup. @@ -545,9 +616,10 @@ locked: /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); + sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + wake_up_q(&wake_q); return sem; } @@ -562,14 +634,16 @@ __visible struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) { unsigned long flags; + WAKE_Q(wake_q); raw_spin_lock_irqsave(&sem->wait_lock, flags); /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); + sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + wake_up_q(&wake_q); return sem; } diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 205be0ce34de..45ba475d4be3 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -22,6 +22,7 @@ void __sched down_read(struct rw_semaphore *sem) rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_read_trylock, __down_read); + rwsem_set_reader_owned(sem); } EXPORT_SYMBOL(down_read); @@ -33,8 +34,10 @@ int down_read_trylock(struct rw_semaphore *sem) { int ret = __down_read_trylock(sem); - if (ret == 1) + if (ret == 1) { rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); + rwsem_set_reader_owned(sem); + } return ret; } @@ -55,6 +58,25 @@ void __sched down_write(struct rw_semaphore *sem) EXPORT_SYMBOL(down_write); /* + * lock for writing + */ +int __sched down_write_killable(struct rw_semaphore *sem) +{ + might_sleep(); + rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); + + if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) { + rwsem_release(&sem->dep_map, 1, _RET_IP_); + return -EINTR; + } + + rwsem_set_owner(sem); + return 0; +} + +EXPORT_SYMBOL(down_write_killable); + +/* * trylock for writing -- returns 1 if successful, 0 if contention */ int down_write_trylock(struct rw_semaphore *sem) @@ -105,7 +127,7 @@ void downgrade_write(struct rw_semaphore *sem) * lockdep: a downgraded write will live on as a write * dependency. */ - rwsem_clear_owner(sem); + rwsem_set_reader_owned(sem); __downgrade_write(sem); } @@ -119,6 +141,7 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_read_trylock, __down_read); + rwsem_set_reader_owned(sem); } EXPORT_SYMBOL(down_read_nested); @@ -154,6 +177,22 @@ void down_write_nested(struct rw_semaphore *sem, int subclass) EXPORT_SYMBOL(down_write_nested); +int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) +{ + might_sleep(); + rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); + + if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) { + rwsem_release(&sem->dep_map, 1, _RET_IP_); + return -EINTR; + } + + rwsem_set_owner(sem); + return 0; +} + +EXPORT_SYMBOL(down_write_killable_nested); + void up_read_non_owner(struct rw_semaphore *sem) { __up_read(sem); diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index 870ed9a5b426..a699f4048ba1 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -1,14 +1,58 @@ +/* + * The owner field of the rw_semaphore structure will be set to + * RWSEM_READ_OWNED when a reader grabs the lock. A writer will clear + * the owner field when it unlocks. A reader, on the other hand, will + * not touch the owner field when it unlocks. + * + * In essence, the owner field now has the following 3 states: + * 1) 0 + * - lock is free or the owner hasn't set the field yet + * 2) RWSEM_READER_OWNED + * - lock is currently or previously owned by readers (lock is free + * or not set by owner yet) + * 3) Other non-zero value + * - a writer owns the lock + */ +#define RWSEM_READER_OWNED ((struct task_struct *)1UL) + #ifdef CONFIG_RWSEM_SPIN_ON_OWNER +/* + * All writes to owner are protected by WRITE_ONCE() to make sure that + * store tearing can't happen as optimistic spinners may read and use + * the owner value concurrently without lock. Read from owner, however, + * may not need READ_ONCE() as long as the pointer value is only used + * for comparison and isn't being dereferenced. + */ static inline void rwsem_set_owner(struct rw_semaphore *sem) { - sem->owner = current; + WRITE_ONCE(sem->owner, current); } static inline void rwsem_clear_owner(struct rw_semaphore *sem) { - sem->owner = NULL; + WRITE_ONCE(sem->owner, NULL); +} + +static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) +{ + /* + * We check the owner value first to make sure that we will only + * do a write to the rwsem cacheline when it is really necessary + * to minimize cacheline contention. + */ + if (sem->owner != RWSEM_READER_OWNED) + WRITE_ONCE(sem->owner, RWSEM_READER_OWNED); +} + +static inline bool rwsem_owner_is_writer(struct task_struct *owner) +{ + return owner && owner != RWSEM_READER_OWNED; } +static inline bool rwsem_owner_is_reader(struct task_struct *owner) +{ + return owner == RWSEM_READER_OWNED; +} #else static inline void rwsem_set_owner(struct rw_semaphore *sem) { @@ -17,4 +61,8 @@ static inline void rwsem_set_owner(struct rw_semaphore *sem) static inline void rwsem_clear_owner(struct rw_semaphore *sem) { } + +static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) +{ +} #endif diff --git a/kernel/memremap.c b/kernel/memremap.c index a6d382312e6f..b501e390bb34 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -27,6 +27,13 @@ __weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size) } #endif +#ifndef arch_memremap_wb +static void *arch_memremap_wb(resource_size_t offset, unsigned long size) +{ + return (__force void *)ioremap_cache(offset, size); +} +#endif + static void *try_ram_remap(resource_size_t offset, size_t size) { unsigned long pfn = PHYS_PFN(offset); @@ -34,7 +41,7 @@ static void *try_ram_remap(resource_size_t offset, size_t size) /* In the simple case just return the existing linear address */ if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn))) return __va(offset); - return NULL; /* fallback to ioremap_cache */ + return NULL; /* fallback to arch_memremap_wb */ } /** @@ -90,7 +97,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) if (is_ram == REGION_INTERSECTS) addr = try_ram_remap(offset, size); if (!addr) - addr = ioremap_cache(offset, size); + addr = arch_memremap_wb(offset, size); } /* @@ -162,12 +169,6 @@ void devm_memunmap(struct device *dev, void *addr) } EXPORT_SYMBOL(devm_memunmap); -pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags) -{ - return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags); -} -EXPORT_SYMBOL(phys_to_pfn_t); - #ifdef CONFIG_ZONE_DEVICE static DEFINE_MUTEX(pgmap_lock); static RADIX_TREE(pgmap_radix, GFP_KERNEL); @@ -246,6 +247,7 @@ static void devm_memremap_pages_release(struct device *dev, void *data) align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(resource_size(res), SECTION_SIZE); arch_remove_memory(align_start, align_size); + untrack_pfn(NULL, PHYS_PFN(align_start), align_size); pgmap_radix_release(res); dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc, "%s: failed to free all reserved pages\n", __func__); @@ -281,6 +283,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, struct percpu_ref *ref, struct vmem_altmap *altmap) { resource_size_t key, align_start, align_size, align_end; + pgprot_t pgprot = PAGE_KERNEL; struct dev_pagemap *pgmap; struct page_map *page_map; int error, nid, is_ram; @@ -301,12 +304,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (is_ram == REGION_INTERSECTS) return __va(res->start); - if (altmap && !IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) { - dev_err(dev, "%s: altmap requires CONFIG_SPARSEMEM_VMEMMAP=y\n", - __func__); - return ERR_PTR(-ENXIO); - } - if (!ref) return ERR_PTR(-EINVAL); @@ -356,6 +353,11 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (nid < 0) nid = numa_mem_id(); + error = track_pfn_remap(NULL, &pgprot, PHYS_PFN(align_start), 0, + align_size); + if (error) + goto err_pfn_remap; + error = arch_add_memory(nid, align_start, align_size, true); if (error) goto err_add_memory; @@ -376,6 +378,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, return __va(res->start); err_add_memory: + untrack_pfn(NULL, PHYS_PFN(align_start), align_size); + err_pfn_remap: err_radix: pgmap_radix_release(res); devres_free(page_map); @@ -394,7 +398,6 @@ void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) altmap->alloc -= nr_pfns; } -#ifdef CONFIG_SPARSEMEM_VMEMMAP struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) { /* @@ -420,5 +423,4 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) return pgmap ? pgmap->altmap : NULL; } -#endif /* CONFIG_SPARSEMEM_VMEMMAP */ #endif /* CONFIG_ZONE_DEVICE */ diff --git a/kernel/module.c b/kernel/module.c index 041200ca4a2d..529efae9f481 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -60,6 +60,7 @@ #include <linux/jump_label.h> #include <linux/pfn.h> #include <linux/bsearch.h> +#include <linux/dynamic_debug.h> #include <uapi/linux/module.h> #include "module-internal.h" @@ -264,7 +265,7 @@ static void module_assert_mutex_or_preempt(void) if (unlikely(!debug_locks)) return; - WARN_ON(!rcu_read_lock_sched_held() && + WARN_ON_ONCE(!rcu_read_lock_sched_held() && !lockdep_is_held(&module_mutex)); #endif } @@ -336,7 +337,7 @@ static inline void add_taint_module(struct module *mod, unsigned flag, * A thread that wants to hold a reference to a module only while it * is running can call this to safely exit. nfsd and lockd use this. */ -void __module_put_and_exit(struct module *mod, long code) +void __noreturn __module_put_and_exit(struct module *mod, long code) { module_put(mod); do_exit(code); @@ -1693,8 +1694,7 @@ static int module_add_modinfo_attrs(struct module *mod) temp_attr = mod->modinfo_attrs; for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) { - if (!attr->test || - (attr->test && attr->test(mod))) { + if (!attr->test || attr->test(mod)) { memcpy(temp_attr, attr, sizeof(*temp_attr)); sysfs_attr_init(&temp_attr->attr); error = sysfs_create_file(&mod->mkobj.kobj, @@ -1858,10 +1858,11 @@ static void mod_sysfs_teardown(struct module *mod) * from modification and any data from execution. * * General layout of module is: - * [text] [read-only-data] [writable data] - * text_size -----^ ^ ^ - * ro_size ------------------------| | - * size -------------------------------------------| + * [text] [read-only-data] [ro-after-init] [writable data] + * text_size -----^ ^ ^ ^ + * ro_size ------------------------| | | + * ro_after_init_size -----------------------------| | + * size -----------------------------------------------------------| * * These values are always page-aligned (as is base) */ @@ -1884,14 +1885,24 @@ static void frob_rodata(const struct module_layout *layout, (layout->ro_size - layout->text_size) >> PAGE_SHIFT); } +static void frob_ro_after_init(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) +{ + BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1)); + set_memory((unsigned long)layout->base + layout->ro_size, + (layout->ro_after_init_size - layout->ro_size) >> PAGE_SHIFT); +} + static void frob_writable_data(const struct module_layout *layout, int (*set_memory)(unsigned long start, int num_pages)) { BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); - BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1)); BUG_ON((unsigned long)layout->size & (PAGE_SIZE-1)); - set_memory((unsigned long)layout->base + layout->ro_size, - (layout->size - layout->ro_size) >> PAGE_SHIFT); + set_memory((unsigned long)layout->base + layout->ro_after_init_size, + (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT); } /* livepatching wants to disable read-only so it can frob module. */ @@ -1899,21 +1910,26 @@ void module_disable_ro(const struct module *mod) { frob_text(&mod->core_layout, set_memory_rw); frob_rodata(&mod->core_layout, set_memory_rw); + frob_ro_after_init(&mod->core_layout, set_memory_rw); frob_text(&mod->init_layout, set_memory_rw); frob_rodata(&mod->init_layout, set_memory_rw); } -void module_enable_ro(const struct module *mod) +void module_enable_ro(const struct module *mod, bool after_init) { frob_text(&mod->core_layout, set_memory_ro); frob_rodata(&mod->core_layout, set_memory_ro); frob_text(&mod->init_layout, set_memory_ro); frob_rodata(&mod->init_layout, set_memory_ro); + + if (after_init) + frob_ro_after_init(&mod->core_layout, set_memory_ro); } static void module_enable_nx(const struct module *mod) { frob_rodata(&mod->core_layout, set_memory_nx); + frob_ro_after_init(&mod->core_layout, set_memory_nx); frob_writable_data(&mod->core_layout, set_memory_nx); frob_rodata(&mod->init_layout, set_memory_nx); frob_writable_data(&mod->init_layout, set_memory_nx); @@ -1922,6 +1938,7 @@ static void module_enable_nx(const struct module *mod) static void module_disable_nx(const struct module *mod) { frob_rodata(&mod->core_layout, set_memory_x); + frob_ro_after_init(&mod->core_layout, set_memory_x); frob_writable_data(&mod->core_layout, set_memory_x); frob_rodata(&mod->init_layout, set_memory_x); frob_writable_data(&mod->init_layout, set_memory_x); @@ -1964,6 +1981,8 @@ static void disable_ro_nx(const struct module_layout *layout) frob_text(layout, set_memory_rw); frob_rodata(layout, set_memory_rw); frob_rodata(layout, set_memory_x); + frob_ro_after_init(layout, set_memory_rw); + frob_ro_after_init(layout, set_memory_x); frob_writable_data(layout, set_memory_x); } @@ -1973,6 +1992,83 @@ static void module_enable_nx(const struct module *mod) { } static void module_disable_nx(const struct module *mod) { } #endif +#ifdef CONFIG_LIVEPATCH +/* + * Persist Elf information about a module. Copy the Elf header, + * section header table, section string table, and symtab section + * index from info to mod->klp_info. + */ +static int copy_module_elf(struct module *mod, struct load_info *info) +{ + unsigned int size, symndx; + int ret; + + size = sizeof(*mod->klp_info); + mod->klp_info = kmalloc(size, GFP_KERNEL); + if (mod->klp_info == NULL) + return -ENOMEM; + + /* Elf header */ + size = sizeof(mod->klp_info->hdr); + memcpy(&mod->klp_info->hdr, info->hdr, size); + + /* Elf section header table */ + size = sizeof(*info->sechdrs) * info->hdr->e_shnum; + mod->klp_info->sechdrs = kmalloc(size, GFP_KERNEL); + if (mod->klp_info->sechdrs == NULL) { + ret = -ENOMEM; + goto free_info; + } + memcpy(mod->klp_info->sechdrs, info->sechdrs, size); + + /* Elf section name string table */ + size = info->sechdrs[info->hdr->e_shstrndx].sh_size; + mod->klp_info->secstrings = kmalloc(size, GFP_KERNEL); + if (mod->klp_info->secstrings == NULL) { + ret = -ENOMEM; + goto free_sechdrs; + } + memcpy(mod->klp_info->secstrings, info->secstrings, size); + + /* Elf symbol section index */ + symndx = info->index.sym; + mod->klp_info->symndx = symndx; + + /* + * For livepatch modules, core_kallsyms.symtab is a complete + * copy of the original symbol table. Adjust sh_addr to point + * to core_kallsyms.symtab since the copy of the symtab in module + * init memory is freed at the end of do_init_module(). + */ + mod->klp_info->sechdrs[symndx].sh_addr = \ + (unsigned long) mod->core_kallsyms.symtab; + + return 0; + +free_sechdrs: + kfree(mod->klp_info->sechdrs); +free_info: + kfree(mod->klp_info); + return ret; +} + +static void free_module_elf(struct module *mod) +{ + kfree(mod->klp_info->sechdrs); + kfree(mod->klp_info->secstrings); + kfree(mod->klp_info); +} +#else /* !CONFIG_LIVEPATCH */ +static int copy_module_elf(struct module *mod, struct load_info *info) +{ + return 0; +} + +static void free_module_elf(struct module *mod) +{ +} +#endif /* CONFIG_LIVEPATCH */ + void __weak module_memfree(void *module_region) { vfree(module_region); @@ -2011,6 +2107,9 @@ static void free_module(struct module *mod) /* Free any allocated parameters. */ destroy_params(mod->kp, mod->num_kp); + if (is_livepatch_module(mod)) + free_module_elf(mod); + /* Now we can delete it from the lists */ mutex_lock(&module_mutex); /* Unlink carefully: kallsyms could be walking list. */ @@ -2126,6 +2225,10 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) (long)sym[i].st_value); break; + case SHN_LIVEPATCH: + /* Livepatch symbols are resolved by livepatch */ + break; + case SHN_UNDEF: ksym = resolve_symbol_wait(mod, info, name); /* Ok if resolved. */ @@ -2174,6 +2277,10 @@ static int apply_relocations(struct module *mod, const struct load_info *info) if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC)) continue; + /* Livepatch relocation sections are applied by livepatch */ + if (info->sechdrs[i].sh_flags & SHF_RELA_LIVEPATCH) + continue; + if (info->sechdrs[i].sh_type == SHT_REL) err = apply_relocate(info->sechdrs, info->strtab, info->index.sym, i, mod); @@ -2218,6 +2325,7 @@ static void layout_sections(struct module *mod, struct load_info *info) * finder in the two loops below */ { SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL }, { SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL }, + { SHF_RO_AFTER_INIT | SHF_ALLOC, ARCH_SHF_SMALL }, { SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL }, { ARCH_SHF_SMALL | SHF_ALLOC, 0 } }; @@ -2249,7 +2357,11 @@ static void layout_sections(struct module *mod, struct load_info *info) mod->core_layout.size = debug_align(mod->core_layout.size); mod->core_layout.ro_size = mod->core_layout.size; break; - case 3: /* whole core */ + case 2: /* RO after init */ + mod->core_layout.size = debug_align(mod->core_layout.size); + mod->core_layout.ro_after_init_size = mod->core_layout.size; + break; + case 4: /* whole core */ mod->core_layout.size = debug_align(mod->core_layout.size); break; } @@ -2279,7 +2391,14 @@ static void layout_sections(struct module *mod, struct load_info *info) mod->init_layout.size = debug_align(mod->init_layout.size); mod->init_layout.ro_size = mod->init_layout.size; break; - case 3: /* whole init */ + case 2: + /* + * RO after init doesn't apply to init_layout (only + * core_layout), so it just takes the value of ro_size. + */ + mod->init_layout.ro_after_init_size = mod->init_layout.ro_size; + break; + case 4: /* whole init */ mod->init_layout.size = debug_align(mod->init_layout.size); break; } @@ -2469,7 +2588,7 @@ static void layout_symtab(struct module *mod, struct load_info *info) /* Compute total space required for the core symbols' strtab. */ for (ndst = i = 0; i < nsrc; i++) { - if (i == 0 || + if (i == 0 || is_livepatch_module(mod) || is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, info->index.pcpu)) { strtab_size += strlen(&info->strtab[src[i].st_name])+1; @@ -2528,7 +2647,7 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs; src = mod->kallsyms->symtab; for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) { - if (i == 0 || + if (i == 0 || is_livepatch_module(mod) || is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, info->index.pcpu)) { dst[ndst] = src[i]; @@ -2599,13 +2718,18 @@ static inline void kmemleak_load_module(const struct module *mod, #endif #ifdef CONFIG_MODULE_SIG -static int module_sig_check(struct load_info *info) +static int module_sig_check(struct load_info *info, int flags) { int err = -ENOKEY; const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; const void *mod = info->hdr; - if (info->len > markerlen && + /* + * Require flags == 0, as a module with version information + * removed is no longer the module that was signed + */ + if (flags == 0 && + info->len > markerlen && memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { /* We truncate the module to discard the signature */ info->len -= markerlen; @@ -2624,7 +2748,7 @@ static int module_sig_check(struct load_info *info) return err; } #else /* !CONFIG_MODULE_SIG */ -static int module_sig_check(struct load_info *info) +static int module_sig_check(struct load_info *info, int flags) { return 0; } @@ -2667,6 +2791,26 @@ static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned l return 0; } +#ifdef CONFIG_LIVEPATCH +static int find_livepatch_modinfo(struct module *mod, struct load_info *info) +{ + mod->klp = get_modinfo(info, "livepatch") ? true : false; + + return 0; +} +#else /* !CONFIG_LIVEPATCH */ +static int find_livepatch_modinfo(struct module *mod, struct load_info *info) +{ + if (get_modinfo(info, "livepatch")) { + pr_err("%s: module is marked as livepatch module, but livepatch support is disabled", + mod->name); + return -ENOEXEC; + } + + return 0; +} +#endif /* CONFIG_LIVEPATCH */ + /* Sets info->hdr and info->len. */ static int copy_module_from_user(const void __user *umod, unsigned long len, struct load_info *info) @@ -2812,8 +2956,12 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) return -ENOEXEC; } - if (!get_modinfo(info, "intree")) + if (!get_modinfo(info, "intree")) { + if (!test_taint(TAINT_OOT_MODULE)) + pr_warn("%s: loading out-of-tree module taints kernel.\n", + mod->name); add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK); + } if (get_modinfo(info, "staging")) { add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK); @@ -2821,6 +2969,10 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) "is unknown, you have been warned.\n", mod->name); } + err = find_livepatch_modinfo(mod, info); + if (err) + return err; + /* Set up license info based on the info section */ set_license(mod, get_modinfo(info, "license")); @@ -2978,6 +3130,8 @@ static int move_module(struct module *mod, struct load_info *info) static int check_module_license_and_versions(struct module *mod) { + int prev_taint = test_taint(TAINT_PROPRIETARY_MODULE); + /* * ndiswrapper is under GPL by itself, but loads proprietary modules. * Don't use add_taint_module(), as it would prevent ndiswrapper from @@ -2996,6 +3150,9 @@ static int check_module_license_and_versions(struct module *mod) add_taint_module(mod, TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE); + if (!prev_taint && test_taint(TAINT_PROPRIETARY_MODULE)) + pr_warn("%s: module license taints kernel.\n", mod->name); + #ifdef CONFIG_MODVERSIONS if ((mod->num_syms && !mod->crcs) || (mod->num_gpl_syms && !mod->gpl_crcs) @@ -3043,16 +3200,41 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr, return 0; } +/* module_blacklist is a comma-separated list of module names */ +static char *module_blacklist; +static bool blacklisted(char *module_name) +{ + const char *p; + size_t len; + + if (!module_blacklist) + return false; + + for (p = module_blacklist; *p; p += len) { + len = strcspn(p, ","); + if (strlen(module_name) == len && !memcmp(module_name, p, len)) + return true; + if (p[len] == ',') + len++; + } + return false; +} +core_param(module_blacklist, module_blacklist, charp, 0400); + static struct module *layout_and_allocate(struct load_info *info, int flags) { /* Module within temporary copy. */ struct module *mod; + unsigned int ndx; int err; mod = setup_load_info(info, flags); if (IS_ERR(mod)) return mod; + if (blacklisted(mod->name)) + return ERR_PTR(-EPERM); + err = check_modinfo(mod, info, flags); if (err) return ERR_PTR(err); @@ -3066,6 +3248,15 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) /* We will do a special allocation for per-cpu sections later. */ info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC; + /* + * Mark ro_after_init section with SHF_RO_AFTER_INIT so that + * layout_sections() can put it in the right place. + * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set. + */ + ndx = find_sec(info, ".data..ro_after_init"); + if (ndx) + info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT; + /* Determine total sizes, and put offsets in sh_entsize. For now this is done generically; there doesn't appear to be any special cases for the architectures. */ @@ -3232,12 +3423,14 @@ static noinline int do_init_module(struct module *mod) /* Switch to core kallsyms now init is done: kallsyms may be walking! */ rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms); #endif + module_enable_ro(mod, true); mod_tree_remove_init(mod); disable_ro_nx(&mod->init_layout); module_arch_freeing_init(mod); mod->init_layout.base = NULL; mod->init_layout.size = 0; mod->init_layout.ro_size = 0; + mod->init_layout.ro_after_init_size = 0; mod->init_layout.text_size = 0; /* * We want to free module_init, but be aware that kallsyms may be @@ -3329,8 +3522,7 @@ static int complete_formation(struct module *mod, struct load_info *info) /* This relies on module_mutex for list integrity. */ module_bug_finalize(info->hdr, info->sechdrs, mod); - /* Set RO and NX regions */ - module_enable_ro(mod); + module_enable_ro(mod, false); module_enable_nx(mod); /* Mark state as coming so strong_try_module_get() ignores us, @@ -3386,7 +3578,7 @@ static int load_module(struct load_info *info, const char __user *uargs, long err; char *after_dashes; - err = module_sig_check(info); + err = module_sig_check(info, flags); if (err) goto free_copy; @@ -3494,6 +3686,12 @@ static int load_module(struct load_info *info, const char __user *uargs, if (err < 0) goto coming_cleanup; + if (is_livepatch_module(mod)) { + err = copy_module_elf(mod, info); + if (err < 0) + goto sysfs_cleanup; + } + /* Get rid of temporary copy. */ free_copy(info); @@ -3502,11 +3700,12 @@ static int load_module(struct load_info *info, const char __user *uargs, return do_init_module(mod); + sysfs_cleanup: + mod_sysfs_teardown(mod); coming_cleanup: blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); klp_module_going(mod); - bug_cleanup: /* module_bug_cleanup needs module_mutex protection */ mutex_lock(&module_mutex); diff --git a/kernel/module_signing.c b/kernel/module_signing.c index 64b9dead4a07..937c844bee4a 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -12,7 +12,7 @@ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/string.h> -#include <keys/system_keyring.h> +#include <linux/verification.h> #include <crypto/public_key.h> #include "module-internal.h" @@ -80,6 +80,7 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen) return -EBADMSG; } - return system_verify_data(mod, modlen, mod + modlen, sig_len, - VERIFYING_MODULE_SIGNATURE); + return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len, + NULL, VERIFYING_MODULE_SIGNATURE, + NULL, NULL); } diff --git a/kernel/padata.c b/kernel/padata.c index b38bea9c466a..993278895ccc 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -607,33 +607,6 @@ out_replace: } /** - * padata_set_cpumasks - Set both parallel and serial cpumasks. The first - * one is used by parallel workers and the second one - * by the wokers doing serialization. - * - * @pinst: padata instance - * @pcpumask: the cpumask to use for parallel workers - * @cbcpumask: the cpumsak to use for serial workers - */ -int padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t pcpumask, - cpumask_var_t cbcpumask) -{ - int err; - - mutex_lock(&pinst->lock); - get_online_cpus(); - - err = __padata_set_cpumasks(pinst, pcpumask, cbcpumask); - - put_online_cpus(); - mutex_unlock(&pinst->lock); - - return err; - -} -EXPORT_SYMBOL(padata_set_cpumasks); - -/** * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value * equivalent to @cpumask. * @@ -674,6 +647,43 @@ out: } EXPORT_SYMBOL(padata_set_cpumask); +/** + * padata_start - start the parallel processing + * + * @pinst: padata instance to start + */ +int padata_start(struct padata_instance *pinst) +{ + int err = 0; + + mutex_lock(&pinst->lock); + + if (pinst->flags & PADATA_INVALID) + err = -EINVAL; + + __padata_start(pinst); + + mutex_unlock(&pinst->lock); + + return err; +} +EXPORT_SYMBOL(padata_start); + +/** + * padata_stop - stop the parallel processing + * + * @pinst: padata instance to stop + */ +void padata_stop(struct padata_instance *pinst) +{ + mutex_lock(&pinst->lock); + __padata_stop(pinst); + mutex_unlock(&pinst->lock); +} +EXPORT_SYMBOL(padata_stop); + +#ifdef CONFIG_HOTPLUG_CPU + static int __padata_add_cpu(struct padata_instance *pinst, int cpu) { struct parallel_data *pd; @@ -694,42 +704,6 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu) return 0; } - /** - * padata_add_cpu - add a cpu to one or both(parallel and serial) - * padata cpumasks. - * - * @pinst: padata instance - * @cpu: cpu to add - * @mask: bitmask of flags specifying to which cpumask @cpu shuld be added. - * The @mask may be any combination of the following flags: - * PADATA_CPU_SERIAL - serial cpumask - * PADATA_CPU_PARALLEL - parallel cpumask - */ - -int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask) -{ - int err; - - if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL))) - return -EINVAL; - - mutex_lock(&pinst->lock); - - get_online_cpus(); - if (mask & PADATA_CPU_SERIAL) - cpumask_set_cpu(cpu, pinst->cpumask.cbcpu); - if (mask & PADATA_CPU_PARALLEL) - cpumask_set_cpu(cpu, pinst->cpumask.pcpu); - - err = __padata_add_cpu(pinst, cpu); - put_online_cpus(); - - mutex_unlock(&pinst->lock); - - return err; -} -EXPORT_SYMBOL(padata_add_cpu); - static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) { struct parallel_data *pd = NULL; @@ -789,43 +763,6 @@ int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask) } EXPORT_SYMBOL(padata_remove_cpu); -/** - * padata_start - start the parallel processing - * - * @pinst: padata instance to start - */ -int padata_start(struct padata_instance *pinst) -{ - int err = 0; - - mutex_lock(&pinst->lock); - - if (pinst->flags & PADATA_INVALID) - err =-EINVAL; - - __padata_start(pinst); - - mutex_unlock(&pinst->lock); - - return err; -} -EXPORT_SYMBOL(padata_start); - -/** - * padata_stop - stop the parallel processing - * - * @pinst: padata instance to stop - */ -void padata_stop(struct padata_instance *pinst) -{ - mutex_lock(&pinst->lock); - __padata_stop(pinst); - mutex_unlock(&pinst->lock); -} -EXPORT_SYMBOL(padata_stop); - -#ifdef CONFIG_HOTPLUG_CPU - static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu) { return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) || @@ -1091,7 +1028,6 @@ err_free_inst: err: return NULL; } -EXPORT_SYMBOL(padata_alloc); /** * padata_free - free a padata instance diff --git a/kernel/panic.c b/kernel/panic.c index 535c96510a44..ca8cea1ef673 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -108,6 +108,7 @@ void panic(const char *fmt, ...) long i, i_next = 0; int state = 0; int old_cpu, this_cpu; + bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers; /* * Disable local interrupts. This will prevent panic_smp_self_stop @@ -160,8 +161,10 @@ void panic(const char *fmt, ...) * * Bypass the panic_cpu check and call __crash_kexec directly. */ - if (!crash_kexec_post_notifiers) + if (!_crash_kexec_post_notifiers) { + printk_nmi_flush_on_panic(); __crash_kexec(NULL); + } /* * Note smp_send_stop is the usual smp shutdown function, which @@ -176,6 +179,8 @@ void panic(const char *fmt, ...) */ atomic_notifier_call_chain(&panic_notifier_list, 0, buf); + /* Call flush even twice. It tries harder with a single online CPU */ + printk_nmi_flush_on_panic(); kmsg_dump(KMSG_DUMP_PANIC); /* @@ -187,7 +192,7 @@ void panic(const char *fmt, ...) * * Bypass the panic_cpu check and call __crash_kexec directly. */ - if (crash_kexec_post_notifiers) + if (_crash_kexec_post_notifiers) __crash_kexec(NULL); bust_spinlocks(0); @@ -567,13 +572,7 @@ EXPORT_SYMBOL(__stack_chk_fail); core_param(panic, panic_timeout, int, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); core_param(panic_on_warn, panic_on_warn, int, 0644); - -static int __init setup_crash_kexec_post_notifiers(char *s) -{ - crash_kexec_post_notifiers = true; - return 0; -} -early_param("crash_kexec_post_notifiers", setup_crash_kexec_post_notifiers); +core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644); static int __init oops_setup(char *s) { diff --git a/kernel/pid.c b/kernel/pid.c index 4d73a834c7e6..f66162f2359b 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -311,7 +311,7 @@ struct pid *alloc_pid(struct pid_namespace *ns) pid->level = ns->level; for (i = ns->level; i >= 0; i--) { nr = alloc_pidmap(tmp); - if (IS_ERR_VALUE(nr)) { + if (nr < 0) { retval = nr; goto out_free; } diff --git a/kernel/power/Makefile b/kernel/power/Makefile index cb880a14cc39..eb4f717705ba 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -1,6 +1,8 @@ ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG +KASAN_SANITIZE_snapshot.o := n + obj-y += qos.o obj-$(CONFIG_PM) += main.o obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o diff --git a/kernel/power/console.c b/kernel/power/console.c index aba9c545a0e3..0e781798b0b3 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -126,17 +126,17 @@ out: return ret; } -int pm_prepare_console(void) +void pm_prepare_console(void) { if (!pm_vt_switch()) - return 0; + return; orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1); if (orig_fgconsole < 0) - return 1; + return; orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE); - return 0; + return; } void pm_restore_console(void) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index fca9254280ee..33c79b6105c5 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -52,6 +52,7 @@ enum { #ifdef CONFIG_SUSPEND HIBERNATION_SUSPEND, #endif + HIBERNATION_TEST_RESUME, /* keep last */ __HIBERNATION_AFTER_LAST }; @@ -299,12 +300,12 @@ static int create_image(int platform_mode) save_processor_state(); trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true); error = swsusp_arch_suspend(); + /* Restore control flow magically appears here */ + restore_processor_state(); trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false); if (error) printk(KERN_ERR "PM: Error %d creating hibernation image\n", error); - /* Restore control flow magically appears here */ - restore_processor_state(); if (!in_suspend) events_check_enabled = false; @@ -409,6 +410,11 @@ int hibernation_snapshot(int platform_mode) goto Close; } +int __weak hibernate_resume_nonboot_cpu_disable(void) +{ + return disable_nonboot_cpus(); +} + /** * resume_target_kernel - Restore system state from a hibernation image. * @platform_mode: Whether or not to use the platform driver. @@ -433,7 +439,7 @@ static int resume_target_kernel(bool platform_mode) if (error) goto Cleanup; - error = disable_nonboot_cpus(); + error = hibernate_resume_nonboot_cpu_disable(); if (error) goto Enable_cpus; @@ -642,12 +648,39 @@ static void power_down(void) cpu_relax(); } +static int load_image_and_restore(void) +{ + int error; + unsigned int flags; + + pr_debug("PM: Loading hibernation image.\n"); + + lock_device_hotplug(); + error = create_basic_memory_bitmaps(); + if (error) + goto Unlock; + + error = swsusp_read(&flags); + swsusp_close(FMODE_READ); + if (!error) + hibernation_restore(flags & SF_PLATFORM_MODE); + + printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n"); + swsusp_free(); + free_basic_memory_bitmaps(); + Unlock: + unlock_device_hotplug(); + + return error; +} + /** * hibernate - Carry out system hibernation, including saving the image. */ int hibernate(void) { - int error; + int error, nr_calls = 0; + bool snapshot_test = false; if (!hibernation_available()) { pr_debug("PM: Hibernation not available.\n"); @@ -662,9 +695,11 @@ int hibernate(void) } pm_prepare_console(); - error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); - if (error) + error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls); + if (error) { + nr_calls--; goto Exit; + } printk(KERN_INFO "PM: Syncing filesystems ... "); sys_sync(); @@ -697,8 +732,12 @@ int hibernate(void) pr_debug("PM: writing image.\n"); error = swsusp_write(flags); swsusp_free(); - if (!error) - power_down(); + if (!error) { + if (hibernation_mode == HIBERNATION_TEST_RESUME) + snapshot_test = true; + else + power_down(); + } in_suspend = 0; pm_restore_gfp_mask(); } else { @@ -709,12 +748,18 @@ int hibernate(void) free_basic_memory_bitmaps(); Thaw: unlock_device_hotplug(); + if (snapshot_test) { + pr_debug("PM: Checking hibernation image\n"); + error = swsusp_check(); + if (!error) + error = load_image_and_restore(); + } thaw_processes(); /* Don't bother checking whether freezer_test_done is true */ freezer_test_done = false; Exit: - pm_notifier_call_chain(PM_POST_HIBERNATION); + __pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL); pm_restore_console(); atomic_inc(&snapshot_device_available); Unlock: @@ -740,8 +785,7 @@ int hibernate(void) */ static int software_resume(void) { - int error; - unsigned int flags; + int error, nr_calls = 0; /* * If the user said "noresume".. bail out early. @@ -827,35 +871,20 @@ static int software_resume(void) } pm_prepare_console(); - error = pm_notifier_call_chain(PM_RESTORE_PREPARE); - if (error) + error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls); + if (error) { + nr_calls--; goto Close_Finish; + } pr_debug("PM: Preparing processes for restore.\n"); error = freeze_processes(); if (error) goto Close_Finish; - - pr_debug("PM: Loading hibernation image.\n"); - - lock_device_hotplug(); - error = create_basic_memory_bitmaps(); - if (error) - goto Thaw; - - error = swsusp_read(&flags); - swsusp_close(FMODE_READ); - if (!error) - hibernation_restore(flags & SF_PLATFORM_MODE); - - printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n"); - swsusp_free(); - free_basic_memory_bitmaps(); - Thaw: - unlock_device_hotplug(); + error = load_image_and_restore(); thaw_processes(); Finish: - pm_notifier_call_chain(PM_POST_RESTORE); + __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL); pm_restore_console(); atomic_inc(&snapshot_device_available); /* For success case, the suspend path will release the lock */ @@ -878,6 +907,7 @@ static const char * const hibernation_modes[] = { #ifdef CONFIG_SUSPEND [HIBERNATION_SUSPEND] = "suspend", #endif + [HIBERNATION_TEST_RESUME] = "test_resume", }; /* @@ -924,6 +954,7 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, #ifdef CONFIG_SUSPEND case HIBERNATION_SUSPEND: #endif + case HIBERNATION_TEST_RESUME: break; case HIBERNATION_PLATFORM: if (hibernation_ops) @@ -970,6 +1001,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, #ifdef CONFIG_SUSPEND case HIBERNATION_SUSPEND: #endif + case HIBERNATION_TEST_RESUME: hibernation_mode = mode; break; case HIBERNATION_PLATFORM: @@ -1115,13 +1147,16 @@ static int __init resume_offset_setup(char *str) static int __init hibernate_setup(char *str) { - if (!strncmp(str, "noresume", 8)) + if (!strncmp(str, "noresume", 8)) { noresume = 1; - else if (!strncmp(str, "nocompress", 10)) + } else if (!strncmp(str, "nocompress", 10)) { nocompress = 1; - else if (!strncmp(str, "no", 2)) { + } else if (!strncmp(str, "no", 2)) { noresume = 1; nohibernate = 1; + } else if (IS_ENABLED(CONFIG_DEBUG_RODATA) + && !strncmp(str, "protect_image", 13)) { + enable_restore_image_protection(); } return 1; } @@ -1154,11 +1189,6 @@ static int __init nohibernate_setup(char *str) return 1; } -static int __init kaslr_nohibernate_setup(char *str) -{ - return nohibernate_setup(str); -} - static int __init page_poison_nohibernate_setup(char *str) { #ifdef CONFIG_PAGE_POISONING_ZERO @@ -1182,5 +1212,4 @@ __setup("hibernate=", hibernate_setup); __setup("resumewait", resumewait_setup); __setup("resumedelay=", resumedelay_setup); __setup("nohibernate", nohibernate_setup); -__setup("kaslr", kaslr_nohibernate_setup); __setup("page_poison=", page_poison_nohibernate_setup); diff --git a/kernel/power/main.c b/kernel/power/main.c index 27946975eff0..5ea50b1b7595 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -38,12 +38,19 @@ int unregister_pm_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(unregister_pm_notifier); -int pm_notifier_call_chain(unsigned long val) +int __pm_notifier_call_chain(unsigned long val, int nr_to_call, int *nr_calls) { - int ret = blocking_notifier_call_chain(&pm_chain_head, val, NULL); + int ret; + + ret = __blocking_notifier_call_chain(&pm_chain_head, val, NULL, + nr_to_call, nr_calls); return notifier_to_errno(ret); } +int pm_notifier_call_chain(unsigned long val) +{ + return __pm_notifier_call_chain(val, -1, NULL); +} /* If set, devices may be suspended and resumed asynchronously. */ int pm_async_enabled = 1; diff --git a/kernel/power/power.h b/kernel/power/power.h index efe1b3b17c88..242d8b827dd5 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -38,6 +38,8 @@ static inline char *check_image_kernel(struct swsusp_info *info) } #endif /* CONFIG_ARCH_HIBERNATION_HEADER */ +extern int hibernate_resume_nonboot_cpu_disable(void); + /* * Keep some memory free so that I/O operations can succeed without paging * [Might this be more than 4 MB?] @@ -59,6 +61,13 @@ extern int hibernation_snapshot(int platform_mode); extern int hibernation_restore(int platform_mode); extern int hibernation_platform_enter(void); +#ifdef CONFIG_DEBUG_RODATA +/* kernel/power/snapshot.c */ +extern void enable_restore_image_protection(void); +#else +static inline void enable_restore_image_protection(void) {} +#endif /* CONFIG_DEBUG_RODATA */ + #else /* !CONFIG_HIBERNATION */ static inline void hibernate_reserved_size_init(void) {} @@ -200,6 +209,8 @@ static inline void suspend_test_finish(const char *label) {} #ifdef CONFIG_PM_SLEEP /* kernel/power/main.c */ +extern int __pm_notifier_call_chain(unsigned long val, int nr_to_call, + int *nr_calls); extern int pm_notifier_call_chain(unsigned long val); #endif diff --git a/kernel/power/process.c b/kernel/power/process.c index df058bed53ce..8f27d5a8adf6 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -89,6 +89,9 @@ static int try_to_freeze_tasks(bool user_only) elapsed_msecs / 1000, elapsed_msecs % 1000, todo - wq_busy, wq_busy); + if (wq_busy) + show_workqueue_state(); + if (!wakeup) { read_lock(&tasklist_lock); for_each_process_thread(g, p) { @@ -146,6 +149,18 @@ int freeze_processes(void) if (!error && !oom_killer_disable()) error = -EBUSY; + /* + * There is a hard to fix race between oom_reaper kernel thread + * and oom_killer_disable. oom_reaper calls exit_oom_victim + * before the victim reaches exit_mm so try to freeze all the tasks + * again and catch such a left over task. + */ + if (!error) { + pr_info("Double checking all user space processes after OOM killer disable... "); + error = try_to_freeze_tasks(true); + pr_cont("\n"); + } + if (error) thaw_processes(); return error; diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 97b0df71303e..168ff442ebde 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -482,7 +482,16 @@ void pm_qos_update_request(struct pm_qos_request *req, return; } - cancel_delayed_work_sync(&req->work); + /* + * This function may be called very early during boot, for example, + * from of_clk_init(), where irq needs to stay disabled. + * cancel_delayed_work_sync() assumes that irq is enabled on + * invocation and re-enables it on return. Avoid calling it until + * workqueue is initialized. + */ + if (keventd_up()) + cancel_delayed_work_sync(&req->work); + __pm_qos_update_request(req, new_value); } EXPORT_SYMBOL_GPL(pm_qos_update_request); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 3a970604308f..b02228411d57 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -38,6 +38,43 @@ #include "power.h" +#ifdef CONFIG_DEBUG_RODATA +static bool hibernate_restore_protection; +static bool hibernate_restore_protection_active; + +void enable_restore_image_protection(void) +{ + hibernate_restore_protection = true; +} + +static inline void hibernate_restore_protection_begin(void) +{ + hibernate_restore_protection_active = hibernate_restore_protection; +} + +static inline void hibernate_restore_protection_end(void) +{ + hibernate_restore_protection_active = false; +} + +static inline void hibernate_restore_protect_page(void *page_address) +{ + if (hibernate_restore_protection_active) + set_memory_ro((unsigned long)page_address, 1); +} + +static inline void hibernate_restore_unprotect_page(void *page_address) +{ + if (hibernate_restore_protection_active) + set_memory_rw((unsigned long)page_address, 1); +} +#else +static inline void hibernate_restore_protection_begin(void) {} +static inline void hibernate_restore_protection_end(void) {} +static inline void hibernate_restore_protect_page(void *page_address) {} +static inline void hibernate_restore_unprotect_page(void *page_address) {} +#endif /* CONFIG_DEBUG_RODATA */ + static int swsusp_page_is_free(struct page *); static void swsusp_set_page_forbidden(struct page *); static void swsusp_unset_page_forbidden(struct page *); @@ -67,25 +104,32 @@ void __init hibernate_image_size_init(void) image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE; } -/* List of PBEs needed for restoring the pages that were allocated before +/* + * List of PBEs needed for restoring the pages that were allocated before * the suspend and included in the suspend image, but have also been * allocated by the "resume" kernel, so their contents cannot be written * directly to their "original" page frames. */ struct pbe *restore_pblist; -/* Pointer to an auxiliary buffer (1 page) */ -static void *buffer; +/* struct linked_page is used to build chains of pages */ -/** - * @safe_needed - on resume, for storing the PBE list and the image, - * we can only use memory pages that do not conflict with the pages - * used before suspend. The unsafe pages have PageNosaveFree set - * and we count them using unsafe_pages. - * - * Each allocated image page is marked as PageNosave and PageNosaveFree - * so that swsusp_free() can release it. +#define LINKED_PAGE_DATA_SIZE (PAGE_SIZE - sizeof(void *)) + +struct linked_page { + struct linked_page *next; + char data[LINKED_PAGE_DATA_SIZE]; +} __packed; + +/* + * List of "safe" pages (ie. pages that were not used by the image kernel + * before hibernation) that may be used as temporary storage for image kernel + * memory contents. */ +static struct linked_page *safe_pages_list; + +/* Pointer to an auxiliary buffer (1 page) */ +static void *buffer; #define PG_ANY 0 #define PG_SAFE 1 @@ -94,6 +138,19 @@ static void *buffer; static unsigned int allocated_unsafe_pages; +/** + * get_image_page - Allocate a page for a hibernation image. + * @gfp_mask: GFP mask for the allocation. + * @safe_needed: Get pages that were not used before hibernation (restore only) + * + * During image restoration, for storing the PBE list and the image data, we can + * only use memory pages that do not conflict with the pages used before + * hibernation. The "unsafe" pages have PageNosaveFree set and we count them + * using allocated_unsafe_pages. + * + * Each allocated image page is marked as PageNosave and PageNosaveFree so that + * swsusp_free() can release it. + */ static void *get_image_page(gfp_t gfp_mask, int safe_needed) { void *res; @@ -113,9 +170,21 @@ static void *get_image_page(gfp_t gfp_mask, int safe_needed) return res; } +static void *__get_safe_page(gfp_t gfp_mask) +{ + if (safe_pages_list) { + void *ret = safe_pages_list; + + safe_pages_list = safe_pages_list->next; + memset(ret, 0, PAGE_SIZE); + return ret; + } + return get_image_page(gfp_mask, PG_SAFE); +} + unsigned long get_safe_page(gfp_t gfp_mask) { - return (unsigned long)get_image_page(gfp_mask, PG_SAFE); + return (unsigned long)__get_safe_page(gfp_mask); } static struct page *alloc_image_page(gfp_t gfp_mask) @@ -130,11 +199,22 @@ static struct page *alloc_image_page(gfp_t gfp_mask) return page; } +static void recycle_safe_page(void *page_address) +{ + struct linked_page *lp = page_address; + + lp->next = safe_pages_list; + safe_pages_list = lp; +} + /** - * free_image_page - free page represented by @addr, allocated with - * get_image_page (page flags set by it must be cleared) + * free_image_page - Free a page allocated for hibernation image. + * @addr: Address of the page to free. + * @clear_nosave_free: If set, clear the PageNosaveFree bit for the page. + * + * The page to free should have been allocated by get_image_page() (page flags + * set by it are affected). */ - static inline void free_image_page(void *addr, int clear_nosave_free) { struct page *page; @@ -150,17 +230,8 @@ static inline void free_image_page(void *addr, int clear_nosave_free) __free_page(page); } -/* struct linked_page is used to build chains of pages */ - -#define LINKED_PAGE_DATA_SIZE (PAGE_SIZE - sizeof(void *)) - -struct linked_page { - struct linked_page *next; - char data[LINKED_PAGE_DATA_SIZE]; -} __packed; - -static inline void -free_list_of_pages(struct linked_page *list, int clear_page_nosave) +static inline void free_list_of_pages(struct linked_page *list, + int clear_page_nosave) { while (list) { struct linked_page *lp = list->next; @@ -170,30 +241,28 @@ free_list_of_pages(struct linked_page *list, int clear_page_nosave) } } -/** - * struct chain_allocator is used for allocating small objects out of - * a linked list of pages called 'the chain'. - * - * The chain grows each time when there is no room for a new object in - * the current page. The allocated objects cannot be freed individually. - * It is only possible to free them all at once, by freeing the entire - * chain. - * - * NOTE: The chain allocator may be inefficient if the allocated objects - * are not much smaller than PAGE_SIZE. - */ - +/* + * struct chain_allocator is used for allocating small objects out of + * a linked list of pages called 'the chain'. + * + * The chain grows each time when there is no room for a new object in + * the current page. The allocated objects cannot be freed individually. + * It is only possible to free them all at once, by freeing the entire + * chain. + * + * NOTE: The chain allocator may be inefficient if the allocated objects + * are not much smaller than PAGE_SIZE. + */ struct chain_allocator { struct linked_page *chain; /* the chain */ unsigned int used_space; /* total size of objects allocated out - * of the current page - */ + of the current page */ gfp_t gfp_mask; /* mask for allocating pages */ int safe_needed; /* if set, only "safe" pages are allocated */ }; -static void -chain_init(struct chain_allocator *ca, gfp_t gfp_mask, int safe_needed) +static void chain_init(struct chain_allocator *ca, gfp_t gfp_mask, + int safe_needed) { ca->chain = NULL; ca->used_space = LINKED_PAGE_DATA_SIZE; @@ -208,7 +277,8 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) { struct linked_page *lp; - lp = get_image_page(ca->gfp_mask, ca->safe_needed); + lp = ca->safe_needed ? __get_safe_page(ca->gfp_mask) : + get_image_page(ca->gfp_mask, PG_ANY); if (!lp) return NULL; @@ -222,44 +292,44 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) } /** - * Data types related to memory bitmaps. + * Data types related to memory bitmaps. * - * Memory bitmap is a structure consiting of many linked lists of - * objects. The main list's elements are of type struct zone_bitmap - * and each of them corresonds to one zone. For each zone bitmap - * object there is a list of objects of type struct bm_block that - * represent each blocks of bitmap in which information is stored. + * Memory bitmap is a structure consiting of many linked lists of + * objects. The main list's elements are of type struct zone_bitmap + * and each of them corresonds to one zone. For each zone bitmap + * object there is a list of objects of type struct bm_block that + * represent each blocks of bitmap in which information is stored. * - * struct memory_bitmap contains a pointer to the main list of zone - * bitmap objects, a struct bm_position used for browsing the bitmap, - * and a pointer to the list of pages used for allocating all of the - * zone bitmap objects and bitmap block objects. + * struct memory_bitmap contains a pointer to the main list of zone + * bitmap objects, a struct bm_position used for browsing the bitmap, + * and a pointer to the list of pages used for allocating all of the + * zone bitmap objects and bitmap block objects. * - * NOTE: It has to be possible to lay out the bitmap in memory - * using only allocations of order 0. Additionally, the bitmap is - * designed to work with arbitrary number of zones (this is over the - * top for now, but let's avoid making unnecessary assumptions ;-). + * NOTE: It has to be possible to lay out the bitmap in memory + * using only allocations of order 0. Additionally, the bitmap is + * designed to work with arbitrary number of zones (this is over the + * top for now, but let's avoid making unnecessary assumptions ;-). * - * struct zone_bitmap contains a pointer to a list of bitmap block - * objects and a pointer to the bitmap block object that has been - * most recently used for setting bits. Additionally, it contains the - * pfns that correspond to the start and end of the represented zone. + * struct zone_bitmap contains a pointer to a list of bitmap block + * objects and a pointer to the bitmap block object that has been + * most recently used for setting bits. Additionally, it contains the + * PFNs that correspond to the start and end of the represented zone. * - * struct bm_block contains a pointer to the memory page in which - * information is stored (in the form of a block of bitmap) - * It also contains the pfns that correspond to the start and end of - * the represented memory area. + * struct bm_block contains a pointer to the memory page in which + * information is stored (in the form of a block of bitmap) + * It also contains the pfns that correspond to the start and end of + * the represented memory area. * - * The memory bitmap is organized as a radix tree to guarantee fast random - * access to the bits. There is one radix tree for each zone (as returned - * from create_mem_extents). + * The memory bitmap is organized as a radix tree to guarantee fast random + * access to the bits. There is one radix tree for each zone (as returned + * from create_mem_extents). * - * One radix tree is represented by one struct mem_zone_bm_rtree. There are - * two linked lists for the nodes of the tree, one for the inner nodes and - * one for the leave nodes. The linked leave nodes are used for fast linear - * access of the memory bitmap. + * One radix tree is represented by one struct mem_zone_bm_rtree. There are + * two linked lists for the nodes of the tree, one for the inner nodes and + * one for the leave nodes. The linked leave nodes are used for fast linear + * access of the memory bitmap. * - * The struct rtree_node represents one node of the radix tree. + * The struct rtree_node represents one node of the radix tree. */ #define BM_END_OF_MAP (~0UL) @@ -305,9 +375,8 @@ struct bm_position { struct memory_bitmap { struct list_head zones; struct linked_page *p_list; /* list of pages used to store zone - * bitmap objects and bitmap block - * objects - */ + bitmap objects and bitmap block + objects */ struct bm_position cur; /* most recently used bit position */ }; @@ -321,12 +390,12 @@ struct memory_bitmap { #endif #define BM_RTREE_LEVEL_MASK ((1UL << BM_RTREE_LEVEL_SHIFT) - 1) -/* - * alloc_rtree_node - Allocate a new node and add it to the radix tree. +/** + * alloc_rtree_node - Allocate a new node and add it to the radix tree. * - * This function is used to allocate inner nodes as well as the - * leave nodes of the radix tree. It also adds the node to the - * corresponding linked list passed in by the *list parameter. + * This function is used to allocate inner nodes as well as the + * leave nodes of the radix tree. It also adds the node to the + * corresponding linked list passed in by the *list parameter. */ static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed, struct chain_allocator *ca, @@ -347,12 +416,12 @@ static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed, return node; } -/* - * add_rtree_block - Add a new leave node to the radix tree +/** + * add_rtree_block - Add a new leave node to the radix tree. * - * The leave nodes need to be allocated in order to keep the leaves - * linked list in order. This is guaranteed by the zone->blocks - * counter. + * The leave nodes need to be allocated in order to keep the leaves + * linked list in order. This is guaranteed by the zone->blocks + * counter. */ static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask, int safe_needed, struct chain_allocator *ca) @@ -417,17 +486,18 @@ static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask, static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, int clear_nosave_free); -/* - * create_zone_bm_rtree - create a radix tree for one zone +/** + * create_zone_bm_rtree - Create a radix tree for one zone. * - * Allocated the mem_zone_bm_rtree structure and initializes it. - * This function also allocated and builds the radix tree for the - * zone. + * Allocated the mem_zone_bm_rtree structure and initializes it. + * This function also allocated and builds the radix tree for the + * zone. */ -static struct mem_zone_bm_rtree * -create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed, - struct chain_allocator *ca, - unsigned long start, unsigned long end) +static struct mem_zone_bm_rtree *create_zone_bm_rtree(gfp_t gfp_mask, + int safe_needed, + struct chain_allocator *ca, + unsigned long start, + unsigned long end) { struct mem_zone_bm_rtree *zone; unsigned int i, nr_blocks; @@ -454,12 +524,12 @@ create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed, return zone; } -/* - * free_zone_bm_rtree - Free the memory of the radix tree +/** + * free_zone_bm_rtree - Free the memory of the radix tree. * - * Free all node pages of the radix tree. The mem_zone_bm_rtree - * structure itself is not freed here nor are the rtree_node - * structs. + * Free all node pages of the radix tree. The mem_zone_bm_rtree + * structure itself is not freed here nor are the rtree_node + * structs. */ static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, int clear_nosave_free) @@ -492,8 +562,8 @@ struct mem_extent { }; /** - * free_mem_extents - free a list of memory extents - * @list - list of extents to empty + * free_mem_extents - Free a list of memory extents. + * @list: List of extents to free. */ static void free_mem_extents(struct list_head *list) { @@ -506,10 +576,11 @@ static void free_mem_extents(struct list_head *list) } /** - * create_mem_extents - create a list of memory extents representing - * contiguous ranges of PFNs - * @list - list to put the extents into - * @gfp_mask - mask to use for memory allocations + * create_mem_extents - Create a list of memory extents. + * @list: List to put the extents into. + * @gfp_mask: Mask to use for memory allocations. + * + * The extents represent contiguous ranges of PFNs. */ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask) { @@ -565,10 +636,10 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask) } /** - * memory_bm_create - allocate memory for a memory bitmap - */ -static int -memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) + * memory_bm_create - Allocate memory for a memory bitmap. + */ +static int memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, + int safe_needed) { struct chain_allocator ca; struct list_head mem_extents; @@ -607,8 +678,9 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) } /** - * memory_bm_free - free memory occupied by the memory bitmap @bm - */ + * memory_bm_free - Free memory occupied by the memory bitmap. + * @bm: Memory bitmap. + */ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) { struct mem_zone_bm_rtree *zone; @@ -622,14 +694,13 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) } /** - * memory_bm_find_bit - Find the bit for pfn in the memory - * bitmap + * memory_bm_find_bit - Find the bit for a given PFN in a memory bitmap. * - * Find the bit in the bitmap @bm that corresponds to given pfn. - * The cur.zone, cur.block and cur.node_pfn member of @bm are - * updated. - * It walks the radix tree to find the page which contains the bit for - * pfn and returns the bit position in **addr and *bit_nr. + * Find the bit in memory bitmap @bm that corresponds to the given PFN. + * The cur.zone, cur.block and cur.node_pfn members of @bm are updated. + * + * Walk the radix tree to find the page containing the bit that represents @pfn + * and return the position of the bit in @addr and @bit_nr. */ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, void **addr, unsigned int *bit_nr) @@ -658,10 +729,9 @@ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, zone_found: /* - * We have a zone. Now walk the radix tree to find the leave - * node for our pfn. + * We have found the zone. Now walk the radix tree to find the leaf node + * for our PFN. */ - node = bm->cur.node; if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn) goto node_found; @@ -754,20 +824,20 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) } /* - * rtree_next_node - Jumps to the next leave node + * rtree_next_node - Jump to the next leaf node. * - * Sets the position to the beginning of the next node in the - * memory bitmap. This is either the next node in the current - * zone's radix tree or the first node in the radix tree of the - * next zone. + * Set the position to the beginning of the next node in the + * memory bitmap. This is either the next node in the current + * zone's radix tree or the first node in the radix tree of the + * next zone. * - * Returns true if there is a next node, false otherwise. + * Return true if there is a next node, false otherwise. */ static bool rtree_next_node(struct memory_bitmap *bm) { - bm->cur.node = list_entry(bm->cur.node->list.next, - struct rtree_node, list); - if (&bm->cur.node->list != &bm->cur.zone->leaves) { + if (!list_is_last(&bm->cur.node->list, &bm->cur.zone->leaves)) { + bm->cur.node = list_entry(bm->cur.node->list.next, + struct rtree_node, list); bm->cur.node_pfn += BM_BITS_PER_BLOCK; bm->cur.node_bit = 0; touch_softlockup_watchdog(); @@ -775,9 +845,9 @@ static bool rtree_next_node(struct memory_bitmap *bm) } /* No more nodes, goto next zone */ - bm->cur.zone = list_entry(bm->cur.zone->list.next, + if (!list_is_last(&bm->cur.zone->list, &bm->zones)) { + bm->cur.zone = list_entry(bm->cur.zone->list.next, struct mem_zone_bm_rtree, list); - if (&bm->cur.zone->list != &bm->zones) { bm->cur.node = list_entry(bm->cur.zone->leaves.next, struct rtree_node, list); bm->cur.node_pfn = 0; @@ -790,14 +860,15 @@ static bool rtree_next_node(struct memory_bitmap *bm) } /** - * memory_bm_rtree_next_pfn - Find the next set bit in the bitmap @bm + * memory_bm_rtree_next_pfn - Find the next set bit in a memory bitmap. + * @bm: Memory bitmap. * - * Starting from the last returned position this function searches - * for the next set bit in the memory bitmap and returns its - * number. If no more bit is set BM_END_OF_MAP is returned. + * Starting from the last returned position this function searches for the next + * set bit in @bm and returns the PFN represented by it. If no more bits are + * set, BM_END_OF_MAP is returned. * - * It is required to run memory_bm_position_reset() before the - * first call to this function. + * It is required to run memory_bm_position_reset() before the first call to + * this function for the given memory bitmap. */ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) { @@ -819,11 +890,10 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) return BM_END_OF_MAP; } -/** - * This structure represents a range of page frames the contents of which - * should not be saved during the suspend. +/* + * This structure represents a range of page frames the contents of which + * should not be saved during hibernation. */ - struct nosave_region { struct list_head list; unsigned long start_pfn; @@ -832,15 +902,42 @@ struct nosave_region { static LIST_HEAD(nosave_regions); +static void recycle_zone_bm_rtree(struct mem_zone_bm_rtree *zone) +{ + struct rtree_node *node; + + list_for_each_entry(node, &zone->nodes, list) + recycle_safe_page(node->data); + + list_for_each_entry(node, &zone->leaves, list) + recycle_safe_page(node->data); +} + +static void memory_bm_recycle(struct memory_bitmap *bm) +{ + struct mem_zone_bm_rtree *zone; + struct linked_page *p_list; + + list_for_each_entry(zone, &bm->zones, list) + recycle_zone_bm_rtree(zone); + + p_list = bm->p_list; + while (p_list) { + struct linked_page *lp = p_list; + + p_list = lp->next; + recycle_safe_page(lp); + } +} + /** - * register_nosave_region - register a range of page frames the contents - * of which should not be saved during the suspend (to be used in the early - * initialization code) + * register_nosave_region - Register a region of unsaveable memory. + * + * Register a range of page frames the contents of which should not be saved + * during hibernation (to be used in the early initialization code). */ - -void __init -__register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, - int use_kmalloc) +void __init __register_nosave_region(unsigned long start_pfn, + unsigned long end_pfn, int use_kmalloc) { struct nosave_region *region; @@ -857,12 +954,13 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, } } if (use_kmalloc) { - /* during init, this shouldn't fail */ + /* During init, this shouldn't fail */ region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL); BUG_ON(!region); - } else + } else { /* This allocation cannot fail */ region = memblock_virt_alloc(sizeof(struct nosave_region), 0); + } region->start_pfn = start_pfn; region->end_pfn = end_pfn; list_add_tail(®ion->list, &nosave_regions); @@ -923,10 +1021,12 @@ static void swsusp_unset_page_forbidden(struct page *page) } /** - * mark_nosave_pages - set bits corresponding to the page frames the - * contents of which should not be saved in a given bitmap. + * mark_nosave_pages - Mark pages that should not be saved. + * @bm: Memory bitmap. + * + * Set the bits in @bm that correspond to the page frames the contents of which + * should not be saved. */ - static void mark_nosave_pages(struct memory_bitmap *bm) { struct nosave_region *region; @@ -956,13 +1056,13 @@ static void mark_nosave_pages(struct memory_bitmap *bm) } /** - * create_basic_memory_bitmaps - create bitmaps needed for marking page - * frames that should not be saved and free page frames. The pointers - * forbidden_pages_map and free_pages_map are only modified if everything - * goes well, because we don't want the bits to be used before both bitmaps - * are set up. + * create_basic_memory_bitmaps - Create bitmaps to hold basic page information. + * + * Create bitmaps needed for marking page frames that should not be saved and + * free page frames. The forbidden_pages_map and free_pages_map pointers are + * only modified if everything goes well, because we don't want the bits to be + * touched before both bitmaps are set up. */ - int create_basic_memory_bitmaps(void) { struct memory_bitmap *bm1, *bm2; @@ -1007,12 +1107,12 @@ int create_basic_memory_bitmaps(void) } /** - * free_basic_memory_bitmaps - free memory bitmaps allocated by - * create_basic_memory_bitmaps(). The auxiliary pointers are necessary - * so that the bitmaps themselves are not referred to while they are being - * freed. + * free_basic_memory_bitmaps - Free memory bitmaps holding basic information. + * + * Free memory bitmaps allocated by create_basic_memory_bitmaps(). The + * auxiliary pointers are necessary so that the bitmaps themselves are not + * referred to while they are being freed. */ - void free_basic_memory_bitmaps(void) { struct memory_bitmap *bm1, *bm2; @@ -1033,11 +1133,13 @@ void free_basic_memory_bitmaps(void) } /** - * snapshot_additional_pages - estimate the number of additional pages - * be needed for setting up the suspend image data structures for given - * zone (usually the returned value is greater than the exact number) + * snapshot_additional_pages - Estimate the number of extra pages needed. + * @zone: Memory zone to carry out the computation for. + * + * Estimate the number of additional pages needed for setting up a hibernation + * image data structures for @zone (usually, the returned value is greater than + * the exact number). */ - unsigned int snapshot_additional_pages(struct zone *zone) { unsigned int rtree, nodes; @@ -1055,10 +1157,10 @@ unsigned int snapshot_additional_pages(struct zone *zone) #ifdef CONFIG_HIGHMEM /** - * count_free_highmem_pages - compute the total number of free highmem - * pages, system-wide. + * count_free_highmem_pages - Compute the total number of free highmem pages. + * + * The returned number is system-wide. */ - static unsigned int count_free_highmem_pages(void) { struct zone *zone; @@ -1072,11 +1174,12 @@ static unsigned int count_free_highmem_pages(void) } /** - * saveable_highmem_page - Determine whether a highmem page should be - * included in the suspend image. + * saveable_highmem_page - Check if a highmem page is saveable. + * + * Determine whether a highmem page should be included in a hibernation image. * - * We should save the page if it isn't Nosave or NosaveFree, or Reserved, - * and it isn't a part of a free chunk of pages. + * We should save the page if it isn't Nosave or NosaveFree, or Reserved, + * and it isn't part of a free chunk of pages. */ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) { @@ -1102,10 +1205,8 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) } /** - * count_highmem_pages - compute the total number of saveable highmem - * pages. + * count_highmem_pages - Compute the total number of saveable highmem pages. */ - static unsigned int count_highmem_pages(void) { struct zone *zone; @@ -1133,12 +1234,14 @@ static inline void *saveable_highmem_page(struct zone *z, unsigned long p) #endif /* CONFIG_HIGHMEM */ /** - * saveable_page - Determine whether a non-highmem page should be included - * in the suspend image. + * saveable_page - Check if the given page is saveable. + * + * Determine whether a non-highmem page should be included in a hibernation + * image. * - * We should save the page if it isn't Nosave, and is not in the range - * of pages statically defined as 'unsaveable', and it isn't a part of - * a free chunk of pages. + * We should save the page if it isn't Nosave, and is not in the range + * of pages statically defined as 'unsaveable', and it isn't part of + * a free chunk of pages. */ static struct page *saveable_page(struct zone *zone, unsigned long pfn) { @@ -1167,10 +1270,8 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) } /** - * count_data_pages - compute the total number of saveable non-highmem - * pages. + * count_data_pages - Compute the total number of saveable non-highmem pages. */ - static unsigned int count_data_pages(void) { struct zone *zone; @@ -1190,7 +1291,8 @@ static unsigned int count_data_pages(void) return n; } -/* This is needed, because copy_page and memcpy are not usable for copying +/* + * This is needed, because copy_page and memcpy are not usable for copying * task structs. */ static inline void do_copy_page(long *dst, long *src) @@ -1201,12 +1303,12 @@ static inline void do_copy_page(long *dst, long *src) *dst++ = *src++; } - /** - * safe_copy_page - check if the page we are going to copy is marked as - * present in the kernel page tables (this always is the case if - * CONFIG_DEBUG_PAGEALLOC is not set and in that case - * kernel_page_present() always returns 'true'). + * safe_copy_page - Copy a page in a safe way. + * + * Check if the page we are going to copy is marked as present in the kernel + * page tables (this always is the case if CONFIG_DEBUG_PAGEALLOC is not set + * and in that case kernel_page_present() always returns 'true'). */ static void safe_copy_page(void *dst, struct page *s_page) { @@ -1219,10 +1321,8 @@ static void safe_copy_page(void *dst, struct page *s_page) } } - #ifdef CONFIG_HIGHMEM -static inline struct page * -page_is_saveable(struct zone *zone, unsigned long pfn) +static inline struct page *page_is_saveable(struct zone *zone, unsigned long pfn) { return is_highmem(zone) ? saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn); @@ -1243,7 +1343,8 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) kunmap_atomic(src); } else { if (PageHighMem(d_page)) { - /* Page pointed to by src may contain some kernel + /* + * The page pointed to by src may contain some kernel * data modified by kmap_atomic() */ safe_copy_page(buffer, s_page); @@ -1265,8 +1366,8 @@ static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) } #endif /* CONFIG_HIGHMEM */ -static void -copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) +static void copy_data_pages(struct memory_bitmap *copy_bm, + struct memory_bitmap *orig_bm) { struct zone *zone; unsigned long pfn; @@ -1315,12 +1416,11 @@ static struct memory_bitmap orig_bm; static struct memory_bitmap copy_bm; /** - * swsusp_free - free pages allocated for the suspend. + * swsusp_free - Free pages allocated for hibernation image. * - * Suspend pages are alocated before the atomic copy is made, so we - * need to release them after the resume. + * Image pages are alocated before snapshot creation, so they need to be + * released after resume. */ - void swsusp_free(void) { unsigned long fb_pfn, fr_pfn; @@ -1351,6 +1451,7 @@ loop: memory_bm_clear_current(forbidden_pages_map); memory_bm_clear_current(free_pages_map); + hibernate_restore_unprotect_page(page_address(page)); __free_page(page); goto loop; } @@ -1362,6 +1463,7 @@ out: buffer = NULL; alloc_normal = 0; alloc_highmem = 0; + hibernate_restore_protection_end(); } /* Helper functions used for the shrinking of memory. */ @@ -1369,7 +1471,7 @@ out: #define GFP_IMAGE (GFP_KERNEL | __GFP_NOWARN) /** - * preallocate_image_pages - Allocate a number of pages for hibernation image + * preallocate_image_pages - Allocate a number of pages for hibernation image. * @nr_pages: Number of page frames to allocate. * @mask: GFP flags to use for the allocation. * @@ -1419,7 +1521,7 @@ static unsigned long preallocate_image_highmem(unsigned long nr_pages) } /** - * __fraction - Compute (an approximation of) x * (multiplier / base) + * __fraction - Compute (an approximation of) x * (multiplier / base). */ static unsigned long __fraction(u64 x, u64 multiplier, u64 base) { @@ -1429,8 +1531,8 @@ static unsigned long __fraction(u64 x, u64 multiplier, u64 base) } static unsigned long preallocate_highmem_fraction(unsigned long nr_pages, - unsigned long highmem, - unsigned long total) + unsigned long highmem, + unsigned long total) { unsigned long alloc = __fraction(nr_pages, highmem, total); @@ -1443,15 +1545,15 @@ static inline unsigned long preallocate_image_highmem(unsigned long nr_pages) } static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages, - unsigned long highmem, - unsigned long total) + unsigned long highmem, + unsigned long total) { return 0; } #endif /* CONFIG_HIGHMEM */ /** - * free_unnecessary_pages - Release preallocated pages not needed for the image + * free_unnecessary_pages - Release preallocated pages not needed for the image. */ static unsigned long free_unnecessary_pages(void) { @@ -1505,7 +1607,7 @@ static unsigned long free_unnecessary_pages(void) } /** - * minimum_image_size - Estimate the minimum acceptable size of an image + * minimum_image_size - Estimate the minimum acceptable size of an image. * @saveable: Number of saveable pages in the system. * * We want to avoid attempting to free too much memory too hard, so estimate the @@ -1525,17 +1627,17 @@ static unsigned long minimum_image_size(unsigned long saveable) unsigned long size; size = global_page_state(NR_SLAB_RECLAIMABLE) - + global_page_state(NR_ACTIVE_ANON) - + global_page_state(NR_INACTIVE_ANON) - + global_page_state(NR_ACTIVE_FILE) - + global_page_state(NR_INACTIVE_FILE) - - global_page_state(NR_FILE_MAPPED); + + global_node_page_state(NR_ACTIVE_ANON) + + global_node_page_state(NR_INACTIVE_ANON) + + global_node_page_state(NR_ACTIVE_FILE) + + global_node_page_state(NR_INACTIVE_FILE) + - global_node_page_state(NR_FILE_MAPPED); return saveable <= size ? 0 : saveable - size; } /** - * hibernate_preallocate_memory - Preallocate memory for hibernation image + * hibernate_preallocate_memory - Preallocate memory for hibernation image. * * To create a hibernation image it is necessary to make a copy of every page * frame in use. We also need a number of page frames to be free during @@ -1708,10 +1810,11 @@ int hibernate_preallocate_memory(void) #ifdef CONFIG_HIGHMEM /** - * count_pages_for_highmem - compute the number of non-highmem pages - * that will be necessary for creating copies of highmem pages. - */ - + * count_pages_for_highmem - Count non-highmem pages needed for copying highmem. + * + * Compute the number of non-highmem pages that will be necessary for creating + * copies of highmem pages. + */ static unsigned int count_pages_for_highmem(unsigned int nr_highmem) { unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem; @@ -1724,15 +1827,12 @@ static unsigned int count_pages_for_highmem(unsigned int nr_highmem) return nr_highmem; } #else -static unsigned int -count_pages_for_highmem(unsigned int nr_highmem) { return 0; } +static unsigned int count_pages_for_highmem(unsigned int nr_highmem) { return 0; } #endif /* CONFIG_HIGHMEM */ /** - * enough_free_mem - Make sure we have enough free memory for the - * snapshot image. + * enough_free_mem - Check if there is enough free memory for the image. */ - static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) { struct zone *zone; @@ -1751,10 +1851,11 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) #ifdef CONFIG_HIGHMEM /** - * get_highmem_buffer - if there are some highmem pages in the suspend - * image, we may need the buffer to copy them and/or load their data. + * get_highmem_buffer - Allocate a buffer for highmem pages. + * + * If there are some highmem pages in the hibernation image, we may need a + * buffer to copy them and/or load their data. */ - static inline int get_highmem_buffer(int safe_needed) { buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed); @@ -1762,13 +1863,13 @@ static inline int get_highmem_buffer(int safe_needed) } /** - * alloc_highmem_image_pages - allocate some highmem pages for the image. - * Try to allocate as many pages as needed, but if the number of free - * highmem pages is lesser than that, allocate them all. + * alloc_highmem_image_pages - Allocate some highmem pages for the image. + * + * Try to allocate as many pages as needed, but if the number of free highmem + * pages is less than that, allocate them all. */ - -static inline unsigned int -alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem) +static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm, + unsigned int nr_highmem) { unsigned int to_alloc = count_free_highmem_pages(); @@ -1787,25 +1888,24 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem) #else static inline int get_highmem_buffer(int safe_needed) { return 0; } -static inline unsigned int -alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; } +static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm, + unsigned int n) { return 0; } #endif /* CONFIG_HIGHMEM */ /** - * swsusp_alloc - allocate memory for the suspend image + * swsusp_alloc - Allocate memory for hibernation image. * - * We first try to allocate as many highmem pages as there are - * saveable highmem pages in the system. If that fails, we allocate - * non-highmem pages for the copies of the remaining highmem ones. + * We first try to allocate as many highmem pages as there are + * saveable highmem pages in the system. If that fails, we allocate + * non-highmem pages for the copies of the remaining highmem ones. * - * In this approach it is likely that the copies of highmem pages will - * also be located in the high memory, because of the way in which - * copy_data_pages() works. + * In this approach it is likely that the copies of highmem pages will + * also be located in the high memory, because of the way in which + * copy_data_pages() works. */ - -static int -swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, - unsigned int nr_pages, unsigned int nr_highmem) +static int swsusp_alloc(struct memory_bitmap *orig_bm, + struct memory_bitmap *copy_bm, + unsigned int nr_pages, unsigned int nr_highmem) { if (nr_highmem > 0) { if (get_highmem_buffer(PG_ANY)) @@ -1855,7 +1955,8 @@ asmlinkage __visible int swsusp_save(void) return -ENOMEM; } - /* During allocating of suspend pagedir, new cold pages may appear. + /* + * During allocating of suspend pagedir, new cold pages may appear. * Kill them. */ drain_local_pages(NULL); @@ -1918,12 +2019,14 @@ static int init_header(struct swsusp_info *info) } /** - * pack_pfns - pfns corresponding to the set bits found in the bitmap @bm - * are stored in the array @buf[] (1 page at a time) + * pack_pfns - Prepare PFNs for saving. + * @bm: Memory bitmap. + * @buf: Memory buffer to store the PFNs in. + * + * PFNs corresponding to set bits in @bm are stored in the area of memory + * pointed to by @buf (1 page at a time). */ - -static inline void -pack_pfns(unsigned long *buf, struct memory_bitmap *bm) +static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm) { int j; @@ -1937,22 +2040,21 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm) } /** - * snapshot_read_next - used for reading the system memory snapshot. + * snapshot_read_next - Get the address to read the next image page from. + * @handle: Snapshot handle to be used for the reading. * - * On the first call to it @handle should point to a zeroed - * snapshot_handle structure. The structure gets updated and a pointer - * to it should be passed to this function every next time. + * On the first call, @handle should point to a zeroed snapshot_handle + * structure. The structure gets populated then and a pointer to it should be + * passed to this function every next time. * - * On success the function returns a positive number. Then, the caller - * is allowed to read up to the returned number of bytes from the memory - * location computed by the data_of() macro. + * On success, the function returns a positive number. Then, the caller + * is allowed to read up to the returned number of bytes from the memory + * location computed by the data_of() macro. * - * The function returns 0 to indicate the end of data stream condition, - * and a negative number is returned on error. In such cases the - * structure pointed to by @handle is not updated and should not be used - * any more. + * The function returns 0 to indicate the end of the data stream condition, + * and negative numbers are returned on errors. If that happens, the structure + * pointed to by @handle is not updated and should not be used any more. */ - int snapshot_read_next(struct snapshot_handle *handle) { if (handle->cur > nr_meta_pages + nr_copy_pages) @@ -1981,7 +2083,8 @@ int snapshot_read_next(struct snapshot_handle *handle) page = pfn_to_page(memory_bm_next_pfn(©_bm)); if (PageHighMem(page)) { - /* Highmem pages are copied to the buffer, + /* + * Highmem pages are copied to the buffer, * because we can't return with a kmapped * highmem page (we may not be called again). */ @@ -1999,53 +2102,41 @@ int snapshot_read_next(struct snapshot_handle *handle) return PAGE_SIZE; } -/** - * mark_unsafe_pages - mark the pages that cannot be used for storing - * the image during resume, because they conflict with the pages that - * had been used before suspend - */ - -static int mark_unsafe_pages(struct memory_bitmap *bm) +static void duplicate_memory_bitmap(struct memory_bitmap *dst, + struct memory_bitmap *src) { - struct zone *zone; - unsigned long pfn, max_zone_pfn; + unsigned long pfn; - /* Clear page flags */ - for_each_populated_zone(zone) { - max_zone_pfn = zone_end_pfn(zone); - for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (pfn_valid(pfn)) - swsusp_unset_page_free(pfn_to_page(pfn)); + memory_bm_position_reset(src); + pfn = memory_bm_next_pfn(src); + while (pfn != BM_END_OF_MAP) { + memory_bm_set_bit(dst, pfn); + pfn = memory_bm_next_pfn(src); } - - /* Mark pages that correspond to the "original" pfns as "unsafe" */ - memory_bm_position_reset(bm); - do { - pfn = memory_bm_next_pfn(bm); - if (likely(pfn != BM_END_OF_MAP)) { - if (likely(pfn_valid(pfn))) - swsusp_set_page_free(pfn_to_page(pfn)); - else - return -EFAULT; - } - } while (pfn != BM_END_OF_MAP); - - allocated_unsafe_pages = 0; - - return 0; } -static void -duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src) +/** + * mark_unsafe_pages - Mark pages that were used before hibernation. + * + * Mark the pages that cannot be used for storing the image during restoration, + * because they conflict with the pages that had been used before hibernation. + */ +static void mark_unsafe_pages(struct memory_bitmap *bm) { unsigned long pfn; - memory_bm_position_reset(src); - pfn = memory_bm_next_pfn(src); + /* Clear the "free"/"unsafe" bit for all PFNs */ + memory_bm_position_reset(free_pages_map); + pfn = memory_bm_next_pfn(free_pages_map); while (pfn != BM_END_OF_MAP) { - memory_bm_set_bit(dst, pfn); - pfn = memory_bm_next_pfn(src); + memory_bm_clear_current(free_pages_map); + pfn = memory_bm_next_pfn(free_pages_map); } + + /* Mark pages that correspond to the "original" PFNs as "unsafe" */ + duplicate_memory_bitmap(free_pages_map, bm); + + allocated_unsafe_pages = 0; } static int check_header(struct swsusp_info *info) @@ -2063,11 +2154,9 @@ static int check_header(struct swsusp_info *info) } /** - * load header - check the image header and copy data from it + * load header - Check the image header and copy the data from it. */ - -static int -load_header(struct swsusp_info *info) +static int load_header(struct swsusp_info *info) { int error; @@ -2081,8 +2170,12 @@ load_header(struct swsusp_info *info) } /** - * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set - * the corresponding bit in the memory bitmap @bm + * unpack_orig_pfns - Set bits corresponding to given PFNs in a memory bitmap. + * @bm: Memory bitmap. + * @buf: Area of memory containing the PFNs. + * + * For each element of the array pointed to by @buf (1 page at a time), set the + * corresponding bit in @bm. */ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) { @@ -2095,7 +2188,7 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) /* Extract and buffer page key for data page (s390 only). */ page_key_memorize(buf + j); - if (memory_bm_pfn_present(bm, buf[j])) + if (pfn_valid(buf[j]) && memory_bm_pfn_present(bm, buf[j])) memory_bm_set_bit(bm, buf[j]); else return -EFAULT; @@ -2104,13 +2197,9 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) return 0; } -/* List of "safe" pages that may be used to store data loaded from the suspend - * image - */ -static struct linked_page *safe_pages_list; - #ifdef CONFIG_HIGHMEM -/* struct highmem_pbe is used for creating the list of highmem pages that +/* + * struct highmem_pbe is used for creating the list of highmem pages that * should be restored atomically during the resume from disk, because the page * frames they have occupied before the suspend are in use. */ @@ -2120,7 +2209,8 @@ struct highmem_pbe { struct highmem_pbe *next; }; -/* List of highmem PBEs needed for restoring the highmem pages that were +/* + * List of highmem PBEs needed for restoring the highmem pages that were * allocated before the suspend and included in the suspend image, but have * also been allocated by the "resume" kernel, so their contents cannot be * written directly to their "original" page frames. @@ -2128,11 +2218,11 @@ struct highmem_pbe { static struct highmem_pbe *highmem_pblist; /** - * count_highmem_image_pages - compute the number of highmem pages in the - * suspend image. The bits in the memory bitmap @bm that correspond to the - * image pages are assumed to be set. + * count_highmem_image_pages - Compute the number of highmem pages in the image. + * @bm: Memory bitmap. + * + * The bits in @bm that correspond to image pages are assumed to be set. */ - static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) { unsigned long pfn; @@ -2149,24 +2239,25 @@ static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) return cnt; } -/** - * prepare_highmem_image - try to allocate as many highmem pages as - * there are highmem image pages (@nr_highmem_p points to the variable - * containing the number of highmem image pages). The pages that are - * "safe" (ie. will not be overwritten when the suspend image is - * restored) have the corresponding bits set in @bm (it must be - * unitialized). - * - * NOTE: This function should not be called if there are no highmem - * image pages. - */ - static unsigned int safe_highmem_pages; static struct memory_bitmap *safe_highmem_bm; -static int -prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) +/** + * prepare_highmem_image - Allocate memory for loading highmem data from image. + * @bm: Pointer to an uninitialized memory bitmap structure. + * @nr_highmem_p: Pointer to the number of highmem image pages. + * + * Try to allocate as many highmem pages as there are highmem image pages + * (@nr_highmem_p points to the variable containing the number of highmem image + * pages). The pages that are "safe" (ie. will not be overwritten when the + * hibernation image is restored entirely) have the corresponding bits set in + * @bm (it must be unitialized). + * + * NOTE: This function should not be called if there are no highmem image pages. + */ +static int prepare_highmem_image(struct memory_bitmap *bm, + unsigned int *nr_highmem_p) { unsigned int to_alloc; @@ -2201,39 +2292,42 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) return 0; } +static struct page *last_highmem_page; + /** - * get_highmem_page_buffer - for given highmem image page find the buffer - * that suspend_write_next() should set for its caller to write to. + * get_highmem_page_buffer - Prepare a buffer to store a highmem image page. * - * If the page is to be saved to its "original" page frame or a copy of - * the page is to be made in the highmem, @buffer is returned. Otherwise, - * the copy of the page is to be made in normal memory, so the address of - * the copy is returned. + * For a given highmem image page get a buffer that suspend_write_next() should + * return to its caller to write to. * - * If @buffer is returned, the caller of suspend_write_next() will write - * the page's contents to @buffer, so they will have to be copied to the - * right location on the next call to suspend_write_next() and it is done - * with the help of copy_last_highmem_page(). For this purpose, if - * @buffer is returned, @last_highmem page is set to the page to which - * the data will have to be copied from @buffer. + * If the page is to be saved to its "original" page frame or a copy of + * the page is to be made in the highmem, @buffer is returned. Otherwise, + * the copy of the page is to be made in normal memory, so the address of + * the copy is returned. + * + * If @buffer is returned, the caller of suspend_write_next() will write + * the page's contents to @buffer, so they will have to be copied to the + * right location on the next call to suspend_write_next() and it is done + * with the help of copy_last_highmem_page(). For this purpose, if + * @buffer is returned, @last_highmem_page is set to the page to which + * the data will have to be copied from @buffer. */ - -static struct page *last_highmem_page; - -static void * -get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) +static void *get_highmem_page_buffer(struct page *page, + struct chain_allocator *ca) { struct highmem_pbe *pbe; void *kaddr; if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) { - /* We have allocated the "original" page frame and we can + /* + * We have allocated the "original" page frame and we can * use it directly to store the loaded page. */ last_highmem_page = page; return buffer; } - /* The "original" page frame has not been allocated and we have to + /* + * The "original" page frame has not been allocated and we have to * use a "safe" page frame to store the loaded page. */ pbe = chain_alloc(ca, sizeof(struct highmem_pbe)); @@ -2263,11 +2357,12 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) } /** - * copy_last_highmem_page - copy the contents of a highmem image from - * @buffer, where the caller of snapshot_write_next() has place them, - * to the right location represented by @last_highmem_page . + * copy_last_highmem_page - Copy most the most recent highmem image page. + * + * Copy the contents of a highmem image from @buffer, where the caller of + * snapshot_write_next() has stored them, to the right location represented by + * @last_highmem_page . */ - static void copy_last_highmem_page(void) { if (last_highmem_page) { @@ -2294,17 +2389,13 @@ static inline void free_highmem_data(void) free_image_page(buffer, PG_UNSAFE_CLEAR); } #else -static unsigned int -count_highmem_image_pages(struct memory_bitmap *bm) { return 0; } +static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) { return 0; } -static inline int -prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) -{ - return 0; -} +static inline int prepare_highmem_image(struct memory_bitmap *bm, + unsigned int *nr_highmem_p) { return 0; } -static inline void * -get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) +static inline void *get_highmem_page_buffer(struct page *page, + struct chain_allocator *ca) { return ERR_PTR(-EINVAL); } @@ -2314,27 +2405,27 @@ static inline int last_highmem_page_copied(void) { return 1; } static inline void free_highmem_data(void) {} #endif /* CONFIG_HIGHMEM */ +#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe)) + /** - * prepare_image - use the memory bitmap @bm to mark the pages that will - * be overwritten in the process of restoring the system memory state - * from the suspend image ("unsafe" pages) and allocate memory for the - * image. + * prepare_image - Make room for loading hibernation image. + * @new_bm: Unitialized memory bitmap structure. + * @bm: Memory bitmap with unsafe pages marked. + * + * Use @bm to mark the pages that will be overwritten in the process of + * restoring the system memory state from the suspend image ("unsafe" pages) + * and allocate memory for the image. * - * The idea is to allocate a new memory bitmap first and then allocate - * as many pages as needed for the image data, but not to assign these - * pages to specific tasks initially. Instead, we just mark them as - * allocated and create a lists of "safe" pages that will be used - * later. On systems with high memory a list of "safe" highmem pages is - * also created. + * The idea is to allocate a new memory bitmap first and then allocate + * as many pages as needed for image data, but without specifying what those + * pages will be used for just yet. Instead, we mark them all as allocated and + * create a lists of "safe" pages to be used later. On systems with high + * memory a list of "safe" highmem pages is created too. */ - -#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe)) - -static int -prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) +static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) { unsigned int nr_pages, nr_highmem; - struct linked_page *sp_list, *lp; + struct linked_page *lp; int error; /* If there is no highmem, the buffer will not be necessary */ @@ -2342,9 +2433,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) buffer = NULL; nr_highmem = count_highmem_image_pages(bm); - error = mark_unsafe_pages(bm); - if (error) - goto Free; + mark_unsafe_pages(bm); error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE); if (error) @@ -2357,14 +2446,15 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) if (error) goto Free; } - /* Reserve some safe pages for potential later use. + /* + * Reserve some safe pages for potential later use. * * NOTE: This way we make sure there will be enough safe pages for the * chain_alloc() in get_buffer(). It is a bit wasteful, but * nr_copy_pages cannot be greater than 50% of the memory anyway. + * + * nr_copy_pages cannot be less than allocated_unsafe_pages too. */ - sp_list = NULL; - /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */ nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE); while (nr_pages > 0) { @@ -2373,12 +2463,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) error = -ENOMEM; goto Free; } - lp->next = sp_list; - sp_list = lp; + lp->next = safe_pages_list; + safe_pages_list = lp; nr_pages--; } /* Preallocate memory for the image */ - safe_pages_list = NULL; nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; while (nr_pages > 0) { lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC); @@ -2396,12 +2485,6 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) swsusp_set_page_free(virt_to_page(lp)); nr_pages--; } - /* Free the reserved safe pages so that chain_alloc() can use them */ - while (sp_list) { - lp = sp_list->next; - free_image_page(sp_list, PG_UNSAFE_CLEAR); - sp_list = lp; - } return 0; Free: @@ -2410,10 +2493,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) } /** - * get_buffer - compute the address that snapshot_write_next() should - * set for its caller to write to. + * get_buffer - Get the address to store the next image data page. + * + * Get the address that snapshot_write_next() should return to its caller to + * write to. */ - static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) { struct pbe *pbe; @@ -2428,12 +2512,14 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) return get_highmem_page_buffer(page, ca); if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) - /* We have allocated the "original" page frame and we can + /* + * We have allocated the "original" page frame and we can * use it directly to store the loaded page. */ return page_address(page); - /* The "original" page frame has not been allocated and we have to + /* + * The "original" page frame has not been allocated and we have to * use a "safe" page frame to store the loaded page. */ pbe = chain_alloc(ca, sizeof(struct pbe)); @@ -2450,22 +2536,21 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) } /** - * snapshot_write_next - used for writing the system memory snapshot. + * snapshot_write_next - Get the address to store the next image page. + * @handle: Snapshot handle structure to guide the writing. * - * On the first call to it @handle should point to a zeroed - * snapshot_handle structure. The structure gets updated and a pointer - * to it should be passed to this function every next time. + * On the first call, @handle should point to a zeroed snapshot_handle + * structure. The structure gets populated then and a pointer to it should be + * passed to this function every next time. * - * On success the function returns a positive number. Then, the caller - * is allowed to write up to the returned number of bytes to the memory - * location computed by the data_of() macro. + * On success, the function returns a positive number. Then, the caller + * is allowed to write up to the returned number of bytes to the memory + * location computed by the data_of() macro. * - * The function returns 0 to indicate the "end of file" condition, - * and a negative number is returned on error. In such cases the - * structure pointed to by @handle is not updated and should not be used - * any more. + * The function returns 0 to indicate the "end of file" condition. Negative + * numbers are returned on errors, in which cases the structure pointed to by + * @handle is not updated and should not be used any more. */ - int snapshot_write_next(struct snapshot_handle *handle) { static struct chain_allocator ca; @@ -2491,6 +2576,8 @@ int snapshot_write_next(struct snapshot_handle *handle) if (error) return error; + safe_pages_list = NULL; + error = memory_bm_create(©_bm, GFP_ATOMIC, PG_ANY); if (error) return error; @@ -2500,6 +2587,7 @@ int snapshot_write_next(struct snapshot_handle *handle) if (error) return error; + hibernate_restore_protection_begin(); } else if (handle->cur <= nr_meta_pages + 1) { error = unpack_orig_pfns(buffer, ©_bm); if (error) @@ -2522,6 +2610,7 @@ int snapshot_write_next(struct snapshot_handle *handle) copy_last_highmem_page(); /* Restore page key for data page (s390 only). */ page_key_write(handle->buffer); + hibernate_restore_protect_page(handle->buffer); handle->buffer = get_buffer(&orig_bm, &ca); if (IS_ERR(handle->buffer)) return PTR_ERR(handle->buffer); @@ -2533,22 +2622,23 @@ int snapshot_write_next(struct snapshot_handle *handle) } /** - * snapshot_write_finalize - must be called after the last call to - * snapshot_write_next() in case the last page in the image happens - * to be a highmem page and its contents should be stored in the - * highmem. Additionally, it releases the memory that will not be - * used any more. + * snapshot_write_finalize - Complete the loading of a hibernation image. + * + * Must be called after the last call to snapshot_write_next() in case the last + * page in the image happens to be a highmem page and its contents should be + * stored in highmem. Additionally, it recycles bitmap memory that's not + * necessary any more. */ - void snapshot_write_finalize(struct snapshot_handle *handle) { copy_last_highmem_page(); /* Restore page key for data page (s390 only). */ page_key_write(handle->buffer); page_key_free(); - /* Free only if we have loaded the image entirely */ + hibernate_restore_protect_page(handle->buffer); + /* Do that only if we have loaded the image entirely */ if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) { - memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); + memory_bm_recycle(&orig_bm); free_highmem_data(); } } @@ -2561,8 +2651,8 @@ int snapshot_image_loaded(struct snapshot_handle *handle) #ifdef CONFIG_HIGHMEM /* Assumes that @buf is ready and points to a "safe" page */ -static inline void -swap_two_pages_data(struct page *p1, struct page *p2, void *buf) +static inline void swap_two_pages_data(struct page *p1, struct page *p2, + void *buf) { void *kaddr1, *kaddr2; @@ -2576,15 +2666,15 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf) } /** - * restore_highmem - for each highmem page that was allocated before - * the suspend and included in the suspend image, and also has been - * allocated by the "resume" kernel swap its current (ie. "before - * resume") contents with the previous (ie. "before suspend") one. + * restore_highmem - Put highmem image pages into their original locations. + * + * For each highmem page that was in use before hibernation and is included in + * the image, and also has been allocated by the "restore" kernel, swap its + * current contents with the previous (ie. "before hibernation") ones. * - * If the resume eventually fails, we can call this function once - * again and restore the "before resume" highmem state. + * If the restore eventually fails, we can call this function once again and + * restore the highmem state as seen by the restore kernel. */ - int restore_highmem(void) { struct highmem_pbe *pbe = highmem_pblist; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 5b70d64b871e..0acab9d7f96f 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -266,16 +266,18 @@ static int suspend_test(int level) */ static int suspend_prepare(suspend_state_t state) { - int error; + int error, nr_calls = 0; if (!sleep_state_supported(state)) return -EPERM; pm_prepare_console(); - error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); - if (error) + error = __pm_notifier_call_chain(PM_SUSPEND_PREPARE, -1, &nr_calls); + if (error) { + nr_calls--; goto Finish; + } trace_suspend_resume(TPS("freeze_processes"), 0, true); error = suspend_freeze_processes(); @@ -286,7 +288,7 @@ static int suspend_prepare(suspend_state_t state) suspend_stats.failed_freeze++; dpm_save_failed_step(SUSPEND_FREEZE); Finish: - pm_notifier_call_chain(PM_POST_SUSPEND); + __pm_notifier_call_chain(PM_POST_SUSPEND, nr_calls, NULL); pm_restore_console(); return error; } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 12cd989dadf6..a3b1e617bcdc 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -37,6 +37,14 @@ #define HIBERNATE_SIG "S1SUSPEND" /* + * When reading an {un,}compressed image, we may restore pages in place, + * in which case some architectures need these pages cleaning before they + * can be executed. We don't know which pages these may be, so clean the lot. + */ +static bool clean_pages_on_read; +static bool clean_pages_on_decompress; + +/* * The swap map is a data structure used for keeping track of each page * written to a swap partition. It consists of many swap_map_page * structures that contain each an array of MAP_PAGE_ENTRIES swap entries. @@ -241,6 +249,9 @@ static void hib_end_io(struct bio *bio) if (bio_data_dir(bio) == WRITE) put_page(page); + else if (clean_pages_on_read) + flush_icache_range((unsigned long)page_address(page), + (unsigned long)page_address(page) + PAGE_SIZE); if (bio->bi_error && !hb->error) hb->error = bio->bi_error; @@ -250,7 +261,7 @@ static void hib_end_io(struct bio *bio) bio_put(bio); } -static int hib_submit_io(int rw, pgoff_t page_off, void *addr, +static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, struct hib_bio_batch *hb) { struct page *page = virt_to_page(addr); @@ -260,6 +271,7 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr, bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1); bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); bio->bi_bdev = hib_resume_bdev; + bio_set_op_attrs(bio, op, op_flags); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { printk(KERN_ERR "PM: Adding page to bio failed at %llu\n", @@ -272,9 +284,9 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr, bio->bi_end_io = hib_end_io; bio->bi_private = hb; atomic_inc(&hb->count); - submit_bio(rw, bio); + submit_bio(bio); } else { - error = submit_bio_wait(rw, bio); + error = submit_bio_wait(bio); bio_put(bio); } @@ -295,7 +307,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) { int error; - hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL); + hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block, + swsusp_header, NULL); if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); @@ -304,8 +317,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) swsusp_header->flags = flags; if (flags & SF_CRC32_MODE) swsusp_header->crc32 = handle->crc32; - error = hib_submit_io(WRITE_SYNC, swsusp_resume_block, - swsusp_header, NULL); + error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, + swsusp_resume_block, swsusp_header, NULL); } else { printk(KERN_ERR "PM: Swap header not found!\n"); error = -ENODEV; @@ -337,6 +350,12 @@ static int swsusp_swap_check(void) if (res < 0) blkdev_put(hib_resume_bdev, FMODE_WRITE); + /* + * Update the resume device to the one actually used, + * so the test_resume mode can use it in case it is + * invoked from hibernate() to test the snapshot. + */ + swsusp_resume_device = hib_resume_bdev->bd_dev; return res; } @@ -378,7 +397,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) } else { src = buf; } - return hib_submit_io(WRITE_SYNC, offset, src, hb); + return hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, offset, src, hb); } static void release_swap_writer(struct swap_map_handle *handle) @@ -981,7 +1000,8 @@ static int get_swap_reader(struct swap_map_handle *handle, return -ENOMEM; } - error = hib_submit_io(READ_SYNC, offset, tmp->map, NULL); + error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset, + tmp->map, NULL); if (error) { release_swap_reader(handle); return error; @@ -1005,7 +1025,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf, offset = handle->cur->entries[handle->k]; if (!offset) return -EFAULT; - error = hib_submit_io(READ_SYNC, offset, buf, hb); + error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset, buf, hb); if (error) return error; if (++handle->k >= MAP_PAGE_ENTRIES) { @@ -1049,6 +1069,7 @@ static int load_image(struct swap_map_handle *handle, hib_init_batch(&hb); + clean_pages_on_read = true; printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n", nr_to_read); m = nr_to_read / 10; @@ -1124,6 +1145,10 @@ static int lzo_decompress_threadfn(void *data) d->unc_len = LZO_UNC_SIZE; d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len, d->unc, &d->unc_len); + if (clean_pages_on_decompress) + flush_icache_range((unsigned long)d->unc, + (unsigned long)d->unc + d->unc_len); + atomic_set(&d->stop, 1); wake_up(&d->done); } @@ -1189,6 +1214,8 @@ static int load_image_lzo(struct swap_map_handle *handle, } memset(crc, 0, offsetof(struct crc_data, go)); + clean_pages_on_decompress = true; + /* * Start the decompression threads. */ @@ -1507,7 +1534,8 @@ int swsusp_check(void) if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); clear_page(swsusp_header); - error = hib_submit_io(READ_SYNC, swsusp_resume_block, + error = hib_submit_io(REQ_OP_READ, READ_SYNC, + swsusp_resume_block, swsusp_header, NULL); if (error) goto put; @@ -1515,7 +1543,8 @@ int swsusp_check(void) if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) { memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); /* Reset swap signature now */ - error = hib_submit_io(WRITE_SYNC, swsusp_resume_block, + error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, + swsusp_resume_block, swsusp_header, NULL); } else { error = -EINVAL; @@ -1559,10 +1588,12 @@ int swsusp_unmark(void) { int error; - hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL); + hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block, + swsusp_header, NULL); if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) { memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10); - error = hib_submit_io(WRITE_SYNC, swsusp_resume_block, + error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, + swsusp_resume_block, swsusp_header, NULL); } else { printk(KERN_ERR "PM: Cannot find swsusp signature!\n"); diff --git a/kernel/power/user.c b/kernel/power/user.c index 526e8911460a..35310b627388 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -47,7 +47,7 @@ atomic_t snapshot_device_available = ATOMIC_INIT(1); static int snapshot_open(struct inode *inode, struct file *filp) { struct snapshot_data *data; - int error; + int error, nr_calls = 0; if (!hibernation_available()) return -EPERM; @@ -74,9 +74,9 @@ static int snapshot_open(struct inode *inode, struct file *filp) swap_type_of(swsusp_resume_device, 0, NULL) : -1; data->mode = O_RDONLY; data->free_bitmaps = false; - error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); + error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls); if (error) - pm_notifier_call_chain(PM_POST_HIBERNATION); + __pm_notifier_call_chain(PM_POST_HIBERNATION, --nr_calls, NULL); } else { /* * Resuming. We may need to wait for the image device to @@ -86,13 +86,15 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->swap = -1; data->mode = O_WRONLY; - error = pm_notifier_call_chain(PM_RESTORE_PREPARE); + error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls); if (!error) { error = create_basic_memory_bitmaps(); data->free_bitmaps = !error; - } + } else + nr_calls--; + if (error) - pm_notifier_call_chain(PM_POST_RESTORE); + __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL); } if (error) atomic_inc(&snapshot_device_available); diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index 85405bdcf2b3..abb0042a427b 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -1,2 +1,3 @@ obj-y = printk.o +obj-$(CONFIG_PRINTK_NMI) += nmi.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c index 276762f3a460..d5760c42f042 100644 --- a/kernel/printk/braille.c +++ b/kernel/printk/braille.c @@ -9,10 +9,10 @@ char *_braille_console_setup(char **str, char **brl_options) { - if (!memcmp(*str, "brl,", 4)) { + if (!strncmp(*str, "brl,", 4)) { *brl_options = ""; *str += 4; - } else if (!memcmp(str, "brl=", 4)) { + } else if (!strncmp(*str, "brl=", 4)) { *brl_options = *str + 4; *str = strchr(*brl_options, ','); if (!*str) diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h new file mode 100644 index 000000000000..7fd2838fa417 --- /dev/null +++ b/kernel/printk/internal.h @@ -0,0 +1,57 @@ +/* + * internal.h - printk internal definitions + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ +#include <linux/percpu.h> + +typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args); + +int __printf(1, 0) vprintk_default(const char *fmt, va_list args); + +#ifdef CONFIG_PRINTK_NMI + +extern raw_spinlock_t logbuf_lock; + +/* + * printk() could not take logbuf_lock in NMI context. Instead, + * it temporary stores the strings into a per-CPU buffer. + * The alternative implementation is chosen transparently + * via per-CPU variable. + */ +DECLARE_PER_CPU(printk_func_t, printk_func); +static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) +{ + return this_cpu_read(printk_func)(fmt, args); +} + +extern atomic_t nmi_message_lost; +static inline int get_nmi_message_lost(void) +{ + return atomic_xchg(&nmi_message_lost, 0); +} + +#else /* CONFIG_PRINTK_NMI */ + +static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) +{ + return vprintk_default(fmt, args); +} + +static inline int get_nmi_message_lost(void) +{ + return 0; +} + +#endif /* CONFIG_PRINTK_NMI */ diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c new file mode 100644 index 000000000000..16bab471c7e2 --- /dev/null +++ b/kernel/printk/nmi.c @@ -0,0 +1,268 @@ +/* + * nmi.c - Safe printk in NMI context + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/preempt.h> +#include <linux/spinlock.h> +#include <linux/debug_locks.h> +#include <linux/smp.h> +#include <linux/cpumask.h> +#include <linux/irq_work.h> +#include <linux/printk.h> + +#include "internal.h" + +/* + * printk() could not take logbuf_lock in NMI context. Instead, + * it uses an alternative implementation that temporary stores + * the strings into a per-CPU buffer. The content of the buffer + * is later flushed into the main ring buffer via IRQ work. + * + * The alternative implementation is chosen transparently + * via @printk_func per-CPU variable. + * + * The implementation allows to flush the strings also from another CPU. + * There are situations when we want to make sure that all buffers + * were handled or when IRQs are blocked. + */ +DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; +static int printk_nmi_irq_ready; +atomic_t nmi_message_lost; + +#define NMI_LOG_BUF_LEN ((1 << CONFIG_NMI_LOG_BUF_SHIFT) - \ + sizeof(atomic_t) - sizeof(struct irq_work)) + +struct nmi_seq_buf { + atomic_t len; /* length of written data */ + struct irq_work work; /* IRQ work that flushes the buffer */ + unsigned char buffer[NMI_LOG_BUF_LEN]; +}; +static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq); + +/* + * Safe printk() for NMI context. It uses a per-CPU buffer to + * store the message. NMIs are not nested, so there is always only + * one writer running. But the buffer might get flushed from another + * CPU, so we need to be careful. + */ +static int vprintk_nmi(const char *fmt, va_list args) +{ + struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq); + int add = 0; + size_t len; + +again: + len = atomic_read(&s->len); + + if (len >= sizeof(s->buffer)) { + atomic_inc(&nmi_message_lost); + return 0; + } + + /* + * Make sure that all old data have been read before the buffer was + * reseted. This is not needed when we just append data. + */ + if (!len) + smp_rmb(); + + add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args); + + /* + * Do it once again if the buffer has been flushed in the meantime. + * Note that atomic_cmpxchg() is an implicit memory barrier that + * makes sure that the data were written before updating s->len. + */ + if (atomic_cmpxchg(&s->len, len, len + add) != len) + goto again; + + /* Get flushed in a more safe context. */ + if (add && printk_nmi_irq_ready) { + /* Make sure that IRQ work is really initialized. */ + smp_rmb(); + irq_work_queue(&s->work); + } + + return add; +} + +static void printk_nmi_flush_line(const char *text, int len) +{ + /* + * The buffers are flushed in NMI only on panic. The messages must + * go only into the ring buffer at this stage. Consoles will get + * explicitly called later when a crashdump is not generated. + */ + if (in_nmi()) + printk_deferred("%.*s", len, text); + else + printk("%.*s", len, text); + +} + +/* + * printk one line from the temporary buffer from @start index until + * and including the @end index. + */ +static void printk_nmi_flush_seq_line(struct nmi_seq_buf *s, + int start, int end) +{ + const char *buf = s->buffer + start; + + printk_nmi_flush_line(buf, (end - start) + 1); +} + +/* + * Flush data from the associated per_CPU buffer. The function + * can be called either via IRQ work or independently. + */ +static void __printk_nmi_flush(struct irq_work *work) +{ + static raw_spinlock_t read_lock = + __RAW_SPIN_LOCK_INITIALIZER(read_lock); + struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work); + unsigned long flags; + size_t len, size; + int i, last_i; + + /* + * The lock has two functions. First, one reader has to flush all + * available message to make the lockless synchronization with + * writers easier. Second, we do not want to mix messages from + * different CPUs. This is especially important when printing + * a backtrace. + */ + raw_spin_lock_irqsave(&read_lock, flags); + + i = 0; +more: + len = atomic_read(&s->len); + + /* + * This is just a paranoid check that nobody has manipulated + * the buffer an unexpected way. If we printed something then + * @len must only increase. + */ + if (i && i >= len) { + const char *msg = "printk_nmi_flush: internal error\n"; + + printk_nmi_flush_line(msg, strlen(msg)); + } + + if (!len) + goto out; /* Someone else has already flushed the buffer. */ + + /* Make sure that data has been written up to the @len */ + smp_rmb(); + + size = min(len, sizeof(s->buffer)); + last_i = i; + + /* Print line by line. */ + for (; i < size; i++) { + if (s->buffer[i] == '\n') { + printk_nmi_flush_seq_line(s, last_i, i); + last_i = i + 1; + } + } + /* Check if there was a partial line. */ + if (last_i < size) { + printk_nmi_flush_seq_line(s, last_i, size - 1); + printk_nmi_flush_line("\n", strlen("\n")); + } + + /* + * Check that nothing has got added in the meantime and truncate + * the buffer. Note that atomic_cmpxchg() is an implicit memory + * barrier that makes sure that the data were copied before + * updating s->len. + */ + if (atomic_cmpxchg(&s->len, len, 0) != len) + goto more; + +out: + raw_spin_unlock_irqrestore(&read_lock, flags); +} + +/** + * printk_nmi_flush - flush all per-cpu nmi buffers. + * + * The buffers are flushed automatically via IRQ work. This function + * is useful only when someone wants to be sure that all buffers have + * been flushed at some point. + */ +void printk_nmi_flush(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + __printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work); +} + +/** + * printk_nmi_flush_on_panic - flush all per-cpu nmi buffers when the system + * goes down. + * + * Similar to printk_nmi_flush() but it can be called even in NMI context when + * the system goes down. It does the best effort to get NMI messages into + * the main ring buffer. + * + * Note that it could try harder when there is only one CPU online. + */ +void printk_nmi_flush_on_panic(void) +{ + /* + * Make sure that we could access the main ring buffer. + * Do not risk a double release when more CPUs are up. + */ + if (in_nmi() && raw_spin_is_locked(&logbuf_lock)) { + if (num_online_cpus() > 1) + return; + + debug_locks_off(); + raw_spin_lock_init(&logbuf_lock); + } + + printk_nmi_flush(); +} + +void __init printk_nmi_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct nmi_seq_buf *s = &per_cpu(nmi_print_seq, cpu); + + init_irq_work(&s->work, __printk_nmi_flush); + } + + /* Make sure that IRQ works are initialized before enabling. */ + smp_wmb(); + printk_nmi_irq_ready = 1; + + /* Flush pending messages that did not have scheduled IRQ works. */ + printk_nmi_flush(); +} + +void printk_nmi_enter(void) +{ + this_cpu_write(printk_func, vprintk_nmi); +} + +void printk_nmi_exit(void) +{ + this_cpu_write(printk_func, vprintk_default); +} diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index bfbf284e4218..eea6dbc2d8cf 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -26,7 +26,6 @@ #include <linux/nmi.h> #include <linux/module.h> #include <linux/moduleparam.h> -#include <linux/interrupt.h> /* For in_interrupt() */ #include <linux/delay.h> #include <linux/smp.h> #include <linux/security.h> @@ -48,13 +47,14 @@ #include <linux/uio.h> #include <asm/uaccess.h> -#include <asm-generic/sections.h> +#include <asm/sections.h> #define CREATE_TRACE_POINTS #include <trace/events/printk.h> #include "console_cmdline.h" #include "braille.h" +#include "internal.h" int console_printk[4] = { CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ @@ -85,6 +85,111 @@ static struct lockdep_map console_lock_dep_map = { }; #endif +enum devkmsg_log_bits { + __DEVKMSG_LOG_BIT_ON = 0, + __DEVKMSG_LOG_BIT_OFF, + __DEVKMSG_LOG_BIT_LOCK, +}; + +enum devkmsg_log_masks { + DEVKMSG_LOG_MASK_ON = BIT(__DEVKMSG_LOG_BIT_ON), + DEVKMSG_LOG_MASK_OFF = BIT(__DEVKMSG_LOG_BIT_OFF), + DEVKMSG_LOG_MASK_LOCK = BIT(__DEVKMSG_LOG_BIT_LOCK), +}; + +/* Keep both the 'on' and 'off' bits clear, i.e. ratelimit by default: */ +#define DEVKMSG_LOG_MASK_DEFAULT 0 + +static unsigned int __read_mostly devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT; + +static int __control_devkmsg(char *str) +{ + if (!str) + return -EINVAL; + + if (!strncmp(str, "on", 2)) { + devkmsg_log = DEVKMSG_LOG_MASK_ON; + return 2; + } else if (!strncmp(str, "off", 3)) { + devkmsg_log = DEVKMSG_LOG_MASK_OFF; + return 3; + } else if (!strncmp(str, "ratelimit", 9)) { + devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT; + return 9; + } + return -EINVAL; +} + +static int __init control_devkmsg(char *str) +{ + if (__control_devkmsg(str) < 0) + return 1; + + /* + * Set sysctl string accordingly: + */ + if (devkmsg_log == DEVKMSG_LOG_MASK_ON) { + memset(devkmsg_log_str, 0, DEVKMSG_STR_MAX_SIZE); + strncpy(devkmsg_log_str, "on", 2); + } else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF) { + memset(devkmsg_log_str, 0, DEVKMSG_STR_MAX_SIZE); + strncpy(devkmsg_log_str, "off", 3); + } + /* else "ratelimit" which is set by default. */ + + /* + * Sysctl cannot change it anymore. The kernel command line setting of + * this parameter is to force the setting to be permanent throughout the + * runtime of the system. This is a precation measure against userspace + * trying to be a smarta** and attempting to change it up on us. + */ + devkmsg_log |= DEVKMSG_LOG_MASK_LOCK; + + return 0; +} +__setup("printk.devkmsg=", control_devkmsg); + +char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit"; + +int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + char old_str[DEVKMSG_STR_MAX_SIZE]; + unsigned int old; + int err; + + if (write) { + if (devkmsg_log & DEVKMSG_LOG_MASK_LOCK) + return -EINVAL; + + old = devkmsg_log; + strncpy(old_str, devkmsg_log_str, DEVKMSG_STR_MAX_SIZE); + } + + err = proc_dostring(table, write, buffer, lenp, ppos); + if (err) + return err; + + if (write) { + err = __control_devkmsg(devkmsg_log_str); + + /* + * Do not accept an unknown string OR a known string with + * trailing crap... + */ + if (err < 0 || (err + 1 != *lenp)) { + + /* ... and restore old setting. */ + devkmsg_log = old; + strncpy(devkmsg_log_str, old_str, DEVKMSG_STR_MAX_SIZE); + + return -EINVAL; + } + } + + return 0; +} + /* * Number of registered extended console drivers. * @@ -244,7 +349,7 @@ __packed __aligned(4) * within the scheduler's rq lock. It must be released before calling * console_unlock() or anything else that might wake up a process. */ -static DEFINE_RAW_SPINLOCK(logbuf_lock); +DEFINE_RAW_SPINLOCK(logbuf_lock); #ifdef CONFIG_PRINTK DECLARE_WAIT_QUEUE_HEAD(log_wait); @@ -613,6 +718,7 @@ struct devkmsg_user { u64 seq; u32 idx; enum log_flags prev; + struct ratelimit_state rs; struct mutex lock; char buf[CONSOLE_EXT_LOG_MAX]; }; @@ -622,11 +728,24 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) char *buf, *line; int level = default_message_loglevel; int facility = 1; /* LOG_USER */ + struct file *file = iocb->ki_filp; + struct devkmsg_user *user = file->private_data; size_t len = iov_iter_count(from); ssize_t ret = len; - if (len > LOG_LINE_MAX) + if (!user || len > LOG_LINE_MAX) return -EINVAL; + + /* Ignore when user logging is disabled. */ + if (devkmsg_log & DEVKMSG_LOG_MASK_OFF) + return len; + + /* Ratelimit when not explicitly enabled. */ + if (!(devkmsg_log & DEVKMSG_LOG_MASK_ON)) { + if (!___ratelimit(&user->rs, current->comm)) + return ret; + } + buf = kmalloc(len+1, GFP_KERNEL); if (buf == NULL) return -ENOMEM; @@ -799,19 +918,24 @@ static int devkmsg_open(struct inode *inode, struct file *file) struct devkmsg_user *user; int err; - /* write-only does not need any file context */ - if ((file->f_flags & O_ACCMODE) == O_WRONLY) - return 0; + if (devkmsg_log & DEVKMSG_LOG_MASK_OFF) + return -EPERM; - err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL, - SYSLOG_FROM_READER); - if (err) - return err; + /* write-only does not need any file context */ + if ((file->f_flags & O_ACCMODE) != O_WRONLY) { + err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL, + SYSLOG_FROM_READER); + if (err) + return err; + } user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL); if (!user) return -ENOMEM; + ratelimit_default_init(&user->rs); + ratelimit_set_flags(&user->rs, RATELIMIT_MSG_ON_RELEASE); + mutex_init(&user->lock); raw_spin_lock_irq(&logbuf_lock); @@ -830,6 +954,8 @@ static int devkmsg_release(struct inode *inode, struct file *file) if (!user) return 0; + ratelimit_state_exit(&user->rs); + mutex_destroy(&user->lock); kfree(user); return 0; @@ -985,6 +1111,11 @@ module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting (prints all kernel messages to the console)"); +static bool suppress_message_printing(int level) +{ + return (level >= console_loglevel && !ignore_loglevel); +} + #ifdef CONFIG_BOOT_PRINTK_DELAY static int boot_delay; /* msecs delay after each printk during bootup */ @@ -1014,7 +1145,7 @@ static void boot_delay_msec(int level) unsigned long timeout; if ((boot_delay == 0 || system_state != SYSTEM_BOOTING) - || (level >= console_loglevel && !ignore_loglevel)) { + || suppress_message_printing(level)) { return; } @@ -1438,8 +1569,6 @@ static void call_console_drivers(int level, trace_console(text, len); - if (level >= console_loglevel && !ignore_loglevel) - return; if (!console_drivers) return; @@ -1616,6 +1745,7 @@ asmlinkage int vprintk_emit(int facility, int level, unsigned long flags; int this_cpu; int printed_len = 0; + int nmi_message_lost; bool in_sched = false; /* cpu currently holding logbuf_lock in this function */ static unsigned int logbuf_cpu = UINT_MAX; @@ -1666,6 +1796,15 @@ asmlinkage int vprintk_emit(int facility, int level, strlen(recursion_msg)); } + nmi_message_lost = get_nmi_message_lost(); + if (unlikely(nmi_message_lost)) { + text_len = scnprintf(textbuf, sizeof(textbuf), + "BAD LUCK: lost %d message(s) from NMI context!", + nmi_message_lost); + printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, + NULL, 0, textbuf, text_len); + } + /* * The printf needs to come first; we need the syslog * prefix which might be passed-in as a parameter. @@ -1807,14 +1946,6 @@ int vprintk_default(const char *fmt, va_list args) } EXPORT_SYMBOL_GPL(vprintk_default); -/* - * This allows printk to be diverted to another function per cpu. - * This is useful for calling printk functions from within NMI - * without worrying about race conditions that can lock up the - * box. - */ -DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; - /** * printk - print a kernel message * @fmt: format string @@ -1838,21 +1969,11 @@ DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; */ asmlinkage __visible int printk(const char *fmt, ...) { - printk_func_t vprintk_func; va_list args; int r; va_start(args, fmt); - - /* - * If a caller overrides the per_cpu printk_func, then it needs - * to disable preemption when calling printk(). Otherwise - * the printk_func should be set to the default. No need to - * disable preemption here. - */ - vprintk_func = this_cpu_read(printk_func); r = vprintk_func(fmt, args); - va_end(args); return r; @@ -1895,6 +2016,7 @@ static void call_console_drivers(int level, static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev, bool syslog, char *buf, size_t size) { return 0; } static size_t cont_print_text(char *text, size_t size) { return 0; } +static bool suppress_message_printing(int level) { return false; } /* Still needs to be defined for users */ DEFINE_PER_CPU(printk_func_t, printk_func); @@ -2174,6 +2296,13 @@ static void console_cont_flush(char *text, size_t size) if (!cont.len) goto out; + if (suppress_message_printing(cont.level)) { + cont.cons = cont.len; + if (cont.flushed) + cont.len = 0; + goto out; + } + /* * We still queue earlier records, likely because the console was * busy. The earlier ones need to be printed before this one, we @@ -2277,10 +2406,13 @@ skip: break; msg = log_from_idx(console_idx); - if (msg->flags & LOG_NOCONS) { + level = msg->level; + if ((msg->flags & LOG_NOCONS) || + suppress_message_printing(level)) { /* * Skip record we have buffered and already printed - * directly to the console when we received it. + * directly to the console when we received it, and + * record that has level above the console loglevel. */ console_idx = log_next(console_idx); console_seq++; @@ -2294,7 +2426,6 @@ skip: goto skip; } - level = msg->level; len += msg_print_text(msg, console_prev, false, text + len, sizeof(text) - len); if (nr_ext_console_drivers) { @@ -3184,9 +3315,8 @@ void show_regs_print_info(const char *log_lvl) { dump_stack_print_info(log_lvl); - printk("%stask: %p ti: %p task.ti: %p\n", - log_lvl, current, current_thread_info(), - task_thread_info(current)); + printk("%stask: %p task.stack: %p\n", + log_lvl, current, task_stack_page(current)); } #endif diff --git a/kernel/profile.c b/kernel/profile.c index c2199e9901c9..2dbccf2d806c 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -328,68 +328,57 @@ out: put_cpu(); } -static int profile_cpu_callback(struct notifier_block *info, - unsigned long action, void *__cpu) +static int profile_dead_cpu(unsigned int cpu) { - int node, cpu = (unsigned long)__cpu; struct page *page; + int i; - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - node = cpu_to_mem(cpu); - per_cpu(cpu_profile_flip, cpu) = 0; - if (!per_cpu(cpu_profile_hits, cpu)[1]) { - page = __alloc_pages_node(node, - GFP_KERNEL | __GFP_ZERO, - 0); - if (!page) - return notifier_from_errno(-ENOMEM); - per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); - } - if (!per_cpu(cpu_profile_hits, cpu)[0]) { - page = __alloc_pages_node(node, - GFP_KERNEL | __GFP_ZERO, - 0); - if (!page) - goto out_free; - per_cpu(cpu_profile_hits, cpu)[0] = page_address(page); - } - break; -out_free: - page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); - per_cpu(cpu_profile_hits, cpu)[1] = NULL; - __free_page(page); - return notifier_from_errno(-ENOMEM); - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - if (prof_cpu_mask != NULL) - cpumask_set_cpu(cpu, prof_cpu_mask); - break; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - if (prof_cpu_mask != NULL) - cpumask_clear_cpu(cpu, prof_cpu_mask); - if (per_cpu(cpu_profile_hits, cpu)[0]) { - page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); - per_cpu(cpu_profile_hits, cpu)[0] = NULL; + if (prof_cpu_mask != NULL) + cpumask_clear_cpu(cpu, prof_cpu_mask); + + for (i = 0; i < 2; i++) { + if (per_cpu(cpu_profile_hits, cpu)[i]) { + page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[i]); + per_cpu(cpu_profile_hits, cpu)[i] = NULL; __free_page(page); } - if (per_cpu(cpu_profile_hits, cpu)[1]) { - page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); - per_cpu(cpu_profile_hits, cpu)[1] = NULL; - __free_page(page); + } + return 0; +} + +static int profile_prepare_cpu(unsigned int cpu) +{ + int i, node = cpu_to_mem(cpu); + struct page *page; + + per_cpu(cpu_profile_flip, cpu) = 0; + + for (i = 0; i < 2; i++) { + if (per_cpu(cpu_profile_hits, cpu)[i]) + continue; + + page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); + if (!page) { + profile_dead_cpu(cpu); + return -ENOMEM; } - break; + per_cpu(cpu_profile_hits, cpu)[i] = page_address(page); + } - return NOTIFY_OK; + return 0; +} + +static int profile_online_cpu(unsigned int cpu) +{ + if (prof_cpu_mask != NULL) + cpumask_set_cpu(cpu, prof_cpu_mask); + + return 0; } + #else /* !CONFIG_SMP */ #define profile_flip_buffers() do { } while (0) #define profile_discard_flip_buffers() do { } while (0) -#define profile_cpu_callback NULL static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) { @@ -531,83 +520,43 @@ static const struct file_operations proc_profile_operations = { .llseek = default_llseek, }; -#ifdef CONFIG_SMP -static void profile_nop(void *unused) -{ -} - -static int create_hash_tables(void) +int __ref create_proc_profile(void) { - int cpu; - - for_each_online_cpu(cpu) { - int node = cpu_to_mem(cpu); - struct page *page; - - page = __alloc_pages_node(node, - GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, - 0); - if (!page) - goto out_cleanup; - per_cpu(cpu_profile_hits, cpu)[1] - = (struct profile_hit *)page_address(page); - page = __alloc_pages_node(node, - GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, - 0); - if (!page) - goto out_cleanup; - per_cpu(cpu_profile_hits, cpu)[0] - = (struct profile_hit *)page_address(page); - } - return 0; -out_cleanup: - prof_on = 0; - smp_mb(); - on_each_cpu(profile_nop, NULL, 1); - for_each_online_cpu(cpu) { - struct page *page; - - if (per_cpu(cpu_profile_hits, cpu)[0]) { - page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); - per_cpu(cpu_profile_hits, cpu)[0] = NULL; - __free_page(page); - } - if (per_cpu(cpu_profile_hits, cpu)[1]) { - page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); - per_cpu(cpu_profile_hits, cpu)[1] = NULL; - __free_page(page); - } - } - return -1; -} -#else -#define create_hash_tables() ({ 0; }) + struct proc_dir_entry *entry; +#ifdef CONFIG_SMP + enum cpuhp_state online_state; #endif -int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */ -{ - struct proc_dir_entry *entry; int err = 0; if (!prof_on) return 0; - - cpu_notifier_register_begin(); - - if (create_hash_tables()) { - err = -ENOMEM; - goto out; - } - +#ifdef CONFIG_SMP + err = cpuhp_setup_state(CPUHP_PROFILE_PREPARE, "PROFILE_PREPARE", + profile_prepare_cpu, profile_dead_cpu); + if (err) + return err; + + err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "AP_PROFILE_ONLINE", + profile_online_cpu, NULL); + if (err < 0) + goto err_state_prep; + online_state = err; + err = 0; +#endif entry = proc_create("profile", S_IWUSR | S_IRUGO, NULL, &proc_profile_operations); if (!entry) - goto out; + goto err_state_onl; proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); - __hotcpu_notifier(profile_cpu_callback, 0); -out: - cpu_notifier_register_done(); + return err; +err_state_onl: +#ifdef CONFIG_SMP + cpuhp_remove_state(online_state); +err_state_prep: + cpuhp_remove_state(CPUHP_PROFILE_PREPARE); +#endif return err; } subsys_initcall(create_proc_profile); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index d49bfa1e53e6..1d3b7665d0be 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -585,8 +585,8 @@ static int ptrace_setoptions(struct task_struct *child, unsigned long data) return -EINVAL; if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) { - if (!config_enabled(CONFIG_CHECKPOINT_RESTORE) || - !config_enabled(CONFIG_SECCOMP)) + if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) || + !IS_ENABLED(CONFIG_SECCOMP)) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 032b2c015beb..18dfc485225c 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -5,6 +5,7 @@ KCOV_INSTRUMENT := n obj-y += update.o sync.o obj-$(CONFIG_SRCU) += srcu.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o +obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_PREEMPT_RCU) += tree.o obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c new file mode 100644 index 000000000000..d38ab08a3fe7 --- /dev/null +++ b/kernel/rcu/rcuperf.c @@ -0,0 +1,660 @@ +/* + * Read-Copy Update module-based performance-test facility + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2015 + * + * Authors: Paul E. McKenney <paulmck@us.ibm.com> + */ +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/err.h> +#include <linux/spinlock.h> +#include <linux/smp.h> +#include <linux/rcupdate.h> +#include <linux/interrupt.h> +#include <linux/sched.h> +#include <linux/atomic.h> +#include <linux/bitops.h> +#include <linux/completion.h> +#include <linux/moduleparam.h> +#include <linux/percpu.h> +#include <linux/notifier.h> +#include <linux/reboot.h> +#include <linux/freezer.h> +#include <linux/cpu.h> +#include <linux/delay.h> +#include <linux/stat.h> +#include <linux/srcu.h> +#include <linux/slab.h> +#include <asm/byteorder.h> +#include <linux/torture.h> +#include <linux/vmalloc.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); + +#define PERF_FLAG "-perf:" +#define PERFOUT_STRING(s) \ + pr_alert("%s" PERF_FLAG s "\n", perf_type) +#define VERBOSE_PERFOUT_STRING(s) \ + do { if (verbose) pr_alert("%s" PERF_FLAG " %s\n", perf_type, s); } while (0) +#define VERBOSE_PERFOUT_ERRSTRING(s) \ + do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) + +torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); +torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); +torture_param(int, nreaders, -1, "Number of RCU reader threads"); +torture_param(int, nwriters, -1, "Number of RCU updater threads"); +torture_param(bool, shutdown, false, "Shutdown at end of performance tests."); +torture_param(bool, verbose, true, "Enable verbose debugging printk()s"); + +static char *perf_type = "rcu"; +module_param(perf_type, charp, 0444); +MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, rcu_bh, ...)"); + +static int nrealreaders; +static int nrealwriters; +static struct task_struct **writer_tasks; +static struct task_struct **reader_tasks; +static struct task_struct *shutdown_task; + +static u64 **writer_durations; +static int *writer_n_durations; +static atomic_t n_rcu_perf_reader_started; +static atomic_t n_rcu_perf_writer_started; +static atomic_t n_rcu_perf_writer_finished; +static wait_queue_head_t shutdown_wq; +static u64 t_rcu_perf_writer_started; +static u64 t_rcu_perf_writer_finished; +static unsigned long b_rcu_perf_writer_started; +static unsigned long b_rcu_perf_writer_finished; + +static int rcu_perf_writer_state; +#define RTWS_INIT 0 +#define RTWS_EXP_SYNC 1 +#define RTWS_SYNC 2 +#define RTWS_IDLE 2 +#define RTWS_STOPPING 3 + +#define MAX_MEAS 10000 +#define MIN_MEAS 100 + +static int perf_runnable = IS_ENABLED(MODULE); +module_param(perf_runnable, int, 0444); +MODULE_PARM_DESC(perf_runnable, "Start rcuperf at boot"); + +/* + * Operations vector for selecting different types of tests. + */ + +struct rcu_perf_ops { + int ptype; + void (*init)(void); + void (*cleanup)(void); + int (*readlock)(void); + void (*readunlock)(int idx); + unsigned long (*started)(void); + unsigned long (*completed)(void); + unsigned long (*exp_completed)(void); + void (*sync)(void); + void (*exp_sync)(void); + const char *name; +}; + +static struct rcu_perf_ops *cur_ops; + +/* + * Definitions for rcu perf testing. + */ + +static int rcu_perf_read_lock(void) __acquires(RCU) +{ + rcu_read_lock(); + return 0; +} + +static void rcu_perf_read_unlock(int idx) __releases(RCU) +{ + rcu_read_unlock(); +} + +static unsigned long __maybe_unused rcu_no_completed(void) +{ + return 0; +} + +static void rcu_sync_perf_init(void) +{ +} + +static struct rcu_perf_ops rcu_ops = { + .ptype = RCU_FLAVOR, + .init = rcu_sync_perf_init, + .readlock = rcu_perf_read_lock, + .readunlock = rcu_perf_read_unlock, + .started = rcu_batches_started, + .completed = rcu_batches_completed, + .exp_completed = rcu_exp_batches_completed, + .sync = synchronize_rcu, + .exp_sync = synchronize_rcu_expedited, + .name = "rcu" +}; + +/* + * Definitions for rcu_bh perf testing. + */ + +static int rcu_bh_perf_read_lock(void) __acquires(RCU_BH) +{ + rcu_read_lock_bh(); + return 0; +} + +static void rcu_bh_perf_read_unlock(int idx) __releases(RCU_BH) +{ + rcu_read_unlock_bh(); +} + +static struct rcu_perf_ops rcu_bh_ops = { + .ptype = RCU_BH_FLAVOR, + .init = rcu_sync_perf_init, + .readlock = rcu_bh_perf_read_lock, + .readunlock = rcu_bh_perf_read_unlock, + .started = rcu_batches_started_bh, + .completed = rcu_batches_completed_bh, + .exp_completed = rcu_exp_batches_completed_sched, + .sync = synchronize_rcu_bh, + .exp_sync = synchronize_rcu_bh_expedited, + .name = "rcu_bh" +}; + +/* + * Definitions for srcu perf testing. + */ + +DEFINE_STATIC_SRCU(srcu_ctl_perf); +static struct srcu_struct *srcu_ctlp = &srcu_ctl_perf; + +static int srcu_perf_read_lock(void) __acquires(srcu_ctlp) +{ + return srcu_read_lock(srcu_ctlp); +} + +static void srcu_perf_read_unlock(int idx) __releases(srcu_ctlp) +{ + srcu_read_unlock(srcu_ctlp, idx); +} + +static unsigned long srcu_perf_completed(void) +{ + return srcu_batches_completed(srcu_ctlp); +} + +static void srcu_perf_synchronize(void) +{ + synchronize_srcu(srcu_ctlp); +} + +static void srcu_perf_synchronize_expedited(void) +{ + synchronize_srcu_expedited(srcu_ctlp); +} + +static struct rcu_perf_ops srcu_ops = { + .ptype = SRCU_FLAVOR, + .init = rcu_sync_perf_init, + .readlock = srcu_perf_read_lock, + .readunlock = srcu_perf_read_unlock, + .started = NULL, + .completed = srcu_perf_completed, + .exp_completed = srcu_perf_completed, + .sync = srcu_perf_synchronize, + .exp_sync = srcu_perf_synchronize_expedited, + .name = "srcu" +}; + +/* + * Definitions for sched perf testing. + */ + +static int sched_perf_read_lock(void) +{ + preempt_disable(); + return 0; +} + +static void sched_perf_read_unlock(int idx) +{ + preempt_enable(); +} + +static struct rcu_perf_ops sched_ops = { + .ptype = RCU_SCHED_FLAVOR, + .init = rcu_sync_perf_init, + .readlock = sched_perf_read_lock, + .readunlock = sched_perf_read_unlock, + .started = rcu_batches_started_sched, + .completed = rcu_batches_completed_sched, + .exp_completed = rcu_exp_batches_completed_sched, + .sync = synchronize_sched, + .exp_sync = synchronize_sched_expedited, + .name = "sched" +}; + +#ifdef CONFIG_TASKS_RCU + +/* + * Definitions for RCU-tasks perf testing. + */ + +static int tasks_perf_read_lock(void) +{ + return 0; +} + +static void tasks_perf_read_unlock(int idx) +{ +} + +static struct rcu_perf_ops tasks_ops = { + .ptype = RCU_TASKS_FLAVOR, + .init = rcu_sync_perf_init, + .readlock = tasks_perf_read_lock, + .readunlock = tasks_perf_read_unlock, + .started = rcu_no_completed, + .completed = rcu_no_completed, + .sync = synchronize_rcu_tasks, + .exp_sync = synchronize_rcu_tasks, + .name = "tasks" +}; + +#define RCUPERF_TASKS_OPS &tasks_ops, + +static bool __maybe_unused torturing_tasks(void) +{ + return cur_ops == &tasks_ops; +} + +#else /* #ifdef CONFIG_TASKS_RCU */ + +#define RCUPERF_TASKS_OPS + +static bool __maybe_unused torturing_tasks(void) +{ + return false; +} + +#endif /* #else #ifdef CONFIG_TASKS_RCU */ + +/* + * If performance tests complete, wait for shutdown to commence. + */ +static void rcu_perf_wait_shutdown(void) +{ + cond_resched_rcu_qs(); + if (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters) + return; + while (!torture_must_stop()) + schedule_timeout_uninterruptible(1); +} + +/* + * RCU perf reader kthread. Repeatedly does empty RCU read-side + * critical section, minimizing update-side interference. + */ +static int +rcu_perf_reader(void *arg) +{ + unsigned long flags; + int idx; + long me = (long)arg; + + VERBOSE_PERFOUT_STRING("rcu_perf_reader task started"); + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + set_user_nice(current, MAX_NICE); + atomic_inc(&n_rcu_perf_reader_started); + + do { + local_irq_save(flags); + idx = cur_ops->readlock(); + cur_ops->readunlock(idx); + local_irq_restore(flags); + rcu_perf_wait_shutdown(); + } while (!torture_must_stop()); + torture_kthread_stopping("rcu_perf_reader"); + return 0; +} + +/* + * RCU perf writer kthread. Repeatedly does a grace period. + */ +static int +rcu_perf_writer(void *arg) +{ + int i = 0; + int i_max; + long me = (long)arg; + struct sched_param sp; + bool started = false, done = false, alldone = false; + u64 t; + u64 *wdp; + u64 *wdpp = writer_durations[me]; + + VERBOSE_PERFOUT_STRING("rcu_perf_writer task started"); + WARN_ON(!wdpp); + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + sp.sched_priority = 1; + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); + + if (holdoff) + schedule_timeout_uninterruptible(holdoff * HZ); + + t = ktime_get_mono_fast_ns(); + if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) { + t_rcu_perf_writer_started = t; + if (gp_exp) { + b_rcu_perf_writer_started = + cur_ops->exp_completed() / 2; + } else { + b_rcu_perf_writer_started = + cur_ops->completed(); + } + } + + do { + wdp = &wdpp[i]; + *wdp = ktime_get_mono_fast_ns(); + if (gp_exp) { + rcu_perf_writer_state = RTWS_EXP_SYNC; + cur_ops->exp_sync(); + } else { + rcu_perf_writer_state = RTWS_SYNC; + cur_ops->sync(); + } + rcu_perf_writer_state = RTWS_IDLE; + t = ktime_get_mono_fast_ns(); + *wdp = t - *wdp; + i_max = i; + if (!started && + atomic_read(&n_rcu_perf_writer_started) >= nrealwriters) + started = true; + if (!done && i >= MIN_MEAS) { + done = true; + sp.sched_priority = 0; + sched_setscheduler_nocheck(current, + SCHED_NORMAL, &sp); + pr_alert("%s" PERF_FLAG + "rcu_perf_writer %ld has %d measurements\n", + perf_type, me, MIN_MEAS); + if (atomic_inc_return(&n_rcu_perf_writer_finished) >= + nrealwriters) { + schedule_timeout_interruptible(10); + rcu_ftrace_dump(DUMP_ALL); + PERFOUT_STRING("Test complete"); + t_rcu_perf_writer_finished = t; + if (gp_exp) { + b_rcu_perf_writer_finished = + cur_ops->exp_completed() / 2; + } else { + b_rcu_perf_writer_finished = + cur_ops->completed(); + } + if (shutdown) { + smp_mb(); /* Assign before wake. */ + wake_up(&shutdown_wq); + } + } + } + if (done && !alldone && + atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters) + alldone = true; + if (started && !alldone && i < MAX_MEAS - 1) + i++; + rcu_perf_wait_shutdown(); + } while (!torture_must_stop()); + rcu_perf_writer_state = RTWS_STOPPING; + writer_n_durations[me] = i_max; + torture_kthread_stopping("rcu_perf_writer"); + return 0; +} + +static inline void +rcu_perf_print_module_parms(struct rcu_perf_ops *cur_ops, const char *tag) +{ + pr_alert("%s" PERF_FLAG + "--- %s: nreaders=%d nwriters=%d verbose=%d shutdown=%d\n", + perf_type, tag, nrealreaders, nrealwriters, verbose, shutdown); +} + +static void +rcu_perf_cleanup(void) +{ + int i; + int j; + int ngps = 0; + u64 *wdp; + u64 *wdpp; + + if (torture_cleanup_begin()) + return; + + if (reader_tasks) { + for (i = 0; i < nrealreaders; i++) + torture_stop_kthread(rcu_perf_reader, + reader_tasks[i]); + kfree(reader_tasks); + } + + if (writer_tasks) { + for (i = 0; i < nrealwriters; i++) { + torture_stop_kthread(rcu_perf_writer, + writer_tasks[i]); + if (!writer_n_durations) + continue; + j = writer_n_durations[i]; + pr_alert("%s%s writer %d gps: %d\n", + perf_type, PERF_FLAG, i, j); + ngps += j; + } + pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n", + perf_type, PERF_FLAG, + t_rcu_perf_writer_started, t_rcu_perf_writer_finished, + t_rcu_perf_writer_finished - + t_rcu_perf_writer_started, + ngps, + b_rcu_perf_writer_finished - + b_rcu_perf_writer_started); + for (i = 0; i < nrealwriters; i++) { + if (!writer_durations) + break; + if (!writer_n_durations) + continue; + wdpp = writer_durations[i]; + if (!wdpp) + continue; + for (j = 0; j <= writer_n_durations[i]; j++) { + wdp = &wdpp[j]; + pr_alert("%s%s %4d writer-duration: %5d %llu\n", + perf_type, PERF_FLAG, + i, j, *wdp); + if (j % 100 == 0) + schedule_timeout_uninterruptible(1); + } + kfree(writer_durations[i]); + } + kfree(writer_tasks); + kfree(writer_durations); + kfree(writer_n_durations); + } + + /* Do flavor-specific cleanup operations. */ + if (cur_ops->cleanup != NULL) + cur_ops->cleanup(); + + torture_cleanup_end(); +} + +/* + * Return the number if non-negative. If -1, the number of CPUs. + * If less than -1, that much less than the number of CPUs, but + * at least one. + */ +static int compute_real(int n) +{ + int nr; + + if (n >= 0) { + nr = n; + } else { + nr = num_online_cpus() + 1 + n; + if (nr <= 0) + nr = 1; + } + return nr; +} + +/* + * RCU perf shutdown kthread. Just waits to be awakened, then shuts + * down system. + */ +static int +rcu_perf_shutdown(void *arg) +{ + do { + wait_event(shutdown_wq, + atomic_read(&n_rcu_perf_writer_finished) >= + nrealwriters); + } while (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters); + smp_mb(); /* Wake before output. */ + rcu_perf_cleanup(); + kernel_power_off(); + return -EINVAL; +} + +static int __init +rcu_perf_init(void) +{ + long i; + int firsterr = 0; + static struct rcu_perf_ops *perf_ops[] = { + &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, + RCUPERF_TASKS_OPS + }; + + if (!torture_init_begin(perf_type, verbose, &perf_runnable)) + return -EBUSY; + + /* Process args and tell the world that the perf'er is on the job. */ + for (i = 0; i < ARRAY_SIZE(perf_ops); i++) { + cur_ops = perf_ops[i]; + if (strcmp(perf_type, cur_ops->name) == 0) + break; + } + if (i == ARRAY_SIZE(perf_ops)) { + pr_alert("rcu-perf: invalid perf type: \"%s\"\n", + perf_type); + pr_alert("rcu-perf types:"); + for (i = 0; i < ARRAY_SIZE(perf_ops); i++) + pr_alert(" %s", perf_ops[i]->name); + pr_alert("\n"); + firsterr = -EINVAL; + goto unwind; + } + if (cur_ops->init) + cur_ops->init(); + + nrealwriters = compute_real(nwriters); + nrealreaders = compute_real(nreaders); + atomic_set(&n_rcu_perf_reader_started, 0); + atomic_set(&n_rcu_perf_writer_started, 0); + atomic_set(&n_rcu_perf_writer_finished, 0); + rcu_perf_print_module_parms(cur_ops, "Start of test"); + + /* Start up the kthreads. */ + + if (shutdown) { + init_waitqueue_head(&shutdown_wq); + firsterr = torture_create_kthread(rcu_perf_shutdown, NULL, + shutdown_task); + if (firsterr) + goto unwind; + schedule_timeout_uninterruptible(1); + } + reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]), + GFP_KERNEL); + if (reader_tasks == NULL) { + VERBOSE_PERFOUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nrealreaders; i++) { + firsterr = torture_create_kthread(rcu_perf_reader, (void *)i, + reader_tasks[i]); + if (firsterr) + goto unwind; + } + while (atomic_read(&n_rcu_perf_reader_started) < nrealreaders) + schedule_timeout_uninterruptible(1); + writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]), + GFP_KERNEL); + writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations), + GFP_KERNEL); + writer_n_durations = + kcalloc(nrealwriters, sizeof(*writer_n_durations), + GFP_KERNEL); + if (!writer_tasks || !writer_durations || !writer_n_durations) { + VERBOSE_PERFOUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) { + VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); + firsterr = -EINVAL; + goto unwind; + } + if (rcu_gp_is_normal() && gp_exp) { + VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); + firsterr = -EINVAL; + goto unwind; + } + for (i = 0; i < nrealwriters; i++) { + writer_durations[i] = + kcalloc(MAX_MEAS, sizeof(*writer_durations[i]), + GFP_KERNEL); + if (!writer_durations[i]) { + firsterr = -ENOMEM; + goto unwind; + } + firsterr = torture_create_kthread(rcu_perf_writer, (void *)i, + writer_tasks[i]); + if (firsterr) + goto unwind; + } + torture_init_end(); + return 0; + +unwind: + torture_init_end(); + rcu_perf_cleanup(); + return firsterr; +} + +module_init(rcu_perf_init); +module_exit(rcu_perf_cleanup); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 250ea67c1615..971e2b138063 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -130,8 +130,8 @@ static struct rcu_torture __rcu *rcu_torture_current; static unsigned long rcu_torture_current_version; static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; static DEFINE_SPINLOCK(rcu_torture_lock); -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = { 0 }; -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = { 0 }; +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count); +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch); static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; static atomic_t n_rcu_torture_alloc; static atomic_t n_rcu_torture_alloc_fail; @@ -182,12 +182,7 @@ static const char *rcu_torture_writer_state_getname(void) return rcu_torture_writer_state_names[i]; } -#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) -#define RCUTORTURE_RUNNABLE_INIT 1 -#else -#define RCUTORTURE_RUNNABLE_INIT 0 -#endif -static int torture_runnable = RCUTORTURE_RUNNABLE_INIT; +static int torture_runnable = IS_ENABLED(MODULE); module_param(torture_runnable, int, 0444); MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot"); @@ -916,7 +911,7 @@ rcu_torture_fqs(void *arg) static int rcu_torture_writer(void *arg) { - bool can_expedite = !rcu_gp_is_expedited(); + bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal(); int expediting = 0; unsigned long gp_snap; bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; @@ -932,7 +927,7 @@ rcu_torture_writer(void *arg) VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); if (!can_expedite) { pr_alert("%s" TORTURE_FLAG - " Grace periods expedited from boot/sysfs for %s,\n", + " GP expediting controlled from boot/sysfs for %s,\n", torture_type, cur_ops->name); pr_alert("%s" TORTURE_FLAG " Disabled dynamic grace-period expediting.\n", @@ -1082,17 +1077,6 @@ rcu_torture_fakewriter(void *arg) return 0; } -static void rcutorture_trace_dump(void) -{ - static atomic_t beenhere = ATOMIC_INIT(0); - - if (atomic_read(&beenhere)) - return; - if (atomic_xchg(&beenhere, 1) != 0) - return; - ftrace_dump(DUMP_ALL); -} - /* * RCU torture reader from timer handler. Dereferences rcu_torture_current, * incrementing the corresponding element of the pipeline array. The @@ -1142,7 +1126,7 @@ static void rcu_torture_timer(unsigned long unused) if (pipe_count > 1) { do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, started, completed); - rcutorture_trace_dump(); + rcu_ftrace_dump(DUMP_ALL); } __this_cpu_inc(rcu_torture_count[pipe_count]); completed = completed - started; @@ -1215,7 +1199,7 @@ rcu_torture_reader(void *arg) if (pipe_count > 1) { do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, started, completed); - rcutorture_trace_dump(); + rcu_ftrace_dump(DUMP_ALL); } __this_cpu_inc(rcu_torture_count[pipe_count]); completed = completed - started; @@ -1333,7 +1317,7 @@ rcu_torture_stats_print(void) rcu_torture_writer_state, gpnum, completed, flags); show_rcu_gp_kthreads(); - rcutorture_trace_dump(); + rcu_ftrace_dump(DUMP_ALL); } rtcv_snap = rcu_torture_current_version; } @@ -1487,9 +1471,11 @@ static int rcu_torture_barrier_cbs(void *arg) break; /* * The above smp_load_acquire() ensures barrier_phase load - * is ordered before the folloiwng ->call(). + * is ordered before the following ->call(). */ + local_irq_disable(); /* Just to test no-irq call_rcu(). */ cur_ops->call(&rcu, rcu_torture_barrier_cbf); + local_irq_enable(); if (atomic_dec_and_test(&barrier_cbs_count)) wake_up(&barrier_wq); } while (!torture_must_stop()); @@ -1596,7 +1582,7 @@ static int rcutorture_cpu_notify(struct notifier_block *self, { long cpu = (long)hcpu; - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_ONLINE: case CPU_DOWN_FAILED: (void)rcutorture_booster_init(cpu); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9a535a86e732..5d80925e7fc8 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -102,6 +102,8 @@ struct rcu_state sname##_state = { \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .name = RCU_STATE_NAME(sname), \ .abbr = sabbr, \ + .exp_mutex = __MUTEX_INITIALIZER(sname##_state.exp_mutex), \ + .exp_wake_mutex = __MUTEX_INITIALIZER(sname##_state.exp_wake_mutex), \ } RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); @@ -123,12 +125,14 @@ int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; /* Number of rcu_nodes at specified level. */ static int num_rcu_lvl[] = NUM_RCU_LVL_INIT; int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ +/* panic() on RCU Stall sysctl. */ +int sysctl_panic_on_rcu_stall __read_mostly; /* * The rcu_scheduler_active variable transitions from zero to one just * before the first task is spawned. So when this variable is zero, RCU * can assume that there is but one task, allowing RCU to (for example) - * optimize synchronize_sched() to a simple barrier(). When this variable + * optimize synchronize_rcu() to a simple barrier(). When this variable * is one, RCU must actually do all the hard work required to detect real * grace periods. This variable is also used to suppress boot-time false * positives from lockdep-RCU error checking. @@ -157,6 +161,7 @@ static void invoke_rcu_core(void); static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, bool wake); +static void sync_sched_exp_online_cleanup(int cpu); /* rcuc/rcub kthread realtime priority */ #ifdef CONFIG_RCU_KTHREAD_PRIO @@ -370,6 +375,21 @@ void rcu_all_qs(void) rcu_momentary_dyntick_idle(); local_irq_restore(flags); } + if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) { + /* + * Yes, we just checked a per-CPU variable with preemption + * enabled, so we might be migrated to some other CPU at + * this point. That is OK because in that case, the + * migration will supply the needed quiescent state. + * We might end up needlessly disabling preemption and + * invoking rcu_sched_qs() on the destination CPU, but + * the probability and cost are both quite low, so this + * should not be a problem in practice. + */ + preempt_disable(); + rcu_sched_qs(); + preempt_enable(); + } this_cpu_inc(rcu_qs_ctr); barrier(); /* Avoid RCU read-side critical sections leaking up. */ } @@ -385,9 +405,11 @@ module_param(qlowmark, long, 0444); static ulong jiffies_till_first_fqs = ULONG_MAX; static ulong jiffies_till_next_fqs = ULONG_MAX; +static bool rcu_kick_kthreads; module_param(jiffies_till_first_fqs, ulong, 0644); module_param(jiffies_till_next_fqs, ulong, 0644); +module_param(rcu_kick_kthreads, bool, 0644); /* * How long the grace period must be before we start recruiting @@ -460,6 +482,28 @@ unsigned long rcu_batches_completed_bh(void) EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); /* + * Return the number of RCU expedited batches completed thus far for + * debug & stats. Odd numbers mean that a batch is in progress, even + * numbers mean idle. The value returned will thus be roughly double + * the cumulative batches since boot. + */ +unsigned long rcu_exp_batches_completed(void) +{ + return rcu_state_p->expedited_sequence; +} +EXPORT_SYMBOL_GPL(rcu_exp_batches_completed); + +/* + * Return the number of RCU-sched expedited batches completed thus far + * for debug & stats. Similar to rcu_exp_batches_completed(). + */ +unsigned long rcu_exp_batches_completed_sched(void) +{ + return rcu_sched_state.expedited_sequence; +} +EXPORT_SYMBOL_GPL(rcu_exp_batches_completed_sched); + +/* * Force a quiescent state. */ void rcu_force_quiescent_state(void) @@ -637,7 +681,7 @@ static void rcu_eqs_enter_common(long long oldval, bool user) idle_task(smp_processor_id()); trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0); - ftrace_dump(DUMP_ORIG); + rcu_ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ @@ -799,7 +843,7 @@ static void rcu_eqs_exit_common(long long oldval, int user) trace_rcu_dyntick(TPS("Error on exit: not idle task"), oldval, rdtp->dynticks_nesting); - ftrace_dump(DUMP_ORIG); + rcu_ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ @@ -1029,11 +1073,11 @@ EXPORT_SYMBOL_GPL(rcu_is_watching); * offline to continue to use RCU for one jiffy after marking itself * offline in the cpu_online_mask. This leniency is necessary given the * non-atomic nature of the online and offline processing, for example, - * the fact that a CPU enters the scheduler after completing the CPU_DYING - * notifiers. + * the fact that a CPU enters the scheduler after completing the teardown + * of the CPU. * - * This is also why RCU internally marks CPUs online during the - * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase. + * This is also why RCU internally marks CPUs online during in the + * preparation phase and offline after the CPU has been taken down. * * Disable checking if in an NMI handler because we cannot safely report * errors from NMI handlers anyway. @@ -1224,8 +1268,10 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) rsp->gp_flags, gp_state_getname(rsp->gp_state), rsp->gp_state, rsp->gp_kthread ? rsp->gp_kthread->state : ~0); - if (rsp->gp_kthread) + if (rsp->gp_kthread) { sched_show_task(rsp->gp_kthread); + wake_up_process(rsp->gp_kthread); + } } } @@ -1241,14 +1287,39 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask != 0) { - for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) - if (rnp->qsmask & (1UL << cpu)) - dump_cpu_task(rnp->grplo + cpu); + for_each_leaf_node_possible_cpu(rnp, cpu) + if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) + dump_cpu_task(cpu); } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } +/* + * If too much time has passed in the current grace period, and if + * so configured, go kick the relevant kthreads. + */ +static void rcu_stall_kick_kthreads(struct rcu_state *rsp) +{ + unsigned long j; + + if (!rcu_kick_kthreads) + return; + j = READ_ONCE(rsp->jiffies_kick_kthreads); + if (time_after(jiffies, j) && rsp->gp_kthread) { + WARN_ONCE(1, "Kicking %s grace-period kthread\n", rsp->name); + rcu_ftrace_dump(DUMP_ALL); + wake_up_process(rsp->gp_kthread); + WRITE_ONCE(rsp->jiffies_kick_kthreads, j + HZ); + } +} + +static inline void panic_on_rcu_stall(void) +{ + if (sysctl_panic_on_rcu_stall) + panic("RCU Stall\n"); +} + static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) { int cpu; @@ -1260,6 +1331,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) struct rcu_node *rnp = rcu_get_root(rsp); long totqlen = 0; + /* Kick and suppress, if so configured. */ + rcu_stall_kick_kthreads(rsp); + if (rcu_cpu_stall_suppress) + return; + /* Only let one CPU complain about others per time interval. */ raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -1284,10 +1360,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) raw_spin_lock_irqsave_rcu_node(rnp, flags); ndetected += rcu_print_task_stall(rnp); if (rnp->qsmask != 0) { - for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) - if (rnp->qsmask & (1UL << cpu)) { - print_cpu_stall_info(rsp, - rnp->grplo + cpu); + for_each_leaf_node_possible_cpu(rnp, cpu) + if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { + print_cpu_stall_info(rsp, cpu); ndetected++; } } @@ -1323,6 +1398,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) rcu_check_gp_kthread_starvation(rsp); + panic_on_rcu_stall(); + force_quiescent_state(rsp); /* Kick them all. */ } @@ -1333,6 +1410,11 @@ static void print_cpu_stall(struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); long totqlen = 0; + /* Kick and suppress, if so configured. */ + rcu_stall_kick_kthreads(rsp); + if (rcu_cpu_stall_suppress) + return; + /* * OK, time to rat on ourselves... * See Documentation/RCU/stallwarn.txt for info on how to debug @@ -1358,6 +1440,8 @@ static void print_cpu_stall(struct rcu_state *rsp) jiffies + 3 * rcu_jiffies_till_stall_check() + 3); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + panic_on_rcu_stall(); + /* * Attempt to revive the RCU machinery by forcing a context switch. * @@ -1377,8 +1461,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) unsigned long js; struct rcu_node *rnp; - if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp)) + if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) || + !rcu_gp_in_progress(rsp)) return; + rcu_stall_kick_kthreads(rsp); j = jiffies; /* @@ -1915,8 +2001,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) * of the tree within the rsp->node[] array. Note that other CPUs * will access only the leaves of the hierarchy, thus seeing that no * grace period is in progress, at least until the corresponding - * leaf node has been initialized. In addition, we have excluded - * CPU-hotplug operations. + * leaf node has been initialized. * * The grace period cannot complete until the initialization * process finishes, because this kthread handles both. @@ -2117,8 +2202,11 @@ static int __noreturn rcu_gp_kthread(void *arg) } ret = 0; for (;;) { - if (!ret) + if (!ret) { rsp->jiffies_force_qs = jiffies + j; + WRITE_ONCE(rsp->jiffies_kick_kthreads, + jiffies + 3 * j); + } trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("fqswait")); @@ -2144,6 +2232,15 @@ static int __noreturn rcu_gp_kthread(void *arg) TPS("fqsend")); cond_resched_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); + ret = 0; /* Force full wait till next FQS. */ + j = jiffies_till_next_fqs; + if (j > HZ) { + j = HZ; + jiffies_till_next_fqs = HZ; + } else if (j < 1) { + j = 1; + jiffies_till_next_fqs = 1; + } } else { /* Deal with stray signal. */ cond_resched_rcu_qs(); @@ -2152,14 +2249,12 @@ static int __noreturn rcu_gp_kthread(void *arg) trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("fqswaitsig")); - } - j = jiffies_till_next_fqs; - if (j > HZ) { - j = HZ; - jiffies_till_next_fqs = HZ; - } else if (j < 1) { - j = 1; - jiffies_till_next_fqs = 1; + ret = 1; /* Keep old FQS timing. */ + j = jiffies; + if (time_after(jiffies, rsp->jiffies_force_qs)) + j = 1; + else + j = rsp->jiffies_force_qs - j; } } @@ -2788,7 +2883,6 @@ static void force_qs_rnp(struct rcu_state *rsp, unsigned long *maxj), bool *isidle, unsigned long *maxj) { - unsigned long bit; int cpu; unsigned long flags; unsigned long mask; @@ -2823,9 +2917,8 @@ static void force_qs_rnp(struct rcu_state *rsp, continue; } } - cpu = rnp->grplo; - bit = 1; - for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { + for_each_leaf_node_possible_cpu(rnp, cpu) { + unsigned long bit = leaf_node_cpu_bit(rnp, cpu); if ((rnp->qsmask & bit) != 0) { if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) mask |= bit; @@ -3364,508 +3457,6 @@ static bool rcu_seq_done(unsigned long *sp, unsigned long s) return ULONG_CMP_GE(READ_ONCE(*sp), s); } -/* Wrapper functions for expedited grace periods. */ -static void rcu_exp_gp_seq_start(struct rcu_state *rsp) -{ - rcu_seq_start(&rsp->expedited_sequence); -} -static void rcu_exp_gp_seq_end(struct rcu_state *rsp) -{ - rcu_seq_end(&rsp->expedited_sequence); - smp_mb(); /* Ensure that consecutive grace periods serialize. */ -} -static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) -{ - smp_mb(); /* Caller's modifications seen first by other CPUs. */ - return rcu_seq_snap(&rsp->expedited_sequence); -} -static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) -{ - return rcu_seq_done(&rsp->expedited_sequence, s); -} - -/* - * Reset the ->expmaskinit values in the rcu_node tree to reflect any - * recent CPU-online activity. Note that these masks are not cleared - * when CPUs go offline, so they reflect the union of all CPUs that have - * ever been online. This means that this function normally takes its - * no-work-to-do fastpath. - */ -static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) -{ - bool done; - unsigned long flags; - unsigned long mask; - unsigned long oldmask; - int ncpus = READ_ONCE(rsp->ncpus); - struct rcu_node *rnp; - struct rcu_node *rnp_up; - - /* If no new CPUs onlined since last time, nothing to do. */ - if (likely(ncpus == rsp->ncpus_snap)) - return; - rsp->ncpus_snap = ncpus; - - /* - * Each pass through the following loop propagates newly onlined - * CPUs for the current rcu_node structure up the rcu_node tree. - */ - rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (rnp->expmaskinit == rnp->expmaskinitnext) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - continue; /* No new CPUs, nothing to do. */ - } - - /* Update this node's mask, track old value for propagation. */ - oldmask = rnp->expmaskinit; - rnp->expmaskinit = rnp->expmaskinitnext; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - - /* If was already nonzero, nothing to propagate. */ - if (oldmask) - continue; - - /* Propagate the new CPU up the tree. */ - mask = rnp->grpmask; - rnp_up = rnp->parent; - done = false; - while (rnp_up) { - raw_spin_lock_irqsave_rcu_node(rnp_up, flags); - if (rnp_up->expmaskinit) - done = true; - rnp_up->expmaskinit |= mask; - raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags); - if (done) - break; - mask = rnp_up->grpmask; - rnp_up = rnp_up->parent; - } - } -} - -/* - * Reset the ->expmask values in the rcu_node tree in preparation for - * a new expedited grace period. - */ -static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) -{ - unsigned long flags; - struct rcu_node *rnp; - - sync_exp_reset_tree_hotplug(rsp); - rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock_irqsave_rcu_node(rnp, flags); - WARN_ON_ONCE(rnp->expmask); - rnp->expmask = rnp->expmaskinit; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - } -} - -/* - * Return non-zero if there is no RCU expedited grace period in progress - * for the specified rcu_node structure, in other words, if all CPUs and - * tasks covered by the specified rcu_node structure have done their bit - * for the current expedited grace period. Works only for preemptible - * RCU -- other RCU implementation use other means. - * - * Caller must hold the root rcu_node's exp_funnel_mutex. - */ -static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) -{ - return rnp->exp_tasks == NULL && - READ_ONCE(rnp->expmask) == 0; -} - -/* - * Report the exit from RCU read-side critical section for the last task - * that queued itself during or before the current expedited preemptible-RCU - * grace period. This event is reported either to the rcu_node structure on - * which the task was queued or to one of that rcu_node structure's ancestors, - * recursively up the tree. (Calm down, calm down, we do the recursion - * iteratively!) - * - * Caller must hold the root rcu_node's exp_funnel_mutex and the - * specified rcu_node structure's ->lock. - */ -static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, - bool wake, unsigned long flags) - __releases(rnp->lock) -{ - unsigned long mask; - - for (;;) { - if (!sync_rcu_preempt_exp_done(rnp)) { - if (!rnp->expmask) - rcu_initiate_boost(rnp, flags); - else - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - break; - } - if (rnp->parent == NULL) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - if (wake) { - smp_mb(); /* EGP done before wake_up(). */ - swake_up(&rsp->expedited_wq); - } - break; - } - mask = rnp->grpmask; - raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */ - rnp = rnp->parent; - raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ - WARN_ON_ONCE(!(rnp->expmask & mask)); - rnp->expmask &= ~mask; - } -} - -/* - * Report expedited quiescent state for specified node. This is a - * lock-acquisition wrapper function for __rcu_report_exp_rnp(). - * - * Caller must hold the root rcu_node's exp_funnel_mutex. - */ -static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, - struct rcu_node *rnp, bool wake) -{ - unsigned long flags; - - raw_spin_lock_irqsave_rcu_node(rnp, flags); - __rcu_report_exp_rnp(rsp, rnp, wake, flags); -} - -/* - * Report expedited quiescent state for multiple CPUs, all covered by the - * specified leaf rcu_node structure. Caller must hold the root - * rcu_node's exp_funnel_mutex. - */ -static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, - unsigned long mask, bool wake) -{ - unsigned long flags; - - raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (!(rnp->expmask & mask)) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - return; - } - rnp->expmask &= ~mask; - __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */ -} - -/* - * Report expedited quiescent state for specified rcu_data (CPU). - * Caller must hold the root rcu_node's exp_funnel_mutex. - */ -static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, - bool wake) -{ - rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake); -} - -/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ -static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, - struct rcu_data *rdp, - atomic_long_t *stat, unsigned long s) -{ - if (rcu_exp_gp_seq_done(rsp, s)) { - if (rnp) - mutex_unlock(&rnp->exp_funnel_mutex); - else if (rdp) - mutex_unlock(&rdp->exp_funnel_mutex); - /* Ensure test happens before caller kfree(). */ - smp_mb__before_atomic(); /* ^^^ */ - atomic_long_inc(stat); - return true; - } - return false; -} - -/* - * Funnel-lock acquisition for expedited grace periods. Returns a - * pointer to the root rcu_node structure, or NULL if some other - * task did the expedited grace period for us. - */ -static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) -{ - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); - struct rcu_node *rnp0; - struct rcu_node *rnp1 = NULL; - - /* - * First try directly acquiring the root lock in order to reduce - * latency in the common case where expedited grace periods are - * rare. We check mutex_is_locked() to avoid pathological levels of - * memory contention on ->exp_funnel_mutex in the heavy-load case. - */ - rnp0 = rcu_get_root(rsp); - if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) { - if (mutex_trylock(&rnp0->exp_funnel_mutex)) { - if (sync_exp_work_done(rsp, rnp0, NULL, - &rdp->expedited_workdone0, s)) - return NULL; - return rnp0; - } - } - - /* - * Each pass through the following loop works its way - * up the rcu_node tree, returning if others have done the - * work or otherwise falls through holding the root rnp's - * ->exp_funnel_mutex. The mapping from CPU to rcu_node structure - * can be inexact, as it is just promoting locality and is not - * strictly needed for correctness. - */ - if (sync_exp_work_done(rsp, NULL, NULL, &rdp->expedited_workdone1, s)) - return NULL; - mutex_lock(&rdp->exp_funnel_mutex); - rnp0 = rdp->mynode; - for (; rnp0 != NULL; rnp0 = rnp0->parent) { - if (sync_exp_work_done(rsp, rnp1, rdp, - &rdp->expedited_workdone2, s)) - return NULL; - mutex_lock(&rnp0->exp_funnel_mutex); - if (rnp1) - mutex_unlock(&rnp1->exp_funnel_mutex); - else - mutex_unlock(&rdp->exp_funnel_mutex); - rnp1 = rnp0; - } - if (sync_exp_work_done(rsp, rnp1, rdp, - &rdp->expedited_workdone3, s)) - return NULL; - return rnp1; -} - -/* Invoked on each online non-idle CPU for expedited quiescent state. */ -static void sync_sched_exp_handler(void *data) -{ - struct rcu_data *rdp; - struct rcu_node *rnp; - struct rcu_state *rsp = data; - - rdp = this_cpu_ptr(rsp->rda); - rnp = rdp->mynode; - if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || - __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) - return; - __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); - resched_cpu(smp_processor_id()); -} - -/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ -static void sync_sched_exp_online_cleanup(int cpu) -{ - struct rcu_data *rdp; - int ret; - struct rcu_node *rnp; - struct rcu_state *rsp = &rcu_sched_state; - - rdp = per_cpu_ptr(rsp->rda, cpu); - rnp = rdp->mynode; - if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) - return; - ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0); - WARN_ON_ONCE(ret); -} - -/* - * Select the nodes that the upcoming expedited grace period needs - * to wait for. - */ -static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, - smp_call_func_t func) -{ - int cpu; - unsigned long flags; - unsigned long mask; - unsigned long mask_ofl_test; - unsigned long mask_ofl_ipi; - int ret; - struct rcu_node *rnp; - - sync_exp_reset_tree(rsp); - rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave_rcu_node(rnp, flags); - - /* Each pass checks a CPU for identity, offline, and idle. */ - mask_ofl_test = 0; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - - if (raw_smp_processor_id() == cpu || - !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) - mask_ofl_test |= rdp->grpmask; - } - mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; - - /* - * Need to wait for any blocked tasks as well. Note that - * additional blocking tasks will also block the expedited - * GP until such time as the ->expmask bits are cleared. - */ - if (rcu_preempt_has_tasks(rnp)) - rnp->exp_tasks = rnp->blkd_tasks.next; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - - /* IPI the remaining CPUs for expedited quiescent state. */ - mask = 1; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { - if (!(mask_ofl_ipi & mask)) - continue; -retry_ipi: - ret = smp_call_function_single(cpu, func, rsp, 0); - if (!ret) { - mask_ofl_ipi &= ~mask; - continue; - } - /* Failed, raced with offline. */ - raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (cpu_online(cpu) && - (rnp->expmask & mask)) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - schedule_timeout_uninterruptible(1); - if (cpu_online(cpu) && - (rnp->expmask & mask)) - goto retry_ipi; - raw_spin_lock_irqsave_rcu_node(rnp, flags); - } - if (!(rnp->expmask & mask)) - mask_ofl_ipi &= ~mask; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - } - /* Report quiescent states for those that went offline. */ - mask_ofl_test |= mask_ofl_ipi; - if (mask_ofl_test) - rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); - } -} - -static void synchronize_sched_expedited_wait(struct rcu_state *rsp) -{ - int cpu; - unsigned long jiffies_stall; - unsigned long jiffies_start; - unsigned long mask; - int ndetected; - struct rcu_node *rnp; - struct rcu_node *rnp_root = rcu_get_root(rsp); - int ret; - - jiffies_stall = rcu_jiffies_till_stall_check(); - jiffies_start = jiffies; - - for (;;) { - ret = swait_event_timeout( - rsp->expedited_wq, - sync_rcu_preempt_exp_done(rnp_root), - jiffies_stall); - if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) - return; - if (ret < 0) { - /* Hit a signal, disable CPU stall warnings. */ - swait_event(rsp->expedited_wq, - sync_rcu_preempt_exp_done(rnp_root)); - return; - } - pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", - rsp->name); - ndetected = 0; - rcu_for_each_leaf_node(rsp, rnp) { - ndetected = rcu_print_task_exp_stall(rnp); - mask = 1; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { - struct rcu_data *rdp; - - if (!(rnp->expmask & mask)) - continue; - ndetected++; - rdp = per_cpu_ptr(rsp->rda, cpu); - pr_cont(" %d-%c%c%c", cpu, - "O."[cpu_online(cpu)], - "o."[!!(rdp->grpmask & rnp->expmaskinit)], - "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); - } - mask <<= 1; - } - pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", - jiffies - jiffies_start, rsp->expedited_sequence, - rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); - if (!ndetected) { - pr_err("blocking rcu_node structures:"); - rcu_for_each_node_breadth_first(rsp, rnp) { - if (rnp == rnp_root) - continue; /* printed unconditionally */ - if (sync_rcu_preempt_exp_done(rnp)) - continue; - pr_cont(" l=%u:%d-%d:%#lx/%c", - rnp->level, rnp->grplo, rnp->grphi, - rnp->expmask, - ".T"[!!rnp->exp_tasks]); - } - pr_cont("\n"); - } - rcu_for_each_leaf_node(rsp, rnp) { - mask = 1; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { - if (!(rnp->expmask & mask)) - continue; - dump_cpu_task(cpu); - } - } - jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; - } -} - -/** - * synchronize_sched_expedited - Brute-force RCU-sched grace period - * - * Wait for an RCU-sched grace period to elapse, but use a "big hammer" - * approach to force the grace period to end quickly. This consumes - * significant time on all CPUs and is unfriendly to real-time workloads, - * so is thus not recommended for any sort of common-case code. In fact, - * if you are using synchronize_sched_expedited() in a loop, please - * restructure your code to batch your updates, and then use a single - * synchronize_sched() instead. - * - * This implementation can be thought of as an application of sequence - * locking to expedited grace periods, but using the sequence counter to - * determine when someone else has already done the work instead of for - * retrying readers. - */ -void synchronize_sched_expedited(void) -{ - unsigned long s; - struct rcu_node *rnp; - struct rcu_state *rsp = &rcu_sched_state; - - /* If only one CPU, this is automatically a grace period. */ - if (rcu_blocking_is_gp()) - return; - - /* If expedited grace periods are prohibited, fall back to normal. */ - if (rcu_gp_is_normal()) { - wait_rcu_gp(call_rcu_sched); - return; - } - - /* Take a snapshot of the sequence number. */ - s = rcu_exp_gp_seq_snap(rsp); - - rnp = exp_funnel_lock(rsp, s); - if (rnp == NULL) - return; /* Someone else did our work for us. */ - - rcu_exp_gp_seq_start(rsp); - sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); - synchronize_sched_expedited_wait(rsp); - - rcu_exp_gp_seq_end(rsp); - mutex_unlock(&rnp->exp_funnel_mutex); -} -EXPORT_SYMBOL_GPL(synchronize_sched_expedited); - /* * Check to see if there is any immediate RCU-related work to be done * by the current CPU, for the specified type of RCU, returning 1 if so. @@ -4156,13 +3747,12 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave_rcu_node(rnp, flags); - rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); + rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); rdp->cpu = cpu; rdp->rsp = rsp; - mutex_init(&rdp->exp_funnel_mutex); rcu_boot_init_nocb_percpu_data(rdp); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } @@ -4216,12 +3806,58 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } -static void rcu_prepare_cpu(int cpu) +int rcutree_prepare_cpu(unsigned int cpu) { struct rcu_state *rsp; for_each_rcu_flavor(rsp) rcu_init_percpu_data(cpu, rsp); + + rcu_prepare_kthreads(cpu); + rcu_spawn_all_nocb_kthreads(cpu); + + return 0; +} + +static void rcutree_affinity_setting(unsigned int cpu, int outgoing) +{ + struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); + + rcu_boost_kthread_setaffinity(rdp->mynode, outgoing); +} + +int rcutree_online_cpu(unsigned int cpu) +{ + sync_sched_exp_online_cleanup(cpu); + rcutree_affinity_setting(cpu, -1); + return 0; +} + +int rcutree_offline_cpu(unsigned int cpu) +{ + rcutree_affinity_setting(cpu, cpu); + return 0; +} + + +int rcutree_dying_cpu(unsigned int cpu) +{ + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + rcu_cleanup_dying_cpu(rsp); + return 0; +} + +int rcutree_dead_cpu(unsigned int cpu) +{ + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) { + rcu_cleanup_dead_cpu(cpu, rsp); + do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu)); + } + return 0; } #ifdef CONFIG_HOTPLUG_CPU @@ -4240,9 +3876,6 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) - return; - /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ @@ -4264,52 +3897,6 @@ void rcu_report_dead(unsigned int cpu) } #endif -/* - * Handle CPU online/offline notification events. - */ -int rcu_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - long cpu = (long)hcpu; - struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); - struct rcu_node *rnp = rdp->mynode; - struct rcu_state *rsp; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - rcu_prepare_cpu(cpu); - rcu_prepare_kthreads(cpu); - rcu_spawn_all_nocb_kthreads(cpu); - break; - case CPU_ONLINE: - case CPU_DOWN_FAILED: - sync_sched_exp_online_cleanup(cpu); - rcu_boost_kthread_setaffinity(rnp, -1); - break; - case CPU_DOWN_PREPARE: - rcu_boost_kthread_setaffinity(rnp, cpu); - break; - case CPU_DYING: - case CPU_DYING_FROZEN: - for_each_rcu_flavor(rsp) - rcu_cleanup_dying_cpu(rsp); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - for_each_rcu_flavor(rsp) { - rcu_cleanup_dead_cpu(cpu, rsp); - do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu)); - } - break; - default: - break; - } - return NOTIFY_OK; -} - static int rcu_pm_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -4420,10 +4007,8 @@ static void __init rcu_init_one(struct rcu_state *rsp) { static const char * const buf[] = RCU_NODE_NAME_INIT; static const char * const fqs[] = RCU_FQS_NAME_INIT; - static const char * const exp[] = RCU_EXP_NAME_INIT; static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; - static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; static u8 fl_mask = 0x1; int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ @@ -4482,9 +4067,11 @@ static void __init rcu_init_one(struct rcu_state *rsp) rnp->level = i; INIT_LIST_HEAD(&rnp->blkd_tasks); rcu_init_one_nocb(rnp); - mutex_init(&rnp->exp_funnel_mutex); - lockdep_set_class_and_name(&rnp->exp_funnel_mutex, - &rcu_exp_class[i], exp[i]); + init_waitqueue_head(&rnp->exp_wq[0]); + init_waitqueue_head(&rnp->exp_wq[1]); + init_waitqueue_head(&rnp->exp_wq[2]); + init_waitqueue_head(&rnp->exp_wq[3]); + spin_lock_init(&rnp->exp_lock); } } @@ -4621,10 +4208,10 @@ void __init rcu_init(void) * this is called early in boot, before either interrupts * or the scheduler are operational. */ - cpu_notifier(rcu_cpu_notify, 0); pm_notifier(rcu_pm_notify, 0); for_each_online_cpu(cpu) - rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); + rcutree_prepare_cpu(cpu); } +#include "tree_exp.h" #include "tree_plugin.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index df668c0f9e64..f714f873bf9d 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -70,7 +70,6 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 } # define RCU_NODE_NAME_INIT { "rcu_node_0" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } -# define RCU_EXP_NAME_INIT { "rcu_node_exp_0" } #elif NR_CPUS <= RCU_FANOUT_2 # define RCU_NUM_LVLS 2 # define NUM_RCU_LVL_0 1 @@ -79,7 +78,6 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } -# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" } #elif NR_CPUS <= RCU_FANOUT_3 # define RCU_NUM_LVLS 3 # define NUM_RCU_LVL_0 1 @@ -89,7 +87,6 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } -# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" } #elif NR_CPUS <= RCU_FANOUT_4 # define RCU_NUM_LVLS 4 # define NUM_RCU_LVL_0 1 @@ -100,7 +97,6 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } -# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" } #else # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ @@ -252,10 +248,19 @@ struct rcu_node { /* Counts of upcoming no-CB GP requests. */ raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; - struct mutex exp_funnel_mutex ____cacheline_internodealigned_in_smp; + spinlock_t exp_lock ____cacheline_internodealigned_in_smp; + unsigned long exp_seq_rq; + wait_queue_head_t exp_wq[4]; } ____cacheline_internodealigned_in_smp; /* + * Bitmasks in an rcu_node cover the interval [grplo, grphi] of CPU IDs, and + * are indexed relative to this interval rather than the global CPU ID space. + * This generates the bit for a CPU in node-local masks. + */ +#define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo)) + +/* * Do a full breadth-first scan of the rcu_node structures for the * specified rcu_state structure. */ @@ -283,6 +288,14 @@ struct rcu_node { (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) /* + * Iterate over all possible CPUs in a leaf RCU node. + */ +#define for_each_leaf_node_possible_cpu(rnp, cpu) \ + for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \ + cpu <= rnp->grphi; \ + cpu = cpumask_next((cpu), cpu_possible_mask)) + +/* * Union to allow "aggregate OR" operation on the need for a quiescent * state by the normal and expedited grace periods. */ @@ -387,11 +400,9 @@ struct rcu_data { #ifdef CONFIG_RCU_FAST_NO_HZ struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ - struct mutex exp_funnel_mutex; - atomic_long_t expedited_workdone0; /* # done by others #0. */ - atomic_long_t expedited_workdone1; /* # done by others #1. */ - atomic_long_t expedited_workdone2; /* # done by others #2. */ - atomic_long_t expedited_workdone3; /* # done by others #3. */ + atomic_long_t exp_workdone1; /* # done by others #1. */ + atomic_long_t exp_workdone2; /* # done by others #2. */ + atomic_long_t exp_workdone3; /* # done by others #3. */ /* 7) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU @@ -505,6 +516,8 @@ struct rcu_state { /* _rcu_barrier(). */ /* End of fields guarded by barrier_mutex. */ + struct mutex exp_mutex; /* Serialize expedited GP. */ + struct mutex exp_wake_mutex; /* Serialize wakeup. */ unsigned long expedited_sequence; /* Take a ticket. */ atomic_long_t expedited_normal; /* # fallbacks to normal. */ atomic_t expedited_need_qs; /* # CPUs left to check in. */ @@ -513,6 +526,8 @@ struct rcu_state { unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ + unsigned long jiffies_kick_kthreads; /* Time at which to kick */ + /* kthreads, if configured. */ unsigned long n_force_qs; /* Number of calls to */ /* force_quiescent_state(). */ unsigned long n_force_qs_lh; /* ~Number of calls leaving */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h new file mode 100644 index 000000000000..6d86ab6ec2c9 --- /dev/null +++ b/kernel/rcu/tree_exp.h @@ -0,0 +1,655 @@ +/* + * RCU expedited grace periods + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright IBM Corporation, 2016 + * + * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + */ + +/* Wrapper functions for expedited grace periods. */ +static void rcu_exp_gp_seq_start(struct rcu_state *rsp) +{ + rcu_seq_start(&rsp->expedited_sequence); +} +static void rcu_exp_gp_seq_end(struct rcu_state *rsp) +{ + rcu_seq_end(&rsp->expedited_sequence); + smp_mb(); /* Ensure that consecutive grace periods serialize. */ +} +static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) +{ + unsigned long s; + + smp_mb(); /* Caller's modifications seen first by other CPUs. */ + s = rcu_seq_snap(&rsp->expedited_sequence); + trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); + return s; +} +static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) +{ + return rcu_seq_done(&rsp->expedited_sequence, s); +} + +/* + * Reset the ->expmaskinit values in the rcu_node tree to reflect any + * recent CPU-online activity. Note that these masks are not cleared + * when CPUs go offline, so they reflect the union of all CPUs that have + * ever been online. This means that this function normally takes its + * no-work-to-do fastpath. + */ +static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) +{ + bool done; + unsigned long flags; + unsigned long mask; + unsigned long oldmask; + int ncpus = READ_ONCE(rsp->ncpus); + struct rcu_node *rnp; + struct rcu_node *rnp_up; + + /* If no new CPUs onlined since last time, nothing to do. */ + if (likely(ncpus == rsp->ncpus_snap)) + return; + rsp->ncpus_snap = ncpus; + + /* + * Each pass through the following loop propagates newly onlined + * CPUs for the current rcu_node structure up the rcu_node tree. + */ + rcu_for_each_leaf_node(rsp, rnp) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (rnp->expmaskinit == rnp->expmaskinitnext) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + continue; /* No new CPUs, nothing to do. */ + } + + /* Update this node's mask, track old value for propagation. */ + oldmask = rnp->expmaskinit; + rnp->expmaskinit = rnp->expmaskinitnext; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + + /* If was already nonzero, nothing to propagate. */ + if (oldmask) + continue; + + /* Propagate the new CPU up the tree. */ + mask = rnp->grpmask; + rnp_up = rnp->parent; + done = false; + while (rnp_up) { + raw_spin_lock_irqsave_rcu_node(rnp_up, flags); + if (rnp_up->expmaskinit) + done = true; + rnp_up->expmaskinit |= mask; + raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags); + if (done) + break; + mask = rnp_up->grpmask; + rnp_up = rnp_up->parent; + } + } +} + +/* + * Reset the ->expmask values in the rcu_node tree in preparation for + * a new expedited grace period. + */ +static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) +{ + unsigned long flags; + struct rcu_node *rnp; + + sync_exp_reset_tree_hotplug(rsp); + rcu_for_each_node_breadth_first(rsp, rnp) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + WARN_ON_ONCE(rnp->expmask); + rnp->expmask = rnp->expmaskinit; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } +} + +/* + * Return non-zero if there is no RCU expedited grace period in progress + * for the specified rcu_node structure, in other words, if all CPUs and + * tasks covered by the specified rcu_node structure have done their bit + * for the current expedited grace period. Works only for preemptible + * RCU -- other RCU implementation use other means. + * + * Caller must hold the rcu_state's exp_mutex. + */ +static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) +{ + return rnp->exp_tasks == NULL && + READ_ONCE(rnp->expmask) == 0; +} + +/* + * Report the exit from RCU read-side critical section for the last task + * that queued itself during or before the current expedited preemptible-RCU + * grace period. This event is reported either to the rcu_node structure on + * which the task was queued or to one of that rcu_node structure's ancestors, + * recursively up the tree. (Calm down, calm down, we do the recursion + * iteratively!) + * + * Caller must hold the rcu_state's exp_mutex and the specified rcu_node + * structure's ->lock. + */ +static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, + bool wake, unsigned long flags) + __releases(rnp->lock) +{ + unsigned long mask; + + for (;;) { + if (!sync_rcu_preempt_exp_done(rnp)) { + if (!rnp->expmask) + rcu_initiate_boost(rnp, flags); + else + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + break; + } + if (rnp->parent == NULL) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + if (wake) { + smp_mb(); /* EGP done before wake_up(). */ + swake_up(&rsp->expedited_wq); + } + break; + } + mask = rnp->grpmask; + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */ + rnp = rnp->parent; + raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ + WARN_ON_ONCE(!(rnp->expmask & mask)); + rnp->expmask &= ~mask; + } +} + +/* + * Report expedited quiescent state for specified node. This is a + * lock-acquisition wrapper function for __rcu_report_exp_rnp(). + * + * Caller must hold the rcu_state's exp_mutex. + */ +static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, + struct rcu_node *rnp, bool wake) +{ + unsigned long flags; + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + __rcu_report_exp_rnp(rsp, rnp, wake, flags); +} + +/* + * Report expedited quiescent state for multiple CPUs, all covered by the + * specified leaf rcu_node structure. Caller must hold the rcu_state's + * exp_mutex. + */ +static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, + unsigned long mask, bool wake) +{ + unsigned long flags; + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (!(rnp->expmask & mask)) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return; + } + rnp->expmask &= ~mask; + __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */ +} + +/* + * Report expedited quiescent state for specified rcu_data (CPU). + */ +static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, + bool wake) +{ + rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake); +} + +/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ +static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat, + unsigned long s) +{ + if (rcu_exp_gp_seq_done(rsp, s)) { + trace_rcu_exp_grace_period(rsp->name, s, TPS("done")); + /* Ensure test happens before caller kfree(). */ + smp_mb__before_atomic(); /* ^^^ */ + atomic_long_inc(stat); + return true; + } + return false; +} + +/* + * Funnel-lock acquisition for expedited grace periods. Returns true + * if some other task completed an expedited grace period that this task + * can piggy-back on, and with no mutex held. Otherwise, returns false + * with the mutex held, indicating that the caller must actually do the + * expedited grace period. + */ +static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) +{ + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); + struct rcu_node *rnp = rdp->mynode; + struct rcu_node *rnp_root = rcu_get_root(rsp); + + /* Low-contention fastpath. */ + if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) && + (rnp == rnp_root || + ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) && + mutex_trylock(&rsp->exp_mutex)) + goto fastpath; + + /* + * Each pass through the following loop works its way up + * the rcu_node tree, returning if others have done the work or + * otherwise falls through to acquire rsp->exp_mutex. The mapping + * from CPU to rcu_node structure can be inexact, as it is just + * promoting locality and is not strictly needed for correctness. + */ + for (; rnp != NULL; rnp = rnp->parent) { + if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s)) + return true; + + /* Work not done, either wait here or go up. */ + spin_lock(&rnp->exp_lock); + if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) { + + /* Someone else doing GP, so wait for them. */ + spin_unlock(&rnp->exp_lock); + trace_rcu_exp_funnel_lock(rsp->name, rnp->level, + rnp->grplo, rnp->grphi, + TPS("wait")); + wait_event(rnp->exp_wq[(s >> 1) & 0x3], + sync_exp_work_done(rsp, + &rdp->exp_workdone2, s)); + return true; + } + rnp->exp_seq_rq = s; /* Followers can wait on us. */ + spin_unlock(&rnp->exp_lock); + trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo, + rnp->grphi, TPS("nxtlvl")); + } + mutex_lock(&rsp->exp_mutex); +fastpath: + if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) { + mutex_unlock(&rsp->exp_mutex); + return true; + } + rcu_exp_gp_seq_start(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); + return false; +} + +/* Invoked on each online non-idle CPU for expedited quiescent state. */ +static void sync_sched_exp_handler(void *data) +{ + struct rcu_data *rdp; + struct rcu_node *rnp; + struct rcu_state *rsp = data; + + rdp = this_cpu_ptr(rsp->rda); + rnp = rdp->mynode; + if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || + __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) + return; + if (rcu_is_cpu_rrupt_from_idle()) { + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(&rcu_sched_data), true); + return; + } + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); + resched_cpu(smp_processor_id()); +} + +/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ +static void sync_sched_exp_online_cleanup(int cpu) +{ + struct rcu_data *rdp; + int ret; + struct rcu_node *rnp; + struct rcu_state *rsp = &rcu_sched_state; + + rdp = per_cpu_ptr(rsp->rda, cpu); + rnp = rdp->mynode; + if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) + return; + ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0); + WARN_ON_ONCE(ret); +} + +/* + * Select the nodes that the upcoming expedited grace period needs + * to wait for. + */ +static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, + smp_call_func_t func) +{ + int cpu; + unsigned long flags; + unsigned long mask_ofl_test; + unsigned long mask_ofl_ipi; + int ret; + struct rcu_node *rnp; + + sync_exp_reset_tree(rsp); + rcu_for_each_leaf_node(rsp, rnp) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + + /* Each pass checks a CPU for identity, offline, and idle. */ + mask_ofl_test = 0; + for_each_leaf_node_possible_cpu(rnp, cpu) { + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + if (raw_smp_processor_id() == cpu || + !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) + mask_ofl_test |= rdp->grpmask; + } + mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; + + /* + * Need to wait for any blocked tasks as well. Note that + * additional blocking tasks will also block the expedited + * GP until such time as the ->expmask bits are cleared. + */ + if (rcu_preempt_has_tasks(rnp)) + rnp->exp_tasks = rnp->blkd_tasks.next; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + + /* IPI the remaining CPUs for expedited quiescent state. */ + for_each_leaf_node_possible_cpu(rnp, cpu) { + unsigned long mask = leaf_node_cpu_bit(rnp, cpu); + if (!(mask_ofl_ipi & mask)) + continue; +retry_ipi: + ret = smp_call_function_single(cpu, func, rsp, 0); + if (!ret) { + mask_ofl_ipi &= ~mask; + continue; + } + /* Failed, raced with offline. */ + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (cpu_online(cpu) && + (rnp->expmask & mask)) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + schedule_timeout_uninterruptible(1); + if (cpu_online(cpu) && + (rnp->expmask & mask)) + goto retry_ipi; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + } + if (!(rnp->expmask & mask)) + mask_ofl_ipi &= ~mask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } + /* Report quiescent states for those that went offline. */ + mask_ofl_test |= mask_ofl_ipi; + if (mask_ofl_test) + rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); + } +} + +static void synchronize_sched_expedited_wait(struct rcu_state *rsp) +{ + int cpu; + unsigned long jiffies_stall; + unsigned long jiffies_start; + unsigned long mask; + int ndetected; + struct rcu_node *rnp; + struct rcu_node *rnp_root = rcu_get_root(rsp); + int ret; + + jiffies_stall = rcu_jiffies_till_stall_check(); + jiffies_start = jiffies; + + for (;;) { + ret = swait_event_timeout( + rsp->expedited_wq, + sync_rcu_preempt_exp_done(rnp_root), + jiffies_stall); + if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) + return; + if (ret < 0) { + /* Hit a signal, disable CPU stall warnings. */ + swait_event(rsp->expedited_wq, + sync_rcu_preempt_exp_done(rnp_root)); + return; + } + pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", + rsp->name); + ndetected = 0; + rcu_for_each_leaf_node(rsp, rnp) { + ndetected += rcu_print_task_exp_stall(rnp); + for_each_leaf_node_possible_cpu(rnp, cpu) { + struct rcu_data *rdp; + + mask = leaf_node_cpu_bit(rnp, cpu); + if (!(rnp->expmask & mask)) + continue; + ndetected++; + rdp = per_cpu_ptr(rsp->rda, cpu); + pr_cont(" %d-%c%c%c", cpu, + "O."[!!cpu_online(cpu)], + "o."[!!(rdp->grpmask & rnp->expmaskinit)], + "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); + } + } + pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", + jiffies - jiffies_start, rsp->expedited_sequence, + rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); + if (ndetected) { + pr_err("blocking rcu_node structures:"); + rcu_for_each_node_breadth_first(rsp, rnp) { + if (rnp == rnp_root) + continue; /* printed unconditionally */ + if (sync_rcu_preempt_exp_done(rnp)) + continue; + pr_cont(" l=%u:%d-%d:%#lx/%c", + rnp->level, rnp->grplo, rnp->grphi, + rnp->expmask, + ".T"[!!rnp->exp_tasks]); + } + pr_cont("\n"); + } + rcu_for_each_leaf_node(rsp, rnp) { + for_each_leaf_node_possible_cpu(rnp, cpu) { + mask = leaf_node_cpu_bit(rnp, cpu); + if (!(rnp->expmask & mask)) + continue; + dump_cpu_task(cpu); + } + } + jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; + } +} + +/* + * Wait for the current expedited grace period to complete, and then + * wake up everyone who piggybacked on the just-completed expedited + * grace period. Also update all the ->exp_seq_rq counters as needed + * in order to avoid counter-wrap problems. + */ +static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) +{ + struct rcu_node *rnp; + + synchronize_sched_expedited_wait(rsp); + rcu_exp_gp_seq_end(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); + + /* + * Switch over to wakeup mode, allowing the next GP, but -only- the + * next GP, to proceed. + */ + mutex_lock(&rsp->exp_wake_mutex); + mutex_unlock(&rsp->exp_mutex); + + rcu_for_each_node_breadth_first(rsp, rnp) { + if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { + spin_lock(&rnp->exp_lock); + /* Recheck, avoid hang in case someone just arrived. */ + if (ULONG_CMP_LT(rnp->exp_seq_rq, s)) + rnp->exp_seq_rq = s; + spin_unlock(&rnp->exp_lock); + } + wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]); + } + trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); + mutex_unlock(&rsp->exp_wake_mutex); +} + +/** + * synchronize_sched_expedited - Brute-force RCU-sched grace period + * + * Wait for an RCU-sched grace period to elapse, but use a "big hammer" + * approach to force the grace period to end quickly. This consumes + * significant time on all CPUs and is unfriendly to real-time workloads, + * so is thus not recommended for any sort of common-case code. In fact, + * if you are using synchronize_sched_expedited() in a loop, please + * restructure your code to batch your updates, and then use a single + * synchronize_sched() instead. + * + * This implementation can be thought of as an application of sequence + * locking to expedited grace periods, but using the sequence counter to + * determine when someone else has already done the work instead of for + * retrying readers. + */ +void synchronize_sched_expedited(void) +{ + unsigned long s; + struct rcu_state *rsp = &rcu_sched_state; + + /* If only one CPU, this is automatically a grace period. */ + if (rcu_blocking_is_gp()) + return; + + /* If expedited grace periods are prohibited, fall back to normal. */ + if (rcu_gp_is_normal()) { + wait_rcu_gp(call_rcu_sched); + return; + } + + /* Take a snapshot of the sequence number. */ + s = rcu_exp_gp_seq_snap(rsp); + if (exp_funnel_lock(rsp, s)) + return; /* Someone else did our work for us. */ + + /* Initialize the rcu_node tree in preparation for the wait. */ + sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); + + /* Wait and clean up, including waking everyone. */ + rcu_exp_wait_wake(rsp, s); +} +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); + +#ifdef CONFIG_PREEMPT_RCU + +/* + * Remote handler for smp_call_function_single(). If there is an + * RCU read-side critical section in effect, request that the + * next rcu_read_unlock() record the quiescent state up the + * ->expmask fields in the rcu_node tree. Otherwise, immediately + * report the quiescent state. + */ +static void sync_rcu_exp_handler(void *info) +{ + struct rcu_data *rdp; + struct rcu_state *rsp = info; + struct task_struct *t = current; + + /* + * Within an RCU read-side critical section, request that the next + * rcu_read_unlock() report. Unless this RCU read-side critical + * section has already blocked, in which case it is already set + * up for the expedited grace period to wait on it. + */ + if (t->rcu_read_lock_nesting > 0 && + !t->rcu_read_unlock_special.b.blocked) { + t->rcu_read_unlock_special.b.exp_need_qs = true; + return; + } + + /* + * We are either exiting an RCU read-side critical section (negative + * values of t->rcu_read_lock_nesting) or are not in one at all + * (zero value of t->rcu_read_lock_nesting). Or we are in an RCU + * read-side critical section that blocked before this expedited + * grace period started. Either way, we can immediately report + * the quiescent state. + */ + rdp = this_cpu_ptr(rsp->rda); + rcu_report_exp_rdp(rsp, rdp, true); +} + +/** + * synchronize_rcu_expedited - Brute-force RCU grace period + * + * Wait for an RCU-preempt grace period, but expedite it. The basic + * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler + * checks whether the CPU is in an RCU-preempt critical section, and + * if so, it sets a flag that causes the outermost rcu_read_unlock() + * to report the quiescent state. On the other hand, if the CPU is + * not in an RCU read-side critical section, the IPI handler reports + * the quiescent state immediately. + * + * Although this is a greate improvement over previous expedited + * implementations, it is still unfriendly to real-time workloads, so is + * thus not recommended for any sort of common-case code. In fact, if + * you are using synchronize_rcu_expedited() in a loop, please restructure + * your code to batch your updates, and then Use a single synchronize_rcu() + * instead. + */ +void synchronize_rcu_expedited(void) +{ + struct rcu_state *rsp = rcu_state_p; + unsigned long s; + + /* If expedited grace periods are prohibited, fall back to normal. */ + if (rcu_gp_is_normal()) { + wait_rcu_gp(call_rcu); + return; + } + + s = rcu_exp_gp_seq_snap(rsp); + if (exp_funnel_lock(rsp, s)) + return; /* Someone else did our work for us. */ + + /* Initialize the rcu_node tree in preparation for the wait. */ + sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); + + /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */ + rcu_exp_wait_wake(rsp, s); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +#else /* #ifdef CONFIG_PREEMPT_RCU */ + +/* + * Wait for an rcu-preempt grace period, but make it happen quickly. + * But because preemptible RCU does not exist, map to rcu-sched. + */ +void synchronize_rcu_expedited(void) +{ + synchronize_sched_expedited(); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index efdf7b61ce12..0082fce402a0 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -79,8 +79,6 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); if (IS_ENABLED(CONFIG_PROVE_RCU)) pr_info("\tRCU lockdep checking is enabled.\n"); - if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE)) - pr_info("\tRCU torture testing starts during boot.\n"); if (RCU_NUM_LVLS >= 4) pr_info("\tFour(or more)-level hierarchy is enabled.\n"); if (RCU_FANOUT_LEAF != 16) @@ -681,89 +679,6 @@ void synchronize_rcu(void) } EXPORT_SYMBOL_GPL(synchronize_rcu); -/* - * Remote handler for smp_call_function_single(). If there is an - * RCU read-side critical section in effect, request that the - * next rcu_read_unlock() record the quiescent state up the - * ->expmask fields in the rcu_node tree. Otherwise, immediately - * report the quiescent state. - */ -static void sync_rcu_exp_handler(void *info) -{ - struct rcu_data *rdp; - struct rcu_state *rsp = info; - struct task_struct *t = current; - - /* - * Within an RCU read-side critical section, request that the next - * rcu_read_unlock() report. Unless this RCU read-side critical - * section has already blocked, in which case it is already set - * up for the expedited grace period to wait on it. - */ - if (t->rcu_read_lock_nesting > 0 && - !t->rcu_read_unlock_special.b.blocked) { - t->rcu_read_unlock_special.b.exp_need_qs = true; - return; - } - - /* - * We are either exiting an RCU read-side critical section (negative - * values of t->rcu_read_lock_nesting) or are not in one at all - * (zero value of t->rcu_read_lock_nesting). Or we are in an RCU - * read-side critical section that blocked before this expedited - * grace period started. Either way, we can immediately report - * the quiescent state. - */ - rdp = this_cpu_ptr(rsp->rda); - rcu_report_exp_rdp(rsp, rdp, true); -} - -/** - * synchronize_rcu_expedited - Brute-force RCU grace period - * - * Wait for an RCU-preempt grace period, but expedite it. The basic - * idea is to invoke synchronize_sched_expedited() to push all the tasks to - * the ->blkd_tasks lists and wait for this list to drain. This consumes - * significant time on all CPUs and is unfriendly to real-time workloads, - * so is thus not recommended for any sort of common-case code. - * In fact, if you are using synchronize_rcu_expedited() in a loop, - * please restructure your code to batch your updates, and then Use a - * single synchronize_rcu() instead. - */ -void synchronize_rcu_expedited(void) -{ - struct rcu_node *rnp; - struct rcu_node *rnp_unlock; - struct rcu_state *rsp = rcu_state_p; - unsigned long s; - - /* If expedited grace periods are prohibited, fall back to normal. */ - if (rcu_gp_is_normal()) { - wait_rcu_gp(call_rcu); - return; - } - - s = rcu_exp_gp_seq_snap(rsp); - - rnp_unlock = exp_funnel_lock(rsp, s); - if (rnp_unlock == NULL) - return; /* Someone else did our work for us. */ - - rcu_exp_gp_seq_start(rsp); - - /* Initialize the rcu_node tree in preparation for the wait. */ - sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); - - /* Wait for snapshotted ->blkd_tasks lists to drain. */ - rnp = rcu_get_root(rsp); - synchronize_sched_expedited_wait(rsp); - - /* Clean up and exit. */ - rcu_exp_gp_seq_end(rsp); - mutex_unlock(&rnp_unlock->exp_funnel_mutex); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); - /** * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. * @@ -888,16 +803,6 @@ static void rcu_preempt_check_callbacks(void) } /* - * Wait for an rcu-preempt grace period, but make it happen quickly. - * But because preemptible RCU does not exist, map to rcu-sched. - */ -void synchronize_rcu_expedited(void) -{ - synchronize_sched_expedited(); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); - -/* * Because preemptible RCU does not exist, rcu_barrier() is just * another name for rcu_barrier_sched(). */ @@ -1259,8 +1164,9 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) return; if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) return; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) - if ((mask & 0x1) && cpu != outgoingcpu) + for_each_leaf_node_possible_cpu(rnp, cpu) + if ((mask & leaf_node_cpu_bit(rnp, cpu)) && + cpu != outgoingcpu) cpumask_set_cpu(cpu, cm); if (cpumask_weight(cm) == 0) cpumask_setall(cm); diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 1088e64f01ad..86782f9a4604 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -185,17 +185,16 @@ static int show_rcuexp(struct seq_file *m, void *v) int cpu; struct rcu_state *rsp = (struct rcu_state *)m->private; struct rcu_data *rdp; - unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0; + unsigned long s1 = 0, s2 = 0, s3 = 0; for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(rsp->rda, cpu); - s0 += atomic_long_read(&rdp->expedited_workdone0); - s1 += atomic_long_read(&rdp->expedited_workdone1); - s2 += atomic_long_read(&rdp->expedited_workdone2); - s3 += atomic_long_read(&rdp->expedited_workdone3); + s1 += atomic_long_read(&rdp->exp_workdone1); + s2 += atomic_long_read(&rdp->exp_workdone2); + s3 += atomic_long_read(&rdp->exp_workdone3); } - seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", - rsp->expedited_sequence, s0, s1, s2, s3, + seq_printf(m, "s=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", + rsp->expedited_sequence, s1, s2, s3, atomic_long_read(&rsp->expedited_normal), atomic_read(&rsp->expedited_need_qs), rsp->expedited_sequence / 2); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index ca828b41c938..f0d8322bc3ec 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -67,7 +67,7 @@ static int rcu_normal_after_boot; module_param(rcu_normal_after_boot, int, 0); #endif /* #ifndef CONFIG_TINY_RCU */ -#if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT) +#ifdef CONFIG_DEBUG_LOCK_ALLOC /** * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section? * @@ -111,7 +111,7 @@ int rcu_read_lock_sched_held(void) return 0; if (debug_locks) lockdep_opinion = lock_is_held(&rcu_sched_lock_map); - return lockdep_opinion || preempt_count() != 0 || irqs_disabled(); + return lockdep_opinion || !preemptible(); } EXPORT_SYMBOL(rcu_read_lock_sched_held); #endif @@ -380,29 +380,9 @@ void destroy_rcu_head(struct rcu_head *head) debug_object_free(head, &rcuhead_debug_descr); } -/* - * fixup_activate is called when: - * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) - * Activation is performed internally by call_rcu(). - */ -static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) +static bool rcuhead_is_static_object(void *addr) { - struct rcu_head *head = addr; - - switch (state) { - - case ODEBUG_STATE_NOTAVAILABLE: - /* - * This is not really a fixup. We just make sure that it is - * tracked in the object tracker. - */ - debug_object_init(head, &rcuhead_debug_descr); - debug_object_activate(head, &rcuhead_debug_descr); - return 0; - default: - return 1; - } + return true; } /** @@ -440,7 +420,7 @@ EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack); struct debug_obj_descr rcuhead_debug_descr = { .name = "rcu_head", - .fixup_activate = rcuhead_fixup_activate, + .is_static_object = rcuhead_is_static_object, }; EXPORT_SYMBOL_GPL(rcuhead_debug_descr); #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ @@ -548,6 +528,7 @@ static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10; module_param(rcu_task_stall_timeout, int, 0644); static void rcu_spawn_tasks_kthread(void); +static struct task_struct *rcu_tasks_kthread_ptr; /* * Post an RCU-tasks callback. First call must be from process context @@ -557,6 +538,7 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) { unsigned long flags; bool needwake; + bool havetask = READ_ONCE(rcu_tasks_kthread_ptr); rhp->next = NULL; rhp->func = func; @@ -565,7 +547,9 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) *rcu_tasks_cbs_tail = rhp; rcu_tasks_cbs_tail = &rhp->next; raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); - if (needwake) { + /* We can't create the thread unless interrupts are enabled. */ + if ((needwake && havetask) || + (!havetask && !irqs_disabled_flags(flags))) { rcu_spawn_tasks_kthread(); wake_up(&rcu_tasks_cbs_wq); } @@ -810,7 +794,6 @@ static int __noreturn rcu_tasks_kthread(void *arg) static void rcu_spawn_tasks_kthread(void) { static DEFINE_MUTEX(rcu_tasks_kthread_mutex); - static struct task_struct *rcu_tasks_kthread_ptr; struct task_struct *t; if (READ_ONCE(rcu_tasks_kthread_ptr)) { diff --git a/kernel/relay.c b/kernel/relay.c index 074994bcfa9b..d797502140b9 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -451,6 +451,13 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu) if (!dentry) goto free_buf; relay_set_buf_dentry(buf, dentry); + } else { + /* Only retrieve global info, nothing more, nothing less */ + dentry = chan->cb->create_buf_file(NULL, NULL, + S_IRUSR, buf, + &chan->is_global); + if (WARN_ON(dentry)) + goto free_buf; } buf->cpu = cpu; @@ -562,6 +569,10 @@ static int relay_hotcpu_callback(struct notifier_block *nb, * attributes specified. The created channel buffer files * will be named base_filename0...base_filenameN-1. File * permissions will be %S_IRUSR. + * + * If opening a buffer (@parent = NULL) that you later wish to register + * in a filesystem, call relay_late_setup_files() once the @parent dentry + * is available. */ struct rchan *relay_open(const char *base_filename, struct dentry *parent, @@ -614,6 +625,7 @@ free_bufs: kref_put(&chan->kref, relay_destroy_channel); mutex_unlock(&relay_channels_mutex); + kfree(chan); return NULL; } EXPORT_SYMBOL_GPL(relay_open); @@ -639,8 +651,12 @@ static void __relay_set_buf_dentry(void *info) * * Returns 0 if successful, non-zero otherwise. * - * Use to setup files for a previously buffer-only channel. - * Useful to do early tracing in kernel, before VFS is up, for example. + * Use to setup files for a previously buffer-only channel created + * by relay_open() with a NULL parent dentry. + * + * For example, this is useful for perfomring early tracing in kernel, + * before VFS is up and then exposing the early results once the dentry + * is available. */ int relay_late_setup_files(struct rchan *chan, const char *base_filename, @@ -665,6 +681,20 @@ int relay_late_setup_files(struct rchan *chan, } chan->has_base_filename = 1; chan->parent = parent; + + if (chan->is_global) { + err = -EINVAL; + if (!WARN_ON_ONCE(!chan->buf[0])) { + dentry = relay_create_buf_file(chan, chan->buf[0], 0); + if (dentry && !WARN_ON_ONCE(!chan->is_global)) { + relay_set_buf_dentry(chan->buf[0], dentry); + err = 0; + } + } + mutex_unlock(&relay_channels_mutex); + return err; + } + curr_cpu = get_cpu(); /* * The CPU hotplug notifier ran before us and created buffers with @@ -705,6 +735,7 @@ int relay_late_setup_files(struct rchan *chan, return err; } +EXPORT_SYMBOL_GPL(relay_late_setup_files); /** * relay_switch_subbuf - switch to a new sub-buffer diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 414d9c16da42..5e59b832ae2b 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -24,3 +24,4 @@ obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o obj-$(CONFIG_CPU_FREQ) += cpufreq.o +obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index fedb967a9841..e85a725e5c34 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -318,6 +318,7 @@ u64 sched_clock_cpu(int cpu) return clock; } +EXPORT_SYMBOL_GPL(sched_clock_cpu); void sched_clock_tick(void) { @@ -363,39 +364,6 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); -/* - * As outlined at the top, provides a fast, high resolution, nanosecond - * time source that is monotonic per cpu argument and has bounded drift - * between cpus. - * - * ######################### BIG FAT WARNING ########################## - * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # - * # go backwards !! # - * #################################################################### - */ -u64 cpu_clock(int cpu) -{ - if (!sched_clock_stable()) - return sched_clock_cpu(cpu); - - return sched_clock(); -} - -/* - * Similar to cpu_clock() for the current cpu. Time will only be observed - * to be monotonic if care is taken to only compare timestampt taken on the - * same CPU. - * - * See cpu_clock(). - */ -u64 local_clock(void) -{ - if (!sched_clock_stable()) - return sched_clock_cpu(raw_smp_processor_id()); - - return sched_clock(); -} - #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ void sched_clock_init(void) @@ -410,22 +378,8 @@ u64 sched_clock_cpu(int cpu) return sched_clock(); } - -u64 cpu_clock(int cpu) -{ - return sched_clock(); -} - -u64 local_clock(void) -{ - return sched_clock(); -} - #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ -EXPORT_SYMBOL_GPL(cpu_clock); -EXPORT_SYMBOL_GPL(local_clock); - /* * Running clock - returns the time that has elapsed while a guest has been * running. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d1f7149f8704..44817c640e99 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -33,7 +33,7 @@ #include <linux/init.h> #include <linux/uaccess.h> #include <linux/highmem.h> -#include <asm/mmu_context.h> +#include <linux/mmu_context.h> #include <linux/interrupt.h> #include <linux/capability.h> #include <linux/completion.h> @@ -74,6 +74,7 @@ #include <linux/context_tracking.h> #include <linux/compiler.h> #include <linux/frame.h> +#include <linux/prefetch.h> #include <asm/switch_to.h> #include <asm/tlb.h> @@ -170,6 +171,71 @@ static struct rq *this_rq_lock(void) return rq; } +/* + * __task_rq_lock - lock the rq @p resides on. + */ +struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(rq->lock) +{ + struct rq *rq; + + lockdep_assert_held(&p->pi_lock); + + for (;;) { + rq = task_rq(p); + raw_spin_lock(&rq->lock); + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { + rf->cookie = lockdep_pin_lock(&rq->lock); + return rq; + } + raw_spin_unlock(&rq->lock); + + while (unlikely(task_on_rq_migrating(p))) + cpu_relax(); + } +} + +/* + * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. + */ +struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(p->pi_lock) + __acquires(rq->lock) +{ + struct rq *rq; + + for (;;) { + raw_spin_lock_irqsave(&p->pi_lock, rf->flags); + rq = task_rq(p); + raw_spin_lock(&rq->lock); + /* + * move_queued_task() task_rq_lock() + * + * ACQUIRE (rq->lock) + * [S] ->on_rq = MIGRATING [L] rq = task_rq() + * WMB (__set_task_cpu()) ACQUIRE (rq->lock); + * [S] ->cpu = new_cpu [L] task_rq() + * [L] ->on_rq + * RELEASE (rq->lock) + * + * If we observe the old cpu in task_rq_lock, the acquire of + * the old rq->lock will fully serialize against the stores. + * + * If we observe the new cpu in task_rq_lock, the acquire will + * pair with the WMB to ensure we must then also see migrating. + */ + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { + rf->cookie = lockdep_pin_lock(&rq->lock); + return rq; + } + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); + + while (unlikely(task_on_rq_migrating(p))) + cpu_relax(); + } +} + #ifdef CONFIG_SCHED_HRTICK /* * Use HR-timers to deliver accurate preemption points. @@ -249,29 +315,6 @@ void hrtick_start(struct rq *rq, u64 delay) } } -static int -hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - int cpu = (int)(long)hcpu; - - switch (action) { - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - hrtick_clear(cpu_rq(cpu)); - return NOTIFY_OK; - } - - return NOTIFY_DONE; -} - -static __init void init_hrtick(void) -{ - hotcpu_notifier(hotplug_hrtick, 0); -} #else /* * Called to set the hrtick timer state. @@ -288,10 +331,6 @@ void hrtick_start(struct rq *rq, u64 delay) hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL_PINNED); } - -static inline void init_hrtick(void) -{ -} #endif /* CONFIG_SMP */ static void init_rq_hrtick(struct rq *rq) @@ -315,10 +354,6 @@ static inline void hrtick_clear(struct rq *rq) static inline void init_rq_hrtick(struct rq *rq) { } - -static inline void init_hrtick(void) -{ -} #endif /* CONFIG_SCHED_HRTICK */ /* @@ -400,7 +435,7 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task) * wakeup due to that. * * This cmpxchg() implies a full barrier, which pairs with the write - * barrier implied by the wakeup in wake_up_list(). + * barrier implied by the wakeup in wake_up_q(). */ if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) return; @@ -499,7 +534,10 @@ int get_nohz_timer_target(void) rcu_read_lock(); for_each_domain(cpu, sd) { for_each_cpu(i, sched_domain_span(sd)) { - if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) { + if (cpu == i) + continue; + + if (!idle_cpu(i) && is_housekeeping_cpu(i)) { cpu = i; goto unlock; } @@ -1085,12 +1123,20 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) static int __set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask, bool check) { - unsigned long flags; - struct rq *rq; + const struct cpumask *cpu_valid_mask = cpu_active_mask; unsigned int dest_cpu; + struct rq_flags rf; + struct rq *rq; int ret = 0; - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); + + if (p->flags & PF_KTHREAD) { + /* + * Kernel threads are allowed on online && !active CPUs + */ + cpu_valid_mask = cpu_online_mask; + } /* * Must re-check here, to close a race against __kthread_bind(), @@ -1104,22 +1150,32 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_equal(&p->cpus_allowed, new_mask)) goto out; - if (!cpumask_intersects(new_mask, cpu_active_mask)) { + if (!cpumask_intersects(new_mask, cpu_valid_mask)) { ret = -EINVAL; goto out; } do_set_cpus_allowed(p, new_mask); + if (p->flags & PF_KTHREAD) { + /* + * For kernel threads that do indeed end up on online && + * !active we want to ensure they are strict per-cpu threads. + */ + WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && + !cpumask_intersects(new_mask, cpu_active_mask) && + p->nr_cpus_allowed != 1); + } + /* Can the task run on the task's current CPU? If so, we're done */ if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); + dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); if (task_running(rq, p) || p->state == TASK_WAKING) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); tlb_migrate_finish(p->mm); return 0; @@ -1128,12 +1184,12 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, * OK, since we're going to drop the lock immediately * afterwards anyway. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, rf.cookie); rq = move_queued_task(rq, p, dest_cpu); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, rf.cookie); } out: - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); return ret; } @@ -1317,8 +1373,8 @@ out: */ unsigned long wait_task_inactive(struct task_struct *p, long match_state) { - unsigned long flags; int running, queued; + struct rq_flags rf; unsigned long ncsw; struct rq *rq; @@ -1353,14 +1409,14 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * lock now, to be *sure*. If we're wrong, we'll * just go back and repeat. */ - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); trace_sched_wait_task(p); running = task_running(rq, p); queued = task_on_rq_queued(p); ncsw = 0; if (!match_state || p->state == match_state) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); /* * If it changed from the expected state, bail out now. @@ -1434,6 +1490,25 @@ EXPORT_SYMBOL_GPL(kick_process); /* * ->cpus_allowed is protected by both rq->lock and p->pi_lock + * + * A few notes on cpu_active vs cpu_online: + * + * - cpu_active must be a subset of cpu_online + * + * - on cpu-up we allow per-cpu kthreads on the online && !active cpu, + * see __set_cpus_allowed_ptr(). At this point the newly online + * cpu isn't yet part of the sched domains, and balancing will not + * see it. + * + * - on cpu-down we clear cpu_active() to mask the sched domains and + * avoid the load balancer to place new tasks on the to be removed + * cpu. Existing tasks will remain running there and will be taken + * off. + * + * This means that fallback selection must not select !active CPUs. + * And can assume that any active CPU must be online. Conversely + * select_task_rq() below may allow selection of !active CPUs in order + * to satisfy the above rules. */ static int select_fallback_rq(int cpu, struct task_struct *p) { @@ -1452,8 +1527,6 @@ static int select_fallback_rq(int cpu, struct task_struct *p) /* Look for allowed, online CPU in same node. */ for_each_cpu(dest_cpu, nodemask) { - if (!cpu_online(dest_cpu)) - continue; if (!cpu_active(dest_cpu)) continue; if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) @@ -1464,9 +1537,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p) for (;;) { /* Any allowed, online CPU? */ for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { - if (!cpu_online(dest_cpu)) + if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu)) continue; - if (!cpu_active(dest_cpu)) + if (!cpu_online(dest_cpu)) continue; goto out; } @@ -1515,8 +1588,10 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) { lockdep_assert_held(&p->pi_lock); - if (p->nr_cpus_allowed > 1) + if (tsk_nr_cpus_allowed(p) > 1) cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); + else + cpu = cpumask_any(tsk_cpus_allowed(p)); /* * In order not to call set_task_cpu() on a blocking task we need @@ -1604,8 +1679,8 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl /* * Mark the task runnable and perform wakeup-preemption. */ -static void -ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) +static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, + struct pin_cookie cookie) { check_preempt_curr(rq, p, wake_flags); p->state = TASK_RUNNING; @@ -1617,9 +1692,9 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) * Our task @p is fully woken up and running; so its safe to * drop the rq->lock, hereafter rq is only used for statistics. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); p->sched_class->task_woken(rq, p); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, cookie); } if (rq->idle_stamp) { @@ -1637,17 +1712,23 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) } static void -ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) +ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, + struct pin_cookie cookie) { + int en_flags = ENQUEUE_WAKEUP; + lockdep_assert_held(&rq->lock); #ifdef CONFIG_SMP if (p->sched_contributes_to_load) rq->nr_uninterruptible--; + + if (wake_flags & WF_MIGRATED) + en_flags |= ENQUEUE_MIGRATED; #endif - ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); - ttwu_do_wakeup(rq, p, wake_flags); + ttwu_activate(rq, p, en_flags); + ttwu_do_wakeup(rq, p, wake_flags, cookie); } /* @@ -1658,17 +1739,18 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) */ static int ttwu_remote(struct task_struct *p, int wake_flags) { + struct rq_flags rf; struct rq *rq; int ret = 0; - rq = __task_rq_lock(p); + rq = __task_rq_lock(p, &rf); if (task_on_rq_queued(p)) { /* check_preempt_curr() may use rq clock */ update_rq_clock(rq); - ttwu_do_wakeup(rq, p, wake_flags); + ttwu_do_wakeup(rq, p, wake_flags, rf.cookie); ret = 1; } - __task_rq_unlock(rq); + __task_rq_unlock(rq, &rf); return ret; } @@ -1678,6 +1760,7 @@ void sched_ttwu_pending(void) { struct rq *rq = this_rq(); struct llist_node *llist = llist_del_all(&rq->wake_list); + struct pin_cookie cookie; struct task_struct *p; unsigned long flags; @@ -1685,15 +1768,21 @@ void sched_ttwu_pending(void) return; raw_spin_lock_irqsave(&rq->lock, flags); - lockdep_pin_lock(&rq->lock); + cookie = lockdep_pin_lock(&rq->lock); while (llist) { + int wake_flags = 0; + p = llist_entry(llist, struct task_struct, wake_entry); llist = llist_next(llist); - ttwu_do_activate(rq, p, 0); + + if (p->sched_remote_wakeup) + wake_flags = WF_MIGRATED; + + ttwu_do_activate(rq, p, wake_flags, cookie); } - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -1735,10 +1824,12 @@ void scheduler_ipi(void) irq_exit(); } -static void ttwu_queue_remote(struct task_struct *p, int cpu) +static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) { struct rq *rq = cpu_rq(cpu); + p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); + if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { if (!set_nr_if_polling(rq->idle)) smp_send_reschedule(cpu); @@ -1777,22 +1868,23 @@ bool cpus_share_cache(int this_cpu, int that_cpu) } #endif /* CONFIG_SMP */ -static void ttwu_queue(struct task_struct *p, int cpu) +static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) { struct rq *rq = cpu_rq(cpu); + struct pin_cookie cookie; #if defined(CONFIG_SMP) if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { sched_clock_cpu(cpu); /* sync clocks x-cpu */ - ttwu_queue_remote(p, cpu); + ttwu_queue_remote(p, cpu, wake_flags); return; } #endif raw_spin_lock(&rq->lock); - lockdep_pin_lock(&rq->lock); - ttwu_do_activate(rq, p, 0); - lockdep_unpin_lock(&rq->lock); + cookie = lockdep_pin_lock(&rq->lock); + ttwu_do_activate(rq, p, wake_flags, cookie); + lockdep_unpin_lock(&rq->lock, cookie); raw_spin_unlock(&rq->lock); } @@ -1846,7 +1938,7 @@ static void ttwu_queue(struct task_struct *p, int cpu) * chain to provide order. Instead we do: * * 1) smp_store_release(X->on_cpu, 0) - * 2) smp_cond_acquire(!X->on_cpu) + * 2) smp_cond_load_acquire(!X->on_cpu) * * Example: * @@ -1857,7 +1949,7 @@ static void ttwu_queue(struct task_struct *p, int cpu) * sched-out X * smp_store_release(X->on_cpu, 0); * - * smp_cond_acquire(!X->on_cpu); + * smp_cond_load_acquire(&X->on_cpu, !VAL); * X->state = WAKING * set_task_cpu(X,2) * @@ -1883,7 +1975,7 @@ static void ttwu_queue(struct task_struct *p, int cpu) * This means that any means of doing remote wakeups must order the CPU doing * the wakeup against the CPU the task is going to end up running on. This, * however, is already required for the regular Program-Order guarantee above, - * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire). + * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). * */ @@ -1924,6 +2016,28 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) success = 1; /* we're going to change ->state */ cpu = task_cpu(p); + /* + * Ensure we load p->on_rq _after_ p->state, otherwise it would + * be possible to, falsely, observe p->on_rq == 0 and get stuck + * in smp_cond_load_acquire() below. + * + * sched_ttwu_pending() try_to_wake_up() + * [S] p->on_rq = 1; [L] P->state + * UNLOCK rq->lock -----. + * \ + * +--- RMB + * schedule() / + * LOCK rq->lock -----' + * UNLOCK rq->lock + * + * [task p] + * [S] p->state = UNINTERRUPTIBLE [L] p->on_rq + * + * Pairs with the UNLOCK+LOCK on rq->lock from the + * last wakeup of our task and the schedule that got our task + * current. + */ + smp_rmb(); if (p->on_rq && ttwu_remote(p, wake_flags)) goto stat; @@ -1956,14 +2070,11 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * This ensures that tasks getting woken will be fully ordered against * their previous state and preserve Program Order. */ - smp_cond_acquire(!p->on_cpu); + smp_cond_load_acquire(&p->on_cpu, !VAL); p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; - if (p->sched_class->task_waking) - p->sched_class->task_waking(p); - cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; @@ -1971,7 +2082,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) } #endif /* CONFIG_SMP */ - ttwu_queue(p, cpu); + ttwu_queue(p, cpu, wake_flags); stat: if (schedstat_enabled()) ttwu_stat(p, cpu, wake_flags); @@ -1989,7 +2100,7 @@ out: * ensure that this_rq() is locked, @p is bound to this_rq() and not * the current task. */ -static void try_to_wake_up_local(struct task_struct *p) +static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie) { struct rq *rq = task_rq(p); @@ -2006,11 +2117,11 @@ static void try_to_wake_up_local(struct task_struct *p) * disabled avoiding further scheduler activity on it and we've * not yet picked a replacement task. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); raw_spin_unlock(&rq->lock); raw_spin_lock(&p->pi_lock); raw_spin_lock(&rq->lock); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, cookie); } if (!(p->state & TASK_NORMAL)) @@ -2021,7 +2132,7 @@ static void try_to_wake_up_local(struct task_struct *p) if (!task_on_rq_queued(p)) ttwu_activate(rq, p, ENQUEUE_WAKEUP); - ttwu_do_wakeup(rq, p, 0); + ttwu_do_wakeup(rq, p, 0, cookie); if (schedstat_enabled()) ttwu_stat(p, smp_processor_id(), 0); out: @@ -2167,9 +2278,11 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, #endif #endif +#ifdef CONFIG_SCHEDSTATS + DEFINE_STATIC_KEY_FALSE(sched_schedstats); +static bool __initdata __sched_schedstats = false; -#ifdef CONFIG_SCHEDSTATS static void set_schedstats(bool enabled) { if (enabled) @@ -2192,11 +2305,16 @@ static int __init setup_schedstats(char *str) if (!str) goto out; + /* + * This code is called before jump labels have been set up, so we can't + * change the static branch directly just yet. Instead set a temporary + * variable so init_schedstats() can do it later. + */ if (!strcmp(str, "enable")) { - set_schedstats(true); + __sched_schedstats = true; ret = 1; } else if (!strcmp(str, "disable")) { - set_schedstats(false); + __sched_schedstats = false; ret = 1; } out: @@ -2207,6 +2325,11 @@ out: } __setup("schedstats=", setup_schedstats); +static void __init init_schedstats(void) +{ + set_schedstats(__sched_schedstats); +} + #ifdef CONFIG_PROC_SYSCTL int sysctl_schedstats(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -2227,8 +2350,10 @@ int sysctl_schedstats(struct ctl_table *table, int write, set_schedstats(state); return err; } -#endif -#endif +#endif /* CONFIG_PROC_SYSCTL */ +#else /* !CONFIG_SCHEDSTATS */ +static inline void init_schedstats(void) {} +#endif /* CONFIG_SCHEDSTATS */ /* * fork()/clone()-time setup: @@ -2240,11 +2365,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) __sched_fork(clone_flags, p); /* - * We mark the process as running here. This guarantees that + * We mark the process as NEW here. This guarantees that * nobody will actually run it, and a signal or other external * event cannot wake it up and insert it on the runqueue either. */ - p->state = TASK_RUNNING; + p->state = TASK_NEW; /* * Make sure we do not leak PI boosting priority to the child. @@ -2281,8 +2406,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->sched_class = &fair_sched_class; } - if (p->sched_class->task_fork) - p->sched_class->task_fork(p); + init_entity_runnable_average(&p->se); /* * The child is not yet in the pid-hash so no cgroup attach races, @@ -2292,7 +2416,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) * Silence PROVE_RCU. */ raw_spin_lock_irqsave(&p->pi_lock, flags); - set_task_cpu(p, cpu); + /* + * We're setting the cpu for the first time, we don't migrate, + * so use __set_task_cpu(). + */ + __set_task_cpu(p, cpu); + if (p->sched_class->task_fork) + p->sched_class->task_fork(p); raw_spin_unlock_irqrestore(&p->pi_lock, flags); #ifdef CONFIG_SCHED_INFO @@ -2381,7 +2511,8 @@ static int dl_overflow(struct task_struct *p, int policy, u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; int cpus, err = -1; - if (new_bw == p->dl.dl_bw) + /* !deadline task may carry old deadline bandwidth */ + if (new_bw == p->dl.dl_bw && task_has_dl_policy(p)) return 0; /* @@ -2420,22 +2551,25 @@ extern void init_dl_bw(struct dl_bw *dl_b); */ void wake_up_new_task(struct task_struct *p) { - unsigned long flags; + struct rq_flags rf; struct rq *rq; - raw_spin_lock_irqsave(&p->pi_lock, flags); - /* Initialize new task's runnable average */ - init_entity_runnable_average(&p->se); + raw_spin_lock_irqsave(&p->pi_lock, rf.flags); + p->state = TASK_RUNNING; #ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: * - cpus_allowed can change in the fork path * - any previously selected cpu might disappear through hotplug + * + * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, + * as we're not fully set-up yet. */ - set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); + __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif + rq = __task_rq_lock(p, &rf); + post_init_entity_util_avg(&p->se); - rq = __task_rq_lock(p); activate_task(rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); @@ -2446,12 +2580,12 @@ void wake_up_new_task(struct task_struct *p) * Nothing relies on rq->lock after this, so its fine to * drop it. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, rf.cookie); p->sched_class->task_woken(rq, p); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, rf.cookie); } #endif - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); } #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -2713,7 +2847,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) */ static __always_inline struct rq * context_switch(struct rq *rq, struct task_struct *prev, - struct task_struct *next) + struct task_struct *next, struct pin_cookie cookie) { struct mm_struct *mm, *oldmm; @@ -2733,7 +2867,7 @@ context_switch(struct rq *rq, struct task_struct *prev, atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next); } else - switch_mm(oldmm, mm, next); + switch_mm_irqs_off(oldmm, mm, next); if (!prev->mm) { prev->active_mm = NULL; @@ -2745,7 +2879,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * of the scheduler it's an obvious special-case), so we * do an early lockdep release here: */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); spin_release(&rq->lock.dep_map, 1, _THIS_IP_); /* Here we just switch the register state and the stack. */ @@ -2861,13 +2995,30 @@ EXPORT_PER_CPU_SYMBOL(kstat); EXPORT_PER_CPU_SYMBOL(kernel_cpustat); /* + * The function fair_sched_class.update_curr accesses the struct curr + * and its field curr->exec_start; when called from task_sched_runtime(), + * we observe a high rate of cache misses in practice. + * Prefetching this data results in improved performance. + */ +static inline void prefetch_curr_exec_start(struct task_struct *p) +{ +#ifdef CONFIG_FAIR_GROUP_SCHED + struct sched_entity *curr = (&p->se)->cfs_rq->curr; +#else + struct sched_entity *curr = (&task_rq(p)->cfs)->curr; +#endif + prefetch(curr); + prefetch(&curr->exec_start); +} + +/* * Return accounted runtime for the task. * In case the task is currently running, return the runtime plus current's * pending runtime that have not been accounted yet. */ unsigned long long task_sched_runtime(struct task_struct *p) { - unsigned long flags; + struct rq_flags rf; struct rq *rq; u64 ns; @@ -2887,18 +3038,19 @@ unsigned long long task_sched_runtime(struct task_struct *p) return p->se.sum_exec_runtime; #endif - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); /* * Must be ->curr _and_ ->on_rq. If dequeued, we would * project cycles that may never be accounted to this * thread, breaking clock_gettime(). */ if (task_current(rq, p) && task_on_rq_queued(p)) { + prefetch_curr_exec_start(p); update_rq_clock(rq); p->sched_class->update_curr(rq); } ns = p->se.sum_exec_runtime; - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); return ns; } @@ -2918,7 +3070,7 @@ void scheduler_tick(void) raw_spin_lock(&rq->lock); update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); - update_cpu_load_active(rq); + cpu_load_update_active(rq); calc_global_load_tick(rq); raw_spin_unlock(&rq->lock); @@ -2961,6 +3113,20 @@ u64 scheduler_tick_max_deferment(void) #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_PREEMPT_TRACER)) +/* + * If the value passed in is equal to the current preempt count + * then we just disabled preemption. Start timing the latency. + */ +static inline void preempt_latency_start(int val) +{ + if (preempt_count() == val) { + unsigned long ip = get_lock_parent_ip(); +#ifdef CONFIG_DEBUG_PREEMPT + current->preempt_disable_ip = ip; +#endif + trace_preempt_off(CALLER_ADDR0, ip); + } +} void preempt_count_add(int val) { @@ -2979,17 +3145,21 @@ void preempt_count_add(int val) DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK - 10); #endif - if (preempt_count() == val) { - unsigned long ip = get_lock_parent_ip(); -#ifdef CONFIG_DEBUG_PREEMPT - current->preempt_disable_ip = ip; -#endif - trace_preempt_off(CALLER_ADDR0, ip); - } + preempt_latency_start(val); } EXPORT_SYMBOL(preempt_count_add); NOKPROBE_SYMBOL(preempt_count_add); +/* + * If the value passed in equals to the current preempt count + * then we just enabled preemption. Stop timing the latency. + */ +static inline void preempt_latency_stop(int val) +{ + if (preempt_count() == val) + trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); +} + void preempt_count_sub(int val) { #ifdef CONFIG_DEBUG_PREEMPT @@ -3006,13 +3176,15 @@ void preempt_count_sub(int val) return; #endif - if (preempt_count() == val) - trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); + preempt_latency_stop(val); __preempt_count_sub(val); } EXPORT_SYMBOL(preempt_count_sub); NOKPROBE_SYMBOL(preempt_count_sub); +#else +static inline void preempt_latency_start(int val) { } +static inline void preempt_latency_stop(int val) { } #endif /* @@ -3037,6 +3209,9 @@ static noinline void __schedule_bug(struct task_struct *prev) pr_cont("\n"); } #endif + if (panic_on_warn) + panic("scheduling while atomic\n"); + dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } @@ -3047,7 +3222,8 @@ static noinline void __schedule_bug(struct task_struct *prev) static inline void schedule_debug(struct task_struct *prev) { #ifdef CONFIG_SCHED_STACK_END_CHECK - BUG_ON(task_stack_end_corrupted(prev)); + if (task_stack_end_corrupted(prev)) + panic("corrupted stack end detected inside scheduler\n"); #endif if (unlikely(in_atomic_preempt_off())) { @@ -3065,7 +3241,7 @@ static inline void schedule_debug(struct task_struct *prev) * Pick up the highest-prio task: */ static inline struct task_struct * -pick_next_task(struct rq *rq, struct task_struct *prev) +pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) { const struct sched_class *class = &fair_sched_class; struct task_struct *p; @@ -3076,20 +3252,20 @@ pick_next_task(struct rq *rq, struct task_struct *prev) */ if (likely(prev->sched_class == class && rq->nr_running == rq->cfs.h_nr_running)) { - p = fair_sched_class.pick_next_task(rq, prev); + p = fair_sched_class.pick_next_task(rq, prev, cookie); if (unlikely(p == RETRY_TASK)) goto again; /* assumes fair_sched_class->next == idle_sched_class */ if (unlikely(!p)) - p = idle_sched_class.pick_next_task(rq, prev); + p = idle_sched_class.pick_next_task(rq, prev, cookie); return p; } again: for_each_class(class) { - p = class->pick_next_task(rq, prev); + p = class->pick_next_task(rq, prev, cookie); if (p) { if (unlikely(p == RETRY_TASK)) goto again; @@ -3143,6 +3319,7 @@ static void __sched notrace __schedule(bool preempt) { struct task_struct *prev, *next; unsigned long *switch_count; + struct pin_cookie cookie; struct rq *rq; int cpu; @@ -3176,7 +3353,7 @@ static void __sched notrace __schedule(bool preempt) */ smp_mb__before_spinlock(); raw_spin_lock(&rq->lock); - lockdep_pin_lock(&rq->lock); + cookie = lockdep_pin_lock(&rq->lock); rq->clock_skip_update <<= 1; /* promote REQ to ACT */ @@ -3198,7 +3375,7 @@ static void __sched notrace __schedule(bool preempt) to_wakeup = wq_worker_sleeping(prev); if (to_wakeup) - try_to_wake_up_local(to_wakeup); + try_to_wake_up_local(to_wakeup, cookie); } } switch_count = &prev->nvcsw; @@ -3207,7 +3384,7 @@ static void __sched notrace __schedule(bool preempt) if (task_on_rq_queued(prev)) update_rq_clock(rq); - next = pick_next_task(rq, prev); + next = pick_next_task(rq, prev, cookie); clear_tsk_need_resched(prev); clear_preempt_need_resched(); rq->clock_skip_update = 0; @@ -3218,9 +3395,9 @@ static void __sched notrace __schedule(bool preempt) ++*switch_count; trace_sched_switch(preempt, prev, next); - rq = context_switch(rq, prev, next); /* unlocks the rq */ + rq = context_switch(rq, prev, next, cookie); /* unlocks the rq */ } else { - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); raw_spin_unlock_irq(&rq->lock); } @@ -3287,8 +3464,23 @@ void __sched schedule_preempt_disabled(void) static void __sched notrace preempt_schedule_common(void) { do { + /* + * Because the function tracer can trace preempt_count_sub() + * and it also uses preempt_enable/disable_notrace(), if + * NEED_RESCHED is set, the preempt_enable_notrace() called + * by the function tracer will call this function again and + * cause infinite recursion. + * + * Preemption must be disabled here before the function + * tracer can trace. Break up preempt_disable() into two + * calls. One to disable preemption without fear of being + * traced. The other to still record the preemption latency, + * which can also be traced by the function tracer. + */ preempt_disable_notrace(); + preempt_latency_start(1); __schedule(true); + preempt_latency_stop(1); preempt_enable_no_resched_notrace(); /* @@ -3340,7 +3532,21 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) return; do { + /* + * Because the function tracer can trace preempt_count_sub() + * and it also uses preempt_enable/disable_notrace(), if + * NEED_RESCHED is set, the preempt_enable_notrace() called + * by the function tracer will call this function again and + * cause infinite recursion. + * + * Preemption must be disabled here before the function + * tracer can trace. Break up preempt_disable() into two + * calls. One to disable preemption without fear of being + * traced. The other to still record the preemption latency, + * which can also be traced by the function tracer. + */ preempt_disable_notrace(); + preempt_latency_start(1); /* * Needs preempt disabled in case user_exit() is traced * and the tracer calls preempt_enable_notrace() causing @@ -3350,6 +3556,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) __schedule(true); exception_exit(prev_ctx); + preempt_latency_stop(1); preempt_enable_no_resched_notrace(); } while (need_resched()); } @@ -3406,12 +3613,13 @@ EXPORT_SYMBOL(default_wake_function); void rt_mutex_setprio(struct task_struct *p, int prio) { int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE; - struct rq *rq; const struct sched_class *prev_class; + struct rq_flags rf; + struct rq *rq; BUG_ON(prio > MAX_PRIO); - rq = __task_rq_lock(p); + rq = __task_rq_lock(p, &rf); /* * Idle task boosting is a nono in general. There is one @@ -3487,7 +3695,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) check_class_changed(rq, p, prev_class, oldprio); out_unlock: preempt_disable(); /* avoid rq from going away on us */ - __task_rq_unlock(rq); + __task_rq_unlock(rq, &rf); balance_callback(rq); preempt_enable(); @@ -3497,7 +3705,7 @@ out_unlock: void set_user_nice(struct task_struct *p, long nice) { int old_prio, delta, queued; - unsigned long flags; + struct rq_flags rf; struct rq *rq; if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) @@ -3506,7 +3714,7 @@ void set_user_nice(struct task_struct *p, long nice) * We have to be careful, if called from sys_setpriority(), * the task might be in the middle of scheduling on another CPU. */ - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); /* * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected @@ -3537,7 +3745,7 @@ void set_user_nice(struct task_struct *p, long nice) resched_curr(rq); } out_unlock: - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); } EXPORT_SYMBOL(set_user_nice); @@ -3834,11 +4042,11 @@ static int __sched_setscheduler(struct task_struct *p, MAX_RT_PRIO - 1 - attr->sched_priority; int retval, oldprio, oldpolicy = -1, queued, running; int new_effective_prio, policy = attr->sched_policy; - unsigned long flags; const struct sched_class *prev_class; - struct rq *rq; + struct rq_flags rf; int reset_on_fork; int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; + struct rq *rq; /* may grab non-irq protected spin_locks */ BUG_ON(in_interrupt()); @@ -3933,13 +4141,13 @@ recheck: * To be able to change p->policy safely, the appropriate * runqueue lock must be held. */ - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); /* * Changing the policy of the stop threads its a very bad idea */ if (p == rq->stop) { - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); return -EINVAL; } @@ -3956,7 +4164,7 @@ recheck: goto change; p->sched_reset_on_fork = reset_on_fork; - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); return 0; } change: @@ -3970,7 +4178,7 @@ change: if (rt_bandwidth_enabled() && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0 && !task_group_is_autogroup(task_group(p))) { - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); return -EPERM; } #endif @@ -3985,7 +4193,7 @@ change: */ if (!cpumask_subset(span, &p->cpus_allowed) || rq->rd->dl_bw.bw == 0) { - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); return -EPERM; } } @@ -3995,7 +4203,7 @@ change: /* recheck policy now with rq lock held */ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); goto recheck; } @@ -4005,7 +4213,7 @@ change: * is available. */ if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) { - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); return -EBUSY; } @@ -4050,7 +4258,7 @@ change: check_class_changed(rq, p, prev_class, oldprio); preempt_disable(); /* avoid rq from going away on us */ - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); if (pi) rt_mutex_adjust_pi(p); @@ -4595,7 +4803,8 @@ out_unlock: * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to hold the current cpu mask * - * Return: 0 on success. An error code otherwise. + * Return: size of CPU mask copied to user_mask_ptr on success. An + * error code otherwise. */ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, unsigned long __user *, user_mask_ptr) @@ -4903,10 +5112,10 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, { struct task_struct *p; unsigned int time_slice; - unsigned long flags; + struct rq_flags rf; + struct timespec t; struct rq *rq; int retval; - struct timespec t; if (pid < 0) return -EINVAL; @@ -4921,11 +5130,11 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, if (retval) goto out_unlock; - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); time_slice = 0; if (p->sched_class->get_rr_interval) time_slice = p->sched_class->get_rr_interval(rq, p); - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); rcu_read_unlock(); jiffies_to_timespec(time_slice, &t); @@ -4992,16 +5201,19 @@ void show_state_filter(unsigned long state_filter) /* * reset the NMI-timeout, listing all files on a slow * console might take a lot of time: + * Also, reset softlockup watchdogs on all CPUs, because + * another CPU might be blocked waiting for us to process + * an IPI. */ touch_nmi_watchdog(); + touch_all_softlockup_watchdogs(); if (!state_filter || (p->state & state_filter)) sched_show_task(p); } - touch_all_softlockup_watchdogs(); - #ifdef CONFIG_SCHED_DEBUG - sysrq_sched_debug_show(); + if (!state_filter) + sysrq_sched_debug_show(); #endif rcu_read_unlock(); /* @@ -5163,6 +5375,8 @@ out: #ifdef CONFIG_SMP +static bool sched_smp_initialized __read_mostly; + #ifdef CONFIG_NUMA_BALANCING /* Migrate current task p to target_cpu */ int migrate_task_to(struct task_struct *p, int target_cpu) @@ -5188,11 +5402,11 @@ int migrate_task_to(struct task_struct *p, int target_cpu) */ void sched_setnuma(struct task_struct *p, int nid) { - struct rq *rq; - unsigned long flags; bool queued, running; + struct rq_flags rf; + struct rq *rq; - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); queued = task_on_rq_queued(p); running = task_current(rq, p); @@ -5207,7 +5421,7 @@ void sched_setnuma(struct task_struct *p, int nid) p->sched_class->set_curr_task(rq); if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE); - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); } #endif /* CONFIG_NUMA_BALANCING */ @@ -5223,7 +5437,7 @@ void idle_task_exit(void) BUG_ON(cpu_online(smp_processor_id())); if (mm != &init_mm) { - switch_mm(mm, &init_mm, current); + switch_mm_irqs_off(mm, &init_mm, current); finish_arch_post_lock_switch(); } mmdrop(mm); @@ -5232,13 +5446,15 @@ void idle_task_exit(void) /* * Since this CPU is going 'away' for a while, fold any nr_active delta * we might have. Assumes we're called after migrate_tasks() so that the - * nr_active count is stable. + * nr_active count is stable. We need to take the teardown thread which + * is calling this into account, so we hand in adjust = 1 to the load + * calculation. * * Also see the comment "Global load-average calculations". */ static void calc_load_migrate(struct rq *rq) { - long delta = calc_load_fold_active(rq); + long delta = calc_load_fold_active(rq, 1); if (delta) atomic_long_add(delta, &calc_load_tasks); } @@ -5271,6 +5487,7 @@ static void migrate_tasks(struct rq *dead_rq) { struct rq *rq = dead_rq; struct task_struct *next, *stop = rq->stop; + struct pin_cookie cookie; int dest_cpu; /* @@ -5302,8 +5519,8 @@ static void migrate_tasks(struct rq *dead_rq) /* * pick_next_task assumes pinned rq->lock. */ - lockdep_pin_lock(&rq->lock); - next = pick_next_task(rq, &fake_task); + cookie = lockdep_pin_lock(&rq->lock); + next = pick_next_task(rq, &fake_task, cookie); BUG_ON(!next); next->sched_class->put_prev_task(rq, next); @@ -5316,7 +5533,7 @@ static void migrate_tasks(struct rq *dead_rq) * because !cpu_active at this point, which means load-balance * will not interfere. Also, stop-machine. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); raw_spin_unlock(&rq->lock); raw_spin_lock(&next->pi_lock); raw_spin_lock(&rq->lock); @@ -5377,127 +5594,13 @@ static void set_rq_offline(struct rq *rq) } } -/* - * migration_call - callback that gets triggered when a CPU is added. - * Here we can start up the necessary migration thread for the new CPU. - */ -static int -migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) +static void set_cpu_rq_start_time(unsigned int cpu) { - int cpu = (long)hcpu; - unsigned long flags; struct rq *rq = cpu_rq(cpu); - switch (action & ~CPU_TASKS_FROZEN) { - - case CPU_UP_PREPARE: - rq->calc_load_update = calc_load_update; - account_reset_rq(rq); - break; - - case CPU_ONLINE: - /* Update our root-domain */ - raw_spin_lock_irqsave(&rq->lock, flags); - if (rq->rd) { - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - - set_rq_online(rq); - } - raw_spin_unlock_irqrestore(&rq->lock, flags); - break; - -#ifdef CONFIG_HOTPLUG_CPU - case CPU_DYING: - sched_ttwu_pending(); - /* Update our root-domain */ - raw_spin_lock_irqsave(&rq->lock, flags); - if (rq->rd) { - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - set_rq_offline(rq); - } - migrate_tasks(rq); - BUG_ON(rq->nr_running != 1); /* the migration thread */ - raw_spin_unlock_irqrestore(&rq->lock, flags); - break; - - case CPU_DEAD: - calc_load_migrate(rq); - break; -#endif - } - - update_max_interval(); - - return NOTIFY_OK; -} - -/* - * Register at high priority so that task migration (migrate_all_tasks) - * happens before everything else. This has to be lower priority than - * the notifier in the perf_event subsystem, though. - */ -static struct notifier_block migration_notifier = { - .notifier_call = migration_call, - .priority = CPU_PRI_MIGRATION, -}; - -static void set_cpu_rq_start_time(void) -{ - int cpu = smp_processor_id(); - struct rq *rq = cpu_rq(cpu); rq->age_stamp = sched_clock_cpu(cpu); } -static int sched_cpu_active(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - int cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_STARTING: - set_cpu_rq_start_time(); - return NOTIFY_OK; - - case CPU_DOWN_FAILED: - set_cpu_active(cpu, true); - return NOTIFY_OK; - - default: - return NOTIFY_DONE; - } -} - -static int sched_cpu_inactive(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DOWN_PREPARE: - set_cpu_active((long)hcpu, false); - return NOTIFY_OK; - default: - return NOTIFY_DONE; - } -} - -static int __init migration_init(void) -{ - void *cpu = (void *)(long)smp_processor_id(); - int err; - - /* Initialize migration for the boot CPU */ - err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); - BUG_ON(err == NOTIFY_BAD); - migration_call(&migration_notifier, CPU_ONLINE, cpu); - register_cpu_notifier(&migration_notifier); - - /* Register cpu active notifiers */ - cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); - cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); - - return 0; -} -early_initcall(migration_init); - static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ #ifdef CONFIG_SCHED_DEBUG @@ -6645,10 +6748,10 @@ static void sched_init_numa(void) init_numa_topology_type(); } -static void sched_domains_numa_masks_set(int cpu) +static void sched_domains_numa_masks_set(unsigned int cpu) { - int i, j; int node = cpu_to_node(cpu); + int i, j; for (i = 0; i < sched_domains_numa_levels; i++) { for (j = 0; j < nr_node_ids; j++) { @@ -6658,51 +6761,20 @@ static void sched_domains_numa_masks_set(int cpu) } } -static void sched_domains_numa_masks_clear(int cpu) +static void sched_domains_numa_masks_clear(unsigned int cpu) { int i, j; + for (i = 0; i < sched_domains_numa_levels; i++) { for (j = 0; j < nr_node_ids; j++) cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); } } -/* - * Update sched_domains_numa_masks[level][node] array when new cpus - * are onlined. - */ -static int sched_domains_numa_masks_update(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - int cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - sched_domains_numa_masks_set(cpu); - break; - - case CPU_DEAD: - sched_domains_numa_masks_clear(cpu); - break; - - default: - return NOTIFY_DONE; - } - - return NOTIFY_OK; -} #else -static inline void sched_init_numa(void) -{ -} - -static int sched_domains_numa_masks_update(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - return 0; -} +static inline void sched_init_numa(void) { } +static void sched_domains_numa_masks_set(unsigned int cpu) { } +static void sched_domains_numa_masks_clear(unsigned int cpu) { } #endif /* CONFIG_NUMA */ static int __sdt_alloc(const struct cpumask *cpu_map) @@ -7092,13 +7164,9 @@ static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */ * If we come here as part of a suspend/resume, don't touch cpusets because we * want to restore it back to its original state upon resume anyway. */ -static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, - void *hcpu) +static void cpuset_cpu_active(void) { - switch (action) { - case CPU_ONLINE_FROZEN: - case CPU_DOWN_FAILED_FROZEN: - + if (cpuhp_tasks_frozen) { /* * num_cpus_frozen tracks how many CPUs are involved in suspend * resume sequence. As long as this is not the last online @@ -7108,35 +7176,25 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, num_cpus_frozen--; if (likely(num_cpus_frozen)) { partition_sched_domains(1, NULL, NULL); - break; + return; } - /* * This is the last CPU online operation. So fall through and * restore the original sched domains by considering the * cpuset configurations. */ - - case CPU_ONLINE: - cpuset_update_active_cpus(true); - break; - default: - return NOTIFY_DONE; } - return NOTIFY_OK; + cpuset_update_active_cpus(true); } -static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, - void *hcpu) +static int cpuset_cpu_inactive(unsigned int cpu) { unsigned long flags; - long cpu = (long)hcpu; struct dl_bw *dl_b; bool overflow; int cpus; - switch (action) { - case CPU_DOWN_PREPARE: + if (!cpuhp_tasks_frozen) { rcu_read_lock_sched(); dl_b = dl_bw_of(cpu); @@ -7148,18 +7206,118 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, rcu_read_unlock_sched(); if (overflow) - return notifier_from_errno(-EBUSY); + return -EBUSY; cpuset_update_active_cpus(false); - break; - case CPU_DOWN_PREPARE_FROZEN: + } else { num_cpus_frozen++; partition_sched_domains(1, NULL, NULL); - break; - default: - return NOTIFY_DONE; } - return NOTIFY_OK; + return 0; +} + +int sched_cpu_activate(unsigned int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + set_cpu_active(cpu, true); + + if (sched_smp_initialized) { + sched_domains_numa_masks_set(cpu); + cpuset_cpu_active(); + } + + /* + * Put the rq online, if not already. This happens: + * + * 1) In the early boot process, because we build the real domains + * after all cpus have been brought up. + * + * 2) At runtime, if cpuset_cpu_active() fails to rebuild the + * domains. + */ + raw_spin_lock_irqsave(&rq->lock, flags); + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_online(rq); + } + raw_spin_unlock_irqrestore(&rq->lock, flags); + + update_max_interval(); + + return 0; +} + +int sched_cpu_deactivate(unsigned int cpu) +{ + int ret; + + set_cpu_active(cpu, false); + /* + * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU + * users of this state to go away such that all new such users will + * observe it. + * + * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might + * not imply sync_sched(), so wait for both. + * + * Do sync before park smpboot threads to take care the rcu boost case. + */ + if (IS_ENABLED(CONFIG_PREEMPT)) + synchronize_rcu_mult(call_rcu, call_rcu_sched); + else + synchronize_rcu(); + + if (!sched_smp_initialized) + return 0; + + ret = cpuset_cpu_inactive(cpu); + if (ret) { + set_cpu_active(cpu, true); + return ret; + } + sched_domains_numa_masks_clear(cpu); + return 0; +} + +static void sched_rq_cpu_starting(unsigned int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + rq->calc_load_update = calc_load_update; + update_max_interval(); +} + +int sched_cpu_starting(unsigned int cpu) +{ + set_cpu_rq_start_time(cpu); + sched_rq_cpu_starting(cpu); + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU +int sched_cpu_dying(unsigned int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + /* Handle pending wakeups and then migrate everything off */ + sched_ttwu_pending(); + raw_spin_lock_irqsave(&rq->lock, flags); + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_offline(rq); + } + migrate_tasks(rq); + BUG_ON(rq->nr_running != 1); + raw_spin_unlock_irqrestore(&rq->lock, flags); + calc_load_migrate(rq); + update_max_interval(); + nohz_balance_exit_idle(cpu); + hrtick_clear(rq); + return 0; } +#endif void __init sched_init_smp(void) { @@ -7182,12 +7340,6 @@ void __init sched_init_smp(void) cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); mutex_unlock(&sched_domains_mutex); - hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); - hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); - hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); - - init_hrtick(); - /* Move init over to a non-isolated CPU */ if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) BUG(); @@ -7196,7 +7348,16 @@ void __init sched_init_smp(void) init_sched_rt_class(); init_sched_dl_class(); + sched_smp_initialized = true; } + +static int __init migration_init(void) +{ + sched_rq_cpu_starting(smp_processor_id()); + return 0; +} +early_initcall(migration_init); + #else void __init sched_init_smp(void) { @@ -7331,8 +7492,6 @@ void __init sched_init(void) for (j = 0; j < CPU_LOAD_IDX_MAX; j++) rq->cpu_load[j] = 0; - rq->last_load_update_tick = jiffies; - #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; @@ -7351,12 +7510,13 @@ void __init sched_init(void) rq_attach_root(rq, &def_root_domain); #ifdef CONFIG_NO_HZ_COMMON + rq->last_load_update_tick = jiffies; rq->nohz_flags = 0; #endif #ifdef CONFIG_NO_HZ_FULL rq->last_sched_tick = 0; #endif -#endif +#endif /* CONFIG_SMP */ init_rq_hrtick(rq); atomic_set(&rq->nr_iowait, 0); } @@ -7394,10 +7554,12 @@ void __init sched_init(void) if (cpu_isolated_map == NULL) zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); idle_thread_set_boot_cpu(); - set_cpu_rq_start_time(); + set_cpu_rq_start_time(smp_processor_id()); #endif init_sched_fair_class(); + init_schedstats(); + scheduler_running = 1; } @@ -7602,6 +7764,8 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) INIT_LIST_HEAD(&tg->children); list_add_rcu(&tg->siblings, &parent->children); spin_unlock_irqrestore(&task_group_lock, flags); + + online_fair_sched_group(tg); } /* rcu callback to free various structures associated with a task group */ @@ -7630,27 +7794,9 @@ void sched_offline_group(struct task_group *tg) spin_unlock_irqrestore(&task_group_lock, flags); } -/* change task's runqueue when it moves between groups. - * The caller of this function should have put the task in its new group - * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to - * reflect its new group. - */ -void sched_move_task(struct task_struct *tsk) +static void sched_change_group(struct task_struct *tsk, int type) { struct task_group *tg; - int queued, running; - unsigned long flags; - struct rq *rq; - - rq = task_rq_lock(tsk, &flags); - - running = task_current(rq, tsk); - queued = task_on_rq_queued(tsk); - - if (queued) - dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); - if (unlikely(running)) - put_prev_task(rq, tsk); /* * All callers are synchronized by task_rq_lock(); we do not use RCU @@ -7663,18 +7809,44 @@ void sched_move_task(struct task_struct *tsk) tsk->sched_task_group = tg; #ifdef CONFIG_FAIR_GROUP_SCHED - if (tsk->sched_class->task_move_group) - tsk->sched_class->task_move_group(tsk); + if (tsk->sched_class->task_change_group) + tsk->sched_class->task_change_group(tsk, type); else #endif set_task_rq(tsk, task_cpu(tsk)); +} + +/* + * Change task's runqueue when it moves between groups. + * + * The caller of this function should have put the task in its new group by + * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect + * its new group. + */ +void sched_move_task(struct task_struct *tsk) +{ + int queued, running; + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(tsk, &rf); + + running = task_current(rq, tsk); + queued = task_on_rq_queued(tsk); + + if (queued) + dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); + if (unlikely(running)) + put_prev_task(rq, tsk); + + sched_change_group(tsk, TASK_MOVE_GROUP); if (unlikely(running)) tsk->sched_class->set_curr_task(rq); if (queued) enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); - task_rq_unlock(rq, tsk, &flags); + task_rq_unlock(rq, tsk, &rf); } #endif /* CONFIG_CGROUP_SCHED */ @@ -7894,7 +8066,7 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) static int sched_rt_global_constraints(void) { unsigned long flags; - int i, ret = 0; + int i; raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); for_each_possible_cpu(i) { @@ -7906,7 +8078,7 @@ static int sched_rt_global_constraints(void) } raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); - return ret; + return 0; } #endif /* CONFIG_RT_GROUP_SCHED */ @@ -8095,15 +8267,27 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) sched_free_group(tg); } +/* + * This is called before wake_up_new_task(), therefore we really only + * have to set its group bits, all the other stuff does not apply. + */ static void cpu_cgroup_fork(struct task_struct *task) { - sched_move_task(task); + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(task, &rf); + + sched_change_group(task, TASK_SET_GROUP); + + task_rq_unlock(rq, task, &rf); } static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct cgroup_subsys_state *css; + int ret = 0; cgroup_taskset_for_each(task, css, tset) { #ifdef CONFIG_RT_GROUP_SCHED @@ -8114,8 +8298,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) if (task->sched_class != &fair_sched_class) return -EINVAL; #endif + /* + * Serialize against wake_up_new_task() such that if its + * running, we're sure to observe its full state. + */ + raw_spin_lock_irq(&task->pi_lock); + /* + * Avoid calling sched_move_task() before wake_up_new_task() + * has happened. This would lead to problems with PELT, due to + * move wanting to detach+attach while we're not attached yet. + */ + if (task->state == TASK_NEW) + ret = -EINVAL; + raw_spin_unlock_irq(&task->pi_lock); + + if (ret) + break; } - return 0; + return ret; } static void cpu_cgroup_attach(struct cgroup_taskset *tset) diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 4a811203c04a..bc0b309c3f19 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -25,11 +25,20 @@ enum cpuacct_stat_index { CPUACCT_STAT_NSTATS, }; +static const char * const cpuacct_stat_desc[] = { + [CPUACCT_STAT_USER] = "user", + [CPUACCT_STAT_SYSTEM] = "system", +}; + +struct cpuacct_usage { + u64 usages[CPUACCT_STAT_NSTATS]; +}; + /* track cpu usage of a group of tasks and its child groups */ struct cpuacct { struct cgroup_subsys_state css; /* cpuusage holds pointer to a u64-type object on every cpu */ - u64 __percpu *cpuusage; + struct cpuacct_usage __percpu *cpuusage; struct kernel_cpustat __percpu *cpustat; }; @@ -49,7 +58,7 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca) return css_ca(ca->css.parent); } -static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); +static DEFINE_PER_CPU(struct cpuacct_usage, root_cpuacct_cpuusage); static struct cpuacct root_cpuacct = { .cpustat = &kernel_cpustat, .cpuusage = &root_cpuacct_cpuusage, @@ -68,7 +77,7 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) if (!ca) goto out; - ca->cpuusage = alloc_percpu(u64); + ca->cpuusage = alloc_percpu(struct cpuacct_usage); if (!ca->cpuusage) goto out_free_ca; @@ -96,20 +105,37 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css) kfree(ca); } -static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) +static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, + enum cpuacct_stat_index index) { - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); u64 data; + /* + * We allow index == CPUACCT_STAT_NSTATS here to read + * the sum of suages. + */ + BUG_ON(index > CPUACCT_STAT_NSTATS); + #ifndef CONFIG_64BIT /* * Take rq->lock to make 64-bit read safe on 32-bit platforms. */ raw_spin_lock_irq(&cpu_rq(cpu)->lock); - data = *cpuusage; +#endif + + if (index == CPUACCT_STAT_NSTATS) { + int i = 0; + + data = 0; + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) + data += cpuusage->usages[i]; + } else { + data = cpuusage->usages[index]; + } + +#ifndef CONFIG_64BIT raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#else - data = *cpuusage; #endif return data; @@ -117,98 +143,162 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) { - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + int i; #ifndef CONFIG_64BIT /* * Take rq->lock to make 64-bit write safe on 32-bit platforms. */ raw_spin_lock_irq(&cpu_rq(cpu)->lock); - *cpuusage = val; +#endif + + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) + cpuusage->usages[i] = val; + +#ifndef CONFIG_64BIT raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#else - *cpuusage = val; #endif } /* return total cpu usage (in nanoseconds) of a group */ -static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) +static u64 __cpuusage_read(struct cgroup_subsys_state *css, + enum cpuacct_stat_index index) { struct cpuacct *ca = css_ca(css); u64 totalcpuusage = 0; int i; - for_each_present_cpu(i) - totalcpuusage += cpuacct_cpuusage_read(ca, i); + for_each_possible_cpu(i) + totalcpuusage += cpuacct_cpuusage_read(ca, i, index); return totalcpuusage; } +static u64 cpuusage_user_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return __cpuusage_read(css, CPUACCT_STAT_USER); +} + +static u64 cpuusage_sys_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return __cpuusage_read(css, CPUACCT_STAT_SYSTEM); +} + +static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + return __cpuusage_read(css, CPUACCT_STAT_NSTATS); +} + static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { struct cpuacct *ca = css_ca(css); - int err = 0; - int i; + int cpu; /* * Only allow '0' here to do a reset. */ - if (val) { - err = -EINVAL; - goto out; - } + if (val) + return -EINVAL; - for_each_present_cpu(i) - cpuacct_cpuusage_write(ca, i, 0); + for_each_possible_cpu(cpu) + cpuacct_cpuusage_write(ca, cpu, 0); -out: - return err; + return 0; } -static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) +static int __cpuacct_percpu_seq_show(struct seq_file *m, + enum cpuacct_stat_index index) { struct cpuacct *ca = css_ca(seq_css(m)); u64 percpu; int i; - for_each_present_cpu(i) { - percpu = cpuacct_cpuusage_read(ca, i); + for_each_possible_cpu(i) { + percpu = cpuacct_cpuusage_read(ca, i, index); seq_printf(m, "%llu ", (unsigned long long) percpu); } seq_printf(m, "\n"); return 0; } -static const char * const cpuacct_stat_desc[] = { - [CPUACCT_STAT_USER] = "user", - [CPUACCT_STAT_SYSTEM] = "system", -}; +static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V) +{ + return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_USER); +} + +static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V) +{ + return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_SYSTEM); +} + +static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) +{ + return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_NSTATS); +} + +static int cpuacct_all_seq_show(struct seq_file *m, void *V) +{ + struct cpuacct *ca = css_ca(seq_css(m)); + int index; + int cpu; + + seq_puts(m, "cpu"); + for (index = 0; index < CPUACCT_STAT_NSTATS; index++) + seq_printf(m, " %s", cpuacct_stat_desc[index]); + seq_puts(m, "\n"); + + for_each_possible_cpu(cpu) { + struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + + seq_printf(m, "%d", cpu); + + for (index = 0; index < CPUACCT_STAT_NSTATS; index++) { +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit read safe on 32-bit + * platforms. + */ + raw_spin_lock_irq(&cpu_rq(cpu)->lock); +#endif + + seq_printf(m, " %llu", cpuusage->usages[index]); + +#ifndef CONFIG_64BIT + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); +#endif + } + seq_puts(m, "\n"); + } + return 0; +} static int cpuacct_stats_show(struct seq_file *sf, void *v) { struct cpuacct *ca = css_ca(seq_css(sf)); + s64 val[CPUACCT_STAT_NSTATS]; int cpu; - s64 val = 0; + int stat; - for_each_online_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_USER]; - val += kcpustat->cpustat[CPUTIME_NICE]; - } - val = cputime64_to_clock_t(val); - seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val); - - val = 0; - for_each_online_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_SYSTEM]; - val += kcpustat->cpustat[CPUTIME_IRQ]; - val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; + memset(val, 0, sizeof(val)); + for_each_possible_cpu(cpu) { + u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; + + val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER]; + val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE]; + val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM]; + val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ]; + val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ]; } - val = cputime64_to_clock_t(val); - seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); + for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) { + seq_printf(sf, "%s %lld\n", + cpuacct_stat_desc[stat], + cputime64_to_clock_t(val[stat])); + } return 0; } @@ -220,10 +310,30 @@ static struct cftype files[] = { .write_u64 = cpuusage_write, }, { + .name = "usage_user", + .read_u64 = cpuusage_user_read, + }, + { + .name = "usage_sys", + .read_u64 = cpuusage_sys_read, + }, + { .name = "usage_percpu", .seq_show = cpuacct_percpu_seq_show, }, { + .name = "usage_percpu_user", + .seq_show = cpuacct_percpu_user_seq_show, + }, + { + .name = "usage_percpu_sys", + .seq_show = cpuacct_percpu_sys_seq_show, + }, + { + .name = "usage_all", + .seq_show = cpuacct_all_seq_show, + }, + { .name = "stat", .seq_show = cpuacct_stats_show, }, @@ -238,10 +348,17 @@ static struct cftype files[] = { void cpuacct_charge(struct task_struct *tsk, u64 cputime) { struct cpuacct *ca; + int index = CPUACCT_STAT_SYSTEM; + struct pt_regs *regs = task_pt_regs(tsk); + + if (regs && user_mode(regs)) + index = CPUACCT_STAT_USER; rcu_read_lock(); + for (ca = task_ca(tsk); ca; ca = parent_ca(ca)) - *this_cpu_ptr(ca->cpuusage) += cputime; + this_cpu_ptr(ca->cpuusage)->usages[index] += cputime; + rcu_read_unlock(); } diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 5a75b08cfd85..d4184498c9f5 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -103,10 +103,10 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, const struct sched_dl_entity *dl_se = &p->dl; if (later_mask && - cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) { + cpumask_and(later_mask, cp->free_cpus, tsk_cpus_allowed(p))) { best_cpu = cpumask_any(later_mask); goto out; - } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && + } else if (cpumask_test_cpu(cpudl_maximum(cp), tsk_cpus_allowed(p)) && dl_time_before(dl_se->deadline, cp->elements[0].dl)) { best_cpu = cpudl_maximum(cp); if (later_mask) @@ -168,7 +168,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) if (old_idx == IDX_INVALID) { cp->size++; - cp->elements[cp->size - 1].dl = 0; + cp->elements[cp->size - 1].dl = dl; cp->elements[cp->size - 1].cpu = cpu; cp->elements[cpu].idx = cp->size - 1; cpudl_change_key(cp, cp->size - 1, dl); diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index 928c4ba32f68..1141954e73b4 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -14,24 +14,50 @@ DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); /** - * cpufreq_set_update_util_data - Populate the CPU's update_util_data pointer. + * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer. * @cpu: The CPU to set the pointer for. * @data: New pointer value. + * @func: Callback function to set for the CPU. * - * Set and publish the update_util_data pointer for the given CPU. That pointer - * points to a struct update_util_data object containing a callback function - * to call from cpufreq_update_util(). That function will be called from an RCU - * read-side critical section, so it must not sleep. + * Set and publish the update_util_data pointer for the given CPU. * - * Callers must use RCU-sched callbacks to free any memory that might be - * accessed via the old update_util_data pointer or invoke synchronize_sched() - * right after this function to avoid use-after-free. + * The update_util_data pointer of @cpu is set to @data and the callback + * function pointer in the target struct update_util_data is set to @func. + * That function will be called by cpufreq_update_util() from RCU-sched + * read-side critical sections, so it must not sleep. @data will always be + * passed to it as the first argument which allows the function to get to the + * target update_util_data structure and its container. + * + * The update_util_data pointer of @cpu must be NULL when this function is + * called or it will WARN() and return with no effect. */ -void cpufreq_set_update_util_data(int cpu, struct update_util_data *data) +void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data, + void (*func)(struct update_util_data *data, u64 time, + unsigned long util, unsigned long max)) { - if (WARN_ON(data && !data->func)) + if (WARN_ON(!data || !func)) return; + if (WARN_ON(per_cpu(cpufreq_update_util_data, cpu))) + return; + + data->func = func; rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data); } -EXPORT_SYMBOL_GPL(cpufreq_set_update_util_data); +EXPORT_SYMBOL_GPL(cpufreq_add_update_util_hook); + +/** + * cpufreq_remove_update_util_hook - Clear the CPU's update_util_data pointer. + * @cpu: The CPU to clear the pointer for. + * + * Clear the update_util_data pointer for the given CPU. + * + * Callers must use RCU-sched callbacks to free any memory that might be + * accessed via the old update_util_data pointer or invoke synchronize_sched() + * right after this function to avoid use-after-free. + */ +void cpufreq_remove_update_util_hook(int cpu) +{ + rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), NULL); +} +EXPORT_SYMBOL_GPL(cpufreq_remove_update_util_hook); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c new file mode 100644 index 000000000000..a84641b222c1 --- /dev/null +++ b/kernel/sched/cpufreq_schedutil.c @@ -0,0 +1,522 @@ +/* + * CPUFreq governor based on scheduler-provided CPU utilization data. + * + * Copyright (C) 2016, Intel Corporation + * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/cpufreq.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <trace/events/power.h> + +#include "sched.h" + +struct sugov_tunables { + struct gov_attr_set attr_set; + unsigned int rate_limit_us; +}; + +struct sugov_policy { + struct cpufreq_policy *policy; + + struct sugov_tunables *tunables; + struct list_head tunables_hook; + + raw_spinlock_t update_lock; /* For shared policies */ + u64 last_freq_update_time; + s64 freq_update_delay_ns; + unsigned int next_freq; + + /* The next fields are only needed if fast switch cannot be used. */ + struct irq_work irq_work; + struct work_struct work; + struct mutex work_lock; + bool work_in_progress; + + bool need_freq_update; +}; + +struct sugov_cpu { + struct update_util_data update_util; + struct sugov_policy *sg_policy; + + unsigned int cached_raw_freq; + + /* The fields below are only needed when sharing a policy. */ + unsigned long util; + unsigned long max; + u64 last_update; +}; + +static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); + +/************************ Governor internals ***********************/ + +static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) +{ + s64 delta_ns; + + if (sg_policy->work_in_progress) + return false; + + if (unlikely(sg_policy->need_freq_update)) { + sg_policy->need_freq_update = false; + /* + * This happens when limits change, so forget the previous + * next_freq value and force an update. + */ + sg_policy->next_freq = UINT_MAX; + return true; + } + + delta_ns = time - sg_policy->last_freq_update_time; + return delta_ns >= sg_policy->freq_update_delay_ns; +} + +static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, + unsigned int next_freq) +{ + struct cpufreq_policy *policy = sg_policy->policy; + + sg_policy->last_freq_update_time = time; + + if (policy->fast_switch_enabled) { + if (sg_policy->next_freq == next_freq) { + trace_cpu_frequency(policy->cur, smp_processor_id()); + return; + } + sg_policy->next_freq = next_freq; + next_freq = cpufreq_driver_fast_switch(policy, next_freq); + if (next_freq == CPUFREQ_ENTRY_INVALID) + return; + + policy->cur = next_freq; + trace_cpu_frequency(next_freq, smp_processor_id()); + } else if (sg_policy->next_freq != next_freq) { + sg_policy->next_freq = next_freq; + sg_policy->work_in_progress = true; + irq_work_queue(&sg_policy->irq_work); + } +} + +/** + * get_next_freq - Compute a new frequency for a given cpufreq policy. + * @sg_cpu: schedutil cpu object to compute the new frequency for. + * @util: Current CPU utilization. + * @max: CPU capacity. + * + * If the utilization is frequency-invariant, choose the new frequency to be + * proportional to it, that is + * + * next_freq = C * max_freq * util / max + * + * Otherwise, approximate the would-be frequency-invariant utilization by + * util_raw * (curr_freq / max_freq) which leads to + * + * next_freq = C * curr_freq * util_raw / max + * + * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8. + * + * The lowest driver-supported frequency which is equal or greater than the raw + * next_freq (as calculated above) is returned, subject to policy min/max and + * cpufreq driver limitations. + */ +static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util, + unsigned long max) +{ + struct sugov_policy *sg_policy = sg_cpu->sg_policy; + struct cpufreq_policy *policy = sg_policy->policy; + unsigned int freq = arch_scale_freq_invariant() ? + policy->cpuinfo.max_freq : policy->cur; + + freq = (freq + (freq >> 2)) * util / max; + + if (freq == sg_cpu->cached_raw_freq && sg_policy->next_freq != UINT_MAX) + return sg_policy->next_freq; + sg_cpu->cached_raw_freq = freq; + return cpufreq_driver_resolve_freq(policy, freq); +} + +static void sugov_update_single(struct update_util_data *hook, u64 time, + unsigned long util, unsigned long max) +{ + struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); + struct sugov_policy *sg_policy = sg_cpu->sg_policy; + struct cpufreq_policy *policy = sg_policy->policy; + unsigned int next_f; + + if (!sugov_should_update_freq(sg_policy, time)) + return; + + next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq : + get_next_freq(sg_cpu, util, max); + sugov_update_commit(sg_policy, time, next_f); +} + +static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, + unsigned long util, unsigned long max) +{ + struct sugov_policy *sg_policy = sg_cpu->sg_policy; + struct cpufreq_policy *policy = sg_policy->policy; + unsigned int max_f = policy->cpuinfo.max_freq; + u64 last_freq_update_time = sg_policy->last_freq_update_time; + unsigned int j; + + if (util == ULONG_MAX) + return max_f; + + for_each_cpu(j, policy->cpus) { + struct sugov_cpu *j_sg_cpu; + unsigned long j_util, j_max; + s64 delta_ns; + + if (j == smp_processor_id()) + continue; + + j_sg_cpu = &per_cpu(sugov_cpu, j); + /* + * If the CPU utilization was last updated before the previous + * frequency update and the time elapsed between the last update + * of the CPU utilization and the last frequency update is long + * enough, don't take the CPU into account as it probably is + * idle now. + */ + delta_ns = last_freq_update_time - j_sg_cpu->last_update; + if (delta_ns > TICK_NSEC) + continue; + + j_util = j_sg_cpu->util; + if (j_util == ULONG_MAX) + return max_f; + + j_max = j_sg_cpu->max; + if (j_util * max > j_max * util) { + util = j_util; + max = j_max; + } + } + + return get_next_freq(sg_cpu, util, max); +} + +static void sugov_update_shared(struct update_util_data *hook, u64 time, + unsigned long util, unsigned long max) +{ + struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); + struct sugov_policy *sg_policy = sg_cpu->sg_policy; + unsigned int next_f; + + raw_spin_lock(&sg_policy->update_lock); + + sg_cpu->util = util; + sg_cpu->max = max; + sg_cpu->last_update = time; + + if (sugov_should_update_freq(sg_policy, time)) { + next_f = sugov_next_freq_shared(sg_cpu, util, max); + sugov_update_commit(sg_policy, time, next_f); + } + + raw_spin_unlock(&sg_policy->update_lock); +} + +static void sugov_work(struct work_struct *work) +{ + struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work); + + mutex_lock(&sg_policy->work_lock); + __cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq, + CPUFREQ_RELATION_L); + mutex_unlock(&sg_policy->work_lock); + + sg_policy->work_in_progress = false; +} + +static void sugov_irq_work(struct irq_work *irq_work) +{ + struct sugov_policy *sg_policy; + + sg_policy = container_of(irq_work, struct sugov_policy, irq_work); + schedule_work_on(smp_processor_id(), &sg_policy->work); +} + +/************************** sysfs interface ************************/ + +static struct sugov_tunables *global_tunables; +static DEFINE_MUTEX(global_tunables_lock); + +static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set) +{ + return container_of(attr_set, struct sugov_tunables, attr_set); +} + +static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf) +{ + struct sugov_tunables *tunables = to_sugov_tunables(attr_set); + + return sprintf(buf, "%u\n", tunables->rate_limit_us); +} + +static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, + size_t count) +{ + struct sugov_tunables *tunables = to_sugov_tunables(attr_set); + struct sugov_policy *sg_policy; + unsigned int rate_limit_us; + + if (kstrtouint(buf, 10, &rate_limit_us)) + return -EINVAL; + + tunables->rate_limit_us = rate_limit_us; + + list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) + sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC; + + return count; +} + +static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us); + +static struct attribute *sugov_attributes[] = { + &rate_limit_us.attr, + NULL +}; + +static struct kobj_type sugov_tunables_ktype = { + .default_attrs = sugov_attributes, + .sysfs_ops = &governor_sysfs_ops, +}; + +/********************** cpufreq governor interface *********************/ + +static struct cpufreq_governor schedutil_gov; + +static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy; + + sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL); + if (!sg_policy) + return NULL; + + sg_policy->policy = policy; + init_irq_work(&sg_policy->irq_work, sugov_irq_work); + INIT_WORK(&sg_policy->work, sugov_work); + mutex_init(&sg_policy->work_lock); + raw_spin_lock_init(&sg_policy->update_lock); + return sg_policy; +} + +static void sugov_policy_free(struct sugov_policy *sg_policy) +{ + mutex_destroy(&sg_policy->work_lock); + kfree(sg_policy); +} + +static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy) +{ + struct sugov_tunables *tunables; + + tunables = kzalloc(sizeof(*tunables), GFP_KERNEL); + if (tunables) { + gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook); + if (!have_governor_per_policy()) + global_tunables = tunables; + } + return tunables; +} + +static void sugov_tunables_free(struct sugov_tunables *tunables) +{ + if (!have_governor_per_policy()) + global_tunables = NULL; + + kfree(tunables); +} + +static int sugov_init(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy; + struct sugov_tunables *tunables; + unsigned int lat; + int ret = 0; + + /* State should be equivalent to EXIT */ + if (policy->governor_data) + return -EBUSY; + + sg_policy = sugov_policy_alloc(policy); + if (!sg_policy) + return -ENOMEM; + + mutex_lock(&global_tunables_lock); + + if (global_tunables) { + if (WARN_ON(have_governor_per_policy())) { + ret = -EINVAL; + goto free_sg_policy; + } + policy->governor_data = sg_policy; + sg_policy->tunables = global_tunables; + + gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook); + goto out; + } + + tunables = sugov_tunables_alloc(sg_policy); + if (!tunables) { + ret = -ENOMEM; + goto free_sg_policy; + } + + tunables->rate_limit_us = LATENCY_MULTIPLIER; + lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC; + if (lat) + tunables->rate_limit_us *= lat; + + policy->governor_data = sg_policy; + sg_policy->tunables = tunables; + + ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype, + get_governor_parent_kobj(policy), "%s", + schedutil_gov.name); + if (ret) + goto fail; + + out: + mutex_unlock(&global_tunables_lock); + + cpufreq_enable_fast_switch(policy); + return 0; + + fail: + policy->governor_data = NULL; + sugov_tunables_free(tunables); + + free_sg_policy: + mutex_unlock(&global_tunables_lock); + + sugov_policy_free(sg_policy); + pr_err("initialization failed (error %d)\n", ret); + return ret; +} + +static void sugov_exit(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy = policy->governor_data; + struct sugov_tunables *tunables = sg_policy->tunables; + unsigned int count; + + cpufreq_disable_fast_switch(policy); + + mutex_lock(&global_tunables_lock); + + count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); + policy->governor_data = NULL; + if (!count) + sugov_tunables_free(tunables); + + mutex_unlock(&global_tunables_lock); + + sugov_policy_free(sg_policy); +} + +static int sugov_start(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy = policy->governor_data; + unsigned int cpu; + + sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; + sg_policy->last_freq_update_time = 0; + sg_policy->next_freq = UINT_MAX; + sg_policy->work_in_progress = false; + sg_policy->need_freq_update = false; + + for_each_cpu(cpu, policy->cpus) { + struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); + + sg_cpu->sg_policy = sg_policy; + if (policy_is_shared(policy)) { + sg_cpu->util = ULONG_MAX; + sg_cpu->max = 0; + sg_cpu->last_update = 0; + sg_cpu->cached_raw_freq = 0; + cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, + sugov_update_shared); + } else { + cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, + sugov_update_single); + } + } + return 0; +} + +static void sugov_stop(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy = policy->governor_data; + unsigned int cpu; + + for_each_cpu(cpu, policy->cpus) + cpufreq_remove_update_util_hook(cpu); + + synchronize_sched(); + + irq_work_sync(&sg_policy->irq_work); + cancel_work_sync(&sg_policy->work); +} + +static void sugov_limits(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy = policy->governor_data; + + if (!policy->fast_switch_enabled) { + mutex_lock(&sg_policy->work_lock); + cpufreq_policy_apply_limits(policy); + mutex_unlock(&sg_policy->work_lock); + } + + sg_policy->need_freq_update = true; +} + +static struct cpufreq_governor schedutil_gov = { + .name = "schedutil", + .owner = THIS_MODULE, + .init = sugov_init, + .exit = sugov_exit, + .start = sugov_start, + .stop = sugov_stop, + .limits = sugov_limits, +}; + +static int __init sugov_module_init(void) +{ + return cpufreq_register_governor(&schedutil_gov); +} + +static void __exit sugov_module_exit(void) +{ + cpufreq_unregister_governor(&schedutil_gov); +} + +MODULE_AUTHOR("Rafael J. Wysocki <rafael.j.wysocki@intel.com>"); +MODULE_DESCRIPTION("Utilization-based CPU frequency selection"); +MODULE_LICENSE("GPL"); + +#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL +struct cpufreq_governor *cpufreq_default_governor(void) +{ + return &schedutil_gov; +} + +fs_initcall(sugov_module_init); +#else +module_init(sugov_module_init); +#endif +module_exit(sugov_module_exit); diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 981fcd7dc394..11e9705bf937 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -103,11 +103,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, if (skip) continue; - if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) + if (cpumask_any_and(tsk_cpus_allowed(p), vec->mask) >= nr_cpu_ids) continue; if (lowest_mask) { - cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); + cpumask_and(lowest_mask, tsk_cpus_allowed(p), vec->mask); /* * We have to ensure that we have at least one bit diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 75f98c5498d5..a846cf89eb96 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -49,15 +49,12 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq); */ void irqtime_account_irq(struct task_struct *curr) { - unsigned long flags; s64 delta; int cpu; if (!sched_clock_irqtime) return; - local_irq_save(flags); - cpu = smp_processor_id(); delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); __this_cpu_add(irq_start_time, delta); @@ -75,44 +72,53 @@ void irqtime_account_irq(struct task_struct *curr) __this_cpu_add(cpu_softirq_time, delta); irq_time_write_end(); - local_irq_restore(flags); } EXPORT_SYMBOL_GPL(irqtime_account_irq); -static int irqtime_account_hi_update(void) +static cputime_t irqtime_account_hi_update(cputime_t maxtime) { u64 *cpustat = kcpustat_this_cpu->cpustat; unsigned long flags; - u64 latest_ns; - int ret = 0; + cputime_t irq_cputime; local_irq_save(flags); - latest_ns = this_cpu_read(cpu_hardirq_time); - if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) - ret = 1; + irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) - + cpustat[CPUTIME_IRQ]; + irq_cputime = min(irq_cputime, maxtime); + cpustat[CPUTIME_IRQ] += irq_cputime; local_irq_restore(flags); - return ret; + return irq_cputime; } -static int irqtime_account_si_update(void) +static cputime_t irqtime_account_si_update(cputime_t maxtime) { u64 *cpustat = kcpustat_this_cpu->cpustat; unsigned long flags; - u64 latest_ns; - int ret = 0; + cputime_t softirq_cputime; local_irq_save(flags); - latest_ns = this_cpu_read(cpu_softirq_time); - if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) - ret = 1; + softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) - + cpustat[CPUTIME_SOFTIRQ]; + softirq_cputime = min(softirq_cputime, maxtime); + cpustat[CPUTIME_SOFTIRQ] += softirq_cputime; local_irq_restore(flags); - return ret; + return softirq_cputime; } #else /* CONFIG_IRQ_TIME_ACCOUNTING */ #define sched_clock_irqtime (0) +static cputime_t irqtime_account_hi_update(cputime_t dummy) +{ + return 0; +} + +static cputime_t irqtime_account_si_update(cputime_t dummy) +{ + return 0; +} + #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ static inline void task_group_account_field(struct task_struct *p, int index, @@ -257,29 +263,47 @@ void account_idle_time(cputime_t cputime) cpustat[CPUTIME_IDLE] += (__force u64) cputime; } -static __always_inline bool steal_account_process_tick(void) +/* + * When a guest is interrupted for a longer amount of time, missed clock + * ticks are not redelivered later. Due to that, this function may on + * occasion account more time than the calling functions think elapsed. + */ +static __always_inline cputime_t steal_account_process_time(cputime_t maxtime) { #ifdef CONFIG_PARAVIRT if (static_key_false(¶virt_steal_enabled)) { + cputime_t steal_cputime; u64 steal; - unsigned long steal_jiffies; steal = paravirt_steal_clock(smp_processor_id()); steal -= this_rq()->prev_steal_time; - /* - * steal is in nsecs but our caller is expecting steal - * time in jiffies. Lets cast the result to jiffies - * granularity and account the rest on the next rounds. - */ - steal_jiffies = nsecs_to_jiffies(steal); - this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies); + steal_cputime = min(nsecs_to_cputime(steal), maxtime); + account_steal_time(steal_cputime); + this_rq()->prev_steal_time += cputime_to_nsecs(steal_cputime); - account_steal_time(jiffies_to_cputime(steal_jiffies)); - return steal_jiffies; + return steal_cputime; } #endif - return false; + return 0; +} + +/* + * Account how much elapsed time was spent in steal, irq, or softirq time. + */ +static inline cputime_t account_other_time(cputime_t max) +{ + cputime_t accounted; + + accounted = steal_account_process_time(max); + + if (accounted < max) + accounted += irqtime_account_hi_update(max - accounted); + + if (accounted < max) + accounted += irqtime_account_si_update(max - accounted); + + return accounted; } /* @@ -342,21 +366,23 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) static void irqtime_account_process_tick(struct task_struct *p, int user_tick, struct rq *rq, int ticks) { - cputime_t scaled = cputime_to_scaled(cputime_one_jiffy); - u64 cputime = (__force u64) cputime_one_jiffy; - u64 *cpustat = kcpustat_this_cpu->cpustat; + u64 cputime = (__force u64) cputime_one_jiffy * ticks; + cputime_t scaled, other; - if (steal_account_process_tick()) + /* + * When returning from idle, many ticks can get accounted at + * once, including some ticks of steal, irq, and softirq time. + * Subtract those ticks from the amount of time accounted to + * idle, or potentially user or system time. Due to rounding, + * other time can exceed ticks occasionally. + */ + other = account_other_time(ULONG_MAX); + if (other >= cputime) return; + cputime -= other; + scaled = cputime_to_scaled(cputime); - cputime *= ticks; - scaled *= ticks; - - if (irqtime_account_hi_update()) { - cpustat[CPUTIME_IRQ] += cputime; - } else if (irqtime_account_si_update()) { - cpustat[CPUTIME_SOFTIRQ] += cputime; - } else if (this_cpu_ksoftirqd() == p) { + if (this_cpu_ksoftirqd() == p) { /* * ksoftirqd time do not get accounted in cpu_softirq_time. * So, we have to handle it separately here. @@ -406,6 +432,10 @@ void vtime_common_task_switch(struct task_struct *prev) } #endif +#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ + + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE /* * Archs that account the whole time spent in the idle task * (outside irq) as idle time can rely on this and just implement @@ -415,33 +445,16 @@ void vtime_common_task_switch(struct task_struct *prev) * vtime_account(). */ #ifndef __ARCH_HAS_VTIME_ACCOUNT -void vtime_common_account_irq_enter(struct task_struct *tsk) +void vtime_account_irq_enter(struct task_struct *tsk) { - if (!in_interrupt()) { - /* - * If we interrupted user, context_tracking_in_user() - * is 1 because the context tracking don't hook - * on irq entry/exit. This way we know if - * we need to flush user time on kernel entry. - */ - if (context_tracking_in_user()) { - vtime_account_user(tsk); - return; - } - - if (is_idle_task(tsk)) { - vtime_account_idle(tsk); - return; - } - } - vtime_account_system(tsk); + if (!in_interrupt() && is_idle_task(tsk)) + vtime_account_idle(tsk); + else + vtime_account_system(tsk); } -EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter); +EXPORT_SYMBOL_GPL(vtime_account_irq_enter); #endif /* __ARCH_HAS_VTIME_ACCOUNT */ -#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ - -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { *ut = p->utime; @@ -466,7 +479,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime */ void account_process_tick(struct task_struct *p, int user_tick) { - cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + cputime_t cputime, scaled, steal; struct rq *rq = this_rq(); if (vtime_accounting_cpu_enabled()) @@ -477,26 +490,21 @@ void account_process_tick(struct task_struct *p, int user_tick) return; } - if (steal_account_process_tick()) + cputime = cputime_one_jiffy; + steal = steal_account_process_time(ULONG_MAX); + + if (steal >= cputime) return; + cputime -= steal; + scaled = cputime_to_scaled(cputime); + if (user_tick) - account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); + account_user_time(p, cputime, scaled); else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) - account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, - one_jiffy_scaled); + account_system_time(p, HARDIRQ_OFFSET, cputime, scaled); else - account_idle_time(cputime_one_jiffy); -} - -/* - * Account multiple ticks of steal time. - * @p: the process from which the cpu time has been stolen - * @ticks: number of stolen ticks - */ -void account_steal_ticks(unsigned long ticks) -{ - account_steal_time(jiffies_to_cputime(ticks)); + account_idle_time(cputime); } /* @@ -505,13 +513,21 @@ void account_steal_ticks(unsigned long ticks) */ void account_idle_ticks(unsigned long ticks) { + cputime_t cputime, steal; if (sched_clock_irqtime) { irqtime_account_idle_ticks(ticks); return; } - account_idle_time(jiffies_to_cputime(ticks)); + cputime = jiffies_to_cputime(ticks); + steal = steal_account_process_time(ULONG_MAX); + + if (steal >= cputime) + return; + + cputime -= steal; + account_idle_time(cputime); } /* @@ -603,19 +619,25 @@ static void cputime_adjust(struct task_cputime *curr, stime = curr->stime; utime = curr->utime; - if (utime == 0) { - stime = rtime; + /* + * If either stime or both stime and utime are 0, assume all runtime is + * userspace. Once a task gets some ticks, the monotonicy code at + * 'update' will ensure things converge to the observed ratio. + */ + if (stime == 0) { + utime = rtime; goto update; } - if (stime == 0) { - utime = rtime; + if (utime == 0) { + stime = rtime; goto update; } stime = scale_stime((__force u64)stime, (__force u64)rtime, (__force u64)(stime + utime)); +update: /* * Make sure stime doesn't go backwards; this preserves monotonicity * for utime because rtime is monotonic. @@ -638,7 +660,6 @@ static void cputime_adjust(struct task_cputime *curr, stime = rtime - utime; } -update: prev->stime = stime; prev->utime = utime; out: @@ -681,12 +702,21 @@ static cputime_t vtime_delta(struct task_struct *tsk) static cputime_t get_vtime_delta(struct task_struct *tsk) { unsigned long now = READ_ONCE(jiffies); - unsigned long delta = now - tsk->vtime_snap; + cputime_t delta, other; + /* + * Unlike tick based timing, vtime based timing never has lost + * ticks, and no need for steal time accounting to make up for + * lost ticks. Vtime accounts a rounded version of actual + * elapsed time. Limit account_other_time to prevent rounding + * errors from causing elapsed vtime to go negative. + */ + delta = jiffies_to_cputime(now - tsk->vtime_snap); + other = account_other_time(delta); WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); tsk->vtime_snap = now; - return jiffies_to_cputime(delta); + return delta - other; } static void __vtime_account_system(struct task_struct *tsk) @@ -706,16 +736,6 @@ void vtime_account_system(struct task_struct *tsk) write_seqcount_end(&tsk->vtime_seqcount); } -void vtime_gen_account_irq_exit(struct task_struct *tsk) -{ - write_seqcount_begin(&tsk->vtime_seqcount); - if (vtime_delta(tsk)) - __vtime_account_system(tsk); - if (context_tracking_in_user()) - tsk->vtime_snap_whence = VTIME_USER; - write_seqcount_end(&tsk->vtime_seqcount); -} - void vtime_account_user(struct task_struct *tsk) { cputime_t delta_cpu; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 686ec8adf952..1ce8867283dc 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -134,7 +134,7 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { struct task_struct *p = dl_task_of(dl_se); - if (p->nr_cpus_allowed > 1) + if (tsk_nr_cpus_allowed(p) > 1) dl_rq->dl_nr_migratory++; update_dl_migration(dl_rq); @@ -144,7 +144,7 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { struct task_struct *p = dl_task_of(dl_se); - if (p->nr_cpus_allowed > 1) + if (tsk_nr_cpus_allowed(p) > 1) dl_rq->dl_nr_migratory--; update_dl_migration(dl_rq); @@ -591,10 +591,10 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) struct sched_dl_entity, dl_timer); struct task_struct *p = dl_task_of(dl_se); - unsigned long flags; + struct rq_flags rf; struct rq *rq; - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); /* * The task might have changed its scheduling policy to something @@ -658,8 +658,11 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * * XXX figure out if select_task_rq_dl() deals with offline cpus. */ - if (unlikely(!rq->online)) + if (unlikely(!rq->online)) { + lockdep_unpin_lock(&rq->lock, rf.cookie); rq = dl_task_offline_migration(rq, p); + rf.cookie = lockdep_pin_lock(&rq->lock); + } /* * Queueing this task back might have overloaded rq, check if we need @@ -670,14 +673,14 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * Nothing relies on rq->lock after this, so its safe to drop * rq->lock. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, rf.cookie); push_dl_task(rq); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, rf.cookie); } #endif unlock: - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); /* * This can free the task_struct, including this hrtimer, do not touch @@ -717,10 +720,6 @@ static void update_curr_dl(struct rq *rq) if (!dl_task(curr) || !on_dl_rq(dl_se)) return; - /* Kick cpufreq (see the comment in linux/cpufreq.h). */ - if (cpu_of(rq) == smp_processor_id()) - cpufreq_trigger_update(rq_clock(rq)); - /* * Consumed budget is computed considering the time as * observed by schedulable tasks (excluding time spent @@ -736,6 +735,10 @@ static void update_curr_dl(struct rq *rq) return; } + /* kick cpufreq (see the comment in linux/cpufreq.h). */ + if (cpu_of(rq) == smp_processor_id()) + cpufreq_trigger_update(rq_clock(rq)); + schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); @@ -966,7 +969,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) enqueue_dl_entity(&p->dl, pi_se, flags); - if (!task_current(rq, p) && p->nr_cpus_allowed > 1) + if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1) enqueue_pushable_dl_task(rq, p); } @@ -1040,9 +1043,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) * try to make it stay here, it might be important. */ if (unlikely(dl_task(curr)) && - (curr->nr_cpus_allowed < 2 || + (tsk_nr_cpus_allowed(curr) < 2 || !dl_entity_preempt(&p->dl, &curr->dl)) && - (p->nr_cpus_allowed > 1)) { + (tsk_nr_cpus_allowed(p) > 1)) { int target = find_later_rq(p); if (target != -1 && @@ -1063,7 +1066,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) * Current can't be migrated, useless to reschedule, * let's hope p can move out. */ - if (rq->curr->nr_cpus_allowed == 1 || + if (tsk_nr_cpus_allowed(rq->curr) == 1 || cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1) return; @@ -1071,7 +1074,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) * p is migratable, so let's not schedule it and * see if it is pushed or pulled somewhere else. */ - if (p->nr_cpus_allowed != 1 && + if (tsk_nr_cpus_allowed(p) != 1 && cpudl_find(&rq->rd->cpudl, p, NULL) != -1) return; @@ -1125,7 +1128,8 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, return rb_entry(left, struct sched_dl_entity, rb_node); } -struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) +struct task_struct * +pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) { struct sched_dl_entity *dl_se; struct task_struct *p; @@ -1140,9 +1144,9 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) * disabled avoiding further scheduler activity on it and we're * being very careful to re-start the picking loop. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); pull_dl_task(rq); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, cookie); /* * pull_rt_task() can drop (and re-acquire) rq->lock; this * means a stop task can slip in, in which case we need to @@ -1185,7 +1189,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p) { update_curr_dl(rq); - if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) + if (on_dl_rq(&p->dl) && tsk_nr_cpus_allowed(p) > 1) enqueue_pushable_dl_task(rq, p); } @@ -1286,7 +1290,7 @@ static int find_later_rq(struct task_struct *task) if (unlikely(!later_mask)) return -1; - if (task->nr_cpus_allowed == 1) + if (tsk_nr_cpus_allowed(task) == 1) return -1; /* @@ -1392,7 +1396,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) if (double_lock_balance(rq, later_rq)) { if (unlikely(task_rq(task) != rq || !cpumask_test_cpu(later_rq->cpu, - &task->cpus_allowed) || + tsk_cpus_allowed(task)) || task_running(rq, task) || !dl_task(task) || !task_on_rq_queued(task))) { @@ -1432,7 +1436,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) BUG_ON(rq->cpu != task_cpu(p)); BUG_ON(task_current(rq, p)); - BUG_ON(p->nr_cpus_allowed <= 1); + BUG_ON(tsk_nr_cpus_allowed(p) <= 1); BUG_ON(!task_on_rq_queued(p)); BUG_ON(!dl_task(p)); @@ -1471,7 +1475,7 @@ retry: */ if (dl_task(rq->curr) && dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && - rq->curr->nr_cpus_allowed > 1) { + tsk_nr_cpus_allowed(rq->curr) > 1) { resched_curr(rq); return 0; } @@ -1618,9 +1622,9 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) { if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && - p->nr_cpus_allowed > 1 && + tsk_nr_cpus_allowed(p) > 1 && dl_task(rq->curr) && - (rq->curr->nr_cpus_allowed < 2 || + (tsk_nr_cpus_allowed(rq->curr) < 2 || !dl_entity_preempt(&p->dl, &rq->curr->dl))) { push_dl_tasks(rq); } @@ -1724,7 +1728,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP - if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) + if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded) queue_push_tasks(rq); #else if (dl_task(rq->curr)) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4fbc3bd5ff60..2a0a9995256d 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -427,19 +427,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SPLIT_NS(p->se.vruntime), (long long)(p->nvcsw + p->nivcsw), p->prio); -#ifdef CONFIG_SCHEDSTATS - if (schedstat_enabled()) { - SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", - SPLIT_NS(p->se.statistics.wait_sum), - SPLIT_NS(p->se.sum_exec_runtime), - SPLIT_NS(p->se.statistics.sum_sleep_runtime)); - } -#else + SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", - 0LL, 0L, + SPLIT_NS(schedstat_val(p, se.statistics.wait_sum)), SPLIT_NS(p->se.sum_exec_runtime), - 0LL, 0L); -#endif + SPLIT_NS(schedstat_val(p, se.statistics.sum_sleep_runtime))); + #ifdef CONFIG_NUMA_BALANCING SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); #endif @@ -626,15 +619,16 @@ do { \ #undef P #undef PN -#ifdef CONFIG_SCHEDSTATS -#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); -#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n); - #ifdef CONFIG_SMP +#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n); P64(avg_idle); P64(max_idle_balance_cost); +#undef P64 #endif +#ifdef CONFIG_SCHEDSTATS +#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); + if (schedstat_enabled()) { P(yld_count); P(sched_count); @@ -644,7 +638,6 @@ do { \ } #undef P -#undef P64 #endif spin_lock_irqsave(&sched_debug_lock, flags); print_cfs_stats(m, cpu); @@ -886,9 +879,9 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) nr_switches = p->nvcsw + p->nivcsw; -#ifdef CONFIG_SCHEDSTATS P(se.nr_migrations); +#ifdef CONFIG_SCHEDSTATS if (schedstat_enabled()) { u64 avg_atom, avg_per_cpu; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e7dd0ec169be..039de34f1521 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -204,7 +204,7 @@ static void __update_inv_weight(struct load_weight *lw) * OR * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT * - * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case + * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case * we're guaranteed shift stays positive because inv_weight is guaranteed to * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22. * @@ -682,18 +682,100 @@ void init_entity_runnable_average(struct sched_entity *se) sa->period_contrib = 1023; sa->load_avg = scale_load_down(se->load.weight); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; - sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); - sa->util_sum = sa->util_avg * LOAD_AVG_MAX; + /* + * At this point, util_avg won't be used in select_task_rq_fair anyway + */ + sa->util_avg = 0; + sa->util_sum = 0; /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } -static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); -static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq); -#else +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); +static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq); +static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force); +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se); + +/* + * With new tasks being created, their initial util_avgs are extrapolated + * based on the cfs_rq's current util_avg: + * + * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight + * + * However, in many cases, the above util_avg does not give a desired + * value. Moreover, the sum of the util_avgs may be divergent, such + * as when the series is a harmonic series. + * + * To solve this problem, we also cap the util_avg of successive tasks to + * only 1/2 of the left utilization budget: + * + * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n + * + * where n denotes the nth task. + * + * For example, a simplest series from the beginning would be like: + * + * task util_avg: 512, 256, 128, 64, 32, 16, 8, ... + * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ... + * + * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap) + * if util_avg > util_avg_cap. + */ +void post_init_entity_util_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + struct sched_avg *sa = &se->avg; + long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; + u64 now = cfs_rq_clock_task(cfs_rq); + int tg_update; + + if (cap > 0) { + if (cfs_rq->avg.util_avg != 0) { + sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; + sa->util_avg /= (cfs_rq->avg.load_avg + 1); + + if (sa->util_avg > cap) + sa->util_avg = cap; + } else { + sa->util_avg = cap; + } + sa->util_sum = sa->util_avg * LOAD_AVG_MAX; + } + + if (entity_is_task(se)) { + struct task_struct *p = task_of(se); + if (p->sched_class != &fair_sched_class) { + /* + * For !fair tasks do: + * + update_cfs_rq_load_avg(now, cfs_rq, false); + attach_entity_load_avg(cfs_rq, se); + switched_from_fair(rq, p); + * + * such that the next switched_to_fair() has the + * expected state. + */ + se->avg.last_update_time = now; + return; + } + } + + tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); + attach_entity_load_avg(cfs_rq, se); + if (tg_update) + update_tg_load_avg(cfs_rq, false); +} + +#else /* !CONFIG_SMP */ void init_entity_runnable_average(struct sched_entity *se) { } -#endif +void post_init_entity_util_avg(struct sched_entity *se) +{ +} +static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) +{ +} +#endif /* CONFIG_SMP */ /* * Update the current task's runtime statistics. @@ -1254,6 +1336,8 @@ static void task_numa_assign(struct task_numa_env *env, { if (env->best_task) put_task_struct(env->best_task); + if (p) + get_task_struct(p); env->best_task = p; env->best_imp = imp; @@ -1321,31 +1405,11 @@ static void task_numa_compare(struct task_numa_env *env, long imp = env->p->numa_group ? groupimp : taskimp; long moveimp = imp; int dist = env->dist; - bool assigned = false; rcu_read_lock(); - - raw_spin_lock_irq(&dst_rq->lock); - cur = dst_rq->curr; - /* - * No need to move the exiting task or idle task. - */ - if ((cur->flags & PF_EXITING) || is_idle_task(cur)) + cur = task_rcu_dereference(&dst_rq->curr); + if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) cur = NULL; - else { - /* - * The task_struct must be protected here to protect the - * p->numa_faults access in the task_weight since the - * numa_faults could already be freed in the following path: - * finish_task_switch() - * --> put_task_struct() - * --> __put_task_struct() - * --> task_numa_free() - */ - get_task_struct(cur); - } - - raw_spin_unlock_irq(&dst_rq->lock); /* * Because we have preemption enabled we can get migrated around and @@ -1428,7 +1492,6 @@ balance: */ if (!load_too_imbalanced(src_load, dst_load, env)) { imp = moveimp - 1; - put_task_struct(cur); cur = NULL; goto assign; } @@ -1454,16 +1517,9 @@ balance: env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); assign: - assigned = true; task_numa_assign(env, cur, imp); unlock: rcu_read_unlock(); - /* - * The dst_rq->curr isn't assigned. The protection for task_struct is - * finished. - */ - if (cur && !assigned) - put_task_struct(cur); } static void task_numa_find_cpu(struct task_numa_env *env, @@ -2437,37 +2493,33 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_sub(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); +#ifdef CONFIG_SMP if (entity_is_task(se)) { account_numa_dequeue(rq_of(cfs_rq), task_of(se)); list_del_init(&se->group_node); } +#endif cfs_rq->nr_running--; } #ifdef CONFIG_FAIR_GROUP_SCHED # ifdef CONFIG_SMP -static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) +static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) { - long tg_weight; + long tg_weight, load, shares; /* - * Use this CPU's real-time load instead of the last load contribution - * as the updating of the contribution is delayed, and we will use the - * the real-time load to calc the share. See update_tg_load_avg(). + * This really should be: cfs_rq->avg.load_avg, but instead we use + * cfs_rq->load.weight, which is its upper bound. This helps ramp up + * the shares for small weight interactive tasks. */ - tg_weight = atomic_long_read(&tg->load_avg); - tg_weight -= cfs_rq->tg_load_avg_contrib; - tg_weight += cfs_rq->load.weight; - - return tg_weight; -} + load = scale_load_down(cfs_rq->load.weight); -static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) -{ - long tg_weight, load, shares; + tg_weight = atomic_long_read(&tg->load_avg); - tg_weight = calc_tg_weight(tg, cfs_rq); - load = cfs_rq->load.weight; + /* Ensure tg_weight >= load */ + tg_weight -= cfs_rq->tg_load_avg_contrib; + tg_weight += load; shares = (tg->shares * load); if (tg_weight) @@ -2486,6 +2538,7 @@ static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) return tg->shares; } # endif /* CONFIG_SMP */ + static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { @@ -2550,6 +2603,16 @@ static const u32 runnable_avg_yN_sum[] = { }; /* + * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to + * lower integers. See Documentation/scheduler/sched-avg.txt how these + * were generated: + */ +static const u32 __accumulated_sum_N32[] = { + 0, 23371, 35056, 40899, 43820, 45281, + 46011, 46376, 46559, 46650, 46696, 46719, +}; + +/* * Approximate: * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) */ @@ -2597,22 +2660,13 @@ static u32 __compute_runnable_contrib(u64 n) else if (unlikely(n >= LOAD_AVG_MAX_N)) return LOAD_AVG_MAX; - /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */ - do { - contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */ - contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD]; - - n -= LOAD_AVG_PERIOD; - } while (n > LOAD_AVG_PERIOD); - + /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */ + contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD]; + n %= LOAD_AVG_PERIOD; contrib = decay_load(contrib, n); return contrib + runnable_avg_yN_sum[n]; } -#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10 -#error "load tracking assumes 2^10 as unit" -#endif - #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) /* @@ -2819,25 +2873,87 @@ void set_task_rq_fair(struct sched_entity *se, static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} #endif /* CONFIG_FAIR_GROUP_SCHED */ -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); +static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + int cpu = cpu_of(rq); + + if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) { + unsigned long max = rq->cpu_capacity_orig; -/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ -static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) + /* + * There are a few boundary cases this might miss but it should + * get called often enough that that should (hopefully) not be + * a real problem -- added to that it only calls on the local + * CPU, so if we enqueue remotely we'll miss an update, but + * the next tick/schedule should update. + * + * It will not get called when we go idle, because the idle + * thread is a different class (!fair), nor will the utilization + * number include things like RT tasks. + * + * As is, the util number is not freq-invariant (we'd have to + * implement arch_scale_freq_capacity() for that). + * + * See cpu_util(). + */ + cpufreq_update_util(rq_clock(rq), + min(cfs_rq->avg.util_avg, max), max); + } +} + +/* + * Unsigned subtract and clamp on underflow. + * + * Explicitly do a load-store to ensure the intermediate value never hits + * memory. This allows lockless observations without ever seeing the negative + * values. + */ +#define sub_positive(_ptr, _val) do { \ + typeof(_ptr) ptr = (_ptr); \ + typeof(*ptr) val = (_val); \ + typeof(*ptr) res, var = READ_ONCE(*ptr); \ + res = var - val; \ + if (res > var) \ + res = 0; \ + WRITE_ONCE(*ptr, res); \ +} while (0) + +/** + * update_cfs_rq_load_avg - update the cfs_rq's load/util averages + * @now: current time, as per cfs_rq_clock_task() + * @cfs_rq: cfs_rq to update + * @update_freq: should we call cfs_rq_util_change() or will the call do so + * + * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) + * avg. The immediate corollary is that all (fair) tasks must be attached, see + * post_init_entity_util_avg(). + * + * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. + * + * Returns true if the load decayed or we removed utilization. It is expected + * that one calls update_tg_load_avg() on this condition, but after you've + * modified the cfs_rq avg (attach/detach), such that we propagate the new + * avg up. + */ +static inline int +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) { struct sched_avg *sa = &cfs_rq->avg; - int decayed, removed = 0; + int decayed, removed_load = 0, removed_util = 0; if (atomic_long_read(&cfs_rq->removed_load_avg)) { s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); - sa->load_avg = max_t(long, sa->load_avg - r, 0); - sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); - removed = 1; + sub_positive(&sa->load_avg, r); + sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); + removed_load = 1; } if (atomic_long_read(&cfs_rq->removed_util_avg)) { long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); - sa->util_avg = max_t(long, sa->util_avg - r, 0); - sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0); + sub_positive(&sa->util_avg, r); + sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); + removed_util = 1; } decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, @@ -2848,7 +2964,10 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) cfs_rq->load_last_update_time_copy = sa->last_update_time; #endif - return decayed || removed; + if (update_freq && (decayed || removed_util)) + cfs_rq_util_change(cfs_rq); + + return decayed || removed_load; } /* Update task and its cfs_rq load average */ @@ -2867,33 +2986,18 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); - if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) + if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg) update_tg_load_avg(cfs_rq, 0); - - if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) { - unsigned long max = rq->cpu_capacity_orig; - - /* - * There are a few boundary cases this might miss but it should - * get called often enough that that should (hopefully) not be - * a real problem -- added to that it only calls on the local - * CPU, so if we enqueue remotely we'll miss an update, but - * the next tick/schedule should update. - * - * It will not get called when we go idle, because the idle - * thread is a different class (!fair), nor will the utilization - * number include things like RT tasks. - * - * As is, the util number is not freq-invariant (we'd have to - * implement arch_scale_freq_capacity() for that). - * - * See cpu_util(). - */ - cpufreq_update_util(rq_clock(rq), - min(cfs_rq->avg.util_avg, max), max); - } } +/** + * attach_entity_load_avg - attach this entity to its cfs_rq load avg + * @cfs_rq: cfs_rq to attach to + * @se: sched_entity to attach + * + * Must call update_cfs_rq_load_avg() before this, since we rely on + * cfs_rq->avg.last_update_time being current. + */ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { if (!sched_feat(ATTACH_AGE_LOAD)) @@ -2902,6 +3006,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s /* * If we got migrated (either between CPUs or between cgroups) we'll * have aged the average right before clearing @last_update_time. + * + * Or we're fresh through post_init_entity_util_avg(). */ if (se->avg.last_update_time) { __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), @@ -2919,18 +3025,30 @@ skip_aging: cfs_rq->avg.load_sum += se->avg.load_sum; cfs_rq->avg.util_avg += se->avg.util_avg; cfs_rq->avg.util_sum += se->avg.util_sum; + + cfs_rq_util_change(cfs_rq); } +/** + * detach_entity_load_avg - detach this entity from its cfs_rq load avg + * @cfs_rq: cfs_rq to detach from + * @se: sched_entity to detach + * + * Must call update_cfs_rq_load_avg() before this, since we rely on + * cfs_rq->avg.last_update_time being current. + */ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); - cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); - cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); - cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); - cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); + sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); + sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); + sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); + sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); + + cfs_rq_util_change(cfs_rq); } /* Add the load generated by se into cfs_rq's load average */ @@ -2948,7 +3066,7 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->curr == se, NULL); } - decayed = update_cfs_rq_load_avg(now, cfs_rq); + decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated); cfs_rq->runnable_load_avg += sa->load_avg; cfs_rq->runnable_load_sum += sa->load_sum; @@ -3003,11 +3121,14 @@ void remove_entity_load_avg(struct sched_entity *se) u64 last_update_time; /* - * Newly created task or never used group entity should not be removed - * from its (source) cfs_rq + * tasks cannot exit without having gone through wake_up_new_task() -> + * post_init_entity_util_avg() which will have added things to the + * cfs_rq, so we can remove unconditionally. + * + * Similarly for groups, they will have passed through + * post_init_entity_util_avg() before unregister_sched_fair_group() + * calls this. */ - if (se->avg.last_update_time == 0) - return; last_update_time = cfs_rq_last_update_time(cfs_rq); @@ -3030,6 +3151,12 @@ static int idle_balance(struct rq *this_rq); #else /* CONFIG_SMP */ +static inline int +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) +{ + return 0; +} + static inline void update_load_avg(struct sched_entity *se, int not_used) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -3177,7 +3304,7 @@ static inline void check_schedstat_required(void) trace_sched_stat_iowait_enabled() || trace_sched_stat_blocked_enabled() || trace_sched_stat_runtime_enabled()) { - pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, " + printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, " "stat_blocked and stat_runtime require the " "kernel parameter schedstats=enabled or " "kernel.sched_schedstats=1\n"); @@ -3185,20 +3312,61 @@ static inline void check_schedstat_required(void) #endif } + +/* + * MIGRATION + * + * dequeue + * update_curr() + * update_min_vruntime() + * vruntime -= min_vruntime + * + * enqueue + * update_curr() + * update_min_vruntime() + * vruntime += min_vruntime + * + * this way the vruntime transition between RQs is done when both + * min_vruntime are up-to-date. + * + * WAKEUP (remote) + * + * ->migrate_task_rq_fair() (p->state == TASK_WAKING) + * vruntime -= min_vruntime + * + * enqueue + * update_curr() + * update_min_vruntime() + * vruntime += min_vruntime + * + * this way we don't have the most up-to-date min_vruntime on the originating + * CPU and an up-to-date min_vruntime on the destination CPU. + */ + static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { + bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED); + bool curr = cfs_rq->curr == se; + /* - * Update the normalized vruntime before updating min_vruntime - * through calling update_curr(). + * If we're the current task, we must renormalise before calling + * update_curr(). */ - if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) + if (renorm && curr) se->vruntime += cfs_rq->min_vruntime; + update_curr(cfs_rq); + /* - * Update run-time statistics of the 'current'. + * Otherwise, renormalise after, such that we're placed at the current + * moment in time, instead of some random moment in the past. Being + * placed in the past could significantly boost this task to the + * fairness detriment of existing tasks. */ - update_curr(cfs_rq); + if (renorm && !curr) + se->vruntime += cfs_rq->min_vruntime; + enqueue_entity_load_avg(cfs_rq, se); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); @@ -3214,7 +3382,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_stats_enqueue(cfs_rq, se); check_spread(cfs_rq, se); } - if (se != cfs_rq->curr) + if (!curr) __enqueue_entity(cfs_rq, se); se->on_rq = 1; @@ -3578,7 +3746,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) { if (unlikely(cfs_rq->throttle_count)) - return cfs_rq->throttled_clock_task; + return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; } @@ -3716,13 +3884,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; cfs_rq->throttle_count--; -#ifdef CONFIG_SMP if (!cfs_rq->throttle_count) { /* adjust cfs_rq_clock_task() */ cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - cfs_rq->throttled_clock_task; } -#endif return 0; } @@ -4089,6 +4255,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) throttle_cfs_rq(cfs_rq); } +static void sync_throttle(struct task_group *tg, int cpu) +{ + struct cfs_rq *pcfs_rq, *cfs_rq; + + if (!cfs_bandwidth_used()) + return; + + if (!tg->parent) + return; + + cfs_rq = tg->cfs_rq[cpu]; + pcfs_rq = tg->parent->cfs_rq[cpu]; + + cfs_rq->throttle_count = pcfs_rq->throttle_count; + cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu)); +} + /* conditionally throttle active cfs_rq's from put_prev_entity() */ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { @@ -4228,6 +4411,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} +static inline void sync_throttle(struct task_group *tg, int cpu) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) @@ -4336,7 +4520,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * * note: in the case of encountering a throttled cfs_rq we will * post the final h_nr_running increment below. - */ + */ if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running++; @@ -4390,15 +4574,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { + /* Avoid re-evaluating load for this entity: */ + se = parent_entity(se); /* * Bias pick_next to pick a task from this cfs_rq, as * p is sleeping when it is within its sched_slice. */ - if (task_sleep && parent_entity(se)) - set_next_buddy(parent_entity(se)); - - /* avoid re-evaluating load for this entity */ - se = parent_entity(se); + if (task_sleep && se && !throttled_hierarchy(cfs_rq)) + set_next_buddy(se); break; } flags |= DEQUEUE_SLEEP; @@ -4422,7 +4605,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } #ifdef CONFIG_SMP - +#ifdef CONFIG_NO_HZ_COMMON /* * per rq 'load' arrray crap; XXX kill this. */ @@ -4488,13 +4671,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) } return load; } +#endif /* CONFIG_NO_HZ_COMMON */ /** - * __update_cpu_load - update the rq->cpu_load[] statistics + * __cpu_load_update - update the rq->cpu_load[] statistics * @this_rq: The rq to update statistics for * @this_load: The current load * @pending_updates: The number of missed updates - * @active: !0 for NOHZ_FULL * * Update rq->cpu_load[] statistics. This function is usually called every * scheduler tick (TICK_NSEC). @@ -4523,12 +4706,12 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) * load[i]_n = (1 - 1/2^i)^n * load[i]_0 * * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra - * term. See the @active paramter. + * term. */ -static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, - unsigned long pending_updates, int active) +static void cpu_load_update(struct rq *this_rq, unsigned long this_load, + unsigned long pending_updates) { - unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0; + unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0]; int i, scale; this_rq->nr_load_updates++; @@ -4541,6 +4724,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, /* scale is effectively 1 << i now, and >> i divides by scale */ old_load = this_rq->cpu_load[i]; +#ifdef CONFIG_NO_HZ_COMMON old_load = decay_load_missed(old_load, pending_updates - 1, i); if (tickless_load) { old_load -= decay_load_missed(tickless_load, pending_updates - 1, i); @@ -4551,6 +4735,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, */ old_load += tickless_load; } +#endif new_load = this_load; /* * Round up the averaging division if load is increasing. This @@ -4573,10 +4758,23 @@ static unsigned long weighted_cpuload(const int cpu) } #ifdef CONFIG_NO_HZ_COMMON -static void __update_cpu_load_nohz(struct rq *this_rq, - unsigned long curr_jiffies, - unsigned long load, - int active) +/* + * There is no sane way to deal with nohz on smp when using jiffies because the + * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading + * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. + * + * Therefore we need to avoid the delta approach from the regular tick when + * possible since that would seriously skew the load calculation. This is why we + * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on + * jiffies deltas for updates happening while in nohz mode (idle ticks, idle + * loop exit, nohz_idle_balance, nohz full exit...) + * + * This means we might still be one tick off for nohz periods. + */ + +static void cpu_load_update_nohz(struct rq *this_rq, + unsigned long curr_jiffies, + unsigned long load) { unsigned long pending_updates; @@ -4588,28 +4786,15 @@ static void __update_cpu_load_nohz(struct rq *this_rq, * In the NOHZ_FULL case, we were non-idle, we should consider * its weighted load. */ - __update_cpu_load(this_rq, load, pending_updates, active); + cpu_load_update(this_rq, load, pending_updates); } } /* - * There is no sane way to deal with nohz on smp when using jiffies because the - * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading - * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. - * - * Therefore we cannot use the delta approach from the regular tick since that - * would seriously skew the load calculation. However we'll make do for those - * updates happening while idle (nohz_idle_balance) or coming out of idle - * (tick_nohz_idle_exit). - * - * This means we might still be one tick off for nohz periods. - */ - -/* * Called from nohz_idle_balance() to update the load ratings before doing the * idle balance. */ -static void update_cpu_load_idle(struct rq *this_rq) +static void cpu_load_update_idle(struct rq *this_rq) { /* * bail if there's load or we're actually up-to-date. @@ -4617,38 +4802,71 @@ static void update_cpu_load_idle(struct rq *this_rq) if (weighted_cpuload(cpu_of(this_rq))) return; - __update_cpu_load_nohz(this_rq, READ_ONCE(jiffies), 0, 0); + cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0); } /* - * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. + * Record CPU load on nohz entry so we know the tickless load to account + * on nohz exit. cpu_load[0] happens then to be updated more frequently + * than other cpu_load[idx] but it should be fine as cpu_load readers + * shouldn't rely into synchronized cpu_load[*] updates. */ -void update_cpu_load_nohz(int active) +void cpu_load_update_nohz_start(void) { struct rq *this_rq = this_rq(); + + /* + * This is all lockless but should be fine. If weighted_cpuload changes + * concurrently we'll exit nohz. And cpu_load write can race with + * cpu_load_update_idle() but both updater would be writing the same. + */ + this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq)); +} + +/* + * Account the tickless load in the end of a nohz frame. + */ +void cpu_load_update_nohz_stop(void) +{ unsigned long curr_jiffies = READ_ONCE(jiffies); - unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0; + struct rq *this_rq = this_rq(); + unsigned long load; if (curr_jiffies == this_rq->last_load_update_tick) return; + load = weighted_cpuload(cpu_of(this_rq)); raw_spin_lock(&this_rq->lock); - __update_cpu_load_nohz(this_rq, curr_jiffies, load, active); + update_rq_clock(this_rq); + cpu_load_update_nohz(this_rq, curr_jiffies, load); raw_spin_unlock(&this_rq->lock); } -#endif /* CONFIG_NO_HZ */ +#else /* !CONFIG_NO_HZ_COMMON */ +static inline void cpu_load_update_nohz(struct rq *this_rq, + unsigned long curr_jiffies, + unsigned long load) { } +#endif /* CONFIG_NO_HZ_COMMON */ + +static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load) +{ +#ifdef CONFIG_NO_HZ_COMMON + /* See the mess around cpu_load_update_nohz(). */ + this_rq->last_load_update_tick = READ_ONCE(jiffies); +#endif + cpu_load_update(this_rq, load, 1); +} /* * Called from scheduler_tick() */ -void update_cpu_load_active(struct rq *this_rq) +void cpu_load_update_active(struct rq *this_rq) { unsigned long load = weighted_cpuload(cpu_of(this_rq)); - /* - * See the mess around update_cpu_load_idle() / update_cpu_load_nohz(). - */ - this_rq->last_load_update_tick = jiffies; - __update_cpu_load(this_rq, load, 1, 1); + + if (tick_nohz_tick_stopped()) + cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load); + else + cpu_load_update_periodic(this_rq, load); } /* @@ -4706,46 +4924,6 @@ static unsigned long cpu_avg_load_per_task(int cpu) return 0; } -static void record_wakee(struct task_struct *p) -{ - /* - * Rough decay (wiping) for cost saving, don't worry - * about the boundary, really active task won't care - * about the loss. - */ - if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) { - current->wakee_flips >>= 1; - current->wakee_flip_decay_ts = jiffies; - } - - if (current->last_wakee != p) { - current->last_wakee = p; - current->wakee_flips++; - } -} - -static void task_waking_fair(struct task_struct *p) -{ - struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 min_vruntime; - -#ifndef CONFIG_64BIT - u64 min_vruntime_copy; - - do { - min_vruntime_copy = cfs_rq->min_vruntime_copy; - smp_rmb(); - min_vruntime = cfs_rq->min_vruntime; - } while (min_vruntime != min_vruntime_copy); -#else - min_vruntime = cfs_rq->min_vruntime; -#endif - - se->vruntime -= min_vruntime; - record_wakee(p); -} - #ifdef CONFIG_FAIR_GROUP_SCHED /* * effective_load() calculates the load change as seen from the root_task_group @@ -4805,19 +4983,24 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) return wl; for_each_sched_entity(se) { - long w, W; + struct cfs_rq *cfs_rq = se->my_q; + long W, w = cfs_rq_load_avg(cfs_rq); - tg = se->my_q->tg; + tg = cfs_rq->tg; /* * W = @wg + \Sum rw_j */ - W = wg + calc_tg_weight(tg, se->my_q); + W = wg + atomic_long_read(&tg->load_avg); + + /* Ensure \Sum rw_j >= rw_i */ + W -= cfs_rq->tg_load_avg_contrib; + W += w; /* * w = rw_i + @wl */ - w = cfs_rq_load_avg(se->my_q) + wl; + w += wl; /* * wl = S * s'_i; see (2) @@ -4861,17 +5044,39 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) #endif +static void record_wakee(struct task_struct *p) +{ + /* + * Only decay a single time; tasks that have less then 1 wakeup per + * jiffy will not have built up many flips. + */ + if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) { + current->wakee_flips >>= 1; + current->wakee_flip_decay_ts = jiffies; + } + + if (current->last_wakee != p) { + current->last_wakee = p; + current->wakee_flips++; + } +} + /* * Detect M:N waker/wakee relationships via a switching-frequency heuristic. + * * A waker of many should wake a different task than the one last awakened - * at a frequency roughly N times higher than one of its wakees. In order - * to determine whether we should let the load spread vs consolodating to - * shared cache, we look for a minimum 'flip' frequency of llc_size in one - * partner, and a factor of lls_size higher frequency in the other. With - * both conditions met, we can be relatively sure that the relationship is - * non-monogamous, with partner count exceeding socket size. Waker/wakee - * being client/server, worker/dispatcher, interrupt source or whatever is - * irrelevant, spread criteria is apparent partner count exceeds socket size. + * at a frequency roughly N times higher than one of its wakees. + * + * In order to determine whether we should let the load spread vs consolidating + * to shared cache, we look for a minimum 'flip' frequency of llc_size in one + * partner, and a factor of lls_size higher frequency in the other. + * + * With both conditions met, we can be relatively sure that the relationship is + * non-monogamous, with partner count exceeding socket size. + * + * Waker/wakee being client/server, worker/dispatcher, interrupt source or + * whatever is irrelevant, spread criteria is apparent partner count exceeds + * socket size. */ static int wake_wide(struct task_struct *p) { @@ -5176,8 +5381,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int want_affine = 0; int sync = wake_flags & WF_SYNC; - if (sd_flag & SD_BALANCE_WAKE) + if (sd_flag & SD_BALANCE_WAKE) { + record_wakee(p); want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + } rcu_read_lock(); for_each_domain(cpu, tmp) { @@ -5257,6 +5464,32 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f static void migrate_task_rq_fair(struct task_struct *p) { /* + * As blocked tasks retain absolute vruntime the migration needs to + * deal with this by subtracting the old and adding the new + * min_vruntime -- the latter is done by enqueue_entity() when placing + * the task on the new runqueue. + */ + if (p->state == TASK_WAKING) { + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 min_vruntime; + +#ifndef CONFIG_64BIT + u64 min_vruntime_copy; + + do { + min_vruntime_copy = cfs_rq->min_vruntime_copy; + smp_rmb(); + min_vruntime = cfs_rq->min_vruntime; + } while (min_vruntime != min_vruntime_copy); +#else + min_vruntime = cfs_rq->min_vruntime; +#endif + + se->vruntime -= min_vruntime; + } + + /* * We are supposed to update the task to "current" time, then its up to date * and ready to go to new CPU/cfs_rq. But we have difficulty in getting * what current time is, so simply throw away the out-of-date time. This @@ -5439,7 +5672,7 @@ preempt: } static struct task_struct * -pick_next_task_fair(struct rq *rq, struct task_struct *prev) +pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) { struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; @@ -5552,9 +5785,9 @@ idle: * further scheduler activity on it and we're being very careful to * re-start the picking loop. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); new_tasks = idle_balance(rq); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, cookie); /* * Because idle_balance() releases (and re-acquires) rq->lock, it is * possible for any higher priority task to appear. In that case we @@ -5653,7 +5886,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * W_i,0 = \Sum_j w_i,j (2) * * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight - * is derived from the nice value as per prio_to_weight[]. + * is derived from the nice value as per sched_prio_to_weight[]. * * The weight average is an exponential decay average of the instantaneous * weight: @@ -6155,7 +6388,7 @@ static void update_blocked_averages(int cpu) if (throttled_hierarchy(cfs_rq)) continue; - if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq)) + if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) update_tg_load_avg(cfs_rq, 0); } raw_spin_unlock_irqrestore(&rq->lock, flags); @@ -6216,7 +6449,7 @@ static inline void update_blocked_averages(int cpu) raw_spin_lock_irqsave(&rq->lock, flags); update_rq_clock(rq); - update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); + update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -6625,6 +6858,9 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (!(env->sd->flags & SD_ASYM_PACKING)) return true; + /* No ASYM_PACKING if target cpu is already busy */ + if (env->idle == CPU_NOT_IDLE) + return true; /* * ASYM_PACKING needs to move all the work to the lowest * numbered CPUs in the group, therefore mark all groups @@ -6634,7 +6870,8 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (!sds->busiest) return true; - if (group_first_cpu(sds->busiest) > group_first_cpu(sg)) + /* Prefer to move from highest possible cpu's work */ + if (group_first_cpu(sds->busiest) < group_first_cpu(sg)) return true; } @@ -6780,6 +7017,9 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) if (!(env->sd->flags & SD_ASYM_PACKING)) return 0; + if (env->idle == CPU_NOT_IDLE) + return 0; + if (!sds->busiest) return 0; @@ -6888,9 +7128,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s } /* - * In the presence of smp nice balancing, certain scenarios can have - * max load less than avg load(as we skip the groups at or below - * its cpu_capacity, while calculating max_load..) + * Avg load of busiest sg can be less and avg load of local sg can + * be greater than avg load across all sgs of sd because avg load + * factors in sg capacity and sgs with smaller group_type are + * skipped when updating the busiest sg: */ if (busiest->avg_load <= sds->avg_load || local->avg_load >= sds->avg_load) { @@ -6903,11 +7144,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ if (busiest->group_type == group_overloaded && local->group_type == group_overloaded) { - load_above_capacity = busiest->sum_nr_running * - SCHED_LOAD_SCALE; - if (load_above_capacity > busiest->group_capacity) + load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE; + if (load_above_capacity > busiest->group_capacity) { load_above_capacity -= busiest->group_capacity; - else + load_above_capacity *= NICE_0_LOAD; + load_above_capacity /= busiest->group_capacity; + } else load_above_capacity = ~0UL; } @@ -6915,9 +7157,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * We're trying to get all the cpus to the average_load, so we don't * want to push ourselves above the average load, nor do we wish to * reduce the max loaded cpu below the average load. At the same time, - * we also don't want to reduce the group load below the group capacity - * (so that we can implement power-savings policies etc). Thus we look - * for the minimum possible imbalance. + * we also don't want to reduce the group load below the group + * capacity. Thus we look for the minimum possible imbalance. */ max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity); @@ -6941,10 +7182,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /** * find_busiest_group - Returns the busiest group within the sched_domain - * if there is an imbalance. If there isn't an imbalance, and - * the user has opted for power-savings, it returns a group whose - * CPUs can be put to idle by rebalancing those tasks elsewhere, if - * such a group exists. + * if there is an imbalance. * * Also calculates the amount of weighted load which should be moved * to restore balance. @@ -6952,9 +7190,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * @env: The load balancing environment. * * Return: - The busiest group if imbalance exists. - * - If no imbalance and user has opted for power-savings balance, - * return the least loaded group whose CPUs can be - * put to idle by rebalancing its tasks onto our group. */ static struct sched_group *find_busiest_group(struct lb_env *env) { @@ -6972,8 +7207,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) busiest = &sds.busiest_stat; /* ASYM feature bypasses nice load balance check */ - if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && - check_asym_packing(env, &sds)) + if (check_asym_packing(env, &sds)) return sds.busiest; /* There is no busy sibling group to pull tasks from */ @@ -7398,10 +7632,7 @@ more_balance: &busiest->active_balance_work); } - /* - * We've kicked active balancing, reset the failure - * counter. - */ + /* We've kicked active balancing, force task migration. */ sd->nr_balance_failed = sd->cache_nice_tries+1; } } else @@ -7636,10 +7867,13 @@ static int active_load_balance_cpu_stop(void *data) schedstat_inc(sd, alb_count); p = detach_one_task(&env); - if (p) + if (p) { schedstat_inc(sd, alb_pushed); - else + /* Active balancing done, reset the failure counter. */ + sd->nr_balance_failed = 0; + } else { schedstat_inc(sd, alb_failed); + } } rcu_read_unlock(); out_unlock: @@ -7710,7 +7944,7 @@ static void nohz_balancer_kick(void) return; } -static inline void nohz_balance_exit_idle(int cpu) +void nohz_balance_exit_idle(unsigned int cpu) { if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { /* @@ -7783,18 +8017,6 @@ void nohz_balance_enter_idle(int cpu) atomic_inc(&nohz.nr_cpus); set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); } - -static int sched_ilb_notifier(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DYING: - nohz_balance_exit_idle(smp_processor_id()); - return NOTIFY_OK; - default: - return NOTIFY_DONE; - } -} #endif static DEFINE_SPINLOCK(balancing); @@ -7956,7 +8178,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) if (time_after_eq(jiffies, rq->next_balance)) { raw_spin_lock_irq(&rq->lock); update_rq_clock(rq); - update_cpu_load_idle(rq); + cpu_load_update_idle(rq); raw_spin_unlock_irq(&rq->lock); rebalance_domains(rq, CPU_IDLE); } @@ -8139,31 +8361,17 @@ static void task_fork_fair(struct task_struct *p) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se, *curr; - int this_cpu = smp_processor_id(); struct rq *rq = this_rq(); - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock(&rq->lock); update_rq_clock(rq); cfs_rq = task_cfs_rq(current); curr = cfs_rq->curr; - - /* - * Not only the cpu but also the task_group of the parent might have - * been changed after parent->se.parent,cfs_rq were copied to - * child->se.parent,cfs_rq. So call __set_task_cpu() to make those - * of child point to valid ones. - */ - rcu_read_lock(); - __set_task_cpu(p, this_cpu); - rcu_read_unlock(); - - update_curr(cfs_rq); - - if (curr) + if (curr) { + update_curr(cfs_rq); se->vruntime = curr->vruntime; + } place_entity(cfs_rq, se, 1); if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { @@ -8176,8 +8384,7 @@ static void task_fork_fair(struct task_struct *p) } se->vruntime -= cfs_rq->min_vruntime; - - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock(&rq->lock); } /* @@ -8233,6 +8440,8 @@ static void detach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 now = cfs_rq_clock_task(cfs_rq); + int tg_update; if (!vruntime_normalized(p)) { /* @@ -8244,13 +8453,18 @@ static void detach_task_cfs_rq(struct task_struct *p) } /* Catch up with the cfs_rq and remove our load when we leave */ + tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); detach_entity_load_avg(cfs_rq, se); + if (tg_update) + update_tg_load_avg(cfs_rq, false); } static void attach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 now = cfs_rq_clock_task(cfs_rq); + int tg_update; #ifdef CONFIG_FAIR_GROUP_SCHED /* @@ -8261,7 +8475,10 @@ static void attach_task_cfs_rq(struct task_struct *p) #endif /* Synchronize task with its cfs_rq */ + tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); attach_entity_load_avg(cfs_rq, se); + if (tg_update) + update_tg_load_avg(cfs_rq, false); if (!vruntime_normalized(p)) se->vruntime += cfs_rq->min_vruntime; @@ -8321,6 +8538,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) } #ifdef CONFIG_FAIR_GROUP_SCHED +static void task_set_group_fair(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + + set_task_rq(p, task_cpu(p)); + se->depth = se->parent ? se->parent->depth + 1 : 0; +} + static void task_move_group_fair(struct task_struct *p) { detach_task_cfs_rq(p); @@ -8333,6 +8558,19 @@ static void task_move_group_fair(struct task_struct *p) attach_task_cfs_rq(p); } +static void task_change_group_fair(struct task_struct *p, int type) +{ + switch (type) { + case TASK_SET_GROUP: + task_set_group_fair(p); + break; + + case TASK_MOVE_GROUP: + task_move_group_fair(p); + break; + } +} + void free_fair_sched_group(struct task_group *tg) { int i; @@ -8352,8 +8590,9 @@ void free_fair_sched_group(struct task_group *tg) int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { - struct cfs_rq *cfs_rq; struct sched_entity *se; + struct cfs_rq *cfs_rq; + struct rq *rq; int i; tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); @@ -8368,6 +8607,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_bandwidth(tg_cfs_bandwidth(tg)); for_each_possible_cpu(i) { + rq = cpu_rq(i); + cfs_rq = kzalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, cpu_to_node(i)); if (!cfs_rq) @@ -8391,6 +8632,23 @@ err: return 0; } +void online_fair_sched_group(struct task_group *tg) +{ + struct sched_entity *se; + struct rq *rq; + int i; + + for_each_possible_cpu(i) { + rq = cpu_rq(i); + se = tg->se[i]; + + raw_spin_lock_irq(&rq->lock); + post_init_entity_util_avg(se); + sync_throttle(tg, i); + raw_spin_unlock_irq(&rq->lock); + } +} + void unregister_fair_sched_group(struct task_group *tg) { unsigned long flags; @@ -8495,6 +8753,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) return 1; } +void online_fair_sched_group(struct task_group *tg) { } + void unregister_fair_sched_group(struct task_group *tg) { } #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -8537,7 +8797,6 @@ const struct sched_class fair_sched_class = { .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, - .task_waking = task_waking_fair, .task_dead = task_dead_fair, .set_cpus_allowed = set_cpus_allowed_common, #endif @@ -8555,7 +8814,7 @@ const struct sched_class fair_sched_class = { .update_curr = update_curr_fair, #ifdef CONFIG_FAIR_GROUP_SCHED - .task_move_group = task_move_group_fair, + .task_change_group = task_change_group_fair, #endif }; @@ -8599,7 +8858,6 @@ __init void init_sched_fair_class(void) #ifdef CONFIG_NO_HZ_COMMON nohz.next_balance = jiffies; zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); - cpu_notifier(sched_ilb_notifier, 0); #endif #endif /* SMP */ diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index bd12c6c714ec..9fb873cfc75c 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -127,7 +127,7 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, */ static void cpuidle_idle_call(void) { - struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); + struct cpuidle_device *dev = cpuidle_get_device(); struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); int next_state, entered_state; @@ -201,6 +201,8 @@ exit_idle: */ static void cpu_idle_loop(void) { + int cpu = smp_processor_id(); + while (1) { /* * If the arch has a polling bit, we maintain an invariant: @@ -219,7 +221,7 @@ static void cpu_idle_loop(void) check_pgt_cache(); rmb(); - if (cpu_is_offline(smp_processor_id())) { + if (cpu_is_offline(cpu)) { cpuhp_report_idle_dead(); arch_cpu_idle_dead(); } diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 47ce94931f1b..2ce5458bbe1d 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -24,7 +24,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl } static struct task_struct * -pick_next_task_idle(struct rq *rq, struct task_struct *prev) +pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) { put_prev_task(rq, prev); diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index ef7159012cf3..a2d6eb71f06b 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -78,11 +78,11 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) loads[2] = (avenrun[2] + offset) << shift; } -long calc_load_fold_active(struct rq *this_rq) +long calc_load_fold_active(struct rq *this_rq, long adjust) { long nr_active, delta = 0; - nr_active = this_rq->nr_running; + nr_active = this_rq->nr_running - adjust; nr_active += (long)this_rq->nr_uninterruptible; if (nr_active != this_rq->calc_load_active) { @@ -99,10 +99,13 @@ long calc_load_fold_active(struct rq *this_rq) static unsigned long calc_load(unsigned long load, unsigned long exp, unsigned long active) { - load *= exp; - load += active * (FIXED_1 - exp); - load += 1UL << (FSHIFT - 1); - return load >> FSHIFT; + unsigned long newload; + + newload = load * exp + active * (FIXED_1 - exp); + if (active >= load) + newload += FIXED_1-1; + + return newload / FIXED_1; } #ifdef CONFIG_NO_HZ_COMMON @@ -185,7 +188,7 @@ void calc_load_enter_idle(void) * We're going into NOHZ mode, if there's any pending delta, fold it * into the pending idle delta. */ - delta = calc_load_fold_active(this_rq); + delta = calc_load_fold_active(this_rq, 0); if (delta) { int idx = calc_load_write_idx(); @@ -386,7 +389,7 @@ void calc_global_load_tick(struct rq *this_rq) if (time_before(jiffies, this_rq->calc_load_update)) return; - delta = calc_load_fold_active(this_rq); + delta = calc_load_fold_active(this_rq, 0); if (delta) atomic_long_add(delta, &calc_load_tasks); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index ec4f538d4396..d5690b722691 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -334,7 +334,7 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) rt_rq = &rq_of_rt_rq(rt_rq)->rt; rt_rq->rt_nr_total++; - if (p->nr_cpus_allowed > 1) + if (tsk_nr_cpus_allowed(p) > 1) rt_rq->rt_nr_migratory++; update_rt_migration(rt_rq); @@ -351,7 +351,7 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) rt_rq = &rq_of_rt_rq(rt_rq)->rt; rt_rq->rt_nr_total--; - if (p->nr_cpus_allowed > 1) + if (tsk_nr_cpus_allowed(p) > 1) rt_rq->rt_nr_migratory--; update_rt_migration(rt_rq); @@ -953,14 +953,14 @@ static void update_curr_rt(struct rq *rq) if (curr->sched_class != &rt_sched_class) return; - /* Kick cpufreq (see the comment in linux/cpufreq.h). */ - if (cpu_of(rq) == smp_processor_id()) - cpufreq_trigger_update(rq_clock(rq)); - delta_exec = rq_clock_task(rq) - curr->se.exec_start; if (unlikely((s64)delta_exec <= 0)) return; + /* Kick cpufreq (see the comment in linux/cpufreq.h). */ + if (cpu_of(rq) == smp_processor_id()) + cpufreq_trigger_update(rq_clock(rq)); + schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); @@ -1324,7 +1324,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) enqueue_rt_entity(rt_se, flags); - if (!task_current(rq, p) && p->nr_cpus_allowed > 1) + if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1) enqueue_pushable_task(rq, p); } @@ -1413,7 +1413,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) * will have to sort it out. */ if (curr && unlikely(rt_task(curr)) && - (curr->nr_cpus_allowed < 2 || + (tsk_nr_cpus_allowed(curr) < 2 || curr->prio <= p->prio)) { int target = find_lowest_rq(p); @@ -1437,7 +1437,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) * Current can't be migrated, useless to reschedule, * let's hope p can move out. */ - if (rq->curr->nr_cpus_allowed == 1 || + if (tsk_nr_cpus_allowed(rq->curr) == 1 || !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) return; @@ -1445,7 +1445,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) * p is migratable, so let's not schedule it and * see if it is pushed or pulled somewhere else. */ - if (p->nr_cpus_allowed != 1 + if (tsk_nr_cpus_allowed(p) != 1 && cpupri_find(&rq->rd->cpupri, p, NULL)) return; @@ -1524,7 +1524,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) } static struct task_struct * -pick_next_task_rt(struct rq *rq, struct task_struct *prev) +pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) { struct task_struct *p; struct rt_rq *rt_rq = &rq->rt; @@ -1536,9 +1536,9 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) * disabled avoiding further scheduler activity on it and we're * being very careful to re-start the picking loop. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); pull_rt_task(rq); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, cookie); /* * pull_rt_task() can drop (and re-acquire) rq->lock; this * means a dl or stop task can slip in, in which case we need @@ -1579,7 +1579,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) * The previous task needs to be made eligible for pushing * if it is still active */ - if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) + if (on_rt_rq(&p->rt) && tsk_nr_cpus_allowed(p) > 1) enqueue_pushable_task(rq, p); } @@ -1629,7 +1629,7 @@ static int find_lowest_rq(struct task_struct *task) if (unlikely(!lowest_mask)) return -1; - if (task->nr_cpus_allowed == 1) + if (tsk_nr_cpus_allowed(task) == 1) return -1; /* No other targets possible */ if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) @@ -1762,7 +1762,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) BUG_ON(rq->cpu != task_cpu(p)); BUG_ON(task_current(rq, p)); - BUG_ON(p->nr_cpus_allowed <= 1); + BUG_ON(tsk_nr_cpus_allowed(p) <= 1); BUG_ON(!task_on_rq_queued(p)); BUG_ON(!rt_task(p)); @@ -2122,9 +2122,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) { if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && - p->nr_cpus_allowed > 1 && + tsk_nr_cpus_allowed(p) > 1 && (dl_task(rq->curr) || rt_task(rq->curr)) && - (rq->curr->nr_cpus_allowed < 2 || + (tsk_nr_cpus_allowed(rq->curr) < 2 || rq->curr->prio <= p->prio)) push_rt_tasks(rq); } @@ -2197,7 +2197,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) */ if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP - if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) + if (tsk_nr_cpus_allowed(p) > 1 && rq->rt.overloaded) queue_push_tasks(rq); #else if (p->prio < rq->curr->prio) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ec2e8d23527e..c64fc5114004 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -28,12 +28,12 @@ extern unsigned long calc_load_update; extern atomic_long_t calc_load_tasks; extern void calc_global_load_tick(struct rq *this_rq); -extern long calc_load_fold_active(struct rq *this_rq); +extern long calc_load_fold_active(struct rq *this_rq, long adjust); #ifdef CONFIG_SMP -extern void update_cpu_load_active(struct rq *this_rq); +extern void cpu_load_update_active(struct rq *this_rq); #else -static inline void update_cpu_load_active(struct rq *this_rq) { } +static inline void cpu_load_update_active(struct rq *this_rq) { } #endif /* @@ -49,25 +49,32 @@ static inline void update_cpu_load_active(struct rq *this_rq) { } * and does not change the user-interface for setting shares/weights. * * We increase resolution only if we have enough bits to allow this increased - * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution - * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the - * increased costs. + * resolution (i.e. 64bit). The costs for increasing resolution when 32bit are + * pretty high and the returns do not justify the increased costs. + * + * Really only required when CONFIG_FAIR_GROUP_SCHED is also set, but to + * increase coverage and consistency always enable it on 64bit platforms. */ -#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */ -# define SCHED_LOAD_RESOLUTION 10 -# define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION) -# define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION) +#ifdef CONFIG_64BIT +# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) +# define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT) +# define scale_load_down(w) ((w) >> SCHED_FIXEDPOINT_SHIFT) #else -# define SCHED_LOAD_RESOLUTION 0 +# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT) # define scale_load(w) (w) # define scale_load_down(w) (w) #endif -#define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION) -#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) - -#define NICE_0_LOAD SCHED_LOAD_SCALE -#define NICE_0_SHIFT SCHED_LOAD_SHIFT +/* + * Task weight (visible to users) and its load (invisible to users) have + * independent resolution, but they should be well calibrated. We use + * scale_load() and scale_load_down(w) to convert between them. The + * following must be true: + * + * scale_load(sched_prio_to_weight[USER_PRIO(NICE_TO_PRIO(0))]) == NICE_0_LOAD + * + */ +#define NICE_0_LOAD (1L << NICE_0_LOAD_SHIFT) /* * Single value that decides SCHED_DEADLINE internal math precision. @@ -314,6 +321,7 @@ extern int tg_nop(struct task_group *tg, void *data); extern void free_fair_sched_group(struct task_group *tg); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); +extern void online_fair_sched_group(struct task_group *tg); extern void unregister_fair_sched_group(struct task_group *tg); extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *se, int cpu, @@ -585,11 +593,13 @@ struct rq { #endif #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; - unsigned long last_load_update_tick; #ifdef CONFIG_NO_HZ_COMMON +#ifdef CONFIG_SMP + unsigned long last_load_update_tick; +#endif /* CONFIG_SMP */ u64 nohz_stamp; unsigned long nohz_flags; -#endif +#endif /* CONFIG_NO_HZ_COMMON */ #ifdef CONFIG_NO_HZ_FULL unsigned long last_sched_tick; #endif @@ -854,7 +864,7 @@ DECLARE_PER_CPU(struct sched_domain *, sd_asym); struct sched_group_capacity { atomic_t ref; /* - * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity + * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity * for a single CPU. */ unsigned int capacity; @@ -1104,7 +1114,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) * In particular, the load of prev->state in finish_task_switch() must * happen before this. * - * Pairs with the smp_cond_acquire() in try_to_wake_up(). + * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). */ smp_store_release(&prev->on_cpu, 0); #endif @@ -1159,7 +1169,7 @@ extern const u32 sched_prio_to_wmult[40]; * * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) - * ENQUEUE_WAKING - sched_class::task_waking was called + * ENQUEUE_MIGRATED - the task was migrated during wakeup * */ @@ -1174,9 +1184,9 @@ extern const u32 sched_prio_to_wmult[40]; #define ENQUEUE_HEAD 0x08 #define ENQUEUE_REPLENISH 0x10 #ifdef CONFIG_SMP -#define ENQUEUE_WAKING 0x20 +#define ENQUEUE_MIGRATED 0x20 #else -#define ENQUEUE_WAKING 0x00 +#define ENQUEUE_MIGRATED 0x00 #endif #define RETRY_TASK ((void *)-1UL) @@ -1200,14 +1210,14 @@ struct sched_class { * tasks. */ struct task_struct * (*pick_next_task) (struct rq *rq, - struct task_struct *prev); + struct task_struct *prev, + struct pin_cookie cookie); void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); void (*migrate_task_rq)(struct task_struct *p); - void (*task_waking) (struct task_struct *task); void (*task_woken) (struct rq *this_rq, struct task_struct *task); void (*set_cpus_allowed)(struct task_struct *p, @@ -1237,8 +1247,11 @@ struct sched_class { void (*update_curr) (struct rq *rq); +#define TASK_SET_GROUP 0 +#define TASK_MOVE_GROUP 1 + #ifdef CONFIG_FAIR_GROUP_SCHED - void (*task_move_group) (struct task_struct *p); + void (*task_change_group) (struct task_struct *p, int type); #endif }; @@ -1313,6 +1326,7 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se); unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); +extern void post_init_entity_util_avg(struct sched_entity *se); #ifdef CONFIG_NO_HZ_FULL extern bool sched_can_stop_tick(struct rq *rq); @@ -1448,86 +1462,32 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } static inline void sched_avg_update(struct rq *rq) { } #endif -/* - * __task_rq_lock - lock the rq @p resides on. - */ -static inline struct rq *__task_rq_lock(struct task_struct *p) - __acquires(rq->lock) -{ - struct rq *rq; - - lockdep_assert_held(&p->pi_lock); - - for (;;) { - rq = task_rq(p); - raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { - lockdep_pin_lock(&rq->lock); - return rq; - } - raw_spin_unlock(&rq->lock); - - while (unlikely(task_on_rq_migrating(p))) - cpu_relax(); - } -} +struct rq_flags { + unsigned long flags; + struct pin_cookie cookie; +}; -/* - * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. - */ -static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) +struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(rq->lock); +struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(p->pi_lock) - __acquires(rq->lock) -{ - struct rq *rq; - - for (;;) { - raw_spin_lock_irqsave(&p->pi_lock, *flags); - rq = task_rq(p); - raw_spin_lock(&rq->lock); - /* - * move_queued_task() task_rq_lock() - * - * ACQUIRE (rq->lock) - * [S] ->on_rq = MIGRATING [L] rq = task_rq() - * WMB (__set_task_cpu()) ACQUIRE (rq->lock); - * [S] ->cpu = new_cpu [L] task_rq() - * [L] ->on_rq - * RELEASE (rq->lock) - * - * If we observe the old cpu in task_rq_lock, the acquire of - * the old rq->lock will fully serialize against the stores. - * - * If we observe the new cpu in task_rq_lock, the acquire will - * pair with the WMB to ensure we must then also see migrating. - */ - if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { - lockdep_pin_lock(&rq->lock); - return rq; - } - raw_spin_unlock(&rq->lock); - raw_spin_unlock_irqrestore(&p->pi_lock, *flags); - - while (unlikely(task_on_rq_migrating(p))) - cpu_relax(); - } -} + __acquires(rq->lock); -static inline void __task_rq_unlock(struct rq *rq) +static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, rf->cookie); raw_spin_unlock(&rq->lock); } static inline void -task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) +task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) __releases(rq->lock) __releases(p->pi_lock) { - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, rf->cookie); raw_spin_unlock(&rq->lock); - raw_spin_unlock_irqrestore(&p->pi_lock, *flags); + raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); } #ifdef CONFIG_SMP @@ -1743,6 +1703,10 @@ enum rq_nohz_flag_bits { }; #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) + +extern void nohz_balance_exit_idle(unsigned int cpu); +#else +static inline void nohz_balance_exit_idle(unsigned int cpu) { } #endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -1842,15 +1806,10 @@ static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned lo static inline void cpufreq_trigger_update(u64 time) {} #endif /* CONFIG_CPU_FREQ */ -static inline void account_reset_rq(struct rq *rq) -{ -#ifdef CONFIG_IRQ_TIME_ACCOUNTING - rq->prev_irq_time = 0; +#ifdef arch_scale_freq_capacity +#ifndef arch_scale_freq_invariant +#define arch_scale_freq_invariant() (true) #endif -#ifdef CONFIG_PARAVIRT - rq->prev_steal_time = 0; +#else /* arch_scale_freq_capacity */ +#define arch_scale_freq_invariant() (false) #endif -#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING - rq->prev_steal_time_rq = 0; -#endif -} diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 70b3b6a20fb0..78955cbea31c 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -33,6 +33,8 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) # define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0) # define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0) # define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) +# define schedstat_val(rq, field) ((schedstat_enabled()) ? (rq)->field : 0) + #else /* !CONFIG_SCHEDSTATS */ static inline void rq_sched_info_arrive(struct rq *rq, unsigned long long delta) @@ -47,6 +49,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) # define schedstat_inc(rq, field) do { } while (0) # define schedstat_add(rq, field, amt) do { } while (0) # define schedstat_set(var, val) do { } while (0) +# define schedstat_val(rq, field) 0 #endif #ifdef CONFIG_SCHED_INFO diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index cbc67da10954..604297a08b3a 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -24,7 +24,7 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) } static struct task_struct * -pick_next_task_stop(struct rq *rq, struct task_struct *prev) +pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) { struct task_struct *stop = rq->stop; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index e1e5a354854e..0db7c8a2afe2 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -173,7 +173,7 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) * * Returns valid seccomp BPF response codes. */ -static u32 seccomp_run_filters(struct seccomp_data *sd) +static u32 seccomp_run_filters(const struct seccomp_data *sd) { struct seccomp_data sd_local; u32 ret = SECCOMP_RET_ALLOW; @@ -347,7 +347,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) { struct seccomp_filter *sfilter; int ret; - const bool save_orig = config_enabled(CONFIG_CHECKPOINT_RESTORE); + const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE); if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) return ERR_PTR(-EINVAL); @@ -513,24 +513,17 @@ static void seccomp_send_sigsys(int syscall, int reason) * To be fully secure this must be combined with rlimit * to limit the stack allocations too. */ -static int mode1_syscalls[] = { +static const int mode1_syscalls[] = { __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, 0, /* null terminated */ }; -#ifdef CONFIG_COMPAT -static int mode1_syscalls_32[] = { - __NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32, - 0, /* null terminated */ -}; -#endif - static void __secure_computing_strict(int this_syscall) { - int *syscall_whitelist = mode1_syscalls; + const int *syscall_whitelist = mode1_syscalls; #ifdef CONFIG_COMPAT if (in_compat_syscall()) - syscall_whitelist = mode1_syscalls_32; + syscall_whitelist = get_compat_mode1_syscalls(); #endif do { if (*syscall_whitelist == this_syscall) @@ -549,7 +542,7 @@ void secure_computing_strict(int this_syscall) { int mode = current->seccomp.mode; - if (config_enabled(CONFIG_CHECKPOINT_RESTORE) && + if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) && unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) return; @@ -561,20 +554,10 @@ void secure_computing_strict(int this_syscall) BUG(); } #else -int __secure_computing(void) -{ - u32 phase1_result = seccomp_phase1(NULL); - - if (likely(phase1_result == SECCOMP_PHASE1_OK)) - return 0; - else if (likely(phase1_result == SECCOMP_PHASE1_SKIP)) - return -1; - else - return seccomp_phase2(phase1_result); -} #ifdef CONFIG_SECCOMP_FILTER -static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd) +static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, + const bool recheck_after_trace) { u32 filter_ret, action; int data; @@ -606,10 +589,50 @@ static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd) goto skip; case SECCOMP_RET_TRACE: - return filter_ret; /* Save the rest for phase 2. */ + /* We've been put in this state by the ptracer already. */ + if (recheck_after_trace) + return 0; + + /* ENOSYS these calls if there is no tracer attached. */ + if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { + syscall_set_return_value(current, + task_pt_regs(current), + -ENOSYS, 0); + goto skip; + } + + /* Allow the BPF to provide the event message */ + ptrace_event(PTRACE_EVENT_SECCOMP, data); + /* + * The delivery of a fatal signal during event + * notification may silently skip tracer notification, + * which could leave us with a potentially unmodified + * syscall that the tracer would have liked to have + * changed. Since the process is about to die, we just + * force the syscall to be skipped and let the signal + * kill the process and correctly handle any tracer exit + * notifications. + */ + if (fatal_signal_pending(current)) + goto skip; + /* Check if the tracer forced the syscall to be skipped. */ + this_syscall = syscall_get_nr(current, task_pt_regs(current)); + if (this_syscall < 0) + goto skip; + + /* + * Recheck the syscall, since it may have changed. This + * intentionally uses a NULL struct seccomp_data to force + * a reload of all registers. This does not goto skip since + * a skip would have already been reported. + */ + if (__seccomp_filter(this_syscall, NULL, true)) + return -1; + + return 0; case SECCOMP_RET_ALLOW: - return SECCOMP_PHASE1_OK; + return 0; case SECCOMP_RET_KILL: default: @@ -621,96 +644,38 @@ static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd) skip: audit_seccomp(this_syscall, 0, action); - return SECCOMP_PHASE1_SKIP; + return -1; +} +#else +static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, + const bool recheck_after_trace) +{ + BUG(); } #endif -/** - * seccomp_phase1() - run fast path seccomp checks on the current syscall - * @arg sd: The seccomp_data or NULL - * - * This only reads pt_regs via the syscall_xyz helpers. The only change - * it will make to pt_regs is via syscall_set_return_value, and it will - * only do that if it returns SECCOMP_PHASE1_SKIP. - * - * If sd is provided, it will not read pt_regs at all. - * - * It may also call do_exit or force a signal; these actions must be - * safe. - * - * If it returns SECCOMP_PHASE1_OK, the syscall passes checks and should - * be processed normally. - * - * If it returns SECCOMP_PHASE1_SKIP, then the syscall should not be - * invoked. In this case, seccomp_phase1 will have set the return value - * using syscall_set_return_value. - * - * If it returns anything else, then the return value should be passed - * to seccomp_phase2 from a context in which ptrace hooks are safe. - */ -u32 seccomp_phase1(struct seccomp_data *sd) +int __secure_computing(const struct seccomp_data *sd) { int mode = current->seccomp.mode; - int this_syscall = sd ? sd->nr : - syscall_get_nr(current, task_pt_regs(current)); + int this_syscall; - if (config_enabled(CONFIG_CHECKPOINT_RESTORE) && + if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) && unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) - return SECCOMP_PHASE1_OK; + return 0; + + this_syscall = sd ? sd->nr : + syscall_get_nr(current, task_pt_regs(current)); switch (mode) { case SECCOMP_MODE_STRICT: __secure_computing_strict(this_syscall); /* may call do_exit */ - return SECCOMP_PHASE1_OK; -#ifdef CONFIG_SECCOMP_FILTER + return 0; case SECCOMP_MODE_FILTER: - return __seccomp_phase1_filter(this_syscall, sd); -#endif + return __seccomp_filter(this_syscall, sd, false); default: BUG(); } } - -/** - * seccomp_phase2() - finish slow path seccomp work for the current syscall - * @phase1_result: The return value from seccomp_phase1() - * - * This must be called from a context in which ptrace hooks can be used. - * - * Returns 0 if the syscall should be processed or -1 to skip the syscall. - */ -int seccomp_phase2(u32 phase1_result) -{ - struct pt_regs *regs = task_pt_regs(current); - u32 action = phase1_result & SECCOMP_RET_ACTION; - int data = phase1_result & SECCOMP_RET_DATA; - - BUG_ON(action != SECCOMP_RET_TRACE); - - audit_seccomp(syscall_get_nr(current, regs), 0, action); - - /* Skip these calls if there is no tracer. */ - if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { - syscall_set_return_value(current, regs, - -ENOSYS, 0); - return -1; - } - - /* Allow the BPF to provide the event message */ - ptrace_event(PTRACE_EVENT_SECCOMP, data); - /* - * The delivery of a fatal signal during event - * notification may silently skip tracer notification. - * Terminating the task now avoids executing a system - * call that may not be intended. - */ - if (fatal_signal_pending(current)) - do_exit(SIGSYS); - if (syscall_get_nr(current, regs) < 0) - return -1; /* Explicit request to skip. */ - - return 0; -} #endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */ long prctl_get_seccomp(void) @@ -915,7 +880,7 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, fprog = filter->prog->orig_prog; if (!fprog) { - /* This must be a new non-cBPF filter, since we save every + /* This must be a new non-cBPF filter, since we save * every cBPF filter's orig_prog above when * CONFIG_CHECKPOINT_RESTORE is enabled. */ diff --git a/kernel/signal.c b/kernel/signal.c index aa9bf00749c1..af21afc00d08 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -224,7 +224,7 @@ static inline void print_dropped_signal(int sig) if (!__ratelimit(&ratelimit_state)) return; - printk(KERN_INFO "%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n", + pr_info("%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n", current->comm, current->pid, sig); } @@ -1089,10 +1089,10 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, static void print_fatal_signal(int signr) { struct pt_regs *regs = signal_pt_regs(); - printk(KERN_INFO "potentially unexpected fatal signal %d.\n", signr); + pr_info("potentially unexpected fatal signal %d.\n", signr); #if defined(__i386__) && !defined(__arch_um__) - printk(KERN_INFO "code at %08lx: ", regs->ip); + pr_info("code at %08lx: ", regs->ip); { int i; for (i = 0; i < 16; i++) { @@ -1100,10 +1100,10 @@ static void print_fatal_signal(int signr) if (get_user(insn, (unsigned char *)(regs->ip + i))) break; - printk(KERN_CONT "%02x ", insn); + pr_cont("%02x ", insn); } } - printk(KERN_CONT "\n"); + pr_cont("\n"); #endif preempt_disable(); show_regs(regs); @@ -2751,23 +2751,18 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) * @ts: upper bound on process time suspension */ int do_sigtimedwait(const sigset_t *which, siginfo_t *info, - const struct timespec *ts) + const struct timespec *ts) { + ktime_t *to = NULL, timeout = { .tv64 = KTIME_MAX }; struct task_struct *tsk = current; - long timeout = MAX_SCHEDULE_TIMEOUT; sigset_t mask = *which; - int sig; + int sig, ret = 0; if (ts) { if (!timespec_valid(ts)) return -EINVAL; - timeout = timespec_to_jiffies(ts); - /* - * We can be close to the next tick, add another one - * to ensure we will wait at least the time asked for. - */ - if (ts->tv_sec || ts->tv_nsec) - timeout++; + timeout = timespec_to_ktime(*ts); + to = &timeout; } /* @@ -2778,7 +2773,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info, spin_lock_irq(&tsk->sighand->siglock); sig = dequeue_signal(tsk, &mask, info); - if (!sig && timeout) { + if (!sig && timeout.tv64) { /* * None ready, temporarily unblock those we're interested * while we are sleeping in so that we'll be awakened when @@ -2790,8 +2785,9 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info, recalc_sigpending(); spin_unlock_irq(&tsk->sighand->siglock); - timeout = freezable_schedule_timeout_interruptible(timeout); - + __set_current_state(TASK_INTERRUPTIBLE); + ret = freezable_schedule_hrtimeout_range(to, tsk->timer_slack_ns, + HRTIMER_MODE_REL); spin_lock_irq(&tsk->sighand->siglock); __set_task_blocked(tsk, &tsk->real_blocked); sigemptyset(&tsk->real_blocked); @@ -2801,7 +2797,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info, if (sig) return sig; - return timeout ? -EINTR : -EAGAIN; + return ret ? -EINTR : -EAGAIN; } /** @@ -3099,12 +3095,14 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s oss.ss_sp = (void __user *) current->sas_ss_sp; oss.ss_size = current->sas_ss_size; - oss.ss_flags = sas_ss_flags(sp); + oss.ss_flags = sas_ss_flags(sp) | + (current->sas_ss_flags & SS_FLAG_BITS); if (uss) { void __user *ss_sp; size_t ss_size; - int ss_flags; + unsigned ss_flags; + int ss_mode; error = -EFAULT; if (!access_ok(VERIFY_READ, uss, sizeof(*uss))) @@ -3119,18 +3117,13 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s if (on_sig_stack(sp)) goto out; + ss_mode = ss_flags & ~SS_FLAG_BITS; error = -EINVAL; - /* - * Note - this code used to test ss_flags incorrectly: - * old code may have been written using ss_flags==0 - * to mean ss_flags==SS_ONSTACK (as this was the only - * way that worked) - this fix preserves that older - * mechanism. - */ - if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0) + if (ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK && + ss_mode != 0) goto out; - if (ss_flags == SS_DISABLE) { + if (ss_mode == SS_DISABLE) { ss_size = 0; ss_sp = NULL; } else { @@ -3141,6 +3134,7 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s current->sas_ss_sp = (unsigned long) ss_sp; current->sas_ss_size = ss_size; + current->sas_ss_flags = ss_flags; } error = 0; @@ -3171,9 +3165,14 @@ int restore_altstack(const stack_t __user *uss) int __save_altstack(stack_t __user *uss, unsigned long sp) { struct task_struct *t = current; - return __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) | - __put_user(sas_ss_flags(sp), &uss->ss_flags) | + int err = __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) | + __put_user(t->sas_ss_flags, &uss->ss_flags) | __put_user(t->sas_ss_size, &uss->ss_size); + if (err) + return err; + if (t->sas_ss_flags & SS_AUTODISARM) + sas_ss_reset(t); + return 0; } #ifdef CONFIG_COMPAT diff --git a/kernel/smp.c b/kernel/smp.c index 74165443c240..3aa642d39c03 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -33,69 +33,54 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); static void flush_smp_call_function_queue(bool warn_cpu_offline); -static int -hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) +int smpcfd_prepare_cpu(unsigned int cpu) { - long cpu = (long)hcpu; struct call_function_data *cfd = &per_cpu(cfd_data, cpu); - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, - cpu_to_node(cpu))) - return notifier_from_errno(-ENOMEM); - cfd->csd = alloc_percpu(struct call_single_data); - if (!cfd->csd) { - free_cpumask_var(cfd->cpumask); - return notifier_from_errno(-ENOMEM); - } - break; - -#ifdef CONFIG_HOTPLUG_CPU - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - /* Fall-through to the CPU_DEAD[_FROZEN] case. */ - - case CPU_DEAD: - case CPU_DEAD_FROZEN: + if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, + cpu_to_node(cpu))) + return -ENOMEM; + cfd->csd = alloc_percpu(struct call_single_data); + if (!cfd->csd) { free_cpumask_var(cfd->cpumask); - free_percpu(cfd->csd); - break; + return -ENOMEM; + } - case CPU_DYING: - case CPU_DYING_FROZEN: - /* - * The IPIs for the smp-call-function callbacks queued by other - * CPUs might arrive late, either due to hardware latencies or - * because this CPU disabled interrupts (inside stop-machine) - * before the IPIs were sent. So flush out any pending callbacks - * explicitly (without waiting for the IPIs to arrive), to - * ensure that the outgoing CPU doesn't go offline with work - * still pending. - */ - flush_smp_call_function_queue(false); - break; -#endif - }; + return 0; +} + +int smpcfd_dead_cpu(unsigned int cpu) +{ + struct call_function_data *cfd = &per_cpu(cfd_data, cpu); - return NOTIFY_OK; + free_cpumask_var(cfd->cpumask); + free_percpu(cfd->csd); + return 0; } -static struct notifier_block hotplug_cfd_notifier = { - .notifier_call = hotplug_cfd, -}; +int smpcfd_dying_cpu(unsigned int cpu) +{ + /* + * The IPIs for the smp-call-function callbacks queued by other + * CPUs might arrive late, either due to hardware latencies or + * because this CPU disabled interrupts (inside stop-machine) + * before the IPIs were sent. So flush out any pending callbacks + * explicitly (without waiting for the IPIs to arrive), to + * ensure that the outgoing CPU doesn't go offline with work + * still pending. + */ + flush_smp_call_function_queue(false); + return 0; +} void __init call_function_init(void) { - void *cpu = (void *)(long)smp_processor_id(); int i; for_each_possible_cpu(i) init_llist_head(&per_cpu(call_single_queue, i)); - hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu); - register_cpu_notifier(&hotplug_cfd_notifier); + smpcfd_prepare_cpu(smp_processor_id()); } /* @@ -107,7 +92,7 @@ void __init call_function_init(void) */ static __always_inline void csd_lock_wait(struct call_single_data *csd) { - smp_cond_acquire(!(csd->flags & CSD_FLAG_LOCK)); + smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK)); } static __always_inline void csd_lock(struct call_single_data *csd) diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index a467e6c28a3b..4a1ca5f6da7e 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -21,6 +21,7 @@ #include <linux/smpboot.h> #include <linux/atomic.h> #include <linux/lglock.h> +#include <linux/nmi.h> /* * Structure to determine completion condition and record errors. May @@ -209,6 +210,13 @@ static int multi_cpu_stop(void *data) break; } ack_state(msdata); + } else if (curstate > MULTI_STOP_PREPARE) { + /* + * At this stage all other CPUs we depend on must spin + * in the same loop. Any reason for hard-lockup should + * be detected and reported on their side. + */ + touch_nmi_watchdog(); } } while (curstate != MULTI_STOP_EXIT); diff --git a/kernel/sys.c b/kernel/sys.c index cf8ba545c7d3..89d5be418157 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2246,7 +2246,8 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_THP_DISABLE: if (arg3 || arg4 || arg5) return -EINVAL; - down_write(&me->mm->mmap_sem); + if (down_write_killable(&me->mm->mmap_sem)) + return -EINTR; if (arg2) me->mm->def_flags |= VM_NOHUGEPAGE; else diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 725587f10667..a13bbdaab47d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -130,6 +130,9 @@ static int one_thousand = 1000; #ifdef CONFIG_PRINTK static int ten_thousand = 10000; #endif +#ifdef CONFIG_PERF_EVENTS +static int six_hundred_forty_kb = 640 * 1024; +#endif /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; @@ -811,6 +814,13 @@ static struct ctl_table kern_table[] = { .extra2 = &ten_thousand, }, { + .procname = "printk_devkmsg", + .data = devkmsg_log_str, + .maxlen = DEVKMSG_STR_MAX_SIZE, + .mode = 0644, + .proc_handler = devkmsg_sysctl_set_loglvl, + }, + { .procname = "dmesg_restrict", .data = &dmesg_restrict, .maxlen = sizeof(int), @@ -1144,6 +1154,24 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one_hundred, }, + { + .procname = "perf_event_max_stack", + .data = &sysctl_perf_event_max_stack, + .maxlen = sizeof(sysctl_perf_event_max_stack), + .mode = 0644, + .proc_handler = perf_event_max_stack_handler, + .extra1 = &zero, + .extra2 = &six_hundred_forty_kb, + }, + { + .procname = "perf_event_max_contexts_per_stack", + .data = &sysctl_perf_event_max_contexts_per_stack, + .maxlen = sizeof(sysctl_perf_event_max_contexts_per_stack), + .mode = 0644, + .proc_handler = perf_event_max_stack_handler, + .extra1 = &zero, + .extra2 = &one_thousand, + }, #endif #ifdef CONFIG_KMEMCHECK { @@ -1184,6 +1212,17 @@ static struct ctl_table kern_table[] = { .extra2 = &one, }, #endif +#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) + { + .procname = "panic_on_rcu_stall", + .data = &sysctl_panic_on_rcu_stall, + .maxlen = sizeof(sysctl_panic_on_rcu_stall), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif { } }; @@ -1476,8 +1515,8 @@ static struct ctl_table vm_table[] = { #ifdef CONFIG_NUMA { .procname = "zone_reclaim_mode", - .data = &zone_reclaim_mode, - .maxlen = sizeof(zone_reclaim_mode), + .data = &node_reclaim_mode, + .maxlen = sizeof(node_reclaim_mode), .mode = 0644, .proc_handler = proc_dointvec, .extra1 = &zero, @@ -1509,6 +1548,13 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + { + .procname = "stat_refresh", + .data = NULL, + .maxlen = 0, + .mode = 0600, + .proc_handler = vmstat_refresh, + }, #endif #ifdef CONFIG_MMU { @@ -2094,6 +2140,21 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, return 0; } +static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + if (*negp) + return -EINVAL; + *valp = *lvalp; + } else { + unsigned int val = *valp; + *lvalp = (unsigned long)val; + } + return 0; +} + static const char proc_wspace_sep[] = { ' ', '\t', '\n' }; static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, @@ -2213,8 +2274,27 @@ static int do_proc_dointvec(struct ctl_table *table, int write, int proc_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table,write,buffer,lenp,ppos, - NULL,NULL); + return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL); +} + +/** + * proc_douintvec - read a vector of unsigned integers + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer + * values from/to the user buffer, treated as an ASCII string. + * + * Returns 0 on success. + */ +int proc_douintvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table, write, buffer, lenp, ppos, + do_proc_douintvec_conv, NULL); } /* @@ -2812,6 +2892,12 @@ int proc_dointvec(struct ctl_table *table, int write, return -ENOSYS; } +int proc_douintvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + int proc_dointvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -2857,6 +2943,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, * exception granted :-) */ EXPORT_SYMBOL(proc_dointvec); +EXPORT_SYMBOL(proc_douintvec); EXPORT_SYMBOL(proc_dointvec_jiffies); EXPORT_SYMBOL(proc_dointvec_minmax); EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 10a1d7dc9313..6eb99c17dbd8 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -13,6 +13,7 @@ #include <linux/ctype.h> #include <linux/netdevice.h> #include <linux/kernel.h> +#include <linux/uuid.h> #include <linux/slab.h> #include <linux/compat.h> @@ -1117,9 +1118,8 @@ static ssize_t bin_uuid(struct file *file, /* Only supports reads */ if (oldval && oldlen) { - char buf[40], *str = buf; - unsigned char uuid[16]; - int i; + char buf[UUID_STRING_LEN + 1]; + uuid_be uuid; result = kernel_read(file, 0, buf, sizeof(buf) - 1); if (result < 0) @@ -1127,24 +1127,15 @@ static ssize_t bin_uuid(struct file *file, buf[result] = '\0'; - /* Convert the uuid to from a string to binary */ - for (i = 0; i < 16; i++) { - result = -EIO; - if (!isxdigit(str[0]) || !isxdigit(str[1])) - goto out; - - uuid[i] = (hex_to_bin(str[0]) << 4) | - hex_to_bin(str[1]); - str += 2; - if (*str == '-') - str++; - } + result = -EIO; + if (uuid_be_to_bin(buf, &uuid)) + goto out; if (oldlen > 16) oldlen = 16; result = -EFAULT; - if (copy_to_user(oldval, uuid, oldlen)) + if (copy_to_user(oldval, &uuid, oldlen)) goto out; copied = oldlen; diff --git a/kernel/task_work.c b/kernel/task_work.c index 53fa971d000d..d513051fcca2 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -29,7 +29,7 @@ task_work_add(struct task_struct *task, struct callback_head *work, bool notify) struct callback_head *head; do { - head = ACCESS_ONCE(task->task_works); + head = READ_ONCE(task->task_works); if (unlikely(head == &work_exited)) return -ESRCH; work->next = head; @@ -57,6 +57,9 @@ task_work_cancel(struct task_struct *task, task_work_func_t func) struct callback_head **pprev = &task->task_works; struct callback_head *work; unsigned long flags; + + if (likely(!task->task_works)) + return NULL; /* * If cmpxchg() fails we continue without updating pprev. * Either we raced with task_work_add() which added the @@ -64,8 +67,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func) * we raced with task_work_run(), *pprev == NULL/exited. */ raw_spin_lock_irqsave(&task->pi_lock, flags); - while ((work = ACCESS_ONCE(*pprev))) { - smp_read_barrier_depends(); + while ((work = lockless_dereference(*pprev))) { if (work->func != func) pprev = &work->next; else if (cmpxchg(pprev, work, work->next) == work) @@ -95,7 +97,7 @@ void task_work_run(void) * work_exited unless the list is empty. */ do { - work = ACCESS_ONCE(task->task_works); + work = READ_ONCE(task->task_works); head = !work && (task->flags & PF_EXITING) ? &work_exited : NULL; } while (cmpxchg(&task->task_works, work, head) != work); @@ -108,7 +110,6 @@ void task_work_run(void) * fail, but it can play with *work and other entries. */ raw_spin_unlock_wait(&task->pi_lock); - smp_mb(); do { next = work->next; diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 21f82c29c914..b3f05ee20d18 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -357,10 +357,6 @@ static int parse(struct nlattr *na, struct cpumask *mask) return ret; } -#if defined(CONFIG_64BIT) && !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) -#define TASKSTATS_NEEDS_PADDING 1 -#endif - static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) { struct nlattr *na, *ret; @@ -370,29 +366,6 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) ? TASKSTATS_TYPE_AGGR_PID : TASKSTATS_TYPE_AGGR_TGID; - /* - * The taskstats structure is internally aligned on 8 byte - * boundaries but the layout of the aggregrate reply, with - * two NLA headers and the pid (each 4 bytes), actually - * force the entire structure to be unaligned. This causes - * the kernel to issue unaligned access warnings on some - * architectures like ia64. Unfortunately, some software out there - * doesn't properly unroll the NLA packet and assumes that the start - * of the taskstats structure will always be 20 bytes from the start - * of the netlink payload. Aligning the start of the taskstats - * structure breaks this software, which we don't want. So, for now - * the alignment only happens on architectures that require it - * and those users will have to update to fixed versions of those - * packages. Space is reserved in the packet only when needed. - * This ifdef should be removed in several years e.g. 2012 once - * we can be confident that fixed versions are installed on most - * systems. We add the padding before the aggregate since the - * aggregate is already a defined type. - */ -#ifdef TASKSTATS_NEEDS_PADDING - if (nla_put(skb, TASKSTATS_TYPE_NULL, 0, NULL) < 0) - goto err; -#endif na = nla_nest_start(skb, aggr); if (!na) goto err; @@ -401,7 +374,8 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) nla_nest_cancel(skb, na); goto err; } - ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); + ret = nla_reserve_64bit(skb, TASKSTATS_TYPE_STATS, + sizeof(struct taskstats), TASKSTATS_TYPE_NULL); if (!ret) { nla_nest_cancel(skb, na); goto err; @@ -500,10 +474,9 @@ static size_t taskstats_packet_size(void) size_t size; size = nla_total_size(sizeof(u32)) + - nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); -#ifdef TASKSTATS_NEEDS_PADDING - size += nla_total_size(0); /* Padding for alignment */ -#endif + nla_total_size_64bit(sizeof(struct taskstats)) + + nla_total_size(0); + return size; } diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index e840ed867a5d..c3aad685bbc0 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -30,7 +30,6 @@ * struct alarm_base - Alarm timer bases * @lock: Lock for syncrhonized access to the base * @timerqueue: Timerqueue head managing the list of events - * @timer: hrtimer used to schedule events while running * @gettime: Function to read the time correlating to the base * @base_clockid: clockid for the base */ diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index a9b76a40319e..2c5bc77c0bb0 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -645,7 +645,7 @@ void tick_cleanup_dead_cpu(int cpu) #endif #ifdef CONFIG_SYSFS -struct bus_type clockevents_subsys = { +static struct bus_type clockevents_subsys = { .name = "clockevents", .dev_name = "clockevent", }; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 56ece145a814..6a5a310a1a53 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -669,10 +669,12 @@ static void clocksource_enqueue(struct clocksource *cs) struct list_head *entry = &clocksource_list; struct clocksource *tmp; - list_for_each_entry(tmp, &clocksource_list, list) + list_for_each_entry(tmp, &clocksource_list, list) { /* Keep track of the place, where to insert */ - if (tmp->rating >= cs->rating) - entry = &tmp->list; + if (tmp->rating < cs->rating) + break; + entry = &tmp->list; + } list_add(&cs->list, entry); } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index fa0b983290cf..9ba7c820fc23 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -177,7 +177,7 @@ hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) #endif } -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +#ifdef CONFIG_NO_HZ_COMMON static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) @@ -334,7 +334,7 @@ static void *hrtimer_debug_hint(void *addr) * fixup_init is called when: * - an active object is initialized */ -static int hrtimer_fixup_init(void *addr, enum debug_obj_state state) +static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state) { struct hrtimer *timer = addr; @@ -342,30 +342,25 @@ static int hrtimer_fixup_init(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: hrtimer_cancel(timer); debug_object_init(timer, &hrtimer_debug_descr); - return 1; + return true; default: - return 0; + return false; } } /* * fixup_activate is called when: * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) + * - an unknown non-static object is activated */ -static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state) +static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state) { switch (state) { - - case ODEBUG_STATE_NOTAVAILABLE: - WARN_ON_ONCE(1); - return 0; - case ODEBUG_STATE_ACTIVE: WARN_ON(1); default: - return 0; + return false; } } @@ -373,7 +368,7 @@ static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state) * fixup_free is called when: * - an active object is freed */ -static int hrtimer_fixup_free(void *addr, enum debug_obj_state state) +static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state) { struct hrtimer *timer = addr; @@ -381,9 +376,9 @@ static int hrtimer_fixup_free(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: hrtimer_cancel(timer); debug_object_free(timer, &hrtimer_debug_descr); - return 1; + return true; default: - return 0; + return false; } } @@ -430,6 +425,7 @@ void destroy_hrtimer_on_stack(struct hrtimer *timer) { debug_object_free(timer, &hrtimer_debug_descr); } +EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack); #else static inline void debug_hrtimer_init(struct hrtimer *timer) { } @@ -1594,7 +1590,7 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, /* * Functions related to boot-time initialization: */ -static void init_hrtimers_cpu(int cpu) +int hrtimers_prepare_cpu(unsigned int cpu) { struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; @@ -1606,6 +1602,7 @@ static void init_hrtimers_cpu(int cpu) cpu_base->cpu = cpu; hrtimer_init_hres(cpu_base); + return 0; } #ifdef CONFIG_HOTPLUG_CPU @@ -1640,7 +1637,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, } } -static void migrate_hrtimers(int scpu) +int hrtimers_dead_cpu(unsigned int scpu) { struct hrtimer_cpu_base *old_base, *new_base; int i; @@ -1669,45 +1666,14 @@ static void migrate_hrtimers(int scpu) /* Check, if we got expired work to do */ __hrtimer_peek_ahead_timers(); local_irq_enable(); + return 0; } #endif /* CONFIG_HOTPLUG_CPU */ -static int hrtimer_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - int scpu = (long)hcpu; - - switch (action) { - - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - init_hrtimers_cpu(scpu); - break; - -#ifdef CONFIG_HOTPLUG_CPU - case CPU_DEAD: - case CPU_DEAD_FROZEN: - migrate_hrtimers(scpu); - break; -#endif - - default: - break; - } - - return NOTIFY_OK; -} - -static struct notifier_block hrtimers_nb = { - .notifier_call = hrtimer_cpu_notify, -}; - void __init hrtimers_init(void) { - hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, - (void *)(long)smp_processor_id()); - register_cpu_notifier(&hrtimers_nb); + hrtimers_prepare_cpu(smp_processor_id()); } /** diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 1cafba860b08..39008d78927a 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -777,6 +777,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) timer->it.cpu.expires = 0; sample_to_timespec(timer->it_clock, timer->it.cpu.expires, &itp->it_value); + return; } else { cpu_timer_sample_group(timer->it_clock, p, &now); unlock_task_sighand(p, &flags); diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c index e622ba365a13..b0928ab3270f 100644 --- a/kernel/time/test_udelay.c +++ b/kernel/time/test_udelay.c @@ -43,13 +43,13 @@ static int udelay_test_single(struct seq_file *s, int usecs, uint32_t iters) int allowed_error_ns = usecs * 5; for (i = 0; i < iters; ++i) { - struct timespec ts1, ts2; + s64 kt1, kt2; int time_passed; - ktime_get_ts(&ts1); + kt1 = ktime_get_ns(); udelay(usecs); - ktime_get_ts(&ts2); - time_passed = timespec_to_ns(&ts2) - timespec_to_ns(&ts1); + kt2 = ktime_get_ns(); + time_passed = kt2 - kt1; if (i == 0 || time_passed < min) min = time_passed; @@ -87,11 +87,11 @@ static int udelay_test_show(struct seq_file *s, void *v) if (usecs > 0 && iters > 0) { return udelay_test_single(s, usecs, iters); } else if (usecs == 0) { - struct timespec ts; + struct timespec64 ts; - ktime_get_ts(&ts); - seq_printf(s, "udelay() test (lpj=%ld kt=%ld.%09ld)\n", - loops_per_jiffy, ts.tv_sec, ts.tv_nsec); + ktime_get_ts64(&ts); + seq_printf(s, "udelay() test (lpj=%ld kt=%lld.%09ld)\n", + loops_per_jiffy, (s64)ts.tv_sec, ts.tv_nsec); seq_puts(s, "usage:\n"); seq_puts(s, "echo USECS [ITERS] > " DEBUGFS_FILENAME "\n"); seq_puts(s, "cat " DEBUGFS_FILENAME "\n"); diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index 53d7184da0be..690b797f522e 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -75,6 +75,7 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) } static struct clock_event_device ce_broadcast_hrtimer = { + .name = "bc_hrtimer", .set_state_shutdown = bc_shutdown, .set_next_ktime = bc_set_next, .features = CLOCK_EVT_FEAT_ONESHOT | diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 966a5a6fdd0a..f738251000fe 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -164,3 +164,4 @@ static inline void timers_update_migration(bool update_nohz) { } DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); +void timer_clear_idle(void); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 58e3310c9b21..2ec7c00228f3 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -31,7 +31,7 @@ #include <trace/events/timer.h> /* - * Per cpu nohz control structure + * Per-CPU nohz control structure */ static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); @@ -61,7 +61,7 @@ static void tick_do_update_jiffies64(ktime_t now) if (delta.tv64 < tick_period.tv64) return; - /* Reevalute with jiffies_lock held */ + /* Reevaluate with jiffies_lock held */ write_seqlock(&jiffies_lock); delta = ktime_sub(now, last_jiffies_update); @@ -116,8 +116,8 @@ static void tick_sched_do_timer(ktime_t now) #ifdef CONFIG_NO_HZ_COMMON /* * Check if the do_timer duty was dropped. We don't care about - * concurrency: This happens only when the cpu in charge went - * into a long sleep. If two cpus happen to assign themself to + * concurrency: This happens only when the CPU in charge went + * into a long sleep. If two CPUs happen to assign themselves to * this duty, then the jiffies update is still serialized by * jiffies_lock. */ @@ -262,7 +262,7 @@ static void tick_nohz_dep_set_all(atomic_t *dep, { int prev; - prev = atomic_fetch_or(dep, BIT(bit)); + prev = atomic_fetch_or(BIT(bit), dep); if (!prev) tick_nohz_full_kick_all(); } @@ -292,7 +292,7 @@ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) ts = per_cpu_ptr(&tick_cpu_sched, cpu); - prev = atomic_fetch_or(&ts->tick_dep_mask, BIT(bit)); + prev = atomic_fetch_or(BIT(bit), &ts->tick_dep_mask); if (!prev) { preempt_disable(); /* Perf needs local kick that is NMI safe */ @@ -349,7 +349,7 @@ void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bi /* * Re-evaluate the need for the tick as we switch the current task. * It might need the tick due to per task/process properties: - * perf events, posix cpu timers, ... + * perf events, posix CPU timers, ... */ void __tick_nohz_task_switch(void) { @@ -509,8 +509,8 @@ int tick_nohz_tick_stopped(void) * * In case the sched_tick was stopped on this CPU, we have to check if jiffies * must be updated. Otherwise an interrupt handler could use a stale jiffy - * value. We do this unconditionally on any cpu, as we don't know whether the - * cpu, which has the update task assigned is in a long sleep. + * value. We do this unconditionally on any CPU, as we don't know whether the + * CPU, which has the update task assigned is in a long sleep. */ static void tick_nohz_update_jiffies(ktime_t now) { @@ -526,7 +526,7 @@ static void tick_nohz_update_jiffies(ktime_t now) } /* - * Updates the per cpu time idle statistics counters + * Updates the per-CPU time idle statistics counters */ static void update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) @@ -566,12 +566,12 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts) } /** - * get_cpu_idle_time_us - get the total idle time of a cpu + * get_cpu_idle_time_us - get the total idle time of a CPU * @cpu: CPU number to query * @last_update_time: variable to store update time in. Do not update * counters if NULL. * - * Return the cummulative idle time (since boot) for a given + * Return the cumulative idle time (since boot) for a given * CPU, in microseconds. * * This time is measured via accounting rather than sampling, @@ -607,12 +607,12 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); /** - * get_cpu_iowait_time_us - get the total iowait time of a cpu + * get_cpu_iowait_time_us - get the total iowait time of a CPU * @cpu: CPU number to query * @last_update_time: variable to store update time in. Do not update * counters if NULL. * - * Return the cummulative iowait time (since boot) for a given + * Return the cumulative iowait time (since boot) for a given * CPU, in microseconds. * * This time is measured via accounting rather than sampling, @@ -700,6 +700,12 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, delta = next_tick - basemono; if (delta <= (u64)TICK_NSEC) { tick.tv64 = 0; + + /* + * Tell the timer code that the base is not idle, i.e. undo + * the effect of get_next_timer_interrupt(): + */ + timer_clear_idle(); /* * We've not stopped the tick yet, and there's a timer in the * next period, so no point in stopping it either, bail. @@ -726,14 +732,14 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, } /* - * If this cpu is the one which updates jiffies, then give up - * the assignment and let it be taken by the cpu which runs - * the tick timer next, which might be this cpu as well. If we + * If this CPU is the one which updates jiffies, then give up + * the assignment and let it be taken by the CPU which runs + * the tick timer next, which might be this CPU as well. If we * don't drop this here the jiffies might be stale and * do_timer() never invoked. Keep track of the fact that it - * was the one which had the do_timer() duty last. If this cpu + * was the one which had the do_timer() duty last. If this CPU * is the one which had the do_timer() duty last, we limit the - * sleep time to the timekeeping max_deferement value. + * sleep time to the timekeeping max_deferment value. * Otherwise we can sleep as long as we want. */ delta = timekeeping_max_deferment(); @@ -776,6 +782,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, if (!ts->tick_stopped) { nohz_balance_enter_idle(cpu); calc_load_enter_idle(); + cpu_load_update_nohz_start(); ts->last_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; @@ -802,11 +809,17 @@ out: return tick; } -static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now, int active) +static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { /* Update jiffies first */ tick_do_update_jiffies64(now); - update_cpu_load_nohz(active); + cpu_load_update_nohz_stop(); + + /* + * Clear the timer idle flag, so we avoid IPIs on remote queueing and + * the clock forward checks in the enqueue path: + */ + timer_clear_idle(); calc_load_exit_idle(); touch_softlockup_watchdog_sched(); @@ -833,16 +846,16 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts) if (can_stop_full_tick(ts)) tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); else if (ts->tick_stopped) - tick_nohz_restart_sched_tick(ts, ktime_get(), 1); + tick_nohz_restart_sched_tick(ts, ktime_get()); #endif } static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) { /* - * If this cpu is offline and it is the one which updates + * If this CPU is offline and it is the one which updates * jiffies, then give up the assignment and let it be taken by - * the cpu which runs the tick timer next. If we don't drop + * the CPU which runs the tick timer next. If we don't drop * this here the jiffies might be stale and do_timer() never * invoked. */ @@ -932,11 +945,11 @@ void tick_nohz_idle_enter(void) WARN_ON_ONCE(irqs_disabled()); /* - * Update the idle state in the scheduler domain hierarchy - * when tick_nohz_stop_sched_tick() is called from the idle loop. - * State will be updated to busy during the first busy tick after - * exiting idle. - */ + * Update the idle state in the scheduler domain hierarchy + * when tick_nohz_stop_sched_tick() is called from the idle loop. + * State will be updated to busy during the first busy tick after + * exiting idle. + */ set_cpu_sd_state_idle(); local_irq_disable(); @@ -1024,7 +1037,7 @@ void tick_nohz_idle_exit(void) tick_nohz_stop_idle(ts, now); if (ts->tick_stopped) { - tick_nohz_restart_sched_tick(ts, now, 0); + tick_nohz_restart_sched_tick(ts, now); tick_nohz_account_idle_ticks(ts); } @@ -1091,35 +1104,6 @@ static void tick_nohz_switch_to_nohz(void) tick_nohz_activate(ts, NOHZ_MODE_LOWRES); } -/* - * When NOHZ is enabled and the tick is stopped, we need to kick the - * tick timer from irq_enter() so that the jiffies update is kept - * alive during long running softirqs. That's ugly as hell, but - * correctness is key even if we need to fix the offending softirq in - * the first place. - * - * Note, this is different to tick_nohz_restart. We just kick the - * timer and do not touch the other magic bits which need to be done - * when idle is left. - */ -static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now) -{ -#if 0 - /* Switch back to 2.6.27 behaviour */ - ktime_t delta; - - /* - * Do not touch the tick device, when the next expiry is either - * already reached or less/equal than the tick period. - */ - delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now); - if (delta.tv64 <= tick_period.tv64) - return; - - tick_nohz_restart(ts, now); -#endif -} - static inline void tick_nohz_irq_enter(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); @@ -1130,10 +1114,8 @@ static inline void tick_nohz_irq_enter(void) now = ktime_get(); if (ts->idle_active) tick_nohz_stop_idle(ts, now); - if (ts->tick_stopped) { + if (ts->tick_stopped) tick_nohz_update_jiffies(now); - tick_nohz_kick_tick(ts, now); - } } #else @@ -1210,7 +1192,7 @@ void tick_setup_sched_timer(void) hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); ts->sched_timer.function = tick_sched_timer; - /* Get the next period (per cpu) */ + /* Get the next period (per-CPU) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); /* Offset the tick to avert jiffies_lock contention. */ diff --git a/kernel/time/time.c b/kernel/time/time.c index be115b020d27..667b9335f5d6 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -160,15 +160,15 @@ static inline void warp_clock(void) * various programs will get confused when the clock gets warped. */ -int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz) +int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz) { static int firsttime = 1; int error = 0; - if (tv && !timespec_valid(tv)) + if (tv && !timespec64_valid(tv)) return -EINVAL; - error = security_settime(tv, tz); + error = security_settime64(tv, tz); if (error) return error; @@ -186,7 +186,7 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz) } } if (tv) - return do_settimeofday(tv); + return do_settimeofday64(tv); return 0; } @@ -769,3 +769,24 @@ struct timespec timespec_add_safe(const struct timespec lhs, return res; } + +/* + * Add two timespec64 values and do a safety check for overflow. + * It's assumed that both values are valid (>= 0). + * And, each timespec64 is in normalized form. + */ +struct timespec64 timespec64_add_safe(const struct timespec64 lhs, + const struct timespec64 rhs) +{ + struct timespec64 res; + + set_normalized_timespec64(&res, lhs.tv_sec + rhs.tv_sec, + lhs.tv_nsec + rhs.tv_nsec); + + if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) { + res.tv_sec = TIME64_MAX; + res.tv_nsec = 0; + } + + return res; +} diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c index 86628e755f38..7142580ad94f 100644 --- a/kernel/time/timeconv.c +++ b/kernel/time/timeconv.c @@ -67,20 +67,21 @@ static const unsigned short __mon_yday[2][13] = { #define SECS_PER_DAY (SECS_PER_HOUR * 24) /** - * time_to_tm - converts the calendar time to local broken-down time + * time64_to_tm - converts the calendar time to local broken-down time * * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970, * Coordinated Universal Time (UTC). * @offset offset seconds adding to totalsecs. * @result pointer to struct tm variable to receive broken-down time */ -void time_to_tm(time_t totalsecs, int offset, struct tm *result) +void time64_to_tm(time64_t totalsecs, int offset, struct tm *result) { long days, rem, y; + int remainder; const unsigned short *ip; - days = totalsecs / SECS_PER_DAY; - rem = totalsecs % SECS_PER_DAY; + days = div_s64_rem(totalsecs, SECS_PER_DAY, &remainder); + rem = remainder; rem += offset; while (rem < 0) { rem += SECS_PER_DAY; @@ -124,4 +125,4 @@ void time_to_tm(time_t totalsecs, int offset, struct tm *result) result->tm_mon = y; result->tm_mday = days + 1; } -EXPORT_SYMBOL(time_to_tm); +EXPORT_SYMBOL(time64_to_tm); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 479d25cd3d4f..e07fb093f819 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -401,7 +401,10 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) do { seq = raw_read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); - now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr); + now = ktime_to_ns(tkr->base); + + now += clocksource_delta(tkr->read(tkr->clock), + tkr->cycle_last, tkr->mask); } while (read_seqcount_retry(&tkf->seq, seq)); return now; @@ -480,10 +483,12 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk) * users are removed, this can be killed. */ remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1); - tk->tkr_mono.xtime_nsec -= remainder; - tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift; - tk->ntp_error += remainder << tk->ntp_error_shift; - tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift; + if (remainder != 0) { + tk->tkr_mono.xtime_nsec -= remainder; + tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift; + tk->ntp_error += remainder << tk->ntp_error_shift; + tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift; + } } #else #define old_vsyscall_fixup(tk) @@ -2186,6 +2191,7 @@ struct timespec64 get_monotonic_coarse64(void) return now; } +EXPORT_SYMBOL(get_monotonic_coarse64); /* * Must hold jiffies_lock diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index f6bd65236712..107310a6f36f 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -23,7 +23,9 @@ #include "timekeeping_internal.h" -static unsigned int sleep_time_bin[32] = {0}; +#define NUM_BINS 32 + +static unsigned int sleep_time_bin[NUM_BINS] = {0}; static int tk_debug_show_sleep_time(struct seq_file *s, void *data) { @@ -69,6 +71,9 @@ late_initcall(tk_debug_sleep_time_init); void tk_debug_account_sleep_time(struct timespec64 *t) { - sleep_time_bin[fls(t->tv_sec)]++; + /* Cap bin index so we don't overflow the array */ + int bin = min(fls(t->tv_sec), NUM_BINS-1); + + sleep_time_bin[bin]++; } diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 73164c3aa56b..32bf6f75a8fe 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -59,43 +59,153 @@ __visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; EXPORT_SYMBOL(jiffies_64); /* - * per-CPU timer vector definitions: + * The timer wheel has LVL_DEPTH array levels. Each level provides an array of + * LVL_SIZE buckets. Each level is driven by its own clock and therefor each + * level has a different granularity. + * + * The level granularity is: LVL_CLK_DIV ^ lvl + * The level clock frequency is: HZ / (LVL_CLK_DIV ^ level) + * + * The array level of a newly armed timer depends on the relative expiry + * time. The farther the expiry time is away the higher the array level and + * therefor the granularity becomes. + * + * Contrary to the original timer wheel implementation, which aims for 'exact' + * expiry of the timers, this implementation removes the need for recascading + * the timers into the lower array levels. The previous 'classic' timer wheel + * implementation of the kernel already violated the 'exact' expiry by adding + * slack to the expiry time to provide batched expiration. The granularity + * levels provide implicit batching. + * + * This is an optimization of the original timer wheel implementation for the + * majority of the timer wheel use cases: timeouts. The vast majority of + * timeout timers (networking, disk I/O ...) are canceled before expiry. If + * the timeout expires it indicates that normal operation is disturbed, so it + * does not matter much whether the timeout comes with a slight delay. + * + * The only exception to this are networking timers with a small expiry + * time. They rely on the granularity. Those fit into the first wheel level, + * which has HZ granularity. + * + * We don't have cascading anymore. timers with a expiry time above the + * capacity of the last wheel level are force expired at the maximum timeout + * value of the last wheel level. From data sampling we know that the maximum + * value observed is 5 days (network connection tracking), so this should not + * be an issue. + * + * The currently chosen array constants values are a good compromise between + * array size and granularity. + * + * This results in the following granularity and range levels: + * + * HZ 1000 steps + * Level Offset Granularity Range + * 0 0 1 ms 0 ms - 63 ms + * 1 64 8 ms 64 ms - 511 ms + * 2 128 64 ms 512 ms - 4095 ms (512ms - ~4s) + * 3 192 512 ms 4096 ms - 32767 ms (~4s - ~32s) + * 4 256 4096 ms (~4s) 32768 ms - 262143 ms (~32s - ~4m) + * 5 320 32768 ms (~32s) 262144 ms - 2097151 ms (~4m - ~34m) + * 6 384 262144 ms (~4m) 2097152 ms - 16777215 ms (~34m - ~4h) + * 7 448 2097152 ms (~34m) 16777216 ms - 134217727 ms (~4h - ~1d) + * 8 512 16777216 ms (~4h) 134217728 ms - 1073741822 ms (~1d - ~12d) + * + * HZ 300 + * Level Offset Granularity Range + * 0 0 3 ms 0 ms - 210 ms + * 1 64 26 ms 213 ms - 1703 ms (213ms - ~1s) + * 2 128 213 ms 1706 ms - 13650 ms (~1s - ~13s) + * 3 192 1706 ms (~1s) 13653 ms - 109223 ms (~13s - ~1m) + * 4 256 13653 ms (~13s) 109226 ms - 873810 ms (~1m - ~14m) + * 5 320 109226 ms (~1m) 873813 ms - 6990503 ms (~14m - ~1h) + * 6 384 873813 ms (~14m) 6990506 ms - 55924050 ms (~1h - ~15h) + * 7 448 6990506 ms (~1h) 55924053 ms - 447392423 ms (~15h - ~5d) + * 8 512 55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d) + * + * HZ 250 + * Level Offset Granularity Range + * 0 0 4 ms 0 ms - 255 ms + * 1 64 32 ms 256 ms - 2047 ms (256ms - ~2s) + * 2 128 256 ms 2048 ms - 16383 ms (~2s - ~16s) + * 3 192 2048 ms (~2s) 16384 ms - 131071 ms (~16s - ~2m) + * 4 256 16384 ms (~16s) 131072 ms - 1048575 ms (~2m - ~17m) + * 5 320 131072 ms (~2m) 1048576 ms - 8388607 ms (~17m - ~2h) + * 6 384 1048576 ms (~17m) 8388608 ms - 67108863 ms (~2h - ~18h) + * 7 448 8388608 ms (~2h) 67108864 ms - 536870911 ms (~18h - ~6d) + * 8 512 67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d) + * + * HZ 100 + * Level Offset Granularity Range + * 0 0 10 ms 0 ms - 630 ms + * 1 64 80 ms 640 ms - 5110 ms (640ms - ~5s) + * 2 128 640 ms 5120 ms - 40950 ms (~5s - ~40s) + * 3 192 5120 ms (~5s) 40960 ms - 327670 ms (~40s - ~5m) + * 4 256 40960 ms (~40s) 327680 ms - 2621430 ms (~5m - ~43m) + * 5 320 327680 ms (~5m) 2621440 ms - 20971510 ms (~43m - ~5h) + * 6 384 2621440 ms (~43m) 20971520 ms - 167772150 ms (~5h - ~1d) + * 7 448 20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d) */ -#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6) -#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8) -#define TVN_SIZE (1 << TVN_BITS) -#define TVR_SIZE (1 << TVR_BITS) -#define TVN_MASK (TVN_SIZE - 1) -#define TVR_MASK (TVR_SIZE - 1) -#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1)) - -struct tvec { - struct hlist_head vec[TVN_SIZE]; -}; -struct tvec_root { - struct hlist_head vec[TVR_SIZE]; -}; +/* Clock divisor for the next level */ +#define LVL_CLK_SHIFT 3 +#define LVL_CLK_DIV (1UL << LVL_CLK_SHIFT) +#define LVL_CLK_MASK (LVL_CLK_DIV - 1) +#define LVL_SHIFT(n) ((n) * LVL_CLK_SHIFT) +#define LVL_GRAN(n) (1UL << LVL_SHIFT(n)) -struct tvec_base { - spinlock_t lock; - struct timer_list *running_timer; - unsigned long timer_jiffies; - unsigned long next_timer; - unsigned long active_timers; - unsigned long all_timers; - int cpu; - bool migration_enabled; - bool nohz_active; - struct tvec_root tv1; - struct tvec tv2; - struct tvec tv3; - struct tvec tv4; - struct tvec tv5; -} ____cacheline_aligned; +/* + * The time start value for each level to select the bucket at enqueue + * time. + */ +#define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT)) + +/* Size of each clock level */ +#define LVL_BITS 6 +#define LVL_SIZE (1UL << LVL_BITS) +#define LVL_MASK (LVL_SIZE - 1) +#define LVL_OFFS(n) ((n) * LVL_SIZE) + +/* Level depth */ +#if HZ > 100 +# define LVL_DEPTH 9 +# else +# define LVL_DEPTH 8 +#endif + +/* The cutoff (max. capacity of the wheel) */ +#define WHEEL_TIMEOUT_CUTOFF (LVL_START(LVL_DEPTH)) +#define WHEEL_TIMEOUT_MAX (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1)) +/* + * The resulting wheel size. If NOHZ is configured we allocate two + * wheels so we have a separate storage for the deferrable timers. + */ +#define WHEEL_SIZE (LVL_SIZE * LVL_DEPTH) + +#ifdef CONFIG_NO_HZ_COMMON +# define NR_BASES 2 +# define BASE_STD 0 +# define BASE_DEF 1 +#else +# define NR_BASES 1 +# define BASE_STD 0 +# define BASE_DEF 0 +#endif + +struct timer_base { + spinlock_t lock; + struct timer_list *running_timer; + unsigned long clk; + unsigned long next_expiry; + unsigned int cpu; + bool migration_enabled; + bool nohz_active; + bool is_idle; + DECLARE_BITMAP(pending_map, WHEEL_SIZE); + struct hlist_head vectors[WHEEL_SIZE]; +} ____cacheline_aligned; -static DEFINE_PER_CPU(struct tvec_base, tvec_bases); +static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]); #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) unsigned int sysctl_timer_migration = 1; @@ -106,15 +216,17 @@ void timers_update_migration(bool update_nohz) unsigned int cpu; /* Avoid the loop, if nothing to update */ - if (this_cpu_read(tvec_bases.migration_enabled) == on) + if (this_cpu_read(timer_bases[BASE_STD].migration_enabled) == on) return; for_each_possible_cpu(cpu) { - per_cpu(tvec_bases.migration_enabled, cpu) = on; + per_cpu(timer_bases[BASE_STD].migration_enabled, cpu) = on; + per_cpu(timer_bases[BASE_DEF].migration_enabled, cpu) = on; per_cpu(hrtimer_bases.migration_enabled, cpu) = on; if (!update_nohz) continue; - per_cpu(tvec_bases.nohz_active, cpu) = true; + per_cpu(timer_bases[BASE_STD].nohz_active, cpu) = true; + per_cpu(timer_bases[BASE_DEF].nohz_active, cpu) = true; per_cpu(hrtimer_bases.nohz_active, cpu) = true; } } @@ -133,20 +245,6 @@ int timer_migration_handler(struct ctl_table *table, int write, mutex_unlock(&mutex); return ret; } - -static inline struct tvec_base *get_target_base(struct tvec_base *base, - int pinned) -{ - if (pinned || !base->migration_enabled) - return this_cpu_ptr(&tvec_bases); - return per_cpu_ptr(&tvec_bases, get_nohz_timer_target()); -} -#else -static inline struct tvec_base *get_target_base(struct tvec_base *base, - int pinned) -{ - return this_cpu_ptr(&tvec_bases); -} #endif static unsigned long round_jiffies_common(unsigned long j, int cpu, @@ -351,101 +449,126 @@ unsigned long round_jiffies_up_relative(unsigned long j) } EXPORT_SYMBOL_GPL(round_jiffies_up_relative); -/** - * set_timer_slack - set the allowed slack for a timer - * @timer: the timer to be modified - * @slack_hz: the amount of time (in jiffies) allowed for rounding - * - * Set the amount of time, in jiffies, that a certain timer has - * in terms of slack. By setting this value, the timer subsystem - * will schedule the actual timer somewhere between - * the time mod_timer() asks for, and that time plus the slack. - * - * By setting the slack to -1, a percentage of the delay is used - * instead. - */ -void set_timer_slack(struct timer_list *timer, int slack_hz) + +static inline unsigned int timer_get_idx(struct timer_list *timer) { - timer->slack = slack_hz; + return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT; } -EXPORT_SYMBOL_GPL(set_timer_slack); -static void -__internal_add_timer(struct tvec_base *base, struct timer_list *timer) +static inline void timer_set_idx(struct timer_list *timer, unsigned int idx) { - unsigned long expires = timer->expires; - unsigned long idx = expires - base->timer_jiffies; - struct hlist_head *vec; + timer->flags = (timer->flags & ~TIMER_ARRAYMASK) | + idx << TIMER_ARRAYSHIFT; +} - if (idx < TVR_SIZE) { - int i = expires & TVR_MASK; - vec = base->tv1.vec + i; - } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { - int i = (expires >> TVR_BITS) & TVN_MASK; - vec = base->tv2.vec + i; - } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { - int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; - vec = base->tv3.vec + i; - } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { - int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; - vec = base->tv4.vec + i; - } else if ((signed long) idx < 0) { - /* - * Can happen if you add a timer with expires == jiffies, - * or you set a timer to go off in the past - */ - vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); +/* + * Helper function to calculate the array index for a given expiry + * time. + */ +static inline unsigned calc_index(unsigned expires, unsigned lvl) +{ + expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl); + return LVL_OFFS(lvl) + (expires & LVL_MASK); +} + +static int calc_wheel_index(unsigned long expires, unsigned long clk) +{ + unsigned long delta = expires - clk; + unsigned int idx; + + if (delta < LVL_START(1)) { + idx = calc_index(expires, 0); + } else if (delta < LVL_START(2)) { + idx = calc_index(expires, 1); + } else if (delta < LVL_START(3)) { + idx = calc_index(expires, 2); + } else if (delta < LVL_START(4)) { + idx = calc_index(expires, 3); + } else if (delta < LVL_START(5)) { + idx = calc_index(expires, 4); + } else if (delta < LVL_START(6)) { + idx = calc_index(expires, 5); + } else if (delta < LVL_START(7)) { + idx = calc_index(expires, 6); + } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) { + idx = calc_index(expires, 7); + } else if ((long) delta < 0) { + idx = clk & LVL_MASK; } else { - int i; - /* If the timeout is larger than MAX_TVAL (on 64-bit - * architectures or with CONFIG_BASE_SMALL=1) then we - * use the maximum timeout. + /* + * Force expire obscene large timeouts to expire at the + * capacity limit of the wheel. */ - if (idx > MAX_TVAL) { - idx = MAX_TVAL; - expires = idx + base->timer_jiffies; - } - i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; - vec = base->tv5.vec + i; + if (expires >= WHEEL_TIMEOUT_CUTOFF) + expires = WHEEL_TIMEOUT_MAX; + + idx = calc_index(expires, LVL_DEPTH - 1); } + return idx; +} - hlist_add_head(&timer->entry, vec); +/* + * Enqueue the timer into the hash bucket, mark it pending in + * the bitmap and store the index in the timer flags. + */ +static void enqueue_timer(struct timer_base *base, struct timer_list *timer, + unsigned int idx) +{ + hlist_add_head(&timer->entry, base->vectors + idx); + __set_bit(idx, base->pending_map); + timer_set_idx(timer, idx); } -static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) +static void +__internal_add_timer(struct timer_base *base, struct timer_list *timer) { - /* Advance base->jiffies, if the base is empty */ - if (!base->all_timers++) - base->timer_jiffies = jiffies; + unsigned int idx; + + idx = calc_wheel_index(timer->expires, base->clk); + enqueue_timer(base, timer, idx); +} + +static void +trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) +{ + if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active) + return; - __internal_add_timer(base, timer); /* - * Update base->active_timers and base->next_timer + * TODO: This wants some optimizing similar to the code below, but we + * will do that when we switch from push to pull for deferrable timers. */ - if (!(timer->flags & TIMER_DEFERRABLE)) { - if (!base->active_timers++ || - time_before(timer->expires, base->next_timer)) - base->next_timer = timer->expires; + if (timer->flags & TIMER_DEFERRABLE) { + if (tick_nohz_full_cpu(base->cpu)) + wake_up_nohz_cpu(base->cpu); + return; } /* - * Check whether the other CPU is in dynticks mode and needs - * to be triggered to reevaluate the timer wheel. - * We are protected against the other CPU fiddling - * with the timer by holding the timer base lock. This also - * makes sure that a CPU on the way to stop its tick can not - * evaluate the timer wheel. - * - * Spare the IPI for deferrable timers on idle targets though. - * The next busy ticks will take care of it. Except full dynticks - * require special care against races with idle_cpu(), lets deal - * with that later. + * We might have to IPI the remote CPU if the base is idle and the + * timer is not deferrable. If the other CPU is on the way to idle + * then it can't set base->is_idle as we hold the base lock: */ - if (base->nohz_active) { - if (!(timer->flags & TIMER_DEFERRABLE) || - tick_nohz_full_cpu(base->cpu)) - wake_up_nohz_cpu(base->cpu); - } + if (!base->is_idle) + return; + + /* Check whether this is the new first expiring timer: */ + if (time_after_eq(timer->expires, base->next_expiry)) + return; + + /* + * Set the next expiry time and kick the CPU so it can reevaluate the + * wheel: + */ + base->next_expiry = timer->expires; + wake_up_nohz_cpu(base->cpu); +} + +static void +internal_add_timer(struct timer_base *base, struct timer_list *timer) +{ + __internal_add_timer(base, timer); + trigger_dyntick_cpu(base, timer); } #ifdef CONFIG_TIMER_STATS @@ -489,11 +612,19 @@ static void *timer_debug_hint(void *addr) return ((struct timer_list *) addr)->function; } +static bool timer_is_static_object(void *addr) +{ + struct timer_list *timer = addr; + + return (timer->entry.pprev == NULL && + timer->entry.next == TIMER_ENTRY_STATIC); +} + /* * fixup_init is called when: * - an active object is initialized */ -static int timer_fixup_init(void *addr, enum debug_obj_state state) +static bool timer_fixup_init(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; @@ -501,9 +632,9 @@ static int timer_fixup_init(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: del_timer_sync(timer); debug_object_init(timer, &timer_debug_descr); - return 1; + return true; default: - return 0; + return false; } } @@ -516,36 +647,22 @@ static void stub_timer(unsigned long data) /* * fixup_activate is called when: * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) + * - an unknown non-static object is activated */ -static int timer_fixup_activate(void *addr, enum debug_obj_state state) +static bool timer_fixup_activate(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; switch (state) { - case ODEBUG_STATE_NOTAVAILABLE: - /* - * This is not really a fixup. The timer was - * statically initialized. We just make sure that it - * is tracked in the object tracker. - */ - if (timer->entry.pprev == NULL && - timer->entry.next == TIMER_ENTRY_STATIC) { - debug_object_init(timer, &timer_debug_descr); - debug_object_activate(timer, &timer_debug_descr); - return 0; - } else { - setup_timer(timer, stub_timer, 0); - return 1; - } - return 0; + setup_timer(timer, stub_timer, 0); + return true; case ODEBUG_STATE_ACTIVE: WARN_ON(1); default: - return 0; + return false; } } @@ -553,7 +670,7 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state) * fixup_free is called when: * - an active object is freed */ -static int timer_fixup_free(void *addr, enum debug_obj_state state) +static bool timer_fixup_free(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; @@ -561,9 +678,9 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: del_timer_sync(timer); debug_object_free(timer, &timer_debug_descr); - return 1; + return true; default: - return 0; + return false; } } @@ -571,32 +688,23 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state) * fixup_assert_init is called when: * - an untracked/uninit-ed object is found */ -static int timer_fixup_assert_init(void *addr, enum debug_obj_state state) +static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; switch (state) { case ODEBUG_STATE_NOTAVAILABLE: - if (timer->entry.next == TIMER_ENTRY_STATIC) { - /* - * This is not really a fixup. The timer was - * statically initialized. We just make sure that it - * is tracked in the object tracker. - */ - debug_object_init(timer, &timer_debug_descr); - return 0; - } else { - setup_timer(timer, stub_timer, 0); - return 1; - } + setup_timer(timer, stub_timer, 0); + return true; default: - return 0; + return false; } } static struct debug_obj_descr timer_debug_descr = { .name = "timer_list", .debug_hint = timer_debug_hint, + .is_static_object = timer_is_static_object, .fixup_init = timer_fixup_init, .fixup_activate = timer_fixup_activate, .fixup_free = timer_fixup_free, @@ -681,7 +789,6 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags, { timer->entry.pprev = NULL; timer->flags = flags | raw_smp_processor_id(); - timer->slack = -1; #ifdef CONFIG_TIMER_STATS timer->start_site = NULL; timer->start_pid = -1; @@ -721,54 +828,125 @@ static inline void detach_timer(struct timer_list *timer, bool clear_pending) entry->next = LIST_POISON2; } -static inline void -detach_expired_timer(struct timer_list *timer, struct tvec_base *base) -{ - detach_timer(timer, true); - if (!(timer->flags & TIMER_DEFERRABLE)) - base->active_timers--; - base->all_timers--; -} - -static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, +static int detach_if_pending(struct timer_list *timer, struct timer_base *base, bool clear_pending) { + unsigned idx = timer_get_idx(timer); + if (!timer_pending(timer)) return 0; + if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) + __clear_bit(idx, base->pending_map); + detach_timer(timer, clear_pending); - if (!(timer->flags & TIMER_DEFERRABLE)) { - base->active_timers--; - if (timer->expires == base->next_timer) - base->next_timer = base->timer_jiffies; - } - /* If this was the last timer, advance base->jiffies */ - if (!--base->all_timers) - base->timer_jiffies = jiffies; return 1; } +static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu) +{ + struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu); + + /* + * If the timer is deferrable and nohz is active then we need to use + * the deferrable base. + */ + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active && + (tflags & TIMER_DEFERRABLE)) + base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu); + return base; +} + +static inline struct timer_base *get_timer_this_cpu_base(u32 tflags) +{ + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + + /* + * If the timer is deferrable and nohz is active then we need to use + * the deferrable base. + */ + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active && + (tflags & TIMER_DEFERRABLE)) + base = this_cpu_ptr(&timer_bases[BASE_DEF]); + return base; +} + +static inline struct timer_base *get_timer_base(u32 tflags) +{ + return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK); +} + +#ifdef CONFIG_NO_HZ_COMMON +static inline struct timer_base * +__get_target_base(struct timer_base *base, unsigned tflags) +{ +#ifdef CONFIG_SMP + if ((tflags & TIMER_PINNED) || !base->migration_enabled) + return get_timer_this_cpu_base(tflags); + return get_timer_cpu_base(tflags, get_nohz_timer_target()); +#else + return get_timer_this_cpu_base(tflags); +#endif +} + +static inline void forward_timer_base(struct timer_base *base) +{ + /* + * We only forward the base when it's idle and we have a delta between + * base clock and jiffies. + */ + if (!base->is_idle || (long) (jiffies - base->clk) < 2) + return; + + /* + * If the next expiry value is > jiffies, then we fast forward to + * jiffies otherwise we forward to the next expiry value. + */ + if (time_after(base->next_expiry, jiffies)) + base->clk = jiffies; + else + base->clk = base->next_expiry; +} +#else +static inline struct timer_base * +__get_target_base(struct timer_base *base, unsigned tflags) +{ + return get_timer_this_cpu_base(tflags); +} + +static inline void forward_timer_base(struct timer_base *base) { } +#endif + +static inline struct timer_base * +get_target_base(struct timer_base *base, unsigned tflags) +{ + struct timer_base *target = __get_target_base(base, tflags); + + forward_timer_base(target); + return target; +} + /* - * We are using hashed locking: holding per_cpu(tvec_bases).lock - * means that all timers which are tied to this base via timer->base are - * locked, and the base itself is locked too. + * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means + * that all timers which are tied to this base are locked, and the base itself + * is locked too. * * So __run_timers/migrate_timers can safely modify all timers which could - * be found on ->tvX lists. + * be found in the base->vectors array. * - * When the timer's base is locked and removed from the list, the - * TIMER_MIGRATING flag is set, FIXME + * When a timer is migrating then the TIMER_MIGRATING flag is set and we need + * to wait until the migration is done. */ -static struct tvec_base *lock_timer_base(struct timer_list *timer, - unsigned long *flags) +static struct timer_base *lock_timer_base(struct timer_list *timer, + unsigned long *flags) __acquires(timer->base->lock) { for (;;) { + struct timer_base *base; u32 tf = timer->flags; - struct tvec_base *base; if (!(tf & TIMER_MIGRATING)) { - base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK); + base = get_timer_base(tf); spin_lock_irqsave(&base->lock, *flags); if (timer->flags == tf) return base; @@ -779,13 +957,41 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer, } static inline int -__mod_timer(struct timer_list *timer, unsigned long expires, - bool pending_only, int pinned) +__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) { - struct tvec_base *base, *new_base; - unsigned long flags; + struct timer_base *base, *new_base; + unsigned int idx = UINT_MAX; + unsigned long clk = 0, flags; int ret = 0; + /* + * This is a common optimization triggered by the networking code - if + * the timer is re-modified to have the same timeout or ends up in the + * same array bucket then just return: + */ + if (timer_pending(timer)) { + if (timer->expires == expires) + return 1; + /* + * Take the current timer_jiffies of base, but without holding + * the lock! + */ + base = get_timer_base(timer->flags); + clk = base->clk; + + idx = calc_wheel_index(expires, clk); + + /* + * Retrieve and compare the array index of the pending + * timer. If it matches set the expiry to the new value so a + * subsequent call will exit in the expires check above. + */ + if (idx == timer_get_idx(timer)) { + timer->expires = expires; + return 1; + } + } + timer_stats_timer_set_start_info(timer); BUG_ON(!timer->function); @@ -797,15 +1003,15 @@ __mod_timer(struct timer_list *timer, unsigned long expires, debug_activate(timer, expires); - new_base = get_target_base(base, pinned); + new_base = get_target_base(base, timer->flags); if (base != new_base) { /* - * We are trying to schedule the timer on the local CPU. + * We are trying to schedule the timer on the new base. * However we can't change timer's base while it is running, * otherwise del_timer_sync() can't detect that the timer's - * handler yet has not finished. This also guarantees that - * the timer is serialized wrt itself. + * handler yet has not finished. This also guarantees that the + * timer is serialized wrt itself. */ if (likely(base->running_timer != timer)) { /* See the comment in lock_timer_base() */ @@ -820,7 +1026,18 @@ __mod_timer(struct timer_list *timer, unsigned long expires, } timer->expires = expires; - internal_add_timer(base, timer); + /* + * If 'idx' was calculated above and the base time did not advance + * between calculating 'idx' and taking the lock, only enqueue_timer() + * and trigger_dyntick_cpu() is required. Otherwise we need to + * (re)calculate the wheel index via internal_add_timer(). + */ + if (idx != UINT_MAX && clk == base->clk) { + enqueue_timer(base, timer, idx); + trigger_dyntick_cpu(base, timer); + } else { + internal_add_timer(base, timer); + } out_unlock: spin_unlock_irqrestore(&base->lock, flags); @@ -840,49 +1057,10 @@ out_unlock: */ int mod_timer_pending(struct timer_list *timer, unsigned long expires) { - return __mod_timer(timer, expires, true, TIMER_NOT_PINNED); + return __mod_timer(timer, expires, true); } EXPORT_SYMBOL(mod_timer_pending); -/* - * Decide where to put the timer while taking the slack into account - * - * Algorithm: - * 1) calculate the maximum (absolute) time - * 2) calculate the highest bit where the expires and new max are different - * 3) use this bit to make a mask - * 4) use the bitmask to round down the maximum time, so that all last - * bits are zeros - */ -static inline -unsigned long apply_slack(struct timer_list *timer, unsigned long expires) -{ - unsigned long expires_limit, mask; - int bit; - - if (timer->slack >= 0) { - expires_limit = expires + timer->slack; - } else { - long delta = expires - jiffies; - - if (delta < 256) - return expires; - - expires_limit = expires + delta / 256; - } - mask = expires ^ expires_limit; - if (mask == 0) - return expires; - - bit = __fls(mask); - - mask = (1UL << bit) - 1; - - expires_limit = expires_limit & ~(mask); - - return expires_limit; -} - /** * mod_timer - modify a timer's timeout * @timer: the timer to be modified @@ -905,49 +1083,11 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires) */ int mod_timer(struct timer_list *timer, unsigned long expires) { - expires = apply_slack(timer, expires); - - /* - * This is a common optimization triggered by the - * networking code - if the timer is re-modified - * to be the same thing then just return: - */ - if (timer_pending(timer) && timer->expires == expires) - return 1; - - return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); + return __mod_timer(timer, expires, false); } EXPORT_SYMBOL(mod_timer); /** - * mod_timer_pinned - modify a timer's timeout - * @timer: the timer to be modified - * @expires: new timeout in jiffies - * - * mod_timer_pinned() is a way to update the expire field of an - * active timer (if the timer is inactive it will be activated) - * and to ensure that the timer is scheduled on the current CPU. - * - * Note that this does not prevent the timer from being migrated - * when the current CPU goes offline. If this is a problem for - * you, use CPU-hotplug notifiers to handle it correctly, for - * example, cancelling the timer when the corresponding CPU goes - * offline. - * - * mod_timer_pinned(timer, expires) is equivalent to: - * - * del_timer(timer); timer->expires = expires; add_timer(timer); - */ -int mod_timer_pinned(struct timer_list *timer, unsigned long expires) -{ - if (timer->expires == expires && timer_pending(timer)) - return 1; - - return __mod_timer(timer, expires, false, TIMER_PINNED); -} -EXPORT_SYMBOL(mod_timer_pinned); - -/** * add_timer - start a timer * @timer: the timer to be added * @@ -977,13 +1117,14 @@ EXPORT_SYMBOL(add_timer); */ void add_timer_on(struct timer_list *timer, int cpu) { - struct tvec_base *new_base = per_cpu_ptr(&tvec_bases, cpu); - struct tvec_base *base; + struct timer_base *new_base, *base; unsigned long flags; timer_stats_timer_set_start_info(timer); BUG_ON(timer_pending(timer) || !timer->function); + new_base = get_timer_cpu_base(timer->flags, cpu); + /* * If @timer was on a different CPU, it should be migrated with the * old base locked to prevent other operations proceeding with the @@ -1019,7 +1160,7 @@ EXPORT_SYMBOL_GPL(add_timer_on); */ int del_timer(struct timer_list *timer) { - struct tvec_base *base; + struct timer_base *base; unsigned long flags; int ret = 0; @@ -1045,7 +1186,7 @@ EXPORT_SYMBOL(del_timer); */ int try_to_del_timer_sync(struct timer_list *timer) { - struct tvec_base *base; + struct timer_base *base; unsigned long flags; int ret = -1; @@ -1129,27 +1270,6 @@ int del_timer_sync(struct timer_list *timer) EXPORT_SYMBOL(del_timer_sync); #endif -static int cascade(struct tvec_base *base, struct tvec *tv, int index) -{ - /* cascade all the timers from tv up one level */ - struct timer_list *timer; - struct hlist_node *tmp; - struct hlist_head tv_list; - - hlist_move_list(tv->vec + index, &tv_list); - - /* - * We are removing _all_ timers from the list, so we - * don't have to detach them individually. - */ - hlist_for_each_entry_safe(timer, tmp, &tv_list, entry) { - /* No accounting, while moving them */ - __internal_add_timer(base, timer); - } - - return index; -} - static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), unsigned long data) { @@ -1193,147 +1313,141 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), } } -#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) - -/** - * __run_timers - run all expired timers (if any) on this CPU. - * @base: the timer vector to be processed. - * - * This function cascades all vectors and executes all expired timer - * vectors. - */ -static inline void __run_timers(struct tvec_base *base) +static void expire_timers(struct timer_base *base, struct hlist_head *head) { - struct timer_list *timer; + while (!hlist_empty(head)) { + struct timer_list *timer; + void (*fn)(unsigned long); + unsigned long data; - spin_lock_irq(&base->lock); + timer = hlist_entry(head->first, struct timer_list, entry); + timer_stats_account_timer(timer); - while (time_after_eq(jiffies, base->timer_jiffies)) { - struct hlist_head work_list; - struct hlist_head *head = &work_list; - int index; + base->running_timer = timer; + detach_timer(timer, true); - if (!base->all_timers) { - base->timer_jiffies = jiffies; - break; + fn = timer->function; + data = timer->data; + + if (timer->flags & TIMER_IRQSAFE) { + spin_unlock(&base->lock); + call_timer_fn(timer, fn, data); + spin_lock(&base->lock); + } else { + spin_unlock_irq(&base->lock); + call_timer_fn(timer, fn, data); + spin_lock_irq(&base->lock); } + } +} - index = base->timer_jiffies & TVR_MASK; +static int __collect_expired_timers(struct timer_base *base, + struct hlist_head *heads) +{ + unsigned long clk = base->clk; + struct hlist_head *vec; + int i, levels = 0; + unsigned int idx; - /* - * Cascade timers: - */ - if (!index && - (!cascade(base, &base->tv2, INDEX(0))) && - (!cascade(base, &base->tv3, INDEX(1))) && - !cascade(base, &base->tv4, INDEX(2))) - cascade(base, &base->tv5, INDEX(3)); - ++base->timer_jiffies; - hlist_move_list(base->tv1.vec + index, head); - while (!hlist_empty(head)) { - void (*fn)(unsigned long); - unsigned long data; - bool irqsafe; - - timer = hlist_entry(head->first, struct timer_list, entry); - fn = timer->function; - data = timer->data; - irqsafe = timer->flags & TIMER_IRQSAFE; - - timer_stats_account_timer(timer); - - base->running_timer = timer; - detach_expired_timer(timer, base); - - if (irqsafe) { - spin_unlock(&base->lock); - call_timer_fn(timer, fn, data); - spin_lock(&base->lock); - } else { - spin_unlock_irq(&base->lock); - call_timer_fn(timer, fn, data); - spin_lock_irq(&base->lock); - } + for (i = 0; i < LVL_DEPTH; i++) { + idx = (clk & LVL_MASK) + i * LVL_SIZE; + + if (__test_and_clear_bit(idx, base->pending_map)) { + vec = base->vectors + idx; + hlist_move_list(vec, heads++); + levels++; } + /* Is it time to look at the next level? */ + if (clk & LVL_CLK_MASK) + break; + /* Shift clock for the next level granularity */ + clk >>= LVL_CLK_SHIFT; } - base->running_timer = NULL; - spin_unlock_irq(&base->lock); + return levels; } #ifdef CONFIG_NO_HZ_COMMON /* - * Find out when the next timer event is due to happen. This - * is used on S/390 to stop all activity when a CPU is idle. - * This function needs to be called with interrupts disabled. + * Find the next pending bucket of a level. Search from level start (@offset) + * + @clk upwards and if nothing there, search from start of the level + * (@offset) up to @offset + clk. */ -static unsigned long __next_timer_interrupt(struct tvec_base *base) -{ - unsigned long timer_jiffies = base->timer_jiffies; - unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA; - int index, slot, array, found = 0; - struct timer_list *nte; - struct tvec *varray[4]; - - /* Look for timer events in tv1. */ - index = slot = timer_jiffies & TVR_MASK; - do { - hlist_for_each_entry(nte, base->tv1.vec + slot, entry) { - if (nte->flags & TIMER_DEFERRABLE) - continue; - - found = 1; - expires = nte->expires; - /* Look at the cascade bucket(s)? */ - if (!index || slot < index) - goto cascade; - return expires; +static int next_pending_bucket(struct timer_base *base, unsigned offset, + unsigned clk) +{ + unsigned pos, start = offset + clk; + unsigned end = offset + LVL_SIZE; + + pos = find_next_bit(base->pending_map, end, start); + if (pos < end) + return pos - start; + + pos = find_next_bit(base->pending_map, start, offset); + return pos < start ? pos + LVL_SIZE - start : -1; +} + +/* + * Search the first expiring timer in the various clock levels. Caller must + * hold base->lock. + */ +static unsigned long __next_timer_interrupt(struct timer_base *base) +{ + unsigned long clk, next, adj; + unsigned lvl, offset = 0; + + next = base->clk + NEXT_TIMER_MAX_DELTA; + clk = base->clk; + for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) { + int pos = next_pending_bucket(base, offset, clk & LVL_MASK); + + if (pos >= 0) { + unsigned long tmp = clk + (unsigned long) pos; + + tmp <<= LVL_SHIFT(lvl); + if (time_before(tmp, next)) + next = tmp; } - slot = (slot + 1) & TVR_MASK; - } while (slot != index); - -cascade: - /* Calculate the next cascade event */ - if (index) - timer_jiffies += TVR_SIZE - index; - timer_jiffies >>= TVR_BITS; - - /* Check tv2-tv5. */ - varray[0] = &base->tv2; - varray[1] = &base->tv3; - varray[2] = &base->tv4; - varray[3] = &base->tv5; - - for (array = 0; array < 4; array++) { - struct tvec *varp = varray[array]; - - index = slot = timer_jiffies & TVN_MASK; - do { - hlist_for_each_entry(nte, varp->vec + slot, entry) { - if (nte->flags & TIMER_DEFERRABLE) - continue; - - found = 1; - if (time_before(nte->expires, expires)) - expires = nte->expires; - } - /* - * Do we still search for the first timer or are - * we looking up the cascade buckets ? - */ - if (found) { - /* Look at the cascade bucket(s)? */ - if (!index || slot < index) - break; - return expires; - } - slot = (slot + 1) & TVN_MASK; - } while (slot != index); - - if (index) - timer_jiffies += TVN_SIZE - index; - timer_jiffies >>= TVN_BITS; + /* + * Clock for the next level. If the current level clock lower + * bits are zero, we look at the next level as is. If not we + * need to advance it by one because that's going to be the + * next expiring bucket in that level. base->clk is the next + * expiring jiffie. So in case of: + * + * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 + * 0 0 0 0 0 0 + * + * we have to look at all levels @index 0. With + * + * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 + * 0 0 0 0 0 2 + * + * LVL0 has the next expiring bucket @index 2. The upper + * levels have the next expiring bucket @index 1. + * + * In case that the propagation wraps the next level the same + * rules apply: + * + * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 + * 0 0 0 0 F 2 + * + * So after looking at LVL0 we get: + * + * LVL5 LVL4 LVL3 LVL2 LVL1 + * 0 0 0 1 0 + * + * So no propagation from LVL1 to LVL2 because that happened + * with the add already, but then we need to propagate further + * from LVL2 to LVL3. + * + * So the simple check whether the lower bits of the current + * level are 0 or not is sufficient for all cases. + */ + adj = clk & LVL_CLK_MASK ? 1 : 0; + clk >>= LVL_CLK_SHIFT; + clk += adj; } - return expires; + return next; } /* @@ -1379,9 +1493,10 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) */ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) { - struct tvec_base *base = this_cpu_ptr(&tvec_bases); + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); u64 expires = KTIME_MAX; unsigned long nextevt; + bool is_max_delta; /* * Pretend that there is no timer pending if the cpu is offline. @@ -1391,19 +1506,82 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) return expires; spin_lock(&base->lock); - if (base->active_timers) { - if (time_before_eq(base->next_timer, base->timer_jiffies)) - base->next_timer = __next_timer_interrupt(base); - nextevt = base->next_timer; - if (time_before_eq(nextevt, basej)) - expires = basem; - else + nextevt = __next_timer_interrupt(base); + is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA); + base->next_expiry = nextevt; + /* + * We have a fresh next event. Check whether we can forward the base: + */ + if (time_after(nextevt, jiffies)) + base->clk = jiffies; + else if (time_after(nextevt, base->clk)) + base->clk = nextevt; + + if (time_before_eq(nextevt, basej)) { + expires = basem; + base->is_idle = false; + } else { + if (!is_max_delta) expires = basem + (nextevt - basej) * TICK_NSEC; + /* + * If we expect to sleep more than a tick, mark the base idle: + */ + if ((expires - basem) > TICK_NSEC) + base->is_idle = true; } spin_unlock(&base->lock); return cmp_next_hrtimer_event(basem, expires); } + +/** + * timer_clear_idle - Clear the idle state of the timer base + * + * Called with interrupts disabled + */ +void timer_clear_idle(void) +{ + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + + /* + * We do this unlocked. The worst outcome is a remote enqueue sending + * a pointless IPI, but taking the lock would just make the window for + * sending the IPI a few instructions smaller for the cost of taking + * the lock in the exit from idle path. + */ + base->is_idle = false; +} + +static int collect_expired_timers(struct timer_base *base, + struct hlist_head *heads) +{ + /* + * NOHZ optimization. After a long idle sleep we need to forward the + * base to current jiffies. Avoid a loop by searching the bitfield for + * the next expiring timer. + */ + if ((long)(jiffies - base->clk) > 2) { + unsigned long next = __next_timer_interrupt(base); + + /* + * If the next timer is ahead of time forward to current + * jiffies, otherwise forward to the next expiry time: + */ + if (time_after(next, jiffies)) { + /* The call site will increment clock! */ + base->clk = jiffies - 1; + return 0; + } + base->clk = next; + } + return __collect_expired_timers(base, heads); +} +#else +static inline int collect_expired_timers(struct timer_base *base, + struct hlist_head *heads) +{ + return __collect_expired_timers(base, heads); +} #endif /* @@ -1426,15 +1604,42 @@ void update_process_times(int user_tick) run_posix_cpu_timers(p); } +/** + * __run_timers - run all expired timers (if any) on this CPU. + * @base: the timer vector to be processed. + */ +static inline void __run_timers(struct timer_base *base) +{ + struct hlist_head heads[LVL_DEPTH]; + int levels; + + if (!time_after_eq(jiffies, base->clk)) + return; + + spin_lock_irq(&base->lock); + + while (time_after_eq(jiffies, base->clk)) { + + levels = collect_expired_timers(base, heads); + base->clk++; + + while (levels--) + expire_timers(base, heads + levels); + } + base->running_timer = NULL; + spin_unlock_irq(&base->lock); +} + /* * This function runs timers and the timer-tq in bottom half context. */ static void run_timer_softirq(struct softirq_action *h) { - struct tvec_base *base = this_cpu_ptr(&tvec_bases); + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); - if (time_after_eq(jiffies, base->timer_jiffies)) - __run_timers(base); + __run_timers(base); + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) + __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); } /* @@ -1442,7 +1647,18 @@ static void run_timer_softirq(struct softirq_action *h) */ void run_local_timers(void) { + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + hrtimer_run_queues(); + /* Raise the softirq only if required. */ + if (time_before(jiffies, base->clk)) { + if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active) + return; + /* CPU is awake, so check the deferrable base. */ + base++; + if (time_before(jiffies, base->clk)) + return; + } raise_softirq(TIMER_SOFTIRQ); } @@ -1527,7 +1743,7 @@ signed long __sched schedule_timeout(signed long timeout) expire = timeout + jiffies; setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); - __mod_timer(&timer, expire, false, TIMER_NOT_PINNED); + __mod_timer(&timer, expire, false); schedule(); del_singleshot_timer_sync(&timer); @@ -1578,87 +1794,62 @@ signed long __sched schedule_timeout_idle(signed long timeout) EXPORT_SYMBOL(schedule_timeout_idle); #ifdef CONFIG_HOTPLUG_CPU -static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head) +static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head) { struct timer_list *timer; int cpu = new_base->cpu; while (!hlist_empty(head)) { timer = hlist_entry(head->first, struct timer_list, entry); - /* We ignore the accounting on the dying cpu */ detach_timer(timer, false); timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; internal_add_timer(new_base, timer); } } -static void migrate_timers(int cpu) +int timers_dead_cpu(unsigned int cpu) { - struct tvec_base *old_base; - struct tvec_base *new_base; - int i; + struct timer_base *old_base; + struct timer_base *new_base; + int b, i; BUG_ON(cpu_online(cpu)); - old_base = per_cpu_ptr(&tvec_bases, cpu); - new_base = get_cpu_ptr(&tvec_bases); - /* - * The caller is globally serialized and nobody else - * takes two locks at once, deadlock is not possible. - */ - spin_lock_irq(&new_base->lock); - spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); - - BUG_ON(old_base->running_timer); - - for (i = 0; i < TVR_SIZE; i++) - migrate_timer_list(new_base, old_base->tv1.vec + i); - for (i = 0; i < TVN_SIZE; i++) { - migrate_timer_list(new_base, old_base->tv2.vec + i); - migrate_timer_list(new_base, old_base->tv3.vec + i); - migrate_timer_list(new_base, old_base->tv4.vec + i); - migrate_timer_list(new_base, old_base->tv5.vec + i); - } - old_base->active_timers = 0; - old_base->all_timers = 0; + for (b = 0; b < NR_BASES; b++) { + old_base = per_cpu_ptr(&timer_bases[b], cpu); + new_base = get_cpu_ptr(&timer_bases[b]); + /* + * The caller is globally serialized and nobody else + * takes two locks at once, deadlock is not possible. + */ + spin_lock_irq(&new_base->lock); + spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); - spin_unlock(&old_base->lock); - spin_unlock_irq(&new_base->lock); - put_cpu_ptr(&tvec_bases); -} + BUG_ON(old_base->running_timer); -static int timer_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - switch (action) { - case CPU_DEAD: - case CPU_DEAD_FROZEN: - migrate_timers((long)hcpu); - break; - default: - break; - } + for (i = 0; i < WHEEL_SIZE; i++) + migrate_timer_list(new_base, old_base->vectors + i); - return NOTIFY_OK; + spin_unlock(&old_base->lock); + spin_unlock_irq(&new_base->lock); + put_cpu_ptr(&timer_bases); + } + return 0; } -static inline void timer_register_cpu_notifier(void) -{ - cpu_notifier(timer_cpu_notify, 0); -} -#else -static inline void timer_register_cpu_notifier(void) { } #endif /* CONFIG_HOTPLUG_CPU */ static void __init init_timer_cpu(int cpu) { - struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu); - - base->cpu = cpu; - spin_lock_init(&base->lock); + struct timer_base *base; + int i; - base->timer_jiffies = jiffies; - base->next_timer = base->timer_jiffies; + for (i = 0; i < NR_BASES; i++) { + base = per_cpu_ptr(&timer_bases[i], cpu); + base->cpu = cpu; + spin_lock_init(&base->lock); + base->clk = jiffies; + } } static void __init init_timer_cpus(void) @@ -1673,7 +1864,6 @@ void __init init_timers(void) { init_timer_cpus(); init_timer_stats(); - timer_register_cpu_notifier(); open_softirq(TIMER_SOFTIRQ, run_timer_softirq); } @@ -1717,9 +1907,15 @@ static void __sched do_usleep_range(unsigned long min, unsigned long max) } /** - * usleep_range - Drop in replacement for udelay where wakeup is flexible + * usleep_range - Sleep for an approximate time * @min: Minimum time in usecs to sleep * @max: Maximum time in usecs to sleep + * + * In non-atomic context where the exact wakeup time is flexible, use + * usleep_range() instead of udelay(). The sleep improves responsiveness + * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces + * power usage by allowing hrtimers to take advantage of an already- + * scheduled interrupt instead of scheduling a new one just for this sleep. */ void __sched usleep_range(unsigned long min, unsigned long max) { diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 1adecb4b87c8..087204c733eb 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -279,7 +279,7 @@ static void print_name_offset(struct seq_file *m, unsigned long addr) static int tstats_show(struct seq_file *m, void *v) { - struct timespec period; + struct timespec64 period; struct entry *entry; unsigned long ms; long events = 0; @@ -295,11 +295,11 @@ static int tstats_show(struct seq_file *m, void *v) time = ktime_sub(time_stop, time_start); - period = ktime_to_timespec(time); + period = ktime_to_timespec64(time); ms = period.tv_nsec / 1000000; seq_puts(m, "Timer Stats Version: v0.3\n"); - seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); + seq_printf(m, "Sample period: %ld.%03ld s\n", (long)period.tv_sec, ms); if (atomic_read(&overflow_count)) seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count)); seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive"); diff --git a/kernel/torture.c b/kernel/torture.c index 44aa462d033f..75961b3decfe 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -82,6 +82,104 @@ static int min_online = -1; static int max_online; /* + * Attempt to take a CPU offline. Return false if the CPU is already + * offline or if it is not subject to CPU-hotplug operations. The + * caller can detect other failures by looking at the statistics. + */ +bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, + unsigned long *sum_offl, int *min_offl, int *max_offl) +{ + unsigned long delta; + int ret; + unsigned long starttime; + + if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) + return false; + + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: offlining %d\n", + torture_type, cpu); + starttime = jiffies; + (*n_offl_attempts)++; + ret = cpu_down(cpu); + if (ret) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: offline %d failed: errno %d\n", + torture_type, cpu, ret); + } else { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: offlined %d\n", + torture_type, cpu); + (*n_offl_successes)++; + delta = jiffies - starttime; + sum_offl += delta; + if (*min_offl < 0) { + *min_offl = delta; + *max_offl = delta; + } + if (*min_offl > delta) + *min_offl = delta; + if (*max_offl < delta) + *max_offl = delta; + } + + return true; +} +EXPORT_SYMBOL_GPL(torture_offline); + +/* + * Attempt to bring a CPU online. Return false if the CPU is already + * online or if it is not subject to CPU-hotplug operations. The + * caller can detect other failures by looking at the statistics. + */ +bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, + unsigned long *sum_onl, int *min_onl, int *max_onl) +{ + unsigned long delta; + int ret; + unsigned long starttime; + + if (cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) + return false; + + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: onlining %d\n", + torture_type, cpu); + starttime = jiffies; + (*n_onl_attempts)++; + ret = cpu_up(cpu); + if (ret) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: online %d failed: errno %d\n", + torture_type, cpu, ret); + } else { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: onlined %d\n", + torture_type, cpu); + (*n_onl_successes)++; + delta = jiffies - starttime; + *sum_onl += delta; + if (*min_onl < 0) { + *min_onl = delta; + *max_onl = delta; + } + if (*min_onl > delta) + *min_onl = delta; + if (*max_onl < delta) + *max_onl = delta; + } + + return true; +} +EXPORT_SYMBOL_GPL(torture_online); + +/* * Execute random CPU-hotplug operations at the interval specified * by the onoff_interval. */ @@ -89,16 +187,19 @@ static int torture_onoff(void *arg) { int cpu; - unsigned long delta; int maxcpu = -1; DEFINE_TORTURE_RANDOM(rand); - int ret; - unsigned long starttime; VERBOSE_TOROUT_STRING("torture_onoff task started"); for_each_online_cpu(cpu) maxcpu = cpu; WARN_ON(maxcpu < 0); + + if (maxcpu == 0) { + VERBOSE_TOROUT_STRING("Only one CPU, so CPU-hotplug testing is disabled"); + goto stop; + } + if (onoff_holdoff > 0) { VERBOSE_TOROUT_STRING("torture_onoff begin holdoff"); schedule_timeout_interruptible(onoff_holdoff); @@ -106,69 +207,16 @@ torture_onoff(void *arg) } while (!torture_must_stop()) { cpu = (torture_random(&rand) >> 4) % (maxcpu + 1); - if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: offlining %d\n", - torture_type, cpu); - starttime = jiffies; - n_offline_attempts++; - ret = cpu_down(cpu); - if (ret) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: offline %d failed: errno %d\n", - torture_type, cpu, ret); - } else { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: offlined %d\n", - torture_type, cpu); - n_offline_successes++; - delta = jiffies - starttime; - sum_offline += delta; - if (min_offline < 0) { - min_offline = delta; - max_offline = delta; - } - if (min_offline > delta) - min_offline = delta; - if (max_offline < delta) - max_offline = delta; - } - } else if (cpu_is_hotpluggable(cpu)) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: onlining %d\n", - torture_type, cpu); - starttime = jiffies; - n_online_attempts++; - ret = cpu_up(cpu); - if (ret) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: online %d failed: errno %d\n", - torture_type, cpu, ret); - } else { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: onlined %d\n", - torture_type, cpu); - n_online_successes++; - delta = jiffies - starttime; - sum_online += delta; - if (min_online < 0) { - min_online = delta; - max_online = delta; - } - if (min_online > delta) - min_online = delta; - if (max_online < delta) - max_online = delta; - } - } + if (!torture_offline(cpu, + &n_offline_attempts, &n_offline_successes, + &sum_offline, &min_offline, &max_offline)) + torture_online(cpu, + &n_online_attempts, &n_online_successes, + &sum_online, &min_online, &max_online); schedule_timeout_interruptible(onoff_interval); } + +stop: torture_kthread_stopping("torture_onoff"); return 0; } @@ -451,6 +499,7 @@ static int torture_shutdown(void *arg) torture_shutdown_hook(); else VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping."); + ftrace_dump(DUMP_ALL); kernel_power_off(); /* Shut down the system. */ return 0; } @@ -602,8 +651,9 @@ bool torture_init_begin(char *ttype, bool v, int *runnable) { mutex_lock(&fullstop_mutex); if (torture_type != NULL) { - pr_alert("torture_init_begin: refusing %s init: %s running", + pr_alert("torture_init_begin: Refusing %s init: %s running.\n", ttype, torture_type); + pr_alert("torture_init_begin: One torture test at a time!\n"); mutex_unlock(&fullstop_mutex); return false; } diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e45db6b0d878..f4b86e8ca1e7 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -528,6 +528,33 @@ config MMIOTRACE See Documentation/trace/mmiotrace.txt. If you are not helping to develop drivers, say N. +config TRACING_MAP + bool + depends on ARCH_HAVE_NMI_SAFE_CMPXCHG + help + tracing_map is a special-purpose lock-free map for tracing, + separated out as a stand-alone facility in order to allow it + to be shared between multiple tracers. It isn't meant to be + generally used outside of that context, and is normally + selected by tracers that use it. + +config HIST_TRIGGERS + bool "Histogram triggers" + depends on ARCH_HAVE_NMI_SAFE_CMPXCHG + select TRACING_MAP + select TRACING + default n + help + Hist triggers allow one or more arbitrary trace event fields + to be aggregated into hash tables and dumped to stdout by + reading a debugfs/tracefs file. They're useful for + gathering quick and dirty (though precise) summaries of + event activity as an initial guide for further investigation + using more advanced tools. + + See Documentation/trace/events.txt. + If in doubt, say N. + config MMIOTRACE_TEST tristate "Test module for mmiotrace" depends on MMIOTRACE && m diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 9b1044e936a6..d0a1617b52b4 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -1,4 +1,8 @@ +# We are fully aware of the dangers of __builtin_return_address() +FRAME_CFLAGS := $(call cc-disable-warning,frame-address) +KBUILD_CFLAGS += $(FRAME_CFLAGS) + # Do not instrument the tracer itself: ifdef CONFIG_FUNCTION_TRACER @@ -31,6 +35,7 @@ obj-$(CONFIG_TRACING) += trace_output.o obj-$(CONFIG_TRACING) += trace_seq.o obj-$(CONFIG_TRACING) += trace_stat.o obj-$(CONFIG_TRACING) += trace_printk.o +obj-$(CONFIG_TRACING_MAP) += tracing_map.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o @@ -53,6 +58,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o +obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o obj-$(CONFIG_TRACEPOINTS) += power-traces.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index f94e7a21f52d..dbafc5df03f3 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -127,12 +127,13 @@ static void trace_note_tsk(struct task_struct *tsk) static void trace_note_time(struct blk_trace *bt) { - struct timespec now; + struct timespec64 now; unsigned long flags; u32 words[2]; - getnstimeofday(&now); - words[0] = now.tv_sec; + /* need to check user space to see if this breaks in y2038 or y2106 */ + ktime_get_real_ts64(&now); + words[0] = (u32)now.tv_sec; words[1] = now.tv_nsec; local_irq_save(flags); @@ -189,6 +190,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) }; #define BLK_TC_RAHEAD BLK_TC_AHEAD +#define BLK_TC_PREFLUSH BLK_TC_FLUSH /* The ilog2() calls fall out because they're constant */ #define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \ @@ -199,7 +201,8 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), * blk_io_trace structure and places it in a per-cpu subbuffer. */ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, - int rw, u32 what, int error, int pdu_len, void *pdu_data) + int op, int op_flags, u32 what, int error, int pdu_len, + void *pdu_data) { struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; @@ -214,13 +217,16 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) return; - what |= ddir_act[rw & WRITE]; - what |= MASK_TC_BIT(rw, SYNC); - what |= MASK_TC_BIT(rw, RAHEAD); - what |= MASK_TC_BIT(rw, META); - what |= MASK_TC_BIT(rw, DISCARD); - what |= MASK_TC_BIT(rw, FLUSH); - what |= MASK_TC_BIT(rw, FUA); + what |= ddir_act[op_is_write(op) ? WRITE : READ]; + what |= MASK_TC_BIT(op_flags, SYNC); + what |= MASK_TC_BIT(op_flags, RAHEAD); + what |= MASK_TC_BIT(op_flags, META); + what |= MASK_TC_BIT(op_flags, PREFLUSH); + what |= MASK_TC_BIT(op_flags, FUA); + if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE) + what |= BLK_TC_ACT(BLK_TC_DISCARD); + if (op == REQ_OP_FLUSH) + what |= BLK_TC_ACT(BLK_TC_FLUSH); pid = tsk->pid; if (act_log_check(bt, what, sector, pid)) @@ -708,11 +714,11 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { what |= BLK_TC_ACT(BLK_TC_PC); - __blk_add_trace(bt, 0, nr_bytes, rq->cmd_flags, + __blk_add_trace(bt, 0, nr_bytes, req_op(rq), rq->cmd_flags, what, rq->errors, rq->cmd_len, rq->cmd); } else { what |= BLK_TC_ACT(BLK_TC_FS); - __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, + __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, req_op(rq), rq->cmd_flags, what, rq->errors, 0, NULL); } } @@ -770,7 +776,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, return; __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio->bi_rw, what, error, 0, NULL); + bio_op(bio), bio->bi_opf, what, error, 0, NULL); } static void blk_add_trace_bio_bounce(void *ignore, @@ -818,7 +824,8 @@ static void blk_add_trace_getrq(void *ignore, struct blk_trace *bt = q->blk_trace; if (bt) - __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL); + __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0, + NULL); } } @@ -833,7 +840,7 @@ static void blk_add_trace_sleeprq(void *ignore, struct blk_trace *bt = q->blk_trace; if (bt) - __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ, + __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ, 0, 0, NULL); } } @@ -843,7 +850,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q) struct blk_trace *bt = q->blk_trace; if (bt) - __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); + __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); } static void blk_add_trace_unplug(void *ignore, struct request_queue *q, @@ -860,7 +867,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, else what = BLK_TA_UNPLUG_TIMER; - __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); + __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); } } @@ -874,8 +881,9 @@ static void blk_add_trace_split(void *ignore, __be64 rpdu = cpu_to_be64(pdu); __blk_add_trace(bt, bio->bi_iter.bi_sector, - bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT, - bio->bi_error, sizeof(rpdu), &rpdu); + bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, + BLK_TA_SPLIT, bio->bi_error, sizeof(rpdu), + &rpdu); } } @@ -907,7 +915,7 @@ static void blk_add_trace_bio_remap(void *ignore, r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio->bi_rw, BLK_TA_REMAP, bio->bi_error, + bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_error, sizeof(r), &r); } @@ -940,7 +948,7 @@ static void blk_add_trace_rq_remap(void *ignore, r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), - rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors, + rq_data_dir(rq), 0, BLK_TA_REMAP, !!rq->errors, sizeof(r), &r); } @@ -965,10 +973,10 @@ void blk_add_driver_data(struct request_queue *q, return; if (rq->cmd_type == REQ_TYPE_BLOCK_PC) - __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, + __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, 0, BLK_TA_DRV_DATA, rq->errors, len, data); else - __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, + __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, 0, BLK_TA_DRV_DATA, rq->errors, len, data); } EXPORT_SYMBOL_GPL(blk_add_driver_data); @@ -1349,6 +1357,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, if (t->action == BLK_TN_MESSAGE) { log_action(iter, long_act ? "message" : "m"); blk_log_msg(s, iter->ent); + return trace_handle_return(s); } if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) @@ -1551,6 +1560,7 @@ static const struct { { BLK_TC_COMPLETE, "complete" }, { BLK_TC_FS, "fs" }, { BLK_TC_PC, "pc" }, + { BLK_TC_NOTIFY, "notify" }, { BLK_TC_AHEAD, "ahead" }, { BLK_TC_META, "meta" }, { BLK_TC_DISCARD, "discard" }, @@ -1767,21 +1777,34 @@ void blk_dump_cmd(char *buf, struct request *rq) } } -void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) +void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes) { int i = 0; - if (rw & REQ_FLUSH) + if (rw & REQ_PREFLUSH) rwbs[i++] = 'F'; - if (rw & WRITE) + switch (op) { + case REQ_OP_WRITE: + case REQ_OP_WRITE_SAME: rwbs[i++] = 'W'; - else if (rw & REQ_DISCARD) + break; + case REQ_OP_DISCARD: + rwbs[i++] = 'D'; + break; + case REQ_OP_SECURE_ERASE: rwbs[i++] = 'D'; - else if (bytes) + rwbs[i++] = 'E'; + break; + case REQ_OP_FLUSH: + rwbs[i++] = 'F'; + break; + case REQ_OP_READ: rwbs[i++] = 'R'; - else + break; + default: rwbs[i++] = 'N'; + } if (rw & REQ_FUA) rwbs[i++] = 'F'; @@ -1791,8 +1814,6 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) rwbs[i++] = 'S'; if (rw & REQ_META) rwbs[i++] = 'M'; - if (rw & REQ_SECURE) - rwbs[i++] = 'E'; rwbs[i] = '\0'; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3e4ffb3ace5f..b20438fdb029 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -62,21 +62,68 @@ EXPORT_SYMBOL_GPL(trace_call_bpf); static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) { void *dst = (void *) (long) r1; - int size = (int) r2; + int ret, size = (int) r2; void *unsafe_ptr = (void *) (long) r3; - return probe_kernel_read(dst, unsafe_ptr, size); + ret = probe_kernel_read(dst, unsafe_ptr, size); + if (unlikely(ret < 0)) + memset(dst, 0, size); + + return ret; } static const struct bpf_func_proto bpf_probe_read_proto = { .func = bpf_probe_read, .gpl_only = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_STACK, + .arg1_type = ARG_PTR_TO_RAW_STACK, .arg2_type = ARG_CONST_STACK_SIZE, .arg3_type = ARG_ANYTHING, }; +static u64 bpf_probe_write_user(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + void *unsafe_ptr = (void *) (long) r1; + void *src = (void *) (long) r2; + int size = (int) r3; + + /* + * Ensure we're in user context which is safe for the helper to + * run. This helper has no business in a kthread. + * + * access_ok() should prevent writing to non-user memory, but in + * some situations (nommu, temporary switch, etc) access_ok() does + * not provide enough validation, hence the check on KERNEL_DS. + */ + + if (unlikely(in_interrupt() || + current->flags & (PF_KTHREAD | PF_EXITING))) + return -EPERM; + if (unlikely(segment_eq(get_fs(), KERNEL_DS))) + return -EPERM; + if (!access_ok(VERIFY_WRITE, unsafe_ptr, size)) + return -EPERM; + + return probe_kernel_write(unsafe_ptr, src, size); +} + +static const struct bpf_func_proto bpf_probe_write_user_proto = { + .func = bpf_probe_write_user, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_STACK, + .arg3_type = ARG_CONST_STACK_SIZE, +}; + +static const struct bpf_func_proto *bpf_get_probe_write_proto(void) +{ + pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!", + current->comm, task_pid_nr(current)); + + return &bpf_probe_write_user_proto; +} + /* * limited trace_printk() * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed @@ -184,25 +231,33 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) return &bpf_trace_printk_proto; } -static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) +static u64 bpf_perf_event_read(u64 r1, u64 flags, u64 r3, u64 r4, u64 r5) { struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; struct bpf_array *array = container_of(map, struct bpf_array, map); + unsigned int cpu = smp_processor_id(); + u64 index = flags & BPF_F_INDEX_MASK; + struct bpf_event_entry *ee; struct perf_event *event; - struct file *file; + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) + return -EINVAL; + if (index == BPF_F_CURRENT_CPU) + index = cpu; if (unlikely(index >= array->map.max_entries)) return -E2BIG; - file = (struct file *)array->ptrs[index]; - if (unlikely(!file)) + ee = READ_ONCE(array->ptrs[index]); + if (!ee) return -ENOENT; - event = file->private_data; + event = ee->event; + if (unlikely(event->attr.type != PERF_TYPE_HARDWARE && + event->attr.type != PERF_TYPE_RAW)) + return -EINVAL; /* make sure event is local and doesn't have pmu::count */ - if (event->oncpu != smp_processor_id() || - event->pmu->count) + if (unlikely(event->oncpu != cpu || event->pmu->count)) return -EINVAL; /* @@ -221,42 +276,58 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = { .arg2_type = ARG_ANYTHING, }; -static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size) +static __always_inline u64 +__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, + u64 flags, struct perf_raw_record *raw) { - struct pt_regs *regs = (struct pt_regs *) (long) r1; - struct bpf_map *map = (struct bpf_map *) (long) r2; struct bpf_array *array = container_of(map, struct bpf_array, map); - void *data = (void *) (long) r4; + unsigned int cpu = smp_processor_id(); + u64 index = flags & BPF_F_INDEX_MASK; struct perf_sample_data sample_data; + struct bpf_event_entry *ee; struct perf_event *event; - struct file *file; - struct perf_raw_record raw = { - .size = size, - .data = data, - }; + if (index == BPF_F_CURRENT_CPU) + index = cpu; if (unlikely(index >= array->map.max_entries)) return -E2BIG; - file = (struct file *)array->ptrs[index]; - if (unlikely(!file)) + ee = READ_ONCE(array->ptrs[index]); + if (!ee) return -ENOENT; - event = file->private_data; - + event = ee->event; if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE || event->attr.config != PERF_COUNT_SW_BPF_OUTPUT)) return -EINVAL; - if (unlikely(event->oncpu != smp_processor_id())) + if (unlikely(event->oncpu != cpu)) return -EOPNOTSUPP; perf_sample_data_init(&sample_data, 0, 0); - sample_data.raw = &raw; + sample_data.raw = raw; perf_event_output(event, &sample_data, regs); return 0; } +static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) +{ + struct pt_regs *regs = (struct pt_regs *)(long) r1; + struct bpf_map *map = (struct bpf_map *)(long) r2; + void *data = (void *)(long) r4; + struct perf_raw_record raw = { + .frag = { + .size = size, + .data = data, + }, + }; + + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) + return -EINVAL; + + return __bpf_perf_event_output(regs, map, flags, &raw); +} + static const struct bpf_func_proto bpf_perf_event_output_proto = { .func = bpf_perf_event_output, .gpl_only = true, @@ -268,7 +339,44 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = { .arg5_type = ARG_CONST_STACK_SIZE, }; -static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) +static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); + +u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, + void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) +{ + struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs); + struct perf_raw_frag frag = { + .copy = ctx_copy, + .size = ctx_size, + .data = ctx, + }; + struct perf_raw_record raw = { + .frag = { + { + .next = ctx_size ? &frag : NULL, + }, + .size = meta_size, + .data = meta, + }, + }; + + perf_fetch_caller_regs(regs); + + return __bpf_perf_event_output(regs, map, flags, &raw); +} + +static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return (long) current; +} + +static const struct bpf_func_proto bpf_get_current_task_proto = { + .func = bpf_get_current_task, + .gpl_only = true, + .ret_type = RET_INTEGER, +}; + +static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) { switch (func_id) { case BPF_FUNC_map_lookup_elem: @@ -285,6 +393,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_tail_call_proto; case BPF_FUNC_get_current_pid_tgid: return &bpf_get_current_pid_tgid_proto; + case BPF_FUNC_get_current_task: + return &bpf_get_current_task_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_current_comm: @@ -295,30 +405,35 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_get_smp_processor_id_proto; case BPF_FUNC_perf_event_read: return &bpf_perf_event_read_proto; + case BPF_FUNC_probe_write_user: + return bpf_get_probe_write_proto(); + default: + return NULL; + } +} + +static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_perf_event_output_proto; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto; default: - return NULL; + return tracing_func_proto(func_id); } } /* bpf+kprobe programs can access fields of 'struct pt_regs' */ -static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type) +static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, + enum bpf_reg_type *reg_type) { - /* check bounds */ if (off < 0 || off >= sizeof(struct pt_regs)) return false; - - /* only read is allowed */ if (type != BPF_READ) return false; - - /* disallow misaligned access */ if (off % size != 0) return false; - return true; } @@ -332,9 +447,83 @@ static struct bpf_prog_type_list kprobe_tl = { .type = BPF_PROG_TYPE_KPROBE, }; +static u64 bpf_perf_event_output_tp(u64 r1, u64 r2, u64 index, u64 r4, u64 size) +{ + /* + * r1 points to perf tracepoint buffer where first 8 bytes are hidden + * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it + * from there and call the same bpf_perf_event_output() helper + */ + u64 ctx = *(long *)(uintptr_t)r1; + + return bpf_perf_event_output(ctx, r2, index, r4, size); +} + +static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { + .func = bpf_perf_event_output_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_STACK, + .arg5_type = ARG_CONST_STACK_SIZE, +}; + +static u64 bpf_get_stackid_tp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + u64 ctx = *(long *)(uintptr_t)r1; + + return bpf_get_stackid(ctx, r2, r3, r4, r5); +} + +static const struct bpf_func_proto bpf_get_stackid_proto_tp = { + .func = bpf_get_stackid_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_perf_event_output_proto_tp; + case BPF_FUNC_get_stackid: + return &bpf_get_stackid_proto_tp; + default: + return tracing_func_proto(func_id); + } +} + +static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, + enum bpf_reg_type *reg_type) +{ + if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) + return false; + if (type != BPF_READ) + return false; + if (off % size != 0) + return false; + return true; +} + +static const struct bpf_verifier_ops tracepoint_prog_ops = { + .get_func_proto = tp_prog_func_proto, + .is_valid_access = tp_prog_is_valid_access, +}; + +static struct bpf_prog_type_list tracepoint_tl = { + .ops = &tracepoint_prog_ops, + .type = BPF_PROG_TYPE_TRACEPOINT, +}; + static int __init register_kprobe_prog_ops(void) { bpf_register_prog_type(&kprobe_tl); + bpf_register_prog_type(&tracepoint_tl); return 0; } late_initcall(register_kprobe_prog_ops); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b1870fbd2b67..84752c8e28b5 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -89,16 +89,16 @@ struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; /* What to set function_trace_op to */ static struct ftrace_ops *set_function_trace_op; -/* List for set_ftrace_pid's pids. */ -LIST_HEAD(ftrace_pids); -struct ftrace_pid { - struct list_head list; - struct pid *pid; -}; - -static bool ftrace_pids_enabled(void) +static bool ftrace_pids_enabled(struct ftrace_ops *ops) { - return !list_empty(&ftrace_pids); + struct trace_array *tr; + + if (!(ops->flags & FTRACE_OPS_FL_PID) || !ops->private) + return false; + + tr = ops->private; + + return tr->function_pids != NULL; } static void ftrace_update_trampoline(struct ftrace_ops *ops); @@ -179,7 +179,9 @@ int ftrace_nr_registered_ops(void) static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) { - if (!test_tsk_trace_trace(current)) + struct trace_array *tr = op->private; + + if (tr && this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid)) return; op->saved_func(ip, parent_ip, op, regs); @@ -417,7 +419,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) /* Always save the function, and reset at unregistering */ ops->saved_func = ops->func; - if (ops->flags & FTRACE_OPS_FL_PID && ftrace_pids_enabled()) + if (ftrace_pids_enabled(ops)) ops->func = ftrace_pid_func; ftrace_update_trampoline(ops); @@ -450,7 +452,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) static void ftrace_update_pid_func(void) { - bool enabled = ftrace_pids_enabled(); struct ftrace_ops *op; /* Only do something if we are tracing something */ @@ -459,8 +460,8 @@ static void ftrace_update_pid_func(void) do_for_each_ftrace_op(op, ftrace_ops_list) { if (op->flags & FTRACE_OPS_FL_PID) { - op->func = enabled ? ftrace_pid_func : - op->saved_func; + op->func = ftrace_pids_enabled(op) ? + ftrace_pid_func : op->saved_func; ftrace_update_trampoline(op); } } while_for_each_ftrace_op(op); @@ -1530,7 +1531,19 @@ static int ftrace_cmp_recs(const void *a, const void *b) return 0; } -static unsigned long ftrace_location_range(unsigned long start, unsigned long end) +/** + * ftrace_location_range - return the first address of a traced location + * if it touches the given ip range + * @start: start of range to search. + * @end: end of range to search (inclusive). @end points to the last byte + * to check. + * + * Returns rec->ip if the related ftrace location is a least partly within + * the given address range. That is, the first address of the instruction + * that is either a NOP or call to the function tracer. It checks the ftrace + * internal tables to determine if the address belongs or not. + */ +unsigned long ftrace_location_range(unsigned long start, unsigned long end) { struct ftrace_page *pg; struct dyn_ftrace *rec; @@ -3444,11 +3457,23 @@ struct ftrace_glob { int type; }; +/* + * If symbols in an architecture don't correspond exactly to the user-visible + * name of what they represent, it is possible to define this function to + * perform the necessary adjustments. +*/ +char * __weak arch_ftrace_match_adjust(char *str, const char *search) +{ + return str; +} + static int ftrace_match(char *str, struct ftrace_glob *g) { int matched = 0; int slen; + str = arch_ftrace_match_adjust(str, g->search); + switch (g->type) { case MATCH_FULL: if (strcmp(str, g->search) == 0) @@ -5300,179 +5325,99 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) return ops->func; } -static void clear_ftrace_swapper(void) +static void +ftrace_filter_pid_sched_switch_probe(void *data, bool preempt, + struct task_struct *prev, struct task_struct *next) { - struct task_struct *p; - int cpu; - - get_online_cpus(); - for_each_online_cpu(cpu) { - p = idle_task(cpu); - clear_tsk_trace_trace(p); - } - put_online_cpus(); -} + struct trace_array *tr = data; + struct trace_pid_list *pid_list; -static void set_ftrace_swapper(void) -{ - struct task_struct *p; - int cpu; + pid_list = rcu_dereference_sched(tr->function_pids); - get_online_cpus(); - for_each_online_cpu(cpu) { - p = idle_task(cpu); - set_tsk_trace_trace(p); - } - put_online_cpus(); + this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid, + trace_ignore_this_task(pid_list, next)); } -static void clear_ftrace_pid(struct pid *pid) +static void clear_ftrace_pids(struct trace_array *tr) { - struct task_struct *p; + struct trace_pid_list *pid_list; + int cpu; - rcu_read_lock(); - do_each_pid_task(pid, PIDTYPE_PID, p) { - clear_tsk_trace_trace(p); - } while_each_pid_task(pid, PIDTYPE_PID, p); - rcu_read_unlock(); + pid_list = rcu_dereference_protected(tr->function_pids, + lockdep_is_held(&ftrace_lock)); + if (!pid_list) + return; - put_pid(pid); -} + unregister_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr); -static void set_ftrace_pid(struct pid *pid) -{ - struct task_struct *p; + for_each_possible_cpu(cpu) + per_cpu_ptr(tr->trace_buffer.data, cpu)->ftrace_ignore_pid = false; - rcu_read_lock(); - do_each_pid_task(pid, PIDTYPE_PID, p) { - set_tsk_trace_trace(p); - } while_each_pid_task(pid, PIDTYPE_PID, p); - rcu_read_unlock(); -} + rcu_assign_pointer(tr->function_pids, NULL); -static void clear_ftrace_pid_task(struct pid *pid) -{ - if (pid == ftrace_swapper_pid) - clear_ftrace_swapper(); - else - clear_ftrace_pid(pid); -} + /* Wait till all users are no longer using pid filtering */ + synchronize_sched(); -static void set_ftrace_pid_task(struct pid *pid) -{ - if (pid == ftrace_swapper_pid) - set_ftrace_swapper(); - else - set_ftrace_pid(pid); + trace_free_pid_list(pid_list); } -static int ftrace_pid_add(int p) +static void ftrace_pid_reset(struct trace_array *tr) { - struct pid *pid; - struct ftrace_pid *fpid; - int ret = -EINVAL; - mutex_lock(&ftrace_lock); - - if (!p) - pid = ftrace_swapper_pid; - else - pid = find_get_pid(p); - - if (!pid) - goto out; - - ret = 0; - - list_for_each_entry(fpid, &ftrace_pids, list) - if (fpid->pid == pid) - goto out_put; - - ret = -ENOMEM; - - fpid = kmalloc(sizeof(*fpid), GFP_KERNEL); - if (!fpid) - goto out_put; - - list_add(&fpid->list, &ftrace_pids); - fpid->pid = pid; - - set_ftrace_pid_task(pid); + clear_ftrace_pids(tr); ftrace_update_pid_func(); - ftrace_startup_all(0); mutex_unlock(&ftrace_lock); - return 0; - -out_put: - if (pid != ftrace_swapper_pid) - put_pid(pid); - -out: - mutex_unlock(&ftrace_lock); - return ret; } -static void ftrace_pid_reset(void) -{ - struct ftrace_pid *fpid, *safe; - - mutex_lock(&ftrace_lock); - list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) { - struct pid *pid = fpid->pid; - - clear_ftrace_pid_task(pid); - - list_del(&fpid->list); - kfree(fpid); - } - - ftrace_update_pid_func(); - ftrace_startup_all(0); - - mutex_unlock(&ftrace_lock); -} +/* Greater than any max PID */ +#define FTRACE_NO_PIDS (void *)(PID_MAX_LIMIT + 1) static void *fpid_start(struct seq_file *m, loff_t *pos) + __acquires(RCU) { + struct trace_pid_list *pid_list; + struct trace_array *tr = m->private; + mutex_lock(&ftrace_lock); + rcu_read_lock_sched(); - if (!ftrace_pids_enabled() && (!*pos)) - return (void *) 1; + pid_list = rcu_dereference_sched(tr->function_pids); - return seq_list_start(&ftrace_pids, *pos); + if (!pid_list) + return !(*pos) ? FTRACE_NO_PIDS : NULL; + + return trace_pid_start(pid_list, pos); } static void *fpid_next(struct seq_file *m, void *v, loff_t *pos) { - if (v == (void *)1) + struct trace_array *tr = m->private; + struct trace_pid_list *pid_list = rcu_dereference_sched(tr->function_pids); + + if (v == FTRACE_NO_PIDS) return NULL; - return seq_list_next(v, &ftrace_pids, pos); + return trace_pid_next(pid_list, v, pos); } static void fpid_stop(struct seq_file *m, void *p) + __releases(RCU) { + rcu_read_unlock_sched(); mutex_unlock(&ftrace_lock); } static int fpid_show(struct seq_file *m, void *v) { - const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list); - - if (v == (void *)1) { + if (v == FTRACE_NO_PIDS) { seq_puts(m, "no pid\n"); return 0; } - if (fpid->pid == ftrace_swapper_pid) - seq_puts(m, "swapper tasks\n"); - else - seq_printf(m, "%u\n", pid_vnr(fpid->pid)); - - return 0; + return trace_pid_show(m, v); } static const struct seq_operations ftrace_pid_sops = { @@ -5485,58 +5430,103 @@ static const struct seq_operations ftrace_pid_sops = { static int ftrace_pid_open(struct inode *inode, struct file *file) { + struct trace_array *tr = inode->i_private; + struct seq_file *m; int ret = 0; + if (trace_array_get(tr) < 0) + return -ENODEV; + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) - ftrace_pid_reset(); + ftrace_pid_reset(tr); - if (file->f_mode & FMODE_READ) - ret = seq_open(file, &ftrace_pid_sops); + ret = seq_open(file, &ftrace_pid_sops); + if (ret < 0) { + trace_array_put(tr); + } else { + m = file->private_data; + /* copy tr over to seq ops */ + m->private = tr; + } return ret; } +static void ignore_task_cpu(void *data) +{ + struct trace_array *tr = data; + struct trace_pid_list *pid_list; + + /* + * This function is called by on_each_cpu() while the + * event_mutex is held. + */ + pid_list = rcu_dereference_protected(tr->function_pids, + mutex_is_locked(&ftrace_lock)); + + this_cpu_write(tr->trace_buffer.data->ftrace_ignore_pid, + trace_ignore_this_task(pid_list, current)); +} + static ssize_t ftrace_pid_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - char buf[64], *tmp; - long val; - int ret; + struct seq_file *m = filp->private_data; + struct trace_array *tr = m->private; + struct trace_pid_list *filtered_pids = NULL; + struct trace_pid_list *pid_list; + ssize_t ret; - if (cnt >= sizeof(buf)) - return -EINVAL; + if (!cnt) + return 0; + + mutex_lock(&ftrace_lock); - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; + filtered_pids = rcu_dereference_protected(tr->function_pids, + lockdep_is_held(&ftrace_lock)); - buf[cnt] = 0; + ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); + if (ret < 0) + goto out; + + rcu_assign_pointer(tr->function_pids, pid_list); + + if (filtered_pids) { + synchronize_sched(); + trace_free_pid_list(filtered_pids); + } else if (pid_list) { + /* Register a probe to set whether to ignore the tracing of a task */ + register_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr); + } /* - * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid" - * to clean the filter quietly. + * Ignoring of pids is done at task switch. But we have to + * check for those tasks that are currently running. + * Always do this in case a pid was appended or removed. */ - tmp = strstrip(buf); - if (strlen(tmp) == 0) - return 1; + on_each_cpu(ignore_task_cpu, tr, 1); - ret = kstrtol(tmp, 10, &val); - if (ret < 0) - return ret; + ftrace_update_pid_func(); + ftrace_startup_all(0); + out: + mutex_unlock(&ftrace_lock); - ret = ftrace_pid_add(val); + if (ret > 0) + *ppos += ret; - return ret ? ret : cnt; + return ret; } static int ftrace_pid_release(struct inode *inode, struct file *file) { - if (file->f_mode & FMODE_READ) - seq_release(inode, file); + struct trace_array *tr = inode->i_private; - return 0; + trace_array_put(tr); + + return seq_release(inode, file); } static const struct file_operations ftrace_pid_fops = { @@ -5547,24 +5537,21 @@ static const struct file_operations ftrace_pid_fops = { .release = ftrace_pid_release, }; -static __init int ftrace_init_tracefs(void) +void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer) { - struct dentry *d_tracer; + trace_create_file("set_ftrace_pid", 0644, d_tracer, + tr, &ftrace_pid_fops); +} - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) - return 0; +void __init ftrace_init_tracefs_toplevel(struct trace_array *tr, + struct dentry *d_tracer) +{ + /* Only the top level directory has the dyn_tracefs and profile */ + WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL)); ftrace_init_dyn_tracefs(d_tracer); - - trace_create_file("set_ftrace_pid", 0644, d_tracer, - NULL, &ftrace_pid_fops); - ftrace_profile_tracefs(d_tracer); - - return 0; } -fs_initcall(ftrace_init_tracefs); /** * ftrace_kill - kill ftrace @@ -5713,7 +5700,6 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) { int i; int ret = 0; - unsigned long flags; int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE; struct task_struct *g, *t; @@ -5729,7 +5715,7 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) } } - read_lock_irqsave(&tasklist_lock, flags); + read_lock(&tasklist_lock); do_each_thread(g, t) { if (start == end) { ret = -EAGAIN; @@ -5747,7 +5733,7 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) } while_each_thread(g, t); unlock: - read_unlock_irqrestore(&tasklist_lock, flags); + read_unlock(&tasklist_lock); free: for (i = start; i < end; i++) kfree(ret_stack_list[i]); diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index 81b87451c0ea..0c7dee221dca 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -15,5 +15,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); +EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_frequency); EXPORT_TRACEPOINT_SYMBOL_GPL(powernv_throttle); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 95181e36891a..9c143739b8d7 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -437,7 +437,7 @@ struct ring_buffer_per_cpu { raw_spinlock_t reader_lock; /* serialize readers */ arch_spinlock_t lock; struct lock_class_key lock_key; - unsigned int nr_pages; + unsigned long nr_pages; unsigned int current_context; struct list_head *pages; struct buffer_page *head_page; /* read from head */ @@ -458,7 +458,7 @@ struct ring_buffer_per_cpu { u64 write_stamp; u64 read_stamp; /* ring buffer pages to update, > 0 to add, < 0 to remove */ - int nr_pages_to_update; + long nr_pages_to_update; struct list_head new_pages; /* new pages to add */ struct work_struct update_pages_work; struct completion update_done; @@ -1128,10 +1128,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) return 0; } -static int __rb_allocate_pages(int nr_pages, struct list_head *pages, int cpu) +static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu) { - int i; struct buffer_page *bpage, *tmp; + long i; for (i = 0; i < nr_pages; i++) { struct page *page; @@ -1168,7 +1168,7 @@ free_pages: } static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, - unsigned nr_pages) + unsigned long nr_pages) { LIST_HEAD(pages); @@ -1193,7 +1193,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, } static struct ring_buffer_per_cpu * -rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu) +rb_allocate_cpu_buffer(struct ring_buffer *buffer, long nr_pages, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; struct buffer_page *bpage; @@ -1293,8 +1293,9 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key) { struct ring_buffer *buffer; + long nr_pages; int bsize; - int cpu, nr_pages; + int cpu; /* keep it in its own cache line */ buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), @@ -1420,12 +1421,12 @@ static inline unsigned long rb_page_write(struct buffer_page *bpage) } static int -rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages) +rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) { struct list_head *tail_page, *to_remove, *next_page; struct buffer_page *to_remove_page, *tmp_iter_page; struct buffer_page *last_page, *first_page; - unsigned int nr_removed; + unsigned long nr_removed; unsigned long head_bit; int page_entries; @@ -1642,7 +1643,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, int cpu_id) { struct ring_buffer_per_cpu *cpu_buffer; - unsigned nr_pages; + unsigned long nr_pages; int cpu, err = 0; /* @@ -1656,14 +1657,13 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, !cpumask_test_cpu(cpu_id, buffer->cpumask)) return size; - size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); - size *= BUF_PAGE_SIZE; + nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); /* we need a minimum of two pages */ - if (size < BUF_PAGE_SIZE * 2) - size = BUF_PAGE_SIZE * 2; + if (nr_pages < 2) + nr_pages = 2; - nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + size = nr_pages * BUF_PAGE_SIZE; /* * Don't succeed if resizing is disabled, as a reader might be @@ -4640,8 +4640,9 @@ static int rb_cpu_notify(struct notifier_block *self, struct ring_buffer *buffer = container_of(self, struct ring_buffer, cpu_notify); long cpu = (long)hcpu; - int cpu_i, nr_pages_same; - unsigned int nr_pages; + long nr_pages_same; + int cpu_i; + unsigned long nr_pages; switch (action) { case CPU_UP_PREPARE: diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a2f0b9f33e9b..7bc56762ca35 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -25,7 +25,7 @@ #include <linux/hardirq.h> #include <linux/linkage.h> #include <linux/uaccess.h> -#include <linux/kprobes.h> +#include <linux/vmalloc.h> #include <linux/ftrace.h> #include <linux/module.h> #include <linux/percpu.h> @@ -253,6 +253,9 @@ unsigned long long ns2usecs(cycle_t nsec) #define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \ TRACE_ITER_PRINTK_MSGONLY | TRACE_ITER_RECORD_CMD) +/* trace_flags that are default zero for instances */ +#define ZEROED_TRACE_FLAGS \ + TRACE_ITER_EVENT_FORK /* * The global_trace is the descriptor that holds the tracing @@ -303,33 +306,270 @@ void trace_array_put(struct trace_array *this_tr) mutex_unlock(&trace_types_lock); } -int filter_check_discard(struct trace_event_file *file, void *rec, - struct ring_buffer *buffer, - struct ring_buffer_event *event) +int call_filter_check_discard(struct trace_event_call *call, void *rec, + struct ring_buffer *buffer, + struct ring_buffer_event *event) { - if (unlikely(file->flags & EVENT_FILE_FL_FILTERED) && - !filter_match_preds(file->filter, rec)) { - ring_buffer_discard_commit(buffer, event); + if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) && + !filter_match_preds(call->filter, rec)) { + __trace_event_discard_commit(buffer, event); return 1; } return 0; } -EXPORT_SYMBOL_GPL(filter_check_discard); -int call_filter_check_discard(struct trace_event_call *call, void *rec, - struct ring_buffer *buffer, - struct ring_buffer_event *event) +void trace_free_pid_list(struct trace_pid_list *pid_list) { - if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) && - !filter_match_preds(call->filter, rec)) { - ring_buffer_discard_commit(buffer, event); - return 1; + vfree(pid_list->pids); + kfree(pid_list); +} + +/** + * trace_find_filtered_pid - check if a pid exists in a filtered_pid list + * @filtered_pids: The list of pids to check + * @search_pid: The PID to find in @filtered_pids + * + * Returns true if @search_pid is fonud in @filtered_pids, and false otherwis. + */ +bool +trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) +{ + /* + * If pid_max changed after filtered_pids was created, we + * by default ignore all pids greater than the previous pid_max. + */ + if (search_pid >= filtered_pids->pid_max) + return false; + + return test_bit(search_pid, filtered_pids->pids); +} + +/** + * trace_ignore_this_task - should a task be ignored for tracing + * @filtered_pids: The list of pids to check + * @task: The task that should be ignored if not filtered + * + * Checks if @task should be traced or not from @filtered_pids. + * Returns true if @task should *NOT* be traced. + * Returns false if @task should be traced. + */ +bool +trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) +{ + /* + * Return false, because if filtered_pids does not exist, + * all pids are good to trace. + */ + if (!filtered_pids) + return false; + + return !trace_find_filtered_pid(filtered_pids, task->pid); +} + +/** + * trace_pid_filter_add_remove - Add or remove a task from a pid_list + * @pid_list: The list to modify + * @self: The current task for fork or NULL for exit + * @task: The task to add or remove + * + * If adding a task, if @self is defined, the task is only added if @self + * is also included in @pid_list. This happens on fork and tasks should + * only be added when the parent is listed. If @self is NULL, then the + * @task pid will be removed from the list, which would happen on exit + * of a task. + */ +void trace_filter_add_remove_task(struct trace_pid_list *pid_list, + struct task_struct *self, + struct task_struct *task) +{ + if (!pid_list) + return; + + /* For forks, we only add if the forking task is listed */ + if (self) { + if (!trace_find_filtered_pid(pid_list, self->pid)) + return; } + /* Sorry, but we don't support pid_max changing after setting */ + if (task->pid >= pid_list->pid_max) + return; + + /* "self" is set for forks, and NULL for exits */ + if (self) + set_bit(task->pid, pid_list->pids); + else + clear_bit(task->pid, pid_list->pids); +} + +/** + * trace_pid_next - Used for seq_file to get to the next pid of a pid_list + * @pid_list: The pid list to show + * @v: The last pid that was shown (+1 the actual pid to let zero be displayed) + * @pos: The position of the file + * + * This is used by the seq_file "next" operation to iterate the pids + * listed in a trace_pid_list structure. + * + * Returns the pid+1 as we want to display pid of zero, but NULL would + * stop the iteration. + */ +void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos) +{ + unsigned long pid = (unsigned long)v; + + (*pos)++; + + /* pid already is +1 of the actual prevous bit */ + pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); + + /* Return pid + 1 to allow zero to be represented */ + if (pid < pid_list->pid_max) + return (void *)(pid + 1); + + return NULL; +} + +/** + * trace_pid_start - Used for seq_file to start reading pid lists + * @pid_list: The pid list to show + * @pos: The position of the file + * + * This is used by seq_file "start" operation to start the iteration + * of listing pids. + * + * Returns the pid+1 as we want to display pid of zero, but NULL would + * stop the iteration. + */ +void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos) +{ + unsigned long pid; + loff_t l = 0; + + pid = find_first_bit(pid_list->pids, pid_list->pid_max); + if (pid >= pid_list->pid_max) + return NULL; + + /* Return pid + 1 so that zero can be the exit value */ + for (pid++; pid && l < *pos; + pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l)) + ; + return (void *)pid; +} + +/** + * trace_pid_show - show the current pid in seq_file processing + * @m: The seq_file structure to write into + * @v: A void pointer of the pid (+1) value to display + * + * Can be directly used by seq_file operations to display the current + * pid value. + */ +int trace_pid_show(struct seq_file *m, void *v) +{ + unsigned long pid = (unsigned long)v - 1; + + seq_printf(m, "%lu\n", pid); return 0; } -EXPORT_SYMBOL_GPL(call_filter_check_discard); + +/* 128 should be much more than enough */ +#define PID_BUF_SIZE 127 + +int trace_pid_write(struct trace_pid_list *filtered_pids, + struct trace_pid_list **new_pid_list, + const char __user *ubuf, size_t cnt) +{ + struct trace_pid_list *pid_list; + struct trace_parser parser; + unsigned long val; + int nr_pids = 0; + ssize_t read = 0; + ssize_t ret = 0; + loff_t pos; + pid_t pid; + + if (trace_parser_get_init(&parser, PID_BUF_SIZE + 1)) + return -ENOMEM; + + /* + * Always recreate a new array. The write is an all or nothing + * operation. Always create a new array when adding new pids by + * the user. If the operation fails, then the current list is + * not modified. + */ + pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); + if (!pid_list) + return -ENOMEM; + + pid_list->pid_max = READ_ONCE(pid_max); + + /* Only truncating will shrink pid_max */ + if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max) + pid_list->pid_max = filtered_pids->pid_max; + + pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); + if (!pid_list->pids) { + kfree(pid_list); + return -ENOMEM; + } + + if (filtered_pids) { + /* copy the current bits to the new max */ + for_each_set_bit(pid, filtered_pids->pids, + filtered_pids->pid_max) { + set_bit(pid, pid_list->pids); + nr_pids++; + } + } + + while (cnt > 0) { + + pos = 0; + + ret = trace_get_user(&parser, ubuf, cnt, &pos); + if (ret < 0 || !trace_parser_loaded(&parser)) + break; + + read += ret; + ubuf += ret; + cnt -= ret; + + parser.buffer[parser.idx] = 0; + + ret = -EINVAL; + if (kstrtoul(parser.buffer, 0, &val)) + break; + if (val >= pid_list->pid_max) + break; + + pid = (pid_t)val; + + set_bit(pid, pid_list->pids); + nr_pids++; + + trace_parser_clear(&parser); + ret = 0; + } + trace_parser_put(&parser); + + if (ret < 0) { + trace_free_pid_list(pid_list); + return ret; + } + + if (!nr_pids) { + /* Cleared the list of pids */ + trace_free_pid_list(pid_list); + read = ret; + pid_list = NULL; + } + + *new_pid_list = pid_list; + + return read; +} static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) { @@ -1672,6 +1912,16 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, } EXPORT_SYMBOL_GPL(tracing_generic_entry_update); +static __always_inline void +trace_event_setup(struct ring_buffer_event *event, + int type, unsigned long flags, int pc) +{ + struct trace_entry *ent = ring_buffer_event_data(event); + + tracing_generic_entry_update(ent, flags, pc); + ent->type = type; +} + struct ring_buffer_event * trace_buffer_lock_reserve(struct ring_buffer *buffer, int type, @@ -1681,34 +1931,137 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer, struct ring_buffer_event *event; event = ring_buffer_lock_reserve(buffer, len); - if (event != NULL) { - struct trace_entry *ent = ring_buffer_event_data(event); + if (event != NULL) + trace_event_setup(event, type, flags, pc); + + return event; +} - tracing_generic_entry_update(ent, flags, pc); - ent->type = type; +DEFINE_PER_CPU(struct ring_buffer_event *, trace_buffered_event); +DEFINE_PER_CPU(int, trace_buffered_event_cnt); +static int trace_buffered_event_ref; + +/** + * trace_buffered_event_enable - enable buffering events + * + * When events are being filtered, it is quicker to use a temporary + * buffer to write the event data into if there's a likely chance + * that it will not be committed. The discard of the ring buffer + * is not as fast as committing, and is much slower than copying + * a commit. + * + * When an event is to be filtered, allocate per cpu buffers to + * write the event data into, and if the event is filtered and discarded + * it is simply dropped, otherwise, the entire data is to be committed + * in one shot. + */ +void trace_buffered_event_enable(void) +{ + struct ring_buffer_event *event; + struct page *page; + int cpu; + + WARN_ON_ONCE(!mutex_is_locked(&event_mutex)); + + if (trace_buffered_event_ref++) + return; + + for_each_tracing_cpu(cpu) { + page = alloc_pages_node(cpu_to_node(cpu), + GFP_KERNEL | __GFP_NORETRY, 0); + if (!page) + goto failed; + + event = page_address(page); + memset(event, 0, sizeof(*event)); + + per_cpu(trace_buffered_event, cpu) = event; + + preempt_disable(); + if (cpu == smp_processor_id() && + this_cpu_read(trace_buffered_event) != + per_cpu(trace_buffered_event, cpu)) + WARN_ON_ONCE(1); + preempt_enable(); } - return event; + return; + failed: + trace_buffered_event_disable(); } -void -__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) +static void enable_trace_buffered_event(void *data) { - __this_cpu_write(trace_cmdline_save, true); - ring_buffer_unlock_commit(buffer, event); + /* Probably not needed, but do it anyway */ + smp_rmb(); + this_cpu_dec(trace_buffered_event_cnt); } -void trace_buffer_unlock_commit(struct trace_array *tr, - struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc) +static void disable_trace_buffered_event(void *data) { - __buffer_unlock_commit(buffer, event); + this_cpu_inc(trace_buffered_event_cnt); +} - ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL); - ftrace_trace_userstack(buffer, flags, pc); +/** + * trace_buffered_event_disable - disable buffering events + * + * When a filter is removed, it is faster to not use the buffered + * events, and to commit directly into the ring buffer. Free up + * the temp buffers when there are no more users. This requires + * special synchronization with current events. + */ +void trace_buffered_event_disable(void) +{ + int cpu; + + WARN_ON_ONCE(!mutex_is_locked(&event_mutex)); + + if (WARN_ON_ONCE(!trace_buffered_event_ref)) + return; + + if (--trace_buffered_event_ref) + return; + + preempt_disable(); + /* For each CPU, set the buffer as used. */ + smp_call_function_many(tracing_buffer_mask, + disable_trace_buffered_event, NULL, 1); + preempt_enable(); + + /* Wait for all current users to finish */ + synchronize_sched(); + + for_each_tracing_cpu(cpu) { + free_page((unsigned long)per_cpu(trace_buffered_event, cpu)); + per_cpu(trace_buffered_event, cpu) = NULL; + } + /* + * Make sure trace_buffered_event is NULL before clearing + * trace_buffered_event_cnt. + */ + smp_wmb(); + + preempt_disable(); + /* Do the work on each cpu */ + smp_call_function_many(tracing_buffer_mask, + enable_trace_buffered_event, NULL, 1); + preempt_enable(); +} + +void +__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) +{ + __this_cpu_write(trace_cmdline_save, true); + + /* If this is the temp buffer, we need to commit fully */ + if (this_cpu_read(trace_buffered_event) == event) { + /* Length is in event->array[0] */ + ring_buffer_write(buffer, event->array[0], &event->array[1]); + /* Release the temp buffer */ + this_cpu_dec(trace_buffered_event_cnt); + } else + ring_buffer_unlock_commit(buffer, event); } -EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit); static struct ring_buffer *temp_buffer; @@ -1719,8 +2072,23 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, unsigned long flags, int pc) { struct ring_buffer_event *entry; + int val; *current_rb = trace_file->tr->trace_buffer.buffer; + + if ((trace_file->flags & + (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) && + (entry = this_cpu_read(trace_buffered_event))) { + /* Try to use the per cpu buffer first */ + val = this_cpu_inc_return(trace_buffered_event_cnt); + if (val == 1) { + trace_event_setup(entry, type, flags, pc); + entry->array[0] = len; + return entry; + } + this_cpu_dec(trace_buffered_event_cnt); + } + entry = trace_buffer_lock_reserve(*current_rb, type, len, flags, pc); /* @@ -1738,17 +2106,6 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, } EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve); -struct ring_buffer_event * -trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, - int type, unsigned long len, - unsigned long flags, int pc) -{ - *current_rb = global_trace.trace_buffer.buffer; - return trace_buffer_lock_reserve(*current_rb, - type, len, flags, pc); -} -EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve); - void trace_buffer_unlock_commit_regs(struct trace_array *tr, struct ring_buffer *buffer, struct ring_buffer_event *event, @@ -1757,17 +2114,19 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, { __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr, buffer, flags, 0, pc, regs); + /* + * If regs is not set, then skip the following callers: + * trace_buffer_unlock_commit_regs + * event_trigger_unlock_commit + * trace_event_buffer_commit + * trace_event_raw_event_sched_switch + * Note, we can still get here via blktrace, wakeup tracer + * and mmiotrace, but that's ok if they lose a function or + * two. They are that meaningful. + */ + ftrace_trace_stack(tr, buffer, flags, regs ? 0 : 4, pc, regs); ftrace_trace_userstack(buffer, flags, pc); } -EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs); - -void trace_current_buffer_discard_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event) -{ - ring_buffer_discard_commit(buffer, event); -} -EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit); void trace_function(struct trace_array *tr, @@ -1816,6 +2175,13 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, trace.skip = skip; /* + * Add two, for this function and the call to save_stack_trace() + * If regs is set, then these functions will not be in the way. + */ + if (!regs) + trace.skip += 2; + + /* * Since events can happen in NMIs there's no safe way to * use the per cpu ftrace_stacks. We reserve it and if an interrupt * or NMI comes in, it will just have to use the default @@ -1986,83 +2352,41 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags) /* created for use with alloc_percpu */ struct trace_buffer_struct { - char buffer[TRACE_BUF_SIZE]; + int nesting; + char buffer[4][TRACE_BUF_SIZE]; }; static struct trace_buffer_struct *trace_percpu_buffer; -static struct trace_buffer_struct *trace_percpu_sirq_buffer; -static struct trace_buffer_struct *trace_percpu_irq_buffer; -static struct trace_buffer_struct *trace_percpu_nmi_buffer; /* - * The buffer used is dependent on the context. There is a per cpu - * buffer for normal context, softirq contex, hard irq context and - * for NMI context. Thise allows for lockless recording. - * - * Note, if the buffers failed to be allocated, then this returns NULL + * Thise allows for lockless recording. If we're nested too deeply, then + * this returns NULL. */ static char *get_trace_buf(void) { - struct trace_buffer_struct *percpu_buffer; - - /* - * If we have allocated per cpu buffers, then we do not - * need to do any locking. - */ - if (in_nmi()) - percpu_buffer = trace_percpu_nmi_buffer; - else if (in_irq()) - percpu_buffer = trace_percpu_irq_buffer; - else if (in_softirq()) - percpu_buffer = trace_percpu_sirq_buffer; - else - percpu_buffer = trace_percpu_buffer; + struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer); - if (!percpu_buffer) + if (!buffer || buffer->nesting >= 4) return NULL; - return this_cpu_ptr(&percpu_buffer->buffer[0]); + return &buffer->buffer[buffer->nesting++][0]; +} + +static void put_trace_buf(void) +{ + this_cpu_dec(trace_percpu_buffer->nesting); } static int alloc_percpu_trace_buffer(void) { struct trace_buffer_struct *buffers; - struct trace_buffer_struct *sirq_buffers; - struct trace_buffer_struct *irq_buffers; - struct trace_buffer_struct *nmi_buffers; buffers = alloc_percpu(struct trace_buffer_struct); - if (!buffers) - goto err_warn; - - sirq_buffers = alloc_percpu(struct trace_buffer_struct); - if (!sirq_buffers) - goto err_sirq; - - irq_buffers = alloc_percpu(struct trace_buffer_struct); - if (!irq_buffers) - goto err_irq; - - nmi_buffers = alloc_percpu(struct trace_buffer_struct); - if (!nmi_buffers) - goto err_nmi; + if (WARN(!buffers, "Could not allocate percpu trace_printk buffer")) + return -ENOMEM; trace_percpu_buffer = buffers; - trace_percpu_sirq_buffer = sirq_buffers; - trace_percpu_irq_buffer = irq_buffers; - trace_percpu_nmi_buffer = nmi_buffers; - return 0; - - err_nmi: - free_percpu(irq_buffers); - err_irq: - free_percpu(sirq_buffers); - err_sirq: - free_percpu(buffers); - err_warn: - WARN(1, "Could not allocate percpu trace_printk buffer"); - return -ENOMEM; } static int buffers_allocated; @@ -2153,7 +2477,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) tbuffer = get_trace_buf(); if (!tbuffer) { len = 0; - goto out; + goto out_nobuffer; } len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args); @@ -2179,6 +2503,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) } out: + put_trace_buf(); + +out_nobuffer: preempt_enable_notrace(); unpause_graph_tracing(); @@ -2210,7 +2537,7 @@ __trace_array_vprintk(struct ring_buffer *buffer, tbuffer = get_trace_buf(); if (!tbuffer) { len = 0; - goto out; + goto out_nobuffer; } len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); @@ -2229,7 +2556,11 @@ __trace_array_vprintk(struct ring_buffer *buffer, __buffer_unlock_commit(buffer, event); ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL); } - out: + +out: + put_trace_buf(); + +out_nobuffer: preempt_enable_notrace(); unpause_graph_tracing(); @@ -3571,6 +3902,9 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) if (mask == TRACE_ITER_RECORD_CMD) trace_event_enable_cmd_record(enabled); + if (mask == TRACE_ITER_EVENT_FORK) + trace_event_follow_fork(tr, enabled); + if (mask == TRACE_ITER_OVERWRITE) { ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled); #ifdef CONFIG_TRACER_MAX_TRACE @@ -3658,7 +3992,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, if (cnt >= sizeof(buf)) return -EINVAL; - if (copy_from_user(&buf, ubuf, cnt)) + if (copy_from_user(buf, ubuf, cnt)) return -EFAULT; buf[cnt] = 0; @@ -3804,12 +4138,19 @@ static const char readme_msg[] = "\t trigger: traceon, traceoff\n" "\t enable_event:<system>:<event>\n" "\t disable_event:<system>:<event>\n" +#ifdef CONFIG_HIST_TRIGGERS + "\t enable_hist:<system>:<event>\n" + "\t disable_hist:<system>:<event>\n" +#endif #ifdef CONFIG_STACKTRACE "\t\t stacktrace\n" #endif #ifdef CONFIG_TRACER_SNAPSHOT "\t\t snapshot\n" #endif +#ifdef CONFIG_HIST_TRIGGERS + "\t\t hist (see below)\n" +#endif "\t example: echo traceoff > events/block/block_unplug/trigger\n" "\t echo traceoff:3 > events/block/block_unplug/trigger\n" "\t echo 'enable_event:kmem:kmalloc:3 if nr_rq > 1' > \\\n" @@ -3825,6 +4166,56 @@ static const char readme_msg[] = "\t To remove a trigger with a count:\n" "\t echo '!<trigger>:0 > <system>/<event>/trigger\n" "\t Filters can be ignored when removing a trigger.\n" +#ifdef CONFIG_HIST_TRIGGERS + " hist trigger\t- If set, event hits are aggregated into a hash table\n" + "\t Format: hist:keys=<field1[,field2,...]>\n" + "\t [:values=<field1[,field2,...]>]\n" + "\t [:sort=<field1[,field2,...]>]\n" + "\t [:size=#entries]\n" + "\t [:pause][:continue][:clear]\n" + "\t [:name=histname1]\n" + "\t [if <filter>]\n\n" + "\t When a matching event is hit, an entry is added to a hash\n" + "\t table using the key(s) and value(s) named, and the value of a\n" + "\t sum called 'hitcount' is incremented. Keys and values\n" + "\t correspond to fields in the event's format description. Keys\n" + "\t can be any field, or the special string 'stacktrace'.\n" + "\t Compound keys consisting of up to two fields can be specified\n" + "\t by the 'keys' keyword. Values must correspond to numeric\n" + "\t fields. Sort keys consisting of up to two fields can be\n" + "\t specified using the 'sort' keyword. The sort direction can\n" + "\t be modified by appending '.descending' or '.ascending' to a\n" + "\t sort field. The 'size' parameter can be used to specify more\n" + "\t or fewer than the default 2048 entries for the hashtable size.\n" + "\t If a hist trigger is given a name using the 'name' parameter,\n" + "\t its histogram data will be shared with other triggers of the\n" + "\t same name, and trigger hits will update this common data.\n\n" + "\t Reading the 'hist' file for the event will dump the hash\n" + "\t table in its entirety to stdout. If there are multiple hist\n" + "\t triggers attached to an event, there will be a table for each\n" + "\t trigger in the output. The table displayed for a named\n" + "\t trigger will be the same as any other instance having the\n" + "\t same name. The default format used to display a given field\n" + "\t can be modified by appending any of the following modifiers\n" + "\t to the field name, as applicable:\n\n" + "\t .hex display a number as a hex value\n" + "\t .sym display an address as a symbol\n" + "\t .sym-offset display an address as a symbol and offset\n" + "\t .execname display a common_pid as a program name\n" + "\t .syscall display a syscall id as a syscall name\n\n" + "\t .log2 display log2 value rather than raw number\n\n" + "\t The 'pause' parameter can be used to pause an existing hist\n" + "\t trigger or to start a hist trigger but not log any events\n" + "\t until told to do so. 'continue' can be used to start or\n" + "\t restart a paused hist trigger.\n\n" + "\t The 'clear' parameter will clear the contents of a running\n" + "\t hist trigger and leave its current paused/active state\n" + "\t unchanged.\n\n" + "\t The enable_hist and disable_hist triggers can be used to\n" + "\t have one event conditionally start and stop another event's\n" + "\t already-attached hist trigger. The syntax is analagous to\n" + "\t the enable_event and disable_event triggers.\n" +#endif ; static ssize_t @@ -4474,7 +4865,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, if (cnt > MAX_TRACER_SIZE) cnt = MAX_TRACER_SIZE; - if (copy_from_user(&buf, ubuf, cnt)) + if (copy_from_user(buf, ubuf, cnt)) return -EFAULT; buf[cnt] = 0; @@ -4733,19 +5124,20 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, struct trace_iterator *iter = filp->private_data; ssize_t sret; - /* return any leftover data */ - sret = trace_seq_to_user(&iter->seq, ubuf, cnt); - if (sret != -EBUSY) - return sret; - - trace_seq_init(&iter->seq); - /* * Avoid more than one consumer on a single file descriptor * This is just a matter of traces coherency, the ring buffer itself * is protected. */ mutex_lock(&iter->mutex); + + /* return any leftover data */ + sret = trace_seq_to_user(&iter->seq, ubuf, cnt); + if (sret != -EBUSY) + goto out; + + trace_seq_init(&iter->seq); + if (iter->trace->read) { sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); if (sret) @@ -5264,7 +5656,7 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, if (cnt >= sizeof(buf)) return -EINVAL; - if (copy_from_user(&buf, ubuf, cnt)) + if (copy_from_user(buf, ubuf, cnt)) return -EFAULT; buf[cnt] = 0; @@ -5772,9 +6164,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, return -EBUSY; #endif - if (splice_grow_spd(pipe, &spd)) - return -ENOMEM; - if (*ppos & (PAGE_SIZE - 1)) return -EINVAL; @@ -5784,6 +6173,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, len &= PAGE_MASK; } + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + again: trace_access_lock(iter->cpu_file); entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file); @@ -5841,19 +6233,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, /* did we read anything? */ if (!spd.nr_pages) { if (ret) - return ret; + goto out; + ret = -EAGAIN; if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) - return -EAGAIN; + goto out; ret = wait_on_pipe(iter, true); if (ret) - return ret; + goto out; goto again; } ret = splice_to_pipe(pipe, &spd); +out: splice_shrink_spd(&spd); return ret; @@ -6650,7 +7044,7 @@ static int instance_mkdir(const char *name) if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL)) goto out_free_tr; - tr->trace_flags = global_trace.trace_flags; + tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS; cpumask_copy(tr->tracing_cpumask, cpu_all_mask); @@ -6724,6 +7118,12 @@ static int instance_rmdir(const char *name) list_del(&tr->list); + /* Disable all the flags that were enabled coming in */ + for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) { + if ((1 << i) & ZEROED_TRACE_FLAGS) + set_tracer_flag(tr, 1 << i, 0); + } + tracing_set_nop(tr); event_trace_del_tracer(tr); ftrace_destroy_function_files(tr); @@ -6814,6 +7214,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) for_each_tracing_cpu(cpu) tracing_init_tracefs_percpu(tr, cpu); + ftrace_init_tracefs(tr, d_tracer); } static struct vfsmount *trace_automount(void *ingore) @@ -6967,6 +7368,7 @@ static __init int tracer_init_tracefs(void) return 0; init_tracer_tracefs(&global_trace, d_tracer); + ftrace_init_tracefs_toplevel(&global_trace, d_tracer); trace_create_file("tracing_thresh", 0644, d_tracer, &global_trace, &tracing_thresh_fops); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3fff4adfd431..f783df416726 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -80,6 +80,12 @@ enum trace_type { FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ filter) +#undef FTRACE_ENTRY_PACKED +#define FTRACE_ENTRY_PACKED(name, struct_name, id, tstruct, print, \ + filter) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) __packed + #include "trace_entries.h" /* @@ -156,6 +162,9 @@ struct trace_array_cpu { char comm[TASK_COMM_LEN]; bool ignore_pid; +#ifdef CONFIG_FUNCTION_TRACER + bool ftrace_ignore_pid; +#endif }; struct tracer; @@ -177,9 +186,8 @@ struct trace_options { }; struct trace_pid_list { - unsigned int nr_pids; - int order; - pid_t *pids; + int pid_max; + unsigned long *pids; }; /* @@ -248,6 +256,7 @@ struct trace_array { int ref; #ifdef CONFIG_FUNCTION_TRACER struct ftrace_ops *ops; + struct trace_pid_list __rcu *function_pids; /* function tracing enabled */ int function_enabled; #endif @@ -629,6 +638,25 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs); extern unsigned long tracing_thresh; +/* PID filtering */ + +extern int pid_max; + +bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, + pid_t search_pid); +bool trace_ignore_this_task(struct trace_pid_list *filtered_pids, + struct task_struct *task); +void trace_filter_add_remove_task(struct trace_pid_list *pid_list, + struct task_struct *self, + struct task_struct *task); +void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos); +void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos); +int trace_pid_show(struct seq_file *m, void *v); +void trace_free_pid_list(struct trace_pid_list *pid_list); +int trace_pid_write(struct trace_pid_list *filtered_pids, + struct trace_pid_list **new_pid_list, + const char __user *ubuf, size_t cnt); + #ifdef CONFIG_TRACER_MAX_TRACE void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); void update_max_tr_single(struct trace_array *tr, @@ -656,6 +684,7 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags, extern cycle_t ftrace_now(int cpu); extern void trace_find_cmdline(int pid, char comm[]); +extern void trace_event_follow_fork(struct trace_array *tr, bool enable); #ifdef CONFIG_DYNAMIC_FTRACE extern unsigned long ftrace_update_tot_cnt; @@ -821,12 +850,9 @@ extern struct list_head ftrace_pids; #ifdef CONFIG_FUNCTION_TRACER extern bool ftrace_filter_param __initdata; -static inline int ftrace_trace_task(struct task_struct *task) +static inline int ftrace_trace_task(struct trace_array *tr) { - if (list_empty(&ftrace_pids)) - return 1; - - return test_tsk_trace_trace(task); + return !this_cpu_read(tr->trace_buffer.data->ftrace_ignore_pid); } extern int ftrace_is_dead(void); int ftrace_create_function_files(struct trace_array *tr, @@ -836,8 +862,11 @@ void ftrace_init_global_array_ops(struct trace_array *tr); void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); void ftrace_reset_array_ops(struct trace_array *tr); int using_ftrace_ops_list_func(void); +void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer); +void ftrace_init_tracefs_toplevel(struct trace_array *tr, + struct dentry *d_tracer); #else -static inline int ftrace_trace_task(struct task_struct *task) +static inline int ftrace_trace_task(struct trace_array *tr) { return 1; } @@ -852,6 +881,8 @@ static inline void ftrace_destroy_function_files(struct trace_array *tr) { } static inline __init void ftrace_init_global_array_ops(struct trace_array *tr) { } static inline void ftrace_reset_array_ops(struct trace_array *tr) { } +static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { } +static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { } /* ftace_func_t type is not defined, use macro instead of static inline */ #define ftrace_init_array_ops(tr, func) do { } while (0) #endif /* CONFIG_FUNCTION_TRACER */ @@ -967,6 +998,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, C(STOP_ON_FREE, "disable_on_free"), \ C(IRQ_INFO, "irq-info"), \ C(MARKERS, "markers"), \ + C(EVENT_FORK, "event-fork"), \ FUNCTION_FLAGS \ FGRAPH_FLAGS \ STACK_FLAGS \ @@ -1064,6 +1096,137 @@ struct trace_subsystem_dir { int nr_events; }; +extern int call_filter_check_discard(struct trace_event_call *call, void *rec, + struct ring_buffer *buffer, + struct ring_buffer_event *event); + +void trace_buffer_unlock_commit_regs(struct trace_array *tr, + struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc, + struct pt_regs *regs); + +static inline void trace_buffer_unlock_commit(struct trace_array *tr, + struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc) +{ + trace_buffer_unlock_commit_regs(tr, buffer, event, flags, pc, NULL); +} + +DECLARE_PER_CPU(struct ring_buffer_event *, trace_buffered_event); +DECLARE_PER_CPU(int, trace_buffered_event_cnt); +void trace_buffered_event_disable(void); +void trace_buffered_event_enable(void); + +static inline void +__trace_event_discard_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + if (this_cpu_read(trace_buffered_event) == event) { + /* Simply release the temp buffer */ + this_cpu_dec(trace_buffered_event_cnt); + return; + } + ring_buffer_discard_commit(buffer, event); +} + +/* + * Helper function for event_trigger_unlock_commit{_regs}(). + * If there are event triggers attached to this event that requires + * filtering against its fields, then they wil be called as the + * entry already holds the field information of the current event. + * + * It also checks if the event should be discarded or not. + * It is to be discarded if the event is soft disabled and the + * event was only recorded to process triggers, or if the event + * filter is active and this event did not match the filters. + * + * Returns true if the event is discarded, false otherwise. + */ +static inline bool +__event_trigger_test_discard(struct trace_event_file *file, + struct ring_buffer *buffer, + struct ring_buffer_event *event, + void *entry, + enum event_trigger_type *tt) +{ + unsigned long eflags = file->flags; + + if (eflags & EVENT_FILE_FL_TRIGGER_COND) + *tt = event_triggers_call(file, entry); + + if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) || + (unlikely(file->flags & EVENT_FILE_FL_FILTERED) && + !filter_match_preds(file->filter, entry))) { + __trace_event_discard_commit(buffer, event); + return true; + } + + return false; +} + +/** + * event_trigger_unlock_commit - handle triggers and finish event commit + * @file: The file pointer assoctiated to the event + * @buffer: The ring buffer that the event is being written to + * @event: The event meta data in the ring buffer + * @entry: The event itself + * @irq_flags: The state of the interrupts at the start of the event + * @pc: The state of the preempt count at the start of the event. + * + * This is a helper function to handle triggers that require data + * from the event itself. It also tests the event against filters and + * if the event is soft disabled and should be discarded. + */ +static inline void +event_trigger_unlock_commit(struct trace_event_file *file, + struct ring_buffer *buffer, + struct ring_buffer_event *event, + void *entry, unsigned long irq_flags, int pc) +{ + enum event_trigger_type tt = ETT_NONE; + + if (!__event_trigger_test_discard(file, buffer, event, entry, &tt)) + trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc); + + if (tt) + event_triggers_post_call(file, tt, entry); +} + +/** + * event_trigger_unlock_commit_regs - handle triggers and finish event commit + * @file: The file pointer assoctiated to the event + * @buffer: The ring buffer that the event is being written to + * @event: The event meta data in the ring buffer + * @entry: The event itself + * @irq_flags: The state of the interrupts at the start of the event + * @pc: The state of the preempt count at the start of the event. + * + * This is a helper function to handle triggers that require data + * from the event itself. It also tests the event against filters and + * if the event is soft disabled and should be discarded. + * + * Same as event_trigger_unlock_commit() but calls + * trace_buffer_unlock_commit_regs() instead of trace_buffer_unlock_commit(). + */ +static inline void +event_trigger_unlock_commit_regs(struct trace_event_file *file, + struct ring_buffer *buffer, + struct ring_buffer_event *event, + void *entry, unsigned long irq_flags, int pc, + struct pt_regs *regs) +{ + enum event_trigger_type tt = ETT_NONE; + + if (!__event_trigger_test_discard(file, buffer, event, entry, &tt)) + trace_buffer_unlock_commit_regs(file->tr, buffer, event, + irq_flags, pc, regs); + + if (tt) + event_triggers_post_call(file, tt, entry); +} + #define FILTER_PRED_INVALID ((unsigned short)-1) #define FILTER_PRED_IS_RIGHT (1 << 15) #define FILTER_PRED_FOLD (1 << 15) @@ -1161,6 +1324,15 @@ extern struct mutex event_mutex; extern struct list_head ftrace_events; extern const struct file_operations event_trigger_fops; +extern const struct file_operations event_hist_fops; + +#ifdef CONFIG_HIST_TRIGGERS +extern int register_trigger_hist_cmd(void); +extern int register_trigger_hist_enable_disable_cmds(void); +#else +static inline int register_trigger_hist_cmd(void) { return 0; } +static inline int register_trigger_hist_enable_disable_cmds(void) { return 0; } +#endif extern int register_trigger_cmds(void); extern void clear_event_triggers(struct trace_array *tr); @@ -1174,9 +1346,41 @@ struct event_trigger_data { char *filter_str; void *private_data; bool paused; + bool paused_tmp; struct list_head list; + char *name; + struct list_head named_list; + struct event_trigger_data *named_data; +}; + +/* Avoid typos */ +#define ENABLE_EVENT_STR "enable_event" +#define DISABLE_EVENT_STR "disable_event" +#define ENABLE_HIST_STR "enable_hist" +#define DISABLE_HIST_STR "disable_hist" + +struct enable_trigger_data { + struct trace_event_file *file; + bool enable; + bool hist; }; +extern int event_enable_trigger_print(struct seq_file *m, + struct event_trigger_ops *ops, + struct event_trigger_data *data); +extern void event_enable_trigger_free(struct event_trigger_ops *ops, + struct event_trigger_data *data); +extern int event_enable_trigger_func(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param); +extern int event_enable_register_trigger(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct trace_event_file *file); +extern void event_enable_unregister_trigger(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *test, + struct trace_event_file *file); extern void trigger_data_free(struct event_trigger_data *data); extern int event_trigger_init(struct event_trigger_ops *ops, struct event_trigger_data *data); @@ -1189,7 +1393,18 @@ extern void unregister_trigger(char *glob, struct event_trigger_ops *ops, extern int set_trigger_filter(char *filter_str, struct event_trigger_data *trigger_data, struct trace_event_file *file); +extern struct event_trigger_data *find_named_trigger(const char *name); +extern bool is_named_trigger(struct event_trigger_data *test); +extern int save_named_trigger(const char *name, + struct event_trigger_data *data); +extern void del_named_trigger(struct event_trigger_data *data); +extern void pause_named_trigger(struct event_trigger_data *data); +extern void unpause_named_trigger(struct event_trigger_data *data); +extern void set_named_trigger_data(struct event_trigger_data *data, + struct event_trigger_data *named_data); extern int register_event_command(struct event_command *cmd); +extern int unregister_event_command(struct event_command *cmd); +extern int register_trigger_hist_enable_disable_cmds(void); /** * struct event_trigger_ops - callbacks for trace event triggers @@ -1416,6 +1631,11 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \ FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ filter) +#undef FTRACE_ENTRY_PACKED +#define FTRACE_ENTRY_PACKED(call, struct_name, id, tstruct, print, filter) \ + FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) + #include "trace_entries.h" #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER) diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index ee7b94a4810a..5c30efcda5e6 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -72,7 +72,7 @@ FTRACE_ENTRY_REG(function, ftrace_entry, ); /* Function call entry */ -FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, +FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry, TRACE_GRAPH_ENT, @@ -88,7 +88,7 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, ); /* Function return entry */ -FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, +FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, TRACE_GRAPH_RET, diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 00df25fd86ef..562fa69df5d3 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -47,6 +47,9 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event, if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) return -EPERM; + if (!is_sampling_event(p_event)) + return 0; + /* * We don't allow user space callchains for function trace * event, due to issues with page faults while tracing page @@ -260,42 +263,43 @@ void perf_trace_del(struct perf_event *p_event, int flags) tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event); } -void *perf_trace_buf_prepare(int size, unsigned short type, - struct pt_regs **regs, int *rctxp) +void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp) { - struct trace_entry *entry; - unsigned long flags; char *raw_data; - int pc; + int rctx; BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, - "perf buffer not large enough")) + "perf buffer not large enough")) return NULL; - pc = preempt_count(); - - *rctxp = perf_swevent_get_recursion_context(); - if (*rctxp < 0) + *rctxp = rctx = perf_swevent_get_recursion_context(); + if (rctx < 0) return NULL; if (regs) - *regs = this_cpu_ptr(&__perf_regs[*rctxp]); - raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]); + *regs = this_cpu_ptr(&__perf_regs[rctx]); + raw_data = this_cpu_ptr(perf_trace_buf[rctx]); /* zero the dead bytes from align to not leak stack to user */ memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64)); + return raw_data; +} +EXPORT_SYMBOL_GPL(perf_trace_buf_alloc); +NOKPROBE_SYMBOL(perf_trace_buf_alloc); + +void perf_trace_buf_update(void *record, u16 type) +{ + struct trace_entry *entry = record; + int pc = preempt_count(); + unsigned long flags; - entry = (struct trace_entry *)raw_data; local_save_flags(flags); tracing_generic_entry_update(entry, flags, pc); entry->type = type; - - return raw_data; } -EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); -NOKPROBE_SYMBOL(perf_trace_buf_prepare); +NOKPROBE_SYMBOL(perf_trace_buf_update); #ifdef CONFIG_FUNCTION_TRACER static void @@ -316,15 +320,16 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE); + memset(®s, 0, sizeof(regs)); perf_fetch_caller_regs(®s); - entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx); + entry = perf_trace_buf_alloc(ENTRY_SIZE, NULL, &rctx); if (!entry) return; entry->ip = ip; entry->parent_ip = parent_ip; - perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, + perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN, 1, ®s, head, NULL); #undef ENTRY_SIZE diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 6f965864cc02..03c0a48c3ac4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -15,7 +15,6 @@ #include <linux/kthread.h> #include <linux/tracefs.h> #include <linux/uaccess.h> -#include <linux/bsearch.h> #include <linux/module.h> #include <linux/ctype.h> #include <linux/sort.h> @@ -204,6 +203,24 @@ static void trace_destroy_fields(struct trace_event_call *call) } } +/* + * run-time version of trace_event_get_offsets_<call>() that returns the last + * accessible offset of trace fields excluding __dynamic_array bytes + */ +int trace_event_get_offsets(struct trace_event_call *call) +{ + struct ftrace_event_field *tail; + struct list_head *head; + + head = trace_get_fields(call); + /* + * head->next points to the last field with the largest offset, + * since it was added last by trace_define_field() + */ + tail = list_first_entry(head, struct ftrace_event_field, link); + return tail->offset + tail->size; +} + int trace_event_raw_init(struct trace_event_call *call) { int id; @@ -244,6 +261,14 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, local_save_flags(fbuffer->flags); fbuffer->pc = preempt_count(); + /* + * If CONFIG_PREEMPT is enabled, then the tracepoint itself disables + * preemption (adding one to the preempt_count). Since we are + * interested in the preempt_count at the time the tracepoint was + * hit, we need to subtract one to offset the increment. + */ + if (IS_ENABLED(CONFIG_PREEMPT)) + fbuffer->pc--; fbuffer->trace_file = trace_file; fbuffer->event = @@ -363,6 +388,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, { struct trace_event_call *call = file->event_call; struct trace_array *tr = file->tr; + unsigned long file_flags = file->flags; int ret = 0; int disable; @@ -445,6 +471,15 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, break; } + /* Enable or disable use of trace_buffered_event */ + if ((file_flags & EVENT_FILE_FL_SOFT_DISABLED) != + (file->flags & EVENT_FILE_FL_SOFT_DISABLED)) { + if (file->flags & EVENT_FILE_FL_SOFT_DISABLED) + trace_buffered_event_enable(); + else + trace_buffered_event_disable(); + } + return ret; } @@ -471,40 +506,41 @@ static void ftrace_clear_events(struct trace_array *tr) mutex_unlock(&event_mutex); } -static int cmp_pid(const void *key, const void *elt) +static void +event_filter_pid_sched_process_exit(void *data, struct task_struct *task) { - const pid_t *search_pid = key; - const pid_t *pid = elt; + struct trace_pid_list *pid_list; + struct trace_array *tr = data; - if (*search_pid == *pid) - return 0; - if (*search_pid < *pid) - return -1; - return 1; + pid_list = rcu_dereference_sched(tr->filtered_pids); + trace_filter_add_remove_task(pid_list, NULL, task); } -static bool -check_ignore_pid(struct trace_pid_list *filtered_pids, struct task_struct *task) +static void +event_filter_pid_sched_process_fork(void *data, + struct task_struct *self, + struct task_struct *task) { - pid_t search_pid; - pid_t *pid; - - /* - * Return false, because if filtered_pids does not exist, - * all pids are good to trace. - */ - if (!filtered_pids) - return false; - - search_pid = task->pid; + struct trace_pid_list *pid_list; + struct trace_array *tr = data; - pid = bsearch(&search_pid, filtered_pids->pids, - filtered_pids->nr_pids, sizeof(pid_t), - cmp_pid); - if (!pid) - return true; + pid_list = rcu_dereference_sched(tr->filtered_pids); + trace_filter_add_remove_task(pid_list, self, task); +} - return false; +void trace_event_follow_fork(struct trace_array *tr, bool enable) +{ + if (enable) { + register_trace_prio_sched_process_fork(event_filter_pid_sched_process_fork, + tr, INT_MIN); + register_trace_prio_sched_process_exit(event_filter_pid_sched_process_exit, + tr, INT_MAX); + } else { + unregister_trace_sched_process_fork(event_filter_pid_sched_process_fork, + tr); + unregister_trace_sched_process_exit(event_filter_pid_sched_process_exit, + tr); + } } static void @@ -517,8 +553,8 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, pid_list = rcu_dereference_sched(tr->filtered_pids); this_cpu_write(tr->trace_buffer.data->ignore_pid, - check_ignore_pid(pid_list, prev) && - check_ignore_pid(pid_list, next)); + trace_ignore_this_task(pid_list, prev) && + trace_ignore_this_task(pid_list, next)); } static void @@ -531,7 +567,7 @@ event_filter_pid_sched_switch_probe_post(void *data, bool preempt, pid_list = rcu_dereference_sched(tr->filtered_pids); this_cpu_write(tr->trace_buffer.data->ignore_pid, - check_ignore_pid(pid_list, next)); + trace_ignore_this_task(pid_list, next)); } static void @@ -547,7 +583,7 @@ event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task) pid_list = rcu_dereference_sched(tr->filtered_pids); this_cpu_write(tr->trace_buffer.data->ignore_pid, - check_ignore_pid(pid_list, task)); + trace_ignore_this_task(pid_list, task)); } static void @@ -564,7 +600,7 @@ event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task) /* Set tracing if current is enabled */ this_cpu_write(tr->trace_buffer.data->ignore_pid, - check_ignore_pid(pid_list, current)); + trace_ignore_this_task(pid_list, current)); } static void __ftrace_clear_event_pids(struct trace_array *tr) @@ -602,8 +638,7 @@ static void __ftrace_clear_event_pids(struct trace_array *tr) /* Wait till all users are no longer using pid filtering */ synchronize_sched(); - free_pages((unsigned long)pid_list->pids, pid_list->order); - kfree(pid_list); + trace_free_pid_list(pid_list); } static void ftrace_clear_event_pids(struct trace_array *tr) @@ -946,6 +981,15 @@ static void t_stop(struct seq_file *m, void *p) mutex_unlock(&event_mutex); } +static void * +p_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct trace_array *tr = m->private; + struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids); + + return trace_pid_next(pid_list, v, pos); +} + static void *p_start(struct seq_file *m, loff_t *pos) __acquires(RCU) { @@ -963,10 +1007,10 @@ static void *p_start(struct seq_file *m, loff_t *pos) pid_list = rcu_dereference_sched(tr->filtered_pids); - if (!pid_list || *pos >= pid_list->nr_pids) + if (!pid_list) return NULL; - return (void *)&pid_list->pids[*pos]; + return trace_pid_start(pid_list, pos); } static void p_stop(struct seq_file *m, void *p) @@ -976,28 +1020,6 @@ static void p_stop(struct seq_file *m, void *p) mutex_unlock(&event_mutex); } -static void * -p_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct trace_array *tr = m->private; - struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids); - - (*pos)++; - - if (*pos >= pid_list->nr_pids) - return NULL; - - return (void *)&pid_list->pids[*pos]; -} - -static int p_show(struct seq_file *m, void *v) -{ - pid_t *pid = v; - - seq_printf(m, "%d\n", *pid); - return 0; -} - static ssize_t event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -1543,11 +1565,6 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) return r; } -static int max_pids(struct trace_pid_list *pid_list) -{ - return (PAGE_SIZE << pid_list->order) / sizeof(pid_t); -} - static void ignore_task_cpu(void *data) { struct trace_array *tr = data; @@ -1561,7 +1578,7 @@ static void ignore_task_cpu(void *data) mutex_is_locked(&event_mutex)); this_cpu_write(tr->trace_buffer.data->ignore_pid, - check_ignore_pid(pid_list, current)); + trace_ignore_this_task(pid_list, current)); } static ssize_t @@ -1571,15 +1588,9 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, struct seq_file *m = filp->private_data; struct trace_array *tr = m->private; struct trace_pid_list *filtered_pids = NULL; - struct trace_pid_list *pid_list = NULL; + struct trace_pid_list *pid_list; struct trace_event_file *file; - struct trace_parser parser; - unsigned long val; - loff_t this_pos; - ssize_t read = 0; - ssize_t ret = 0; - pid_t pid; - int i; + ssize_t ret; if (!cnt) return 0; @@ -1588,116 +1599,14 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, if (ret < 0) return ret; - if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1)) - return -ENOMEM; - mutex_lock(&event_mutex); - /* - * Load as many pids into the array before doing a - * swap from the tr->filtered_pids to the new list. - */ - while (cnt > 0) { - - this_pos = 0; - - ret = trace_get_user(&parser, ubuf, cnt, &this_pos); - if (ret < 0 || !trace_parser_loaded(&parser)) - break; - - read += ret; - ubuf += ret; - cnt -= ret; - - parser.buffer[parser.idx] = 0; - - ret = -EINVAL; - if (kstrtoul(parser.buffer, 0, &val)) - break; - if (val > INT_MAX) - break; - - pid = (pid_t)val; - - ret = -ENOMEM; - if (!pid_list) { - pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); - if (!pid_list) - break; - - filtered_pids = rcu_dereference_protected(tr->filtered_pids, - lockdep_is_held(&event_mutex)); - if (filtered_pids) - pid_list->order = filtered_pids->order; - else - pid_list->order = 0; - - pid_list->pids = (void *)__get_free_pages(GFP_KERNEL, - pid_list->order); - if (!pid_list->pids) - break; - - if (filtered_pids) { - pid_list->nr_pids = filtered_pids->nr_pids; - memcpy(pid_list->pids, filtered_pids->pids, - pid_list->nr_pids * sizeof(pid_t)); - } else - pid_list->nr_pids = 0; - } - - if (pid_list->nr_pids >= max_pids(pid_list)) { - pid_t *pid_page; - - pid_page = (void *)__get_free_pages(GFP_KERNEL, - pid_list->order + 1); - if (!pid_page) - break; - memcpy(pid_page, pid_list->pids, - pid_list->nr_pids * sizeof(pid_t)); - free_pages((unsigned long)pid_list->pids, pid_list->order); - - pid_list->order++; - pid_list->pids = pid_page; - } - - pid_list->pids[pid_list->nr_pids++] = pid; - trace_parser_clear(&parser); - ret = 0; - } - trace_parser_put(&parser); - - if (ret < 0) { - if (pid_list) - free_pages((unsigned long)pid_list->pids, pid_list->order); - kfree(pid_list); - mutex_unlock(&event_mutex); - return ret; - } - - if (!pid_list) { - mutex_unlock(&event_mutex); - return ret; - } - sort(pid_list->pids, pid_list->nr_pids, sizeof(pid_t), cmp_pid, NULL); - - /* Remove duplicates */ - for (i = 1; i < pid_list->nr_pids; i++) { - int start = i; - - while (i < pid_list->nr_pids && - pid_list->pids[i - 1] == pid_list->pids[i]) - i++; + filtered_pids = rcu_dereference_protected(tr->filtered_pids, + lockdep_is_held(&event_mutex)); - if (start != i) { - if (i < pid_list->nr_pids) { - memmove(&pid_list->pids[start], &pid_list->pids[i], - (pid_list->nr_pids - i) * sizeof(pid_t)); - pid_list->nr_pids -= i - start; - i = start; - } else - pid_list->nr_pids = start; - } - } + ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); + if (ret < 0) + goto out; rcu_assign_pointer(tr->filtered_pids, pid_list); @@ -1707,10 +1616,8 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, if (filtered_pids) { synchronize_sched(); - - free_pages((unsigned long)filtered_pids->pids, filtered_pids->order); - kfree(filtered_pids); - } else { + trace_free_pid_list(filtered_pids); + } else if (pid_list) { /* * Register a probe that is called before all other probes * to set ignore_pid if next or prev do not match. @@ -1745,10 +1652,11 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, */ on_each_cpu(ignore_task_cpu, tr, 1); + out: mutex_unlock(&event_mutex); - ret = read; - *ppos += read; + if (ret > 0) + *ppos += ret; return ret; } @@ -1775,7 +1683,7 @@ static const struct seq_operations show_set_event_seq_ops = { static const struct seq_operations show_set_pid_seq_ops = { .start = p_start, .next = p_next, - .show = p_show, + .show = trace_pid_show, .stop = p_stop, }; @@ -2103,6 +2011,10 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) trace_create_file("trigger", 0644, file->dir, file, &event_trigger_fops); +#ifdef CONFIG_HIST_TRIGGERS + trace_create_file("hist", 0444, file->dir, file, + &event_hist_fops); +#endif trace_create_file("format", 0444, file->dir, call, &ftrace_event_format_fops); @@ -3350,7 +3262,7 @@ static __init void event_trace_self_tests(void) static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); -static struct trace_array *event_tr; +static struct trace_event_file event_trace_file __initdata; static void __init function_test_events_call(unsigned long ip, unsigned long parent_ip, @@ -3374,17 +3286,17 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip, local_save_flags(flags); - event = trace_current_buffer_lock_reserve(&buffer, - TRACE_FN, sizeof(*entry), - flags, pc); + event = trace_event_buffer_lock_reserve(&buffer, &event_trace_file, + TRACE_FN, sizeof(*entry), + flags, pc); if (!event) goto out; entry = ring_buffer_event_data(event); entry->ip = ip; entry->parent_ip = parent_ip; - trace_buffer_unlock_commit(event_tr, buffer, event, flags, pc); - + event_trigger_unlock_commit(&event_trace_file, buffer, event, + entry, flags, pc); out: atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); preempt_enable_notrace(); @@ -3399,9 +3311,11 @@ static struct ftrace_ops trace_ops __initdata = static __init void event_trace_self_test_with_function(void) { int ret; - event_tr = top_trace_array(); - if (WARN_ON(!event_tr)) + + event_trace_file.tr = top_trace_array(); + if (WARN_ON(!event_trace_file.tr)) return; + ret = register_ftrace_function(&trace_ops); if (WARN_ON(ret < 0)) { pr_info("Failed to enable function tracer for event tests\n"); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index b3f5051cd4e9..9daa9b3bc6d9 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -689,10 +689,7 @@ static void append_filter_err(struct filter_parse_state *ps, static inline struct event_filter *event_filter(struct trace_event_file *file) { - if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - return file->event_call->filter; - else - return file->filter; + return file->filter; } /* caller must hold event_mutex */ @@ -826,12 +823,12 @@ static void __free_preds(struct event_filter *filter) static void filter_disable(struct trace_event_file *file) { - struct trace_event_call *call = file->event_call; + unsigned long old_flags = file->flags; - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - call->flags &= ~TRACE_EVENT_FL_FILTERED; - else - file->flags &= ~EVENT_FILE_FL_FILTERED; + file->flags &= ~EVENT_FILE_FL_FILTERED; + + if (old_flags != file->flags) + trace_buffered_event_disable(); } static void __free_filter(struct event_filter *filter) @@ -883,13 +880,8 @@ static int __alloc_preds(struct event_filter *filter, int n_preds) static inline void __remove_filter(struct trace_event_file *file) { - struct trace_event_call *call = file->event_call; - filter_disable(file); - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - remove_filter_string(call->filter); - else - remove_filter_string(file->filter); + remove_filter_string(file->filter); } static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir, @@ -906,15 +898,8 @@ static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir, static inline void __free_subsystem_filter(struct trace_event_file *file) { - struct trace_event_call *call = file->event_call; - - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) { - __free_filter(call->filter); - call->filter = NULL; - } else { - __free_filter(file->filter); - file->filter = NULL; - } + __free_filter(file->filter); + file->filter = NULL; } static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir, @@ -1718,69 +1703,43 @@ fail: static inline void event_set_filtered_flag(struct trace_event_file *file) { - struct trace_event_call *call = file->event_call; + unsigned long old_flags = file->flags; - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - call->flags |= TRACE_EVENT_FL_FILTERED; - else - file->flags |= EVENT_FILE_FL_FILTERED; + file->flags |= EVENT_FILE_FL_FILTERED; + + if (old_flags != file->flags) + trace_buffered_event_enable(); } static inline void event_set_filter(struct trace_event_file *file, struct event_filter *filter) { - struct trace_event_call *call = file->event_call; - - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - rcu_assign_pointer(call->filter, filter); - else - rcu_assign_pointer(file->filter, filter); + rcu_assign_pointer(file->filter, filter); } static inline void event_clear_filter(struct trace_event_file *file) { - struct trace_event_call *call = file->event_call; - - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - RCU_INIT_POINTER(call->filter, NULL); - else - RCU_INIT_POINTER(file->filter, NULL); + RCU_INIT_POINTER(file->filter, NULL); } static inline void event_set_no_set_filter_flag(struct trace_event_file *file) { - struct trace_event_call *call = file->event_call; - - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; - else - file->flags |= EVENT_FILE_FL_NO_SET_FILTER; + file->flags |= EVENT_FILE_FL_NO_SET_FILTER; } static inline void event_clear_no_set_filter_flag(struct trace_event_file *file) { - struct trace_event_call *call = file->event_call; - - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER; - else - file->flags &= ~EVENT_FILE_FL_NO_SET_FILTER; + file->flags &= ~EVENT_FILE_FL_NO_SET_FILTER; } static inline bool event_no_set_filter_flag(struct trace_event_file *file) { - struct trace_event_call *call = file->event_call; - if (file->flags & EVENT_FILE_FL_NO_SET_FILTER) return true; - if ((call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) && - (call->flags & TRACE_EVENT_FL_NO_SET_FILTER)) - return true; - return false; } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c new file mode 100644 index 000000000000..f3a960ed75a1 --- /dev/null +++ b/kernel/trace/trace_events_hist.c @@ -0,0 +1,1755 @@ +/* + * trace_events_hist - trace event hist triggers + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Copyright (C) 2015 Tom Zanussi <tom.zanussi@linux.intel.com> + */ + +#include <linux/module.h> +#include <linux/kallsyms.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/stacktrace.h> + +#include "tracing_map.h" +#include "trace.h" + +struct hist_field; + +typedef u64 (*hist_field_fn_t) (struct hist_field *field, void *event); + +struct hist_field { + struct ftrace_event_field *field; + unsigned long flags; + hist_field_fn_t fn; + unsigned int size; + unsigned int offset; +}; + +static u64 hist_field_none(struct hist_field *field, void *event) +{ + return 0; +} + +static u64 hist_field_counter(struct hist_field *field, void *event) +{ + return 1; +} + +static u64 hist_field_string(struct hist_field *hist_field, void *event) +{ + char *addr = (char *)(event + hist_field->field->offset); + + return (u64)(unsigned long)addr; +} + +static u64 hist_field_dynstring(struct hist_field *hist_field, void *event) +{ + u32 str_item = *(u32 *)(event + hist_field->field->offset); + int str_loc = str_item & 0xffff; + char *addr = (char *)(event + str_loc); + + return (u64)(unsigned long)addr; +} + +static u64 hist_field_pstring(struct hist_field *hist_field, void *event) +{ + char **addr = (char **)(event + hist_field->field->offset); + + return (u64)(unsigned long)*addr; +} + +static u64 hist_field_log2(struct hist_field *hist_field, void *event) +{ + u64 val = *(u64 *)(event + hist_field->field->offset); + + return (u64) ilog2(roundup_pow_of_two(val)); +} + +#define DEFINE_HIST_FIELD_FN(type) \ +static u64 hist_field_##type(struct hist_field *hist_field, void *event)\ +{ \ + type *addr = (type *)(event + hist_field->field->offset); \ + \ + return (u64)(unsigned long)*addr; \ +} + +DEFINE_HIST_FIELD_FN(s64); +DEFINE_HIST_FIELD_FN(u64); +DEFINE_HIST_FIELD_FN(s32); +DEFINE_HIST_FIELD_FN(u32); +DEFINE_HIST_FIELD_FN(s16); +DEFINE_HIST_FIELD_FN(u16); +DEFINE_HIST_FIELD_FN(s8); +DEFINE_HIST_FIELD_FN(u8); + +#define for_each_hist_field(i, hist_data) \ + for ((i) = 0; (i) < (hist_data)->n_fields; (i)++) + +#define for_each_hist_val_field(i, hist_data) \ + for ((i) = 0; (i) < (hist_data)->n_vals; (i)++) + +#define for_each_hist_key_field(i, hist_data) \ + for ((i) = (hist_data)->n_vals; (i) < (hist_data)->n_fields; (i)++) + +#define HIST_STACKTRACE_DEPTH 16 +#define HIST_STACKTRACE_SIZE (HIST_STACKTRACE_DEPTH * sizeof(unsigned long)) +#define HIST_STACKTRACE_SKIP 5 + +#define HITCOUNT_IDX 0 +#define HIST_KEY_SIZE_MAX (MAX_FILTER_STR_VAL + HIST_STACKTRACE_SIZE) + +enum hist_field_flags { + HIST_FIELD_FL_HITCOUNT = 1, + HIST_FIELD_FL_KEY = 2, + HIST_FIELD_FL_STRING = 4, + HIST_FIELD_FL_HEX = 8, + HIST_FIELD_FL_SYM = 16, + HIST_FIELD_FL_SYM_OFFSET = 32, + HIST_FIELD_FL_EXECNAME = 64, + HIST_FIELD_FL_SYSCALL = 128, + HIST_FIELD_FL_STACKTRACE = 256, + HIST_FIELD_FL_LOG2 = 512, +}; + +struct hist_trigger_attrs { + char *keys_str; + char *vals_str; + char *sort_key_str; + char *name; + bool pause; + bool cont; + bool clear; + unsigned int map_bits; +}; + +struct hist_trigger_data { + struct hist_field *fields[TRACING_MAP_FIELDS_MAX]; + unsigned int n_vals; + unsigned int n_keys; + unsigned int n_fields; + unsigned int key_size; + struct tracing_map_sort_key sort_keys[TRACING_MAP_SORT_KEYS_MAX]; + unsigned int n_sort_keys; + struct trace_event_file *event_file; + struct hist_trigger_attrs *attrs; + struct tracing_map *map; +}; + +static hist_field_fn_t select_value_fn(int field_size, int field_is_signed) +{ + hist_field_fn_t fn = NULL; + + switch (field_size) { + case 8: + if (field_is_signed) + fn = hist_field_s64; + else + fn = hist_field_u64; + break; + case 4: + if (field_is_signed) + fn = hist_field_s32; + else + fn = hist_field_u32; + break; + case 2: + if (field_is_signed) + fn = hist_field_s16; + else + fn = hist_field_u16; + break; + case 1: + if (field_is_signed) + fn = hist_field_s8; + else + fn = hist_field_u8; + break; + } + + return fn; +} + +static int parse_map_size(char *str) +{ + unsigned long size, map_bits; + int ret; + + strsep(&str, "="); + if (!str) { + ret = -EINVAL; + goto out; + } + + ret = kstrtoul(str, 0, &size); + if (ret) + goto out; + + map_bits = ilog2(roundup_pow_of_two(size)); + if (map_bits < TRACING_MAP_BITS_MIN || + map_bits > TRACING_MAP_BITS_MAX) + ret = -EINVAL; + else + ret = map_bits; + out: + return ret; +} + +static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs) +{ + if (!attrs) + return; + + kfree(attrs->name); + kfree(attrs->sort_key_str); + kfree(attrs->keys_str); + kfree(attrs->vals_str); + kfree(attrs); +} + +static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) +{ + struct hist_trigger_attrs *attrs; + int ret = 0; + + attrs = kzalloc(sizeof(*attrs), GFP_KERNEL); + if (!attrs) + return ERR_PTR(-ENOMEM); + + while (trigger_str) { + char *str = strsep(&trigger_str, ":"); + + if ((strncmp(str, "key=", strlen("key=")) == 0) || + (strncmp(str, "keys=", strlen("keys=")) == 0)) + attrs->keys_str = kstrdup(str, GFP_KERNEL); + else if ((strncmp(str, "val=", strlen("val=")) == 0) || + (strncmp(str, "vals=", strlen("vals=")) == 0) || + (strncmp(str, "values=", strlen("values=")) == 0)) + attrs->vals_str = kstrdup(str, GFP_KERNEL); + else if (strncmp(str, "sort=", strlen("sort=")) == 0) + attrs->sort_key_str = kstrdup(str, GFP_KERNEL); + else if (strncmp(str, "name=", strlen("name=")) == 0) + attrs->name = kstrdup(str, GFP_KERNEL); + else if (strcmp(str, "pause") == 0) + attrs->pause = true; + else if ((strcmp(str, "cont") == 0) || + (strcmp(str, "continue") == 0)) + attrs->cont = true; + else if (strcmp(str, "clear") == 0) + attrs->clear = true; + else if (strncmp(str, "size=", strlen("size=")) == 0) { + int map_bits = parse_map_size(str); + + if (map_bits < 0) { + ret = map_bits; + goto free; + } + attrs->map_bits = map_bits; + } else { + ret = -EINVAL; + goto free; + } + } + + if (!attrs->keys_str) { + ret = -EINVAL; + goto free; + } + + return attrs; + free: + destroy_hist_trigger_attrs(attrs); + + return ERR_PTR(ret); +} + +static inline void save_comm(char *comm, struct task_struct *task) +{ + if (!task->pid) { + strcpy(comm, "<idle>"); + return; + } + + if (WARN_ON_ONCE(task->pid < 0)) { + strcpy(comm, "<XXX>"); + return; + } + + memcpy(comm, task->comm, TASK_COMM_LEN); +} + +static void hist_trigger_elt_comm_free(struct tracing_map_elt *elt) +{ + kfree((char *)elt->private_data); +} + +static int hist_trigger_elt_comm_alloc(struct tracing_map_elt *elt) +{ + struct hist_trigger_data *hist_data = elt->map->private_data; + struct hist_field *key_field; + unsigned int i; + + for_each_hist_key_field(i, hist_data) { + key_field = hist_data->fields[i]; + + if (key_field->flags & HIST_FIELD_FL_EXECNAME) { + unsigned int size = TASK_COMM_LEN + 1; + + elt->private_data = kzalloc(size, GFP_KERNEL); + if (!elt->private_data) + return -ENOMEM; + break; + } + } + + return 0; +} + +static void hist_trigger_elt_comm_copy(struct tracing_map_elt *to, + struct tracing_map_elt *from) +{ + char *comm_from = from->private_data; + char *comm_to = to->private_data; + + if (comm_from) + memcpy(comm_to, comm_from, TASK_COMM_LEN + 1); +} + +static void hist_trigger_elt_comm_init(struct tracing_map_elt *elt) +{ + char *comm = elt->private_data; + + if (comm) + save_comm(comm, current); +} + +static const struct tracing_map_ops hist_trigger_elt_comm_ops = { + .elt_alloc = hist_trigger_elt_comm_alloc, + .elt_copy = hist_trigger_elt_comm_copy, + .elt_free = hist_trigger_elt_comm_free, + .elt_init = hist_trigger_elt_comm_init, +}; + +static void destroy_hist_field(struct hist_field *hist_field) +{ + kfree(hist_field); +} + +static struct hist_field *create_hist_field(struct ftrace_event_field *field, + unsigned long flags) +{ + struct hist_field *hist_field; + + if (field && is_function_field(field)) + return NULL; + + hist_field = kzalloc(sizeof(struct hist_field), GFP_KERNEL); + if (!hist_field) + return NULL; + + if (flags & HIST_FIELD_FL_HITCOUNT) { + hist_field->fn = hist_field_counter; + goto out; + } + + if (flags & HIST_FIELD_FL_STACKTRACE) { + hist_field->fn = hist_field_none; + goto out; + } + + if (flags & HIST_FIELD_FL_LOG2) { + hist_field->fn = hist_field_log2; + goto out; + } + + if (WARN_ON_ONCE(!field)) + goto out; + + if (is_string_field(field)) { + flags |= HIST_FIELD_FL_STRING; + + if (field->filter_type == FILTER_STATIC_STRING) + hist_field->fn = hist_field_string; + else if (field->filter_type == FILTER_DYN_STRING) + hist_field->fn = hist_field_dynstring; + else + hist_field->fn = hist_field_pstring; + } else { + hist_field->fn = select_value_fn(field->size, + field->is_signed); + if (!hist_field->fn) { + destroy_hist_field(hist_field); + return NULL; + } + } + out: + hist_field->field = field; + hist_field->flags = flags; + + return hist_field; +} + +static void destroy_hist_fields(struct hist_trigger_data *hist_data) +{ + unsigned int i; + + for (i = 0; i < TRACING_MAP_FIELDS_MAX; i++) { + if (hist_data->fields[i]) { + destroy_hist_field(hist_data->fields[i]); + hist_data->fields[i] = NULL; + } + } +} + +static int create_hitcount_val(struct hist_trigger_data *hist_data) +{ + hist_data->fields[HITCOUNT_IDX] = + create_hist_field(NULL, HIST_FIELD_FL_HITCOUNT); + if (!hist_data->fields[HITCOUNT_IDX]) + return -ENOMEM; + + hist_data->n_vals++; + + if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX)) + return -EINVAL; + + return 0; +} + +static int create_val_field(struct hist_trigger_data *hist_data, + unsigned int val_idx, + struct trace_event_file *file, + char *field_str) +{ + struct ftrace_event_field *field = NULL; + unsigned long flags = 0; + char *field_name; + int ret = 0; + + if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX)) + return -EINVAL; + + field_name = strsep(&field_str, "."); + if (field_str) { + if (strcmp(field_str, "hex") == 0) + flags |= HIST_FIELD_FL_HEX; + else { + ret = -EINVAL; + goto out; + } + } + + field = trace_find_event_field(file->event_call, field_name); + if (!field) { + ret = -EINVAL; + goto out; + } + + hist_data->fields[val_idx] = create_hist_field(field, flags); + if (!hist_data->fields[val_idx]) { + ret = -ENOMEM; + goto out; + } + + ++hist_data->n_vals; + + if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX)) + ret = -EINVAL; + out: + return ret; +} + +static int create_val_fields(struct hist_trigger_data *hist_data, + struct trace_event_file *file) +{ + char *fields_str, *field_str; + unsigned int i, j; + int ret; + + ret = create_hitcount_val(hist_data); + if (ret) + goto out; + + fields_str = hist_data->attrs->vals_str; + if (!fields_str) + goto out; + + strsep(&fields_str, "="); + if (!fields_str) + goto out; + + for (i = 0, j = 1; i < TRACING_MAP_VALS_MAX && + j < TRACING_MAP_VALS_MAX; i++) { + field_str = strsep(&fields_str, ","); + if (!field_str) + break; + if (strcmp(field_str, "hitcount") == 0) + continue; + ret = create_val_field(hist_data, j++, file, field_str); + if (ret) + goto out; + } + if (fields_str && (strcmp(fields_str, "hitcount") != 0)) + ret = -EINVAL; + out: + return ret; +} + +static int create_key_field(struct hist_trigger_data *hist_data, + unsigned int key_idx, + unsigned int key_offset, + struct trace_event_file *file, + char *field_str) +{ + struct ftrace_event_field *field = NULL; + unsigned long flags = 0; + unsigned int key_size; + int ret = 0; + + if (WARN_ON(key_idx >= TRACING_MAP_FIELDS_MAX)) + return -EINVAL; + + flags |= HIST_FIELD_FL_KEY; + + if (strcmp(field_str, "stacktrace") == 0) { + flags |= HIST_FIELD_FL_STACKTRACE; + key_size = sizeof(unsigned long) * HIST_STACKTRACE_DEPTH; + } else { + char *field_name = strsep(&field_str, "."); + + if (field_str) { + if (strcmp(field_str, "hex") == 0) + flags |= HIST_FIELD_FL_HEX; + else if (strcmp(field_str, "sym") == 0) + flags |= HIST_FIELD_FL_SYM; + else if (strcmp(field_str, "sym-offset") == 0) + flags |= HIST_FIELD_FL_SYM_OFFSET; + else if ((strcmp(field_str, "execname") == 0) && + (strcmp(field_name, "common_pid") == 0)) + flags |= HIST_FIELD_FL_EXECNAME; + else if (strcmp(field_str, "syscall") == 0) + flags |= HIST_FIELD_FL_SYSCALL; + else if (strcmp(field_str, "log2") == 0) + flags |= HIST_FIELD_FL_LOG2; + else { + ret = -EINVAL; + goto out; + } + } + + field = trace_find_event_field(file->event_call, field_name); + if (!field) { + ret = -EINVAL; + goto out; + } + + if (is_string_field(field)) + key_size = MAX_FILTER_STR_VAL; + else + key_size = field->size; + } + + hist_data->fields[key_idx] = create_hist_field(field, flags); + if (!hist_data->fields[key_idx]) { + ret = -ENOMEM; + goto out; + } + + key_size = ALIGN(key_size, sizeof(u64)); + hist_data->fields[key_idx]->size = key_size; + hist_data->fields[key_idx]->offset = key_offset; + hist_data->key_size += key_size; + if (hist_data->key_size > HIST_KEY_SIZE_MAX) { + ret = -EINVAL; + goto out; + } + + hist_data->n_keys++; + + if (WARN_ON(hist_data->n_keys > TRACING_MAP_KEYS_MAX)) + return -EINVAL; + + ret = key_size; + out: + return ret; +} + +static int create_key_fields(struct hist_trigger_data *hist_data, + struct trace_event_file *file) +{ + unsigned int i, key_offset = 0, n_vals = hist_data->n_vals; + char *fields_str, *field_str; + int ret = -EINVAL; + + fields_str = hist_data->attrs->keys_str; + if (!fields_str) + goto out; + + strsep(&fields_str, "="); + if (!fields_str) + goto out; + + for (i = n_vals; i < n_vals + TRACING_MAP_KEYS_MAX; i++) { + field_str = strsep(&fields_str, ","); + if (!field_str) + break; + ret = create_key_field(hist_data, i, key_offset, + file, field_str); + if (ret < 0) + goto out; + key_offset += ret; + } + if (fields_str) { + ret = -EINVAL; + goto out; + } + ret = 0; + out: + return ret; +} + +static int create_hist_fields(struct hist_trigger_data *hist_data, + struct trace_event_file *file) +{ + int ret; + + ret = create_val_fields(hist_data, file); + if (ret) + goto out; + + ret = create_key_fields(hist_data, file); + if (ret) + goto out; + + hist_data->n_fields = hist_data->n_vals + hist_data->n_keys; + out: + return ret; +} + +static int is_descending(const char *str) +{ + if (!str) + return 0; + + if (strcmp(str, "descending") == 0) + return 1; + + if (strcmp(str, "ascending") == 0) + return 0; + + return -EINVAL; +} + +static int create_sort_keys(struct hist_trigger_data *hist_data) +{ + char *fields_str = hist_data->attrs->sort_key_str; + struct ftrace_event_field *field = NULL; + struct tracing_map_sort_key *sort_key; + int descending, ret = 0; + unsigned int i, j; + + hist_data->n_sort_keys = 1; /* we always have at least one, hitcount */ + + if (!fields_str) + goto out; + + strsep(&fields_str, "="); + if (!fields_str) { + ret = -EINVAL; + goto out; + } + + for (i = 0; i < TRACING_MAP_SORT_KEYS_MAX; i++) { + char *field_str, *field_name; + + sort_key = &hist_data->sort_keys[i]; + + field_str = strsep(&fields_str, ","); + if (!field_str) { + if (i == 0) + ret = -EINVAL; + break; + } + + if ((i == TRACING_MAP_SORT_KEYS_MAX - 1) && fields_str) { + ret = -EINVAL; + break; + } + + field_name = strsep(&field_str, "."); + if (!field_name) { + ret = -EINVAL; + break; + } + + if (strcmp(field_name, "hitcount") == 0) { + descending = is_descending(field_str); + if (descending < 0) { + ret = descending; + break; + } + sort_key->descending = descending; + continue; + } + + for (j = 1; j < hist_data->n_fields; j++) { + field = hist_data->fields[j]->field; + if (field && (strcmp(field_name, field->name) == 0)) { + sort_key->field_idx = j; + descending = is_descending(field_str); + if (descending < 0) { + ret = descending; + goto out; + } + sort_key->descending = descending; + break; + } + } + if (j == hist_data->n_fields) { + ret = -EINVAL; + break; + } + } + hist_data->n_sort_keys = i; + out: + return ret; +} + +static void destroy_hist_data(struct hist_trigger_data *hist_data) +{ + destroy_hist_trigger_attrs(hist_data->attrs); + destroy_hist_fields(hist_data); + tracing_map_destroy(hist_data->map); + kfree(hist_data); +} + +static int create_tracing_map_fields(struct hist_trigger_data *hist_data) +{ + struct tracing_map *map = hist_data->map; + struct ftrace_event_field *field; + struct hist_field *hist_field; + int i, idx; + + for_each_hist_field(i, hist_data) { + hist_field = hist_data->fields[i]; + if (hist_field->flags & HIST_FIELD_FL_KEY) { + tracing_map_cmp_fn_t cmp_fn; + + field = hist_field->field; + + if (hist_field->flags & HIST_FIELD_FL_STACKTRACE) + cmp_fn = tracing_map_cmp_none; + else if (is_string_field(field)) + cmp_fn = tracing_map_cmp_string; + else + cmp_fn = tracing_map_cmp_num(field->size, + field->is_signed); + idx = tracing_map_add_key_field(map, + hist_field->offset, + cmp_fn); + + } else + idx = tracing_map_add_sum_field(map); + + if (idx < 0) + return idx; + } + + return 0; +} + +static bool need_tracing_map_ops(struct hist_trigger_data *hist_data) +{ + struct hist_field *key_field; + unsigned int i; + + for_each_hist_key_field(i, hist_data) { + key_field = hist_data->fields[i]; + + if (key_field->flags & HIST_FIELD_FL_EXECNAME) + return true; + } + + return false; +} + +static struct hist_trigger_data * +create_hist_data(unsigned int map_bits, + struct hist_trigger_attrs *attrs, + struct trace_event_file *file) +{ + const struct tracing_map_ops *map_ops = NULL; + struct hist_trigger_data *hist_data; + int ret = 0; + + hist_data = kzalloc(sizeof(*hist_data), GFP_KERNEL); + if (!hist_data) + return ERR_PTR(-ENOMEM); + + hist_data->attrs = attrs; + + ret = create_hist_fields(hist_data, file); + if (ret) + goto free; + + ret = create_sort_keys(hist_data); + if (ret) + goto free; + + if (need_tracing_map_ops(hist_data)) + map_ops = &hist_trigger_elt_comm_ops; + + hist_data->map = tracing_map_create(map_bits, hist_data->key_size, + map_ops, hist_data); + if (IS_ERR(hist_data->map)) { + ret = PTR_ERR(hist_data->map); + hist_data->map = NULL; + goto free; + } + + ret = create_tracing_map_fields(hist_data); + if (ret) + goto free; + + ret = tracing_map_init(hist_data->map); + if (ret) + goto free; + + hist_data->event_file = file; + out: + return hist_data; + free: + hist_data->attrs = NULL; + + destroy_hist_data(hist_data); + + hist_data = ERR_PTR(ret); + + goto out; +} + +static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + void *rec) +{ + struct hist_field *hist_field; + unsigned int i; + u64 hist_val; + + for_each_hist_val_field(i, hist_data) { + hist_field = hist_data->fields[i]; + hist_val = hist_field->fn(hist_field, rec); + tracing_map_update_sum(elt, i, hist_val); + } +} + +static inline void add_to_key(char *compound_key, void *key, + struct hist_field *key_field, void *rec) +{ + size_t size = key_field->size; + + if (key_field->flags & HIST_FIELD_FL_STRING) { + struct ftrace_event_field *field; + + field = key_field->field; + if (field->filter_type == FILTER_DYN_STRING) + size = *(u32 *)(rec + field->offset) >> 16; + else if (field->filter_type == FILTER_PTR_STRING) + size = strlen(key); + else if (field->filter_type == FILTER_STATIC_STRING) + size = field->size; + + /* ensure NULL-termination */ + if (size > key_field->size - 1) + size = key_field->size - 1; + } + + memcpy(compound_key + key_field->offset, key, size); +} + +static void event_hist_trigger(struct event_trigger_data *data, void *rec) +{ + struct hist_trigger_data *hist_data = data->private_data; + bool use_compound_key = (hist_data->n_keys > 1); + unsigned long entries[HIST_STACKTRACE_DEPTH]; + char compound_key[HIST_KEY_SIZE_MAX]; + struct stack_trace stacktrace; + struct hist_field *key_field; + struct tracing_map_elt *elt; + u64 field_contents; + void *key = NULL; + unsigned int i; + + memset(compound_key, 0, hist_data->key_size); + + for_each_hist_key_field(i, hist_data) { + key_field = hist_data->fields[i]; + + if (key_field->flags & HIST_FIELD_FL_STACKTRACE) { + stacktrace.max_entries = HIST_STACKTRACE_DEPTH; + stacktrace.entries = entries; + stacktrace.nr_entries = 0; + stacktrace.skip = HIST_STACKTRACE_SKIP; + + memset(stacktrace.entries, 0, HIST_STACKTRACE_SIZE); + save_stack_trace(&stacktrace); + + key = entries; + } else { + field_contents = key_field->fn(key_field, rec); + if (key_field->flags & HIST_FIELD_FL_STRING) { + key = (void *)(unsigned long)field_contents; + use_compound_key = true; + } else + key = (void *)&field_contents; + } + + if (use_compound_key) + add_to_key(compound_key, key, key_field, rec); + } + + if (use_compound_key) + key = compound_key; + + elt = tracing_map_insert(hist_data->map, key); + if (elt) + hist_trigger_elt_update(hist_data, elt, rec); +} + +static void hist_trigger_stacktrace_print(struct seq_file *m, + unsigned long *stacktrace_entries, + unsigned int max_entries) +{ + char str[KSYM_SYMBOL_LEN]; + unsigned int spaces = 8; + unsigned int i; + + for (i = 0; i < max_entries; i++) { + if (stacktrace_entries[i] == ULONG_MAX) + return; + + seq_printf(m, "%*c", 1 + spaces, ' '); + sprint_symbol(str, stacktrace_entries[i]); + seq_printf(m, "%s\n", str); + } +} + +static void +hist_trigger_entry_print(struct seq_file *m, + struct hist_trigger_data *hist_data, void *key, + struct tracing_map_elt *elt) +{ + struct hist_field *key_field; + char str[KSYM_SYMBOL_LEN]; + bool multiline = false; + unsigned int i; + u64 uval; + + seq_puts(m, "{ "); + + for_each_hist_key_field(i, hist_data) { + key_field = hist_data->fields[i]; + + if (i > hist_data->n_vals) + seq_puts(m, ", "); + + if (key_field->flags & HIST_FIELD_FL_HEX) { + uval = *(u64 *)(key + key_field->offset); + seq_printf(m, "%s: %llx", + key_field->field->name, uval); + } else if (key_field->flags & HIST_FIELD_FL_SYM) { + uval = *(u64 *)(key + key_field->offset); + sprint_symbol_no_offset(str, uval); + seq_printf(m, "%s: [%llx] %-45s", + key_field->field->name, uval, str); + } else if (key_field->flags & HIST_FIELD_FL_SYM_OFFSET) { + uval = *(u64 *)(key + key_field->offset); + sprint_symbol(str, uval); + seq_printf(m, "%s: [%llx] %-55s", + key_field->field->name, uval, str); + } else if (key_field->flags & HIST_FIELD_FL_EXECNAME) { + char *comm = elt->private_data; + + uval = *(u64 *)(key + key_field->offset); + seq_printf(m, "%s: %-16s[%10llu]", + key_field->field->name, comm, uval); + } else if (key_field->flags & HIST_FIELD_FL_SYSCALL) { + const char *syscall_name; + + uval = *(u64 *)(key + key_field->offset); + syscall_name = get_syscall_name(uval); + if (!syscall_name) + syscall_name = "unknown_syscall"; + + seq_printf(m, "%s: %-30s[%3llu]", + key_field->field->name, syscall_name, uval); + } else if (key_field->flags & HIST_FIELD_FL_STACKTRACE) { + seq_puts(m, "stacktrace:\n"); + hist_trigger_stacktrace_print(m, + key + key_field->offset, + HIST_STACKTRACE_DEPTH); + multiline = true; + } else if (key_field->flags & HIST_FIELD_FL_LOG2) { + seq_printf(m, "%s: ~ 2^%-2llu", key_field->field->name, + *(u64 *)(key + key_field->offset)); + } else if (key_field->flags & HIST_FIELD_FL_STRING) { + seq_printf(m, "%s: %-50s", key_field->field->name, + (char *)(key + key_field->offset)); + } else { + uval = *(u64 *)(key + key_field->offset); + seq_printf(m, "%s: %10llu", key_field->field->name, + uval); + } + } + + if (!multiline) + seq_puts(m, " "); + + seq_puts(m, "}"); + + seq_printf(m, " hitcount: %10llu", + tracing_map_read_sum(elt, HITCOUNT_IDX)); + + for (i = 1; i < hist_data->n_vals; i++) { + if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) { + seq_printf(m, " %s: %10llx", + hist_data->fields[i]->field->name, + tracing_map_read_sum(elt, i)); + } else { + seq_printf(m, " %s: %10llu", + hist_data->fields[i]->field->name, + tracing_map_read_sum(elt, i)); + } + } + + seq_puts(m, "\n"); +} + +static int print_entries(struct seq_file *m, + struct hist_trigger_data *hist_data) +{ + struct tracing_map_sort_entry **sort_entries = NULL; + struct tracing_map *map = hist_data->map; + int i, n_entries; + + n_entries = tracing_map_sort_entries(map, hist_data->sort_keys, + hist_data->n_sort_keys, + &sort_entries); + if (n_entries < 0) + return n_entries; + + for (i = 0; i < n_entries; i++) + hist_trigger_entry_print(m, hist_data, + sort_entries[i]->key, + sort_entries[i]->elt); + + tracing_map_destroy_sort_entries(sort_entries, n_entries); + + return n_entries; +} + +static void hist_trigger_show(struct seq_file *m, + struct event_trigger_data *data, int n) +{ + struct hist_trigger_data *hist_data; + int n_entries, ret = 0; + + if (n > 0) + seq_puts(m, "\n\n"); + + seq_puts(m, "# event histogram\n#\n# trigger info: "); + data->ops->print(m, data->ops, data); + seq_puts(m, "#\n\n"); + + hist_data = data->private_data; + n_entries = print_entries(m, hist_data); + if (n_entries < 0) { + ret = n_entries; + n_entries = 0; + } + + seq_printf(m, "\nTotals:\n Hits: %llu\n Entries: %u\n Dropped: %llu\n", + (u64)atomic64_read(&hist_data->map->hits), + n_entries, (u64)atomic64_read(&hist_data->map->drops)); +} + +static int hist_show(struct seq_file *m, void *v) +{ + struct event_trigger_data *data; + struct trace_event_file *event_file; + int n = 0, ret = 0; + + mutex_lock(&event_mutex); + + event_file = event_file_data(m->private); + if (unlikely(!event_file)) { + ret = -ENODEV; + goto out_unlock; + } + + list_for_each_entry_rcu(data, &event_file->triggers, list) { + if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) + hist_trigger_show(m, data, n++); + } + + out_unlock: + mutex_unlock(&event_mutex); + + return ret; +} + +static int event_hist_open(struct inode *inode, struct file *file) +{ + return single_open(file, hist_show, file); +} + +const struct file_operations event_hist_fops = { + .open = event_hist_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static const char *get_hist_field_flags(struct hist_field *hist_field) +{ + const char *flags_str = NULL; + + if (hist_field->flags & HIST_FIELD_FL_HEX) + flags_str = "hex"; + else if (hist_field->flags & HIST_FIELD_FL_SYM) + flags_str = "sym"; + else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET) + flags_str = "sym-offset"; + else if (hist_field->flags & HIST_FIELD_FL_EXECNAME) + flags_str = "execname"; + else if (hist_field->flags & HIST_FIELD_FL_SYSCALL) + flags_str = "syscall"; + else if (hist_field->flags & HIST_FIELD_FL_LOG2) + flags_str = "log2"; + + return flags_str; +} + +static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) +{ + seq_printf(m, "%s", hist_field->field->name); + if (hist_field->flags) { + const char *flags_str = get_hist_field_flags(hist_field); + + if (flags_str) + seq_printf(m, ".%s", flags_str); + } +} + +static int event_hist_trigger_print(struct seq_file *m, + struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + struct hist_trigger_data *hist_data = data->private_data; + struct hist_field *key_field; + unsigned int i; + + seq_puts(m, "hist:"); + + if (data->name) + seq_printf(m, "%s:", data->name); + + seq_puts(m, "keys="); + + for_each_hist_key_field(i, hist_data) { + key_field = hist_data->fields[i]; + + if (i > hist_data->n_vals) + seq_puts(m, ","); + + if (key_field->flags & HIST_FIELD_FL_STACKTRACE) + seq_puts(m, "stacktrace"); + else + hist_field_print(m, key_field); + } + + seq_puts(m, ":vals="); + + for_each_hist_val_field(i, hist_data) { + if (i == HITCOUNT_IDX) + seq_puts(m, "hitcount"); + else { + seq_puts(m, ","); + hist_field_print(m, hist_data->fields[i]); + } + } + + seq_puts(m, ":sort="); + + for (i = 0; i < hist_data->n_sort_keys; i++) { + struct tracing_map_sort_key *sort_key; + + sort_key = &hist_data->sort_keys[i]; + + if (i > 0) + seq_puts(m, ","); + + if (sort_key->field_idx == HITCOUNT_IDX) + seq_puts(m, "hitcount"); + else { + unsigned int idx = sort_key->field_idx; + + if (WARN_ON(idx >= TRACING_MAP_FIELDS_MAX)) + return -EINVAL; + + hist_field_print(m, hist_data->fields[idx]); + } + + if (sort_key->descending) + seq_puts(m, ".descending"); + } + + seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits)); + + if (data->filter_str) + seq_printf(m, " if %s", data->filter_str); + + if (data->paused) + seq_puts(m, " [paused]"); + else + seq_puts(m, " [active]"); + + seq_putc(m, '\n'); + + return 0; +} + +static int event_hist_trigger_init(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + struct hist_trigger_data *hist_data = data->private_data; + + if (!data->ref && hist_data->attrs->name) + save_named_trigger(hist_data->attrs->name, data); + + data->ref++; + + return 0; +} + +static void event_hist_trigger_free(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + struct hist_trigger_data *hist_data = data->private_data; + + if (WARN_ON_ONCE(data->ref <= 0)) + return; + + data->ref--; + if (!data->ref) { + if (data->name) + del_named_trigger(data); + trigger_data_free(data); + destroy_hist_data(hist_data); + } +} + +static struct event_trigger_ops event_hist_trigger_ops = { + .func = event_hist_trigger, + .print = event_hist_trigger_print, + .init = event_hist_trigger_init, + .free = event_hist_trigger_free, +}; + +static int event_hist_trigger_named_init(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + data->ref++; + + save_named_trigger(data->named_data->name, data); + + event_hist_trigger_init(ops, data->named_data); + + return 0; +} + +static void event_hist_trigger_named_free(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + if (WARN_ON_ONCE(data->ref <= 0)) + return; + + event_hist_trigger_free(ops, data->named_data); + + data->ref--; + if (!data->ref) { + del_named_trigger(data); + trigger_data_free(data); + } +} + +static struct event_trigger_ops event_hist_trigger_named_ops = { + .func = event_hist_trigger, + .print = event_hist_trigger_print, + .init = event_hist_trigger_named_init, + .free = event_hist_trigger_named_free, +}; + +static struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd, + char *param) +{ + return &event_hist_trigger_ops; +} + +static void hist_clear(struct event_trigger_data *data) +{ + struct hist_trigger_data *hist_data = data->private_data; + + if (data->name) + pause_named_trigger(data); + + synchronize_sched(); + + tracing_map_clear(hist_data->map); + + if (data->name) + unpause_named_trigger(data); +} + +static bool compatible_field(struct ftrace_event_field *field, + struct ftrace_event_field *test_field) +{ + if (field == test_field) + return true; + if (field == NULL || test_field == NULL) + return false; + if (strcmp(field->name, test_field->name) != 0) + return false; + if (strcmp(field->type, test_field->type) != 0) + return false; + if (field->size != test_field->size) + return false; + if (field->is_signed != test_field->is_signed) + return false; + + return true; +} + +static bool hist_trigger_match(struct event_trigger_data *data, + struct event_trigger_data *data_test, + struct event_trigger_data *named_data, + bool ignore_filter) +{ + struct tracing_map_sort_key *sort_key, *sort_key_test; + struct hist_trigger_data *hist_data, *hist_data_test; + struct hist_field *key_field, *key_field_test; + unsigned int i; + + if (named_data && (named_data != data_test) && + (named_data != data_test->named_data)) + return false; + + if (!named_data && is_named_trigger(data_test)) + return false; + + hist_data = data->private_data; + hist_data_test = data_test->private_data; + + if (hist_data->n_vals != hist_data_test->n_vals || + hist_data->n_fields != hist_data_test->n_fields || + hist_data->n_sort_keys != hist_data_test->n_sort_keys) + return false; + + if (!ignore_filter) { + if ((data->filter_str && !data_test->filter_str) || + (!data->filter_str && data_test->filter_str)) + return false; + } + + for_each_hist_field(i, hist_data) { + key_field = hist_data->fields[i]; + key_field_test = hist_data_test->fields[i]; + + if (key_field->flags != key_field_test->flags) + return false; + if (!compatible_field(key_field->field, key_field_test->field)) + return false; + if (key_field->offset != key_field_test->offset) + return false; + } + + for (i = 0; i < hist_data->n_sort_keys; i++) { + sort_key = &hist_data->sort_keys[i]; + sort_key_test = &hist_data_test->sort_keys[i]; + + if (sort_key->field_idx != sort_key_test->field_idx || + sort_key->descending != sort_key_test->descending) + return false; + } + + if (!ignore_filter && data->filter_str && + (strcmp(data->filter_str, data_test->filter_str) != 0)) + return false; + + return true; +} + +static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct trace_event_file *file) +{ + struct hist_trigger_data *hist_data = data->private_data; + struct event_trigger_data *test, *named_data = NULL; + int ret = 0; + + if (hist_data->attrs->name) { + named_data = find_named_trigger(hist_data->attrs->name); + if (named_data) { + if (!hist_trigger_match(data, named_data, named_data, + true)) { + ret = -EINVAL; + goto out; + } + } + } + + if (hist_data->attrs->name && !named_data) + goto new; + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (!hist_trigger_match(data, test, named_data, false)) + continue; + if (hist_data->attrs->pause) + test->paused = true; + else if (hist_data->attrs->cont) + test->paused = false; + else if (hist_data->attrs->clear) + hist_clear(test); + else + ret = -EEXIST; + goto out; + } + } + new: + if (hist_data->attrs->cont || hist_data->attrs->clear) { + ret = -ENOENT; + goto out; + } + + if (hist_data->attrs->pause) + data->paused = true; + + if (named_data) { + destroy_hist_data(data->private_data); + data->private_data = named_data->private_data; + set_named_trigger_data(data, named_data); + data->ops = &event_hist_trigger_named_ops; + } + + if (data->ops->init) { + ret = data->ops->init(data->ops, data); + if (ret < 0) + goto out; + } + + list_add_rcu(&data->list, &file->triggers); + ret++; + + update_cond_flag(file); + + if (trace_event_trigger_enable_disable(file, 1) < 0) { + list_del_rcu(&data->list); + update_cond_flag(file); + ret--; + } + out: + return ret; +} + +static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct trace_event_file *file) +{ + struct hist_trigger_data *hist_data = data->private_data; + struct event_trigger_data *test, *named_data = NULL; + bool unregistered = false; + + if (hist_data->attrs->name) + named_data = find_named_trigger(hist_data->attrs->name); + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (!hist_trigger_match(data, test, named_data, false)) + continue; + unregistered = true; + list_del_rcu(&test->list); + trace_event_trigger_enable_disable(file, 0); + update_cond_flag(file); + break; + } + } + + if (unregistered && test->ops->free) + test->ops->free(test->ops, test); +} + +static void hist_unreg_all(struct trace_event_file *file) +{ + struct event_trigger_data *test, *n; + + list_for_each_entry_safe(test, n, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + list_del_rcu(&test->list); + trace_event_trigger_enable_disable(file, 0); + update_cond_flag(file); + if (test->ops->free) + test->ops->free(test->ops, test); + } + } +} + +static int event_hist_trigger_func(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param) +{ + unsigned int hist_trigger_bits = TRACING_MAP_BITS_DEFAULT; + struct event_trigger_data *trigger_data; + struct hist_trigger_attrs *attrs; + struct event_trigger_ops *trigger_ops; + struct hist_trigger_data *hist_data; + char *trigger; + int ret = 0; + + if (!param) + return -EINVAL; + + /* separate the trigger from the filter (k:v [if filter]) */ + trigger = strsep(¶m, " \t"); + if (!trigger) + return -EINVAL; + + attrs = parse_hist_trigger_attrs(trigger); + if (IS_ERR(attrs)) + return PTR_ERR(attrs); + + if (attrs->map_bits) + hist_trigger_bits = attrs->map_bits; + + hist_data = create_hist_data(hist_trigger_bits, attrs, file); + if (IS_ERR(hist_data)) { + destroy_hist_trigger_attrs(attrs); + return PTR_ERR(hist_data); + } + + trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); + + ret = -ENOMEM; + trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); + if (!trigger_data) + goto out_free; + + trigger_data->count = -1; + trigger_data->ops = trigger_ops; + trigger_data->cmd_ops = cmd_ops; + + INIT_LIST_HEAD(&trigger_data->list); + RCU_INIT_POINTER(trigger_data->filter, NULL); + + trigger_data->private_data = hist_data; + + /* if param is non-empty, it's supposed to be a filter */ + if (param && cmd_ops->set_filter) { + ret = cmd_ops->set_filter(param, trigger_data, file); + if (ret < 0) + goto out_free; + } + + if (glob[0] == '!') { + cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); + ret = 0; + goto out_free; + } + + ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); + /* + * The above returns on success the # of triggers registered, + * but if it didn't register any it returns zero. Consider no + * triggers registered a failure too. + */ + if (!ret) { + if (!(attrs->pause || attrs->cont || attrs->clear)) + ret = -ENOENT; + goto out_free; + } else if (ret < 0) + goto out_free; + /* Just return zero, not the number of registered triggers */ + ret = 0; + out: + return ret; + out_free: + if (cmd_ops->set_filter) + cmd_ops->set_filter(NULL, trigger_data, NULL); + + kfree(trigger_data); + + destroy_hist_data(hist_data); + goto out; +} + +static struct event_command trigger_hist_cmd = { + .name = "hist", + .trigger_type = ETT_EVENT_HIST, + .flags = EVENT_CMD_FL_NEEDS_REC, + .func = event_hist_trigger_func, + .reg = hist_register_trigger, + .unreg = hist_unregister_trigger, + .unreg_all = hist_unreg_all, + .get_trigger_ops = event_hist_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +__init int register_trigger_hist_cmd(void) +{ + int ret; + + ret = register_event_command(&trigger_hist_cmd); + WARN_ON(ret < 0); + + return ret; +} + +static void +hist_enable_trigger(struct event_trigger_data *data, void *rec) +{ + struct enable_trigger_data *enable_data = data->private_data; + struct event_trigger_data *test; + + list_for_each_entry_rcu(test, &enable_data->file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (enable_data->enable) + test->paused = false; + else + test->paused = true; + } + } +} + +static void +hist_enable_count_trigger(struct event_trigger_data *data, void *rec) +{ + if (!data->count) + return; + + if (data->count != -1) + (data->count)--; + + hist_enable_trigger(data, rec); +} + +static struct event_trigger_ops hist_enable_trigger_ops = { + .func = hist_enable_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops hist_enable_count_trigger_ops = { + .func = hist_enable_count_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops hist_disable_trigger_ops = { + .func = hist_enable_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops hist_disable_count_trigger_ops = { + .func = hist_enable_count_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops * +hist_enable_get_trigger_ops(char *cmd, char *param) +{ + struct event_trigger_ops *ops; + bool enable; + + enable = (strcmp(cmd, ENABLE_HIST_STR) == 0); + + if (enable) + ops = param ? &hist_enable_count_trigger_ops : + &hist_enable_trigger_ops; + else + ops = param ? &hist_disable_count_trigger_ops : + &hist_disable_trigger_ops; + + return ops; +} + +static void hist_enable_unreg_all(struct trace_event_file *file) +{ + struct event_trigger_data *test, *n; + + list_for_each_entry_safe(test, n, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_HIST_ENABLE) { + list_del_rcu(&test->list); + update_cond_flag(file); + trace_event_trigger_enable_disable(file, 0); + if (test->ops->free) + test->ops->free(test->ops, test); + } + } +} + +static struct event_command trigger_hist_enable_cmd = { + .name = ENABLE_HIST_STR, + .trigger_type = ETT_HIST_ENABLE, + .func = event_enable_trigger_func, + .reg = event_enable_register_trigger, + .unreg = event_enable_unregister_trigger, + .unreg_all = hist_enable_unreg_all, + .get_trigger_ops = hist_enable_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static struct event_command trigger_hist_disable_cmd = { + .name = DISABLE_HIST_STR, + .trigger_type = ETT_HIST_ENABLE, + .func = event_enable_trigger_func, + .reg = event_enable_register_trigger, + .unreg = event_enable_unregister_trigger, + .unreg_all = hist_enable_unreg_all, + .get_trigger_ops = hist_enable_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static __init void unregister_trigger_hist_enable_disable_cmds(void) +{ + unregister_event_command(&trigger_hist_enable_cmd); + unregister_event_command(&trigger_hist_disable_cmd); +} + +__init int register_trigger_hist_enable_disable_cmds(void) +{ + int ret; + + ret = register_event_command(&trigger_hist_enable_cmd); + if (WARN_ON(ret < 0)) + return ret; + ret = register_event_command(&trigger_hist_disable_cmd); + if (WARN_ON(ret < 0)) + unregister_trigger_hist_enable_disable_cmds(); + + return ret; +} diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index d67992f3bb0e..a975571cde24 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -347,7 +347,7 @@ __init int register_event_command(struct event_command *cmd) * Currently we only unregister event commands from __init, so mark * this __init too. */ -static __init int unregister_event_command(struct event_command *cmd) +__init int unregister_event_command(struct event_command *cmd) { struct event_command *p, *n; int ret = -ENODEV; @@ -641,6 +641,7 @@ event_trigger_callback(struct event_command *cmd_ops, trigger_data->ops = trigger_ops; trigger_data->cmd_ops = cmd_ops; INIT_LIST_HEAD(&trigger_data->list); + INIT_LIST_HEAD(&trigger_data->named_list); if (glob[0] == '!') { cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); @@ -764,6 +765,148 @@ int set_trigger_filter(char *filter_str, return ret; } +static LIST_HEAD(named_triggers); + +/** + * find_named_trigger - Find the common named trigger associated with @name + * @name: The name of the set of named triggers to find the common data for + * + * Named triggers are sets of triggers that share a common set of + * trigger data. The first named trigger registered with a given name + * owns the common trigger data that the others subsequently + * registered with the same name will reference. This function + * returns the common trigger data associated with that first + * registered instance. + * + * Return: the common trigger data for the given named trigger on + * success, NULL otherwise. + */ +struct event_trigger_data *find_named_trigger(const char *name) +{ + struct event_trigger_data *data; + + if (!name) + return NULL; + + list_for_each_entry(data, &named_triggers, named_list) { + if (data->named_data) + continue; + if (strcmp(data->name, name) == 0) + return data; + } + + return NULL; +} + +/** + * is_named_trigger - determine if a given trigger is a named trigger + * @test: The trigger data to test + * + * Return: true if 'test' is a named trigger, false otherwise. + */ +bool is_named_trigger(struct event_trigger_data *test) +{ + struct event_trigger_data *data; + + list_for_each_entry(data, &named_triggers, named_list) { + if (test == data) + return true; + } + + return false; +} + +/** + * save_named_trigger - save the trigger in the named trigger list + * @name: The name of the named trigger set + * @data: The trigger data to save + * + * Return: 0 if successful, negative error otherwise. + */ +int save_named_trigger(const char *name, struct event_trigger_data *data) +{ + data->name = kstrdup(name, GFP_KERNEL); + if (!data->name) + return -ENOMEM; + + list_add(&data->named_list, &named_triggers); + + return 0; +} + +/** + * del_named_trigger - delete a trigger from the named trigger list + * @data: The trigger data to delete + */ +void del_named_trigger(struct event_trigger_data *data) +{ + kfree(data->name); + data->name = NULL; + + list_del(&data->named_list); +} + +static void __pause_named_trigger(struct event_trigger_data *data, bool pause) +{ + struct event_trigger_data *test; + + list_for_each_entry(test, &named_triggers, named_list) { + if (strcmp(test->name, data->name) == 0) { + if (pause) { + test->paused_tmp = test->paused; + test->paused = true; + } else { + test->paused = test->paused_tmp; + } + } + } +} + +/** + * pause_named_trigger - Pause all named triggers with the same name + * @data: The trigger data of a named trigger to pause + * + * Pauses a named trigger along with all other triggers having the + * same name. Because named triggers share a common set of data, + * pausing only one is meaningless, so pausing one named trigger needs + * to pause all triggers with the same name. + */ +void pause_named_trigger(struct event_trigger_data *data) +{ + __pause_named_trigger(data, true); +} + +/** + * unpause_named_trigger - Un-pause all named triggers with the same name + * @data: The trigger data of a named trigger to unpause + * + * Un-pauses a named trigger along with all other triggers having the + * same name. Because named triggers share a common set of data, + * unpausing only one is meaningless, so unpausing one named trigger + * needs to unpause all triggers with the same name. + */ +void unpause_named_trigger(struct event_trigger_data *data) +{ + __pause_named_trigger(data, false); +} + +/** + * set_named_trigger_data - Associate common named trigger data + * @data: The trigger data of a named trigger to unpause + * + * Named triggers are sets of triggers that share a common set of + * trigger data. The first named trigger registered with a given name + * owns the common trigger data that the others subsequently + * registered with the same name will reference. This function + * associates the common trigger data from the first trigger with the + * given trigger. + */ +void set_named_trigger_data(struct event_trigger_data *data, + struct event_trigger_data *named_data) +{ + data->named_data = named_data; +} + static void traceon_trigger(struct event_trigger_data *data, void *rec) { @@ -1062,15 +1205,6 @@ static __init void unregister_trigger_traceon_traceoff_cmds(void) unregister_event_command(&trigger_traceoff_cmd); } -/* Avoid typos */ -#define ENABLE_EVENT_STR "enable_event" -#define DISABLE_EVENT_STR "disable_event" - -struct enable_trigger_data { - struct trace_event_file *file; - bool enable; -}; - static void event_enable_trigger(struct event_trigger_data *data, void *rec) { @@ -1100,14 +1234,16 @@ event_enable_count_trigger(struct event_trigger_data *data, void *rec) event_enable_trigger(data, rec); } -static int -event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, - struct event_trigger_data *data) +int event_enable_trigger_print(struct seq_file *m, + struct event_trigger_ops *ops, + struct event_trigger_data *data) { struct enable_trigger_data *enable_data = data->private_data; seq_printf(m, "%s:%s:%s", - enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, + enable_data->hist ? + (enable_data->enable ? ENABLE_HIST_STR : DISABLE_HIST_STR) : + (enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR), enable_data->file->event_call->class->system, trace_event_name(enable_data->file->event_call)); @@ -1124,9 +1260,8 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, return 0; } -static void -event_enable_trigger_free(struct event_trigger_ops *ops, - struct event_trigger_data *data) +void event_enable_trigger_free(struct event_trigger_ops *ops, + struct event_trigger_data *data) { struct enable_trigger_data *enable_data = data->private_data; @@ -1171,10 +1306,9 @@ static struct event_trigger_ops event_disable_count_trigger_ops = { .free = event_enable_trigger_free, }; -static int -event_enable_trigger_func(struct event_command *cmd_ops, - struct trace_event_file *file, - char *glob, char *cmd, char *param) +int event_enable_trigger_func(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param) { struct trace_event_file *event_enable_file; struct enable_trigger_data *enable_data; @@ -1183,6 +1317,7 @@ event_enable_trigger_func(struct event_command *cmd_ops, struct trace_array *tr = file->tr; const char *system; const char *event; + bool hist = false; char *trigger; char *number; bool enable; @@ -1207,8 +1342,15 @@ event_enable_trigger_func(struct event_command *cmd_ops, if (!event_enable_file) goto out; - enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; +#ifdef CONFIG_HIST_TRIGGERS + hist = ((strcmp(cmd, ENABLE_HIST_STR) == 0) || + (strcmp(cmd, DISABLE_HIST_STR) == 0)); + enable = ((strcmp(cmd, ENABLE_EVENT_STR) == 0) || + (strcmp(cmd, ENABLE_HIST_STR) == 0)); +#else + enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; +#endif trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); ret = -ENOMEM; @@ -1228,6 +1370,7 @@ event_enable_trigger_func(struct event_command *cmd_ops, INIT_LIST_HEAD(&trigger_data->list); RCU_INIT_POINTER(trigger_data->filter, NULL); + enable_data->hist = hist; enable_data->enable = enable; enable_data->file = event_enable_file; trigger_data->private_data = enable_data; @@ -1305,10 +1448,10 @@ event_enable_trigger_func(struct event_command *cmd_ops, goto out; } -static int event_enable_register_trigger(char *glob, - struct event_trigger_ops *ops, - struct event_trigger_data *data, - struct trace_event_file *file) +int event_enable_register_trigger(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct trace_event_file *file) { struct enable_trigger_data *enable_data = data->private_data; struct enable_trigger_data *test_enable_data; @@ -1318,6 +1461,8 @@ static int event_enable_register_trigger(char *glob, list_for_each_entry_rcu(test, &file->triggers, list) { test_enable_data = test->private_data; if (test_enable_data && + (test->cmd_ops->trigger_type == + data->cmd_ops->trigger_type) && (test_enable_data->file == enable_data->file)) { ret = -EEXIST; goto out; @@ -1343,10 +1488,10 @@ out: return ret; } -static void event_enable_unregister_trigger(char *glob, - struct event_trigger_ops *ops, - struct event_trigger_data *test, - struct trace_event_file *file) +void event_enable_unregister_trigger(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *test, + struct trace_event_file *file) { struct enable_trigger_data *test_enable_data = test->private_data; struct enable_trigger_data *enable_data; @@ -1356,6 +1501,8 @@ static void event_enable_unregister_trigger(char *glob, list_for_each_entry_rcu(data, &file->triggers, list) { enable_data = data->private_data; if (enable_data && + (data->cmd_ops->trigger_type == + test->cmd_ops->trigger_type) && (enable_data->file == test_enable_data->file)) { unregistered = true; list_del_rcu(&data->list); @@ -1375,8 +1522,12 @@ event_enable_get_trigger_ops(char *cmd, char *param) struct event_trigger_ops *ops; bool enable; +#ifdef CONFIG_HIST_TRIGGERS + enable = ((strcmp(cmd, ENABLE_EVENT_STR) == 0) || + (strcmp(cmd, ENABLE_HIST_STR) == 0)); +#else enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; - +#endif if (enable) ops = param ? &event_enable_count_trigger_ops : &event_enable_trigger_ops; @@ -1447,6 +1598,8 @@ __init int register_trigger_cmds(void) register_trigger_snapshot_cmd(); register_trigger_stacktrace_cmd(); register_trigger_enable_disable_cmds(); + register_trigger_hist_enable_disable_cmds(); + register_trigger_hist_cmd(); return 0; } diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 5a095c2e4b69..0efa00d80623 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -43,7 +43,7 @@ static int allocate_ftrace_ops(struct trace_array *tr) /* Currently only the non stack verision is supported */ ops->func = function_trace_call; - ops->flags = FTRACE_OPS_FL_RECURSION_SAFE; + ops->flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_PID; tr->ops = ops; ops->private = tr; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 3a0244ff7ea8..7363ccf79512 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -319,7 +319,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) int cpu; int pc; - if (!ftrace_trace_task(current)) + if (!ftrace_trace_task(tr)) return 0; /* trace it when it is-nested-in or is a function enabled. */ @@ -338,6 +338,13 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) if (ftrace_graph_notrace_addr(trace->func)) return 1; + /* + * Stop here if tracing_threshold is set. We only write function return + * events to the ring buffer. + */ + if (tracing_thresh) + return 1; + local_irq_save(flags); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->trace_buffer.data, cpu); @@ -355,14 +362,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) return ret; } -static int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) -{ - if (tracing_thresh) - return 1; - else - return trace_graph_entry(trace); -} - static void __trace_graph_function(struct trace_array *tr, unsigned long ip, unsigned long flags, int pc) @@ -457,7 +456,7 @@ static int graph_trace_init(struct trace_array *tr) set_graph_array(tr); if (tracing_thresh) ret = register_ftrace_graph(&trace_graph_thresh_return, - &trace_graph_thresh_entry); + &trace_graph_entry); else ret = register_ftrace_graph(&trace_graph_return, &trace_graph_entry); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 919e0ddd8fcc..9aedb0b06683 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -587,6 +587,7 @@ static int create_trace_kprobe(int argc, char **argv) * $retval : fetch return value * $stack : fetch stack address * $stackN : fetch Nth of stack (N:0-) + * $comm : fetch current task comm * @ADDR : fetch memory at ADDR (ADDR should be in kernel) * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) * %REG : fetch register REG @@ -1149,14 +1150,15 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx); + entry = perf_trace_buf_alloc(size, NULL, &rctx); if (!entry) return; entry->ip = (unsigned long)tk->rp.kp.addr; memset(&entry[1], 0, dsize); store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); - perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); + perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, + head, NULL); } NOKPROBE_SYMBOL(kprobe_perf_func); @@ -1184,14 +1186,15 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx); + entry = perf_trace_buf_alloc(size, NULL, &rctx); if (!entry) return; entry->func = (unsigned long)tk->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); - perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); + perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, + head, NULL); } NOKPROBE_SYMBOL(kretprobe_perf_func); #endif /* CONFIG_PERF_EVENTS */ diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 68f376ca6d3f..cd7480d0a201 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -68,19 +68,15 @@ static void mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x", dev->bus->number, dev->devfn, dev->vendor, dev->device, dev->irq); - /* - * XXX: is pci_resource_to_user() appropriate, since we are - * supposed to interpret the __ioremap() phys_addr argument based on - * these printed values? - */ for (i = 0; i < 7; i++) { - pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); + start = dev->resource[i].start; trace_seq_printf(s, " %llx", (unsigned long long)(start | (dev->resource[i].flags & PCI_REGION_FLAG_MASK))); } for (i = 0; i < 7; i++) { - pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); + start = dev->resource[i].start; + end = dev->resource[i].end; trace_seq_printf(s, " %llx", dev->resource[i].start < dev->resource[i].end ? (unsigned long long)(end - start) + 1 : 0); diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index f96f0383f6c6..ad1d6164e946 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -36,6 +36,10 @@ struct trace_bprintk_fmt { static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) { struct trace_bprintk_fmt *pos; + + if (!fmt) + return ERR_PTR(-EINVAL); + list_for_each_entry(pos, &trace_bprintk_fmt_list, list) { if (!strcmp(pos->fmt, fmt)) return pos; @@ -57,7 +61,8 @@ void hold_module_trace_bprintk_format(const char **start, const char **end) for (iter = start; iter < end; iter++) { struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); if (tb_fmt) { - *iter = tb_fmt->fmt; + if (!IS_ERR(tb_fmt)) + *iter = tb_fmt->fmt; continue; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 1d372fa6fefb..74e80a582c28 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -218,6 +218,28 @@ free_bitfield_fetch_param(struct bitfield_fetch_param *data) kfree(data); } +void FETCH_FUNC_NAME(comm, string)(struct pt_regs *regs, + void *data, void *dest) +{ + int maxlen = get_rloc_len(*(u32 *)dest); + u8 *dst = get_rloc_data(dest); + long ret; + + if (!maxlen) + return; + + ret = strlcpy(dst, current->comm, maxlen); + *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest)); +} +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string)); + +void FETCH_FUNC_NAME(comm, string_size)(struct pt_regs *regs, + void *data, void *dest) +{ + *(u32 *)dest = strlen(current->comm) + 1; +} +NOKPROBE_SYMBOL(FETCH_FUNC_NAME(comm, string_size)); + static const struct fetch_type *find_fetch_type(const char *type, const struct fetch_type *ftbl) { @@ -348,6 +370,11 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, } } else ret = -EINVAL; + } else if (strcmp(arg, "comm") == 0) { + if (strcmp(t->name, "string") != 0 && + strcmp(t->name, "string_size") != 0) + return -EINVAL; + f->fn = t->fetch[FETCH_MTD_comm]; } else ret = -EINVAL; @@ -522,6 +549,12 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, arg[t - parg->comm] = '\0'; t++; } + /* + * The default type of $comm should be "string", and it can't be + * dereferenced. + */ + if (!t && strcmp(arg, "$comm") == 0) + t = "string"; parg->type = find_fetch_type(t, ftbl); if (!parg->type) { pr_info("Unsupported type: %s\n", t); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index f6398db09114..45400ca5ded1 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -102,6 +102,7 @@ enum { FETCH_MTD_reg = 0, FETCH_MTD_stack, FETCH_MTD_retval, + FETCH_MTD_comm, FETCH_MTD_memory, FETCH_MTD_symbol, FETCH_MTD_deref, @@ -183,6 +184,14 @@ DECLARE_BASIC_FETCH_FUNCS(bitfield); #define fetch_bitfield_string NULL #define fetch_bitfield_string_size NULL +/* comm only makes sense as a string */ +#define fetch_comm_u8 NULL +#define fetch_comm_u16 NULL +#define fetch_comm_u32 NULL +#define fetch_comm_u64 NULL +DECLARE_FETCH_FUNC(comm, string); +DECLARE_FETCH_FUNC(comm, string_size); + /* * Define macro for basic types - we don't need to define s* types, because * we have to care only about bitwidth at recording time. @@ -213,6 +222,7 @@ DEFINE_FETCH_##method(u64) ASSIGN_FETCH_FUNC(reg, ftype), \ ASSIGN_FETCH_FUNC(stack, ftype), \ ASSIGN_FETCH_FUNC(retval, ftype), \ +ASSIGN_FETCH_FUNC(comm, ftype), \ ASSIGN_FETCH_FUNC(memory, ftype), \ ASSIGN_FETCH_FUNC(symbol, ftype), \ ASSIGN_FETCH_FUNC(deref, ftype), \ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index e78f364cc192..b2b6efc083a4 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -587,15 +587,16 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) size = ALIGN(size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, - sys_data->enter_event->event.type, NULL, &rctx); + rec = perf_trace_buf_alloc(size, NULL, &rctx); if (!rec) return; rec->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); - perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); + perf_trace_buf_submit(rec, size, rctx, + sys_data->enter_event->event.type, 1, regs, + head, NULL); } static int perf_sysenter_enable(struct trace_event_call *call) @@ -660,14 +661,14 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, - sys_data->exit_event->event.type, NULL, &rctx); + rec = perf_trace_buf_alloc(size, NULL, &rctx); if (!rec) return; rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); - perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); + perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type, + 1, regs, head, NULL); } static int perf_sysexit_enable(struct trace_event_call *call) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 7915142c89e4..c53485441c88 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1131,7 +1131,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, if (hlist_empty(head)) goto out; - entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx); + entry = perf_trace_buf_alloc(size, NULL, &rctx); if (!entry) goto out; @@ -1152,7 +1152,8 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, memset(data + len, 0, size - esize - len); } - perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); + perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, + head, NULL); out: preempt_enable(); } diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c new file mode 100644 index 000000000000..0a689bbb78ef --- /dev/null +++ b/kernel/trace/tracing_map.c @@ -0,0 +1,1062 @@ +/* + * tracing_map - lock-free map for tracing + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Copyright (C) 2015 Tom Zanussi <tom.zanussi@linux.intel.com> + * + * tracing_map implementation inspired by lock-free map algorithms + * originated by Dr. Cliff Click: + * + * http://www.azulsystems.com/blog/cliff/2007-03-26-non-blocking-hashtable + * http://www.azulsystems.com/events/javaone_2007/2007_LockFreeHash.pdf + */ + +#include <linux/vmalloc.h> +#include <linux/jhash.h> +#include <linux/slab.h> +#include <linux/sort.h> + +#include "tracing_map.h" +#include "trace.h" + +/* + * NOTE: For a detailed description of the data structures used by + * these functions (such as tracing_map_elt) please see the overview + * of tracing_map data structures at the beginning of tracing_map.h. + */ + +/** + * tracing_map_update_sum - Add a value to a tracing_map_elt's sum field + * @elt: The tracing_map_elt + * @i: The index of the given sum associated with the tracing_map_elt + * @n: The value to add to the sum + * + * Add n to sum i associated with the specified tracing_map_elt + * instance. The index i is the index returned by the call to + * tracing_map_add_sum_field() when the tracing map was set up. + */ +void tracing_map_update_sum(struct tracing_map_elt *elt, unsigned int i, u64 n) +{ + atomic64_add(n, &elt->fields[i].sum); +} + +/** + * tracing_map_read_sum - Return the value of a tracing_map_elt's sum field + * @elt: The tracing_map_elt + * @i: The index of the given sum associated with the tracing_map_elt + * + * Retrieve the value of the sum i associated with the specified + * tracing_map_elt instance. The index i is the index returned by the + * call to tracing_map_add_sum_field() when the tracing map was set + * up. + * + * Return: The sum associated with field i for elt. + */ +u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i) +{ + return (u64)atomic64_read(&elt->fields[i].sum); +} + +int tracing_map_cmp_string(void *val_a, void *val_b) +{ + char *a = val_a; + char *b = val_b; + + return strcmp(a, b); +} + +int tracing_map_cmp_none(void *val_a, void *val_b) +{ + return 0; +} + +static int tracing_map_cmp_atomic64(void *val_a, void *val_b) +{ + u64 a = atomic64_read((atomic64_t *)val_a); + u64 b = atomic64_read((atomic64_t *)val_b); + + return (a > b) ? 1 : ((a < b) ? -1 : 0); +} + +#define DEFINE_TRACING_MAP_CMP_FN(type) \ +static int tracing_map_cmp_##type(void *val_a, void *val_b) \ +{ \ + type a = *(type *)val_a; \ + type b = *(type *)val_b; \ + \ + return (a > b) ? 1 : ((a < b) ? -1 : 0); \ +} + +DEFINE_TRACING_MAP_CMP_FN(s64); +DEFINE_TRACING_MAP_CMP_FN(u64); +DEFINE_TRACING_MAP_CMP_FN(s32); +DEFINE_TRACING_MAP_CMP_FN(u32); +DEFINE_TRACING_MAP_CMP_FN(s16); +DEFINE_TRACING_MAP_CMP_FN(u16); +DEFINE_TRACING_MAP_CMP_FN(s8); +DEFINE_TRACING_MAP_CMP_FN(u8); + +tracing_map_cmp_fn_t tracing_map_cmp_num(int field_size, + int field_is_signed) +{ + tracing_map_cmp_fn_t fn = tracing_map_cmp_none; + + switch (field_size) { + case 8: + if (field_is_signed) + fn = tracing_map_cmp_s64; + else + fn = tracing_map_cmp_u64; + break; + case 4: + if (field_is_signed) + fn = tracing_map_cmp_s32; + else + fn = tracing_map_cmp_u32; + break; + case 2: + if (field_is_signed) + fn = tracing_map_cmp_s16; + else + fn = tracing_map_cmp_u16; + break; + case 1: + if (field_is_signed) + fn = tracing_map_cmp_s8; + else + fn = tracing_map_cmp_u8; + break; + } + + return fn; +} + +static int tracing_map_add_field(struct tracing_map *map, + tracing_map_cmp_fn_t cmp_fn) +{ + int ret = -EINVAL; + + if (map->n_fields < TRACING_MAP_FIELDS_MAX) { + ret = map->n_fields; + map->fields[map->n_fields++].cmp_fn = cmp_fn; + } + + return ret; +} + +/** + * tracing_map_add_sum_field - Add a field describing a tracing_map sum + * @map: The tracing_map + * + * Add a sum field to the key and return the index identifying it in + * the map and associated tracing_map_elts. This is the index used + * for instance to update a sum for a particular tracing_map_elt using + * tracing_map_update_sum() or reading it via tracing_map_read_sum(). + * + * Return: The index identifying the field in the map and associated + * tracing_map_elts, or -EINVAL on error. + */ +int tracing_map_add_sum_field(struct tracing_map *map) +{ + return tracing_map_add_field(map, tracing_map_cmp_atomic64); +} + +/** + * tracing_map_add_key_field - Add a field describing a tracing_map key + * @map: The tracing_map + * @offset: The offset within the key + * @cmp_fn: The comparison function that will be used to sort on the key + * + * Let the map know there is a key and that if it's used as a sort key + * to use cmp_fn. + * + * A key can be a subset of a compound key; for that purpose, the + * offset param is used to describe where within the the compound key + * the key referenced by this key field resides. + * + * Return: The index identifying the field in the map and associated + * tracing_map_elts, or -EINVAL on error. + */ +int tracing_map_add_key_field(struct tracing_map *map, + unsigned int offset, + tracing_map_cmp_fn_t cmp_fn) + +{ + int idx = tracing_map_add_field(map, cmp_fn); + + if (idx < 0) + return idx; + + map->fields[idx].offset = offset; + + map->key_idx[map->n_keys++] = idx; + + return idx; +} + +void tracing_map_array_clear(struct tracing_map_array *a) +{ + unsigned int i; + + if (!a->pages) + return; + + for (i = 0; i < a->n_pages; i++) + memset(a->pages[i], 0, PAGE_SIZE); +} + +void tracing_map_array_free(struct tracing_map_array *a) +{ + unsigned int i; + + if (!a) + return; + + if (!a->pages) { + kfree(a); + return; + } + + for (i = 0; i < a->n_pages; i++) { + if (!a->pages[i]) + break; + free_page((unsigned long)a->pages[i]); + } +} + +struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts, + unsigned int entry_size) +{ + struct tracing_map_array *a; + unsigned int i; + + a = kzalloc(sizeof(*a), GFP_KERNEL); + if (!a) + return NULL; + + a->entry_size_shift = fls(roundup_pow_of_two(entry_size) - 1); + a->entries_per_page = PAGE_SIZE / (1 << a->entry_size_shift); + a->n_pages = n_elts / a->entries_per_page; + if (!a->n_pages) + a->n_pages = 1; + a->entry_shift = fls(a->entries_per_page) - 1; + a->entry_mask = (1 << a->entry_shift) - 1; + + a->pages = kcalloc(a->n_pages, sizeof(void *), GFP_KERNEL); + if (!a->pages) + goto free; + + for (i = 0; i < a->n_pages; i++) { + a->pages[i] = (void *)get_zeroed_page(GFP_KERNEL); + if (!a->pages[i]) + goto free; + } + out: + return a; + free: + tracing_map_array_free(a); + a = NULL; + + goto out; +} + +static void tracing_map_elt_clear(struct tracing_map_elt *elt) +{ + unsigned i; + + for (i = 0; i < elt->map->n_fields; i++) + if (elt->fields[i].cmp_fn == tracing_map_cmp_atomic64) + atomic64_set(&elt->fields[i].sum, 0); + + if (elt->map->ops && elt->map->ops->elt_clear) + elt->map->ops->elt_clear(elt); +} + +static void tracing_map_elt_init_fields(struct tracing_map_elt *elt) +{ + unsigned int i; + + tracing_map_elt_clear(elt); + + for (i = 0; i < elt->map->n_fields; i++) { + elt->fields[i].cmp_fn = elt->map->fields[i].cmp_fn; + + if (elt->fields[i].cmp_fn != tracing_map_cmp_atomic64) + elt->fields[i].offset = elt->map->fields[i].offset; + } +} + +static void tracing_map_elt_free(struct tracing_map_elt *elt) +{ + if (!elt) + return; + + if (elt->map->ops && elt->map->ops->elt_free) + elt->map->ops->elt_free(elt); + kfree(elt->fields); + kfree(elt->key); + kfree(elt); +} + +static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map) +{ + struct tracing_map_elt *elt; + int err = 0; + + elt = kzalloc(sizeof(*elt), GFP_KERNEL); + if (!elt) + return ERR_PTR(-ENOMEM); + + elt->map = map; + + elt->key = kzalloc(map->key_size, GFP_KERNEL); + if (!elt->key) { + err = -ENOMEM; + goto free; + } + + elt->fields = kcalloc(map->n_fields, sizeof(*elt->fields), GFP_KERNEL); + if (!elt->fields) { + err = -ENOMEM; + goto free; + } + + tracing_map_elt_init_fields(elt); + + if (map->ops && map->ops->elt_alloc) { + err = map->ops->elt_alloc(elt); + if (err) + goto free; + } + return elt; + free: + tracing_map_elt_free(elt); + + return ERR_PTR(err); +} + +static struct tracing_map_elt *get_free_elt(struct tracing_map *map) +{ + struct tracing_map_elt *elt = NULL; + int idx; + + idx = atomic_inc_return(&map->next_elt); + if (idx < map->max_elts) { + elt = *(TRACING_MAP_ELT(map->elts, idx)); + if (map->ops && map->ops->elt_init) + map->ops->elt_init(elt); + } + + return elt; +} + +static void tracing_map_free_elts(struct tracing_map *map) +{ + unsigned int i; + + if (!map->elts) + return; + + for (i = 0; i < map->max_elts; i++) { + tracing_map_elt_free(*(TRACING_MAP_ELT(map->elts, i))); + *(TRACING_MAP_ELT(map->elts, i)) = NULL; + } + + tracing_map_array_free(map->elts); + map->elts = NULL; +} + +static int tracing_map_alloc_elts(struct tracing_map *map) +{ + unsigned int i; + + map->elts = tracing_map_array_alloc(map->max_elts, + sizeof(struct tracing_map_elt *)); + if (!map->elts) + return -ENOMEM; + + for (i = 0; i < map->max_elts; i++) { + *(TRACING_MAP_ELT(map->elts, i)) = tracing_map_elt_alloc(map); + if (IS_ERR(*(TRACING_MAP_ELT(map->elts, i)))) { + *(TRACING_MAP_ELT(map->elts, i)) = NULL; + tracing_map_free_elts(map); + + return -ENOMEM; + } + } + + return 0; +} + +static inline bool keys_match(void *key, void *test_key, unsigned key_size) +{ + bool match = true; + + if (memcmp(key, test_key, key_size)) + match = false; + + return match; +} + +static inline struct tracing_map_elt * +__tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only) +{ + u32 idx, key_hash, test_key; + struct tracing_map_entry *entry; + + key_hash = jhash(key, map->key_size, 0); + if (key_hash == 0) + key_hash = 1; + idx = key_hash >> (32 - (map->map_bits + 1)); + + while (1) { + idx &= (map->map_size - 1); + entry = TRACING_MAP_ENTRY(map->map, idx); + test_key = entry->key; + + if (test_key && test_key == key_hash && entry->val && + keys_match(key, entry->val->key, map->key_size)) { + atomic64_inc(&map->hits); + return entry->val; + } + + if (!test_key) { + if (lookup_only) + break; + + if (!cmpxchg(&entry->key, 0, key_hash)) { + struct tracing_map_elt *elt; + + elt = get_free_elt(map); + if (!elt) { + atomic64_inc(&map->drops); + entry->key = 0; + break; + } + + memcpy(elt->key, key, map->key_size); + entry->val = elt; + atomic64_inc(&map->hits); + + return entry->val; + } + } + + idx++; + } + + return NULL; +} + +/** + * tracing_map_insert - Insert key and/or retrieve val from a tracing_map + * @map: The tracing_map to insert into + * @key: The key to insert + * + * Inserts a key into a tracing_map and creates and returns a new + * tracing_map_elt for it, or if the key has already been inserted by + * a previous call, returns the tracing_map_elt already associated + * with it. When the map was created, the number of elements to be + * allocated for the map was specified (internally maintained as + * 'max_elts' in struct tracing_map), and that number of + * tracing_map_elts was created by tracing_map_init(). This is the + * pre-allocated pool of tracing_map_elts that tracing_map_insert() + * will allocate from when adding new keys. Once that pool is + * exhausted, tracing_map_insert() is useless and will return NULL to + * signal that state. There are two user-visible tracing_map + * variables, 'hits' and 'drops', which are updated by this function. + * Every time an element is either successfully inserted or retrieved, + * the 'hits' value is incrememented. Every time an element insertion + * fails, the 'drops' value is incremented. + * + * This is a lock-free tracing map insertion function implementing a + * modified form of Cliff Click's basic insertion algorithm. It + * requires the table size be a power of two. To prevent any + * possibility of an infinite loop we always make the internal table + * size double the size of the requested table size (max_elts * 2). + * Likewise, we never reuse a slot or resize or delete elements - when + * we've reached max_elts entries, we simply return NULL once we've + * run out of entries. Readers can at any point in time traverse the + * tracing map and safely access the key/val pairs. + * + * Return: the tracing_map_elt pointer val associated with the key. + * If this was a newly inserted key, the val will be a newly allocated + * and associated tracing_map_elt pointer val. If the key wasn't + * found and the pool of tracing_map_elts has been exhausted, NULL is + * returned and no further insertions will succeed. + */ +struct tracing_map_elt *tracing_map_insert(struct tracing_map *map, void *key) +{ + return __tracing_map_insert(map, key, false); +} + +/** + * tracing_map_lookup - Retrieve val from a tracing_map + * @map: The tracing_map to perform the lookup on + * @key: The key to look up + * + * Looks up key in tracing_map and if found returns the matching + * tracing_map_elt. This is a lock-free lookup; see + * tracing_map_insert() for details on tracing_map and how it works. + * Every time an element is retrieved, the 'hits' value is + * incrememented. There is one user-visible tracing_map variable, + * 'hits', which is updated by this function. Every time an element + * is successfully retrieved, the 'hits' value is incrememented. The + * 'drops' value is never updated by this function. + * + * Return: the tracing_map_elt pointer val associated with the key. + * If the key wasn't found, NULL is returned. + */ +struct tracing_map_elt *tracing_map_lookup(struct tracing_map *map, void *key) +{ + return __tracing_map_insert(map, key, true); +} + +/** + * tracing_map_destroy - Destroy a tracing_map + * @map: The tracing_map to destroy + * + * Frees a tracing_map along with its associated array of + * tracing_map_elts. + * + * Callers should make sure there are no readers or writers actively + * reading or inserting into the map before calling this. + */ +void tracing_map_destroy(struct tracing_map *map) +{ + if (!map) + return; + + tracing_map_free_elts(map); + + tracing_map_array_free(map->map); + kfree(map); +} + +/** + * tracing_map_clear - Clear a tracing_map + * @map: The tracing_map to clear + * + * Resets the tracing map to a cleared or initial state. The + * tracing_map_elts are all cleared, and the array of struct + * tracing_map_entry is reset to an initialized state. + * + * Callers should make sure there are no writers actively inserting + * into the map before calling this. + */ +void tracing_map_clear(struct tracing_map *map) +{ + unsigned int i; + + atomic_set(&map->next_elt, -1); + atomic64_set(&map->hits, 0); + atomic64_set(&map->drops, 0); + + tracing_map_array_clear(map->map); + + for (i = 0; i < map->max_elts; i++) + tracing_map_elt_clear(*(TRACING_MAP_ELT(map->elts, i))); +} + +static void set_sort_key(struct tracing_map *map, + struct tracing_map_sort_key *sort_key) +{ + map->sort_key = *sort_key; +} + +/** + * tracing_map_create - Create a lock-free map and element pool + * @map_bits: The size of the map (2 ** map_bits) + * @key_size: The size of the key for the map in bytes + * @ops: Optional client-defined tracing_map_ops instance + * @private_data: Client data associated with the map + * + * Creates and sets up a map to contain 2 ** map_bits number of + * elements (internally maintained as 'max_elts' in struct + * tracing_map). Before using, map fields should be added to the map + * with tracing_map_add_sum_field() and tracing_map_add_key_field(). + * tracing_map_init() should then be called to allocate the array of + * tracing_map_elts, in order to avoid allocating anything in the map + * insertion path. The user-specified map size reflects the maximum + * number of elements that can be contained in the table requested by + * the user - internally we double that in order to keep the table + * sparse and keep collisions manageable. + * + * A tracing_map is a special-purpose map designed to aggregate or + * 'sum' one or more values associated with a specific object of type + * tracing_map_elt, which is attached by the map to a given key. + * + * tracing_map_create() sets up the map itself, and provides + * operations for inserting tracing_map_elts, but doesn't allocate the + * tracing_map_elts themselves, or provide a means for describing the + * keys or sums associated with the tracing_map_elts. All + * tracing_map_elts for a given map have the same set of sums and + * keys, which are defined by the client using the functions + * tracing_map_add_key_field() and tracing_map_add_sum_field(). Once + * the fields are defined, the pool of elements allocated for the map + * can be created, which occurs when the client code calls + * tracing_map_init(). + * + * When tracing_map_init() returns, tracing_map_elt elements can be + * inserted into the map using tracing_map_insert(). When called, + * tracing_map_insert() grabs a free tracing_map_elt from the pool, or + * finds an existing match in the map and in either case returns it. + * The client can then use tracing_map_update_sum() and + * tracing_map_read_sum() to update or read a given sum field for the + * tracing_map_elt. + * + * The client can at any point retrieve and traverse the current set + * of inserted tracing_map_elts in a tracing_map, via + * tracing_map_sort_entries(). Sorting can be done on any field, + * including keys. + * + * See tracing_map.h for a description of tracing_map_ops. + * + * Return: the tracing_map pointer if successful, ERR_PTR if not. + */ +struct tracing_map *tracing_map_create(unsigned int map_bits, + unsigned int key_size, + const struct tracing_map_ops *ops, + void *private_data) +{ + struct tracing_map *map; + unsigned int i; + + if (map_bits < TRACING_MAP_BITS_MIN || + map_bits > TRACING_MAP_BITS_MAX) + return ERR_PTR(-EINVAL); + + map = kzalloc(sizeof(*map), GFP_KERNEL); + if (!map) + return ERR_PTR(-ENOMEM); + + map->map_bits = map_bits; + map->max_elts = (1 << map_bits); + atomic_set(&map->next_elt, -1); + + map->map_size = (1 << (map_bits + 1)); + map->ops = ops; + + map->private_data = private_data; + + map->map = tracing_map_array_alloc(map->map_size, + sizeof(struct tracing_map_entry)); + if (!map->map) + goto free; + + map->key_size = key_size; + for (i = 0; i < TRACING_MAP_KEYS_MAX; i++) + map->key_idx[i] = -1; + out: + return map; + free: + tracing_map_destroy(map); + map = ERR_PTR(-ENOMEM); + + goto out; +} + +/** + * tracing_map_init - Allocate and clear a map's tracing_map_elts + * @map: The tracing_map to initialize + * + * Allocates a clears a pool of tracing_map_elts equal to the + * user-specified size of 2 ** map_bits (internally maintained as + * 'max_elts' in struct tracing_map). Before using, the map fields + * should be added to the map with tracing_map_add_sum_field() and + * tracing_map_add_key_field(). tracing_map_init() should then be + * called to allocate the array of tracing_map_elts, in order to avoid + * allocating anything in the map insertion path. The user-specified + * map size reflects the max number of elements requested by the user + * - internally we double that in order to keep the table sparse and + * keep collisions manageable. + * + * See tracing_map.h for a description of tracing_map_ops. + * + * Return: the tracing_map pointer if successful, ERR_PTR if not. + */ +int tracing_map_init(struct tracing_map *map) +{ + int err; + + if (map->n_fields < 2) + return -EINVAL; /* need at least 1 key and 1 val */ + + err = tracing_map_alloc_elts(map); + if (err) + return err; + + tracing_map_clear(map); + + return err; +} + +static int cmp_entries_dup(const struct tracing_map_sort_entry **a, + const struct tracing_map_sort_entry **b) +{ + int ret = 0; + + if (memcmp((*a)->key, (*b)->key, (*a)->elt->map->key_size)) + ret = 1; + + return ret; +} + +static int cmp_entries_sum(const struct tracing_map_sort_entry **a, + const struct tracing_map_sort_entry **b) +{ + const struct tracing_map_elt *elt_a, *elt_b; + struct tracing_map_sort_key *sort_key; + struct tracing_map_field *field; + tracing_map_cmp_fn_t cmp_fn; + void *val_a, *val_b; + int ret = 0; + + elt_a = (*a)->elt; + elt_b = (*b)->elt; + + sort_key = &elt_a->map->sort_key; + + field = &elt_a->fields[sort_key->field_idx]; + cmp_fn = field->cmp_fn; + + val_a = &elt_a->fields[sort_key->field_idx].sum; + val_b = &elt_b->fields[sort_key->field_idx].sum; + + ret = cmp_fn(val_a, val_b); + if (sort_key->descending) + ret = -ret; + + return ret; +} + +static int cmp_entries_key(const struct tracing_map_sort_entry **a, + const struct tracing_map_sort_entry **b) +{ + const struct tracing_map_elt *elt_a, *elt_b; + struct tracing_map_sort_key *sort_key; + struct tracing_map_field *field; + tracing_map_cmp_fn_t cmp_fn; + void *val_a, *val_b; + int ret = 0; + + elt_a = (*a)->elt; + elt_b = (*b)->elt; + + sort_key = &elt_a->map->sort_key; + + field = &elt_a->fields[sort_key->field_idx]; + + cmp_fn = field->cmp_fn; + + val_a = elt_a->key + field->offset; + val_b = elt_b->key + field->offset; + + ret = cmp_fn(val_a, val_b); + if (sort_key->descending) + ret = -ret; + + return ret; +} + +static void destroy_sort_entry(struct tracing_map_sort_entry *entry) +{ + if (!entry) + return; + + if (entry->elt_copied) + tracing_map_elt_free(entry->elt); + + kfree(entry); +} + +/** + * tracing_map_destroy_sort_entries - Destroy an array of sort entries + * @entries: The entries to destroy + * @n_entries: The number of entries in the array + * + * Destroy the elements returned by a tracing_map_sort_entries() call. + */ +void tracing_map_destroy_sort_entries(struct tracing_map_sort_entry **entries, + unsigned int n_entries) +{ + unsigned int i; + + for (i = 0; i < n_entries; i++) + destroy_sort_entry(entries[i]); + + vfree(entries); +} + +static struct tracing_map_sort_entry * +create_sort_entry(void *key, struct tracing_map_elt *elt) +{ + struct tracing_map_sort_entry *sort_entry; + + sort_entry = kzalloc(sizeof(*sort_entry), GFP_KERNEL); + if (!sort_entry) + return NULL; + + sort_entry->key = key; + sort_entry->elt = elt; + + return sort_entry; +} + +static struct tracing_map_elt *copy_elt(struct tracing_map_elt *elt) +{ + struct tracing_map_elt *dup_elt; + unsigned int i; + + dup_elt = tracing_map_elt_alloc(elt->map); + if (IS_ERR(dup_elt)) + return NULL; + + if (elt->map->ops && elt->map->ops->elt_copy) + elt->map->ops->elt_copy(dup_elt, elt); + + dup_elt->private_data = elt->private_data; + memcpy(dup_elt->key, elt->key, elt->map->key_size); + + for (i = 0; i < elt->map->n_fields; i++) { + atomic64_set(&dup_elt->fields[i].sum, + atomic64_read(&elt->fields[i].sum)); + dup_elt->fields[i].cmp_fn = elt->fields[i].cmp_fn; + } + + return dup_elt; +} + +static int merge_dup(struct tracing_map_sort_entry **sort_entries, + unsigned int target, unsigned int dup) +{ + struct tracing_map_elt *target_elt, *elt; + bool first_dup = (target - dup) == 1; + int i; + + if (first_dup) { + elt = sort_entries[target]->elt; + target_elt = copy_elt(elt); + if (!target_elt) + return -ENOMEM; + sort_entries[target]->elt = target_elt; + sort_entries[target]->elt_copied = true; + } else + target_elt = sort_entries[target]->elt; + + elt = sort_entries[dup]->elt; + + for (i = 0; i < elt->map->n_fields; i++) + atomic64_add(atomic64_read(&elt->fields[i].sum), + &target_elt->fields[i].sum); + + sort_entries[dup]->dup = true; + + return 0; +} + +static int merge_dups(struct tracing_map_sort_entry **sort_entries, + int n_entries, unsigned int key_size) +{ + unsigned int dups = 0, total_dups = 0; + int err, i, j; + void *key; + + if (n_entries < 2) + return total_dups; + + sort(sort_entries, n_entries, sizeof(struct tracing_map_sort_entry *), + (int (*)(const void *, const void *))cmp_entries_dup, NULL); + + key = sort_entries[0]->key; + for (i = 1; i < n_entries; i++) { + if (!memcmp(sort_entries[i]->key, key, key_size)) { + dups++; total_dups++; + err = merge_dup(sort_entries, i - dups, i); + if (err) + return err; + continue; + } + key = sort_entries[i]->key; + dups = 0; + } + + if (!total_dups) + return total_dups; + + for (i = 0, j = 0; i < n_entries; i++) { + if (!sort_entries[i]->dup) { + sort_entries[j] = sort_entries[i]; + if (j++ != i) + sort_entries[i] = NULL; + } else { + destroy_sort_entry(sort_entries[i]); + sort_entries[i] = NULL; + } + } + + return total_dups; +} + +static bool is_key(struct tracing_map *map, unsigned int field_idx) +{ + unsigned int i; + + for (i = 0; i < map->n_keys; i++) + if (map->key_idx[i] == field_idx) + return true; + return false; +} + +static void sort_secondary(struct tracing_map *map, + const struct tracing_map_sort_entry **entries, + unsigned int n_entries, + struct tracing_map_sort_key *primary_key, + struct tracing_map_sort_key *secondary_key) +{ + int (*primary_fn)(const struct tracing_map_sort_entry **, + const struct tracing_map_sort_entry **); + int (*secondary_fn)(const struct tracing_map_sort_entry **, + const struct tracing_map_sort_entry **); + unsigned i, start = 0, n_sub = 1; + + if (is_key(map, primary_key->field_idx)) + primary_fn = cmp_entries_key; + else + primary_fn = cmp_entries_sum; + + if (is_key(map, secondary_key->field_idx)) + secondary_fn = cmp_entries_key; + else + secondary_fn = cmp_entries_sum; + + for (i = 0; i < n_entries - 1; i++) { + const struct tracing_map_sort_entry **a = &entries[i]; + const struct tracing_map_sort_entry **b = &entries[i + 1]; + + if (primary_fn(a, b) == 0) { + n_sub++; + if (i < n_entries - 2) + continue; + } + + if (n_sub < 2) { + start = i + 1; + n_sub = 1; + continue; + } + + set_sort_key(map, secondary_key); + sort(&entries[start], n_sub, + sizeof(struct tracing_map_sort_entry *), + (int (*)(const void *, const void *))secondary_fn, NULL); + set_sort_key(map, primary_key); + + start = i + 1; + n_sub = 1; + } +} + +/** + * tracing_map_sort_entries - Sort the current set of tracing_map_elts in a map + * @map: The tracing_map + * @sort_key: The sort key to use for sorting + * @sort_entries: outval: pointer to allocated and sorted array of entries + * + * tracing_map_sort_entries() sorts the current set of entries in the + * map and returns the list of tracing_map_sort_entries containing + * them to the client in the sort_entries param. The client can + * access the struct tracing_map_elt element of interest directly as + * the 'elt' field of a returned struct tracing_map_sort_entry object. + * + * The sort_key has only two fields: idx and descending. 'idx' refers + * to the index of the field added via tracing_map_add_sum_field() or + * tracing_map_add_key_field() when the tracing_map was initialized. + * 'descending' is a flag that if set reverses the sort order, which + * by default is ascending. + * + * The client should not hold on to the returned array but should use + * it and call tracing_map_destroy_sort_entries() when done. + * + * Return: the number of sort_entries in the struct tracing_map_sort_entry + * array, negative on error + */ +int tracing_map_sort_entries(struct tracing_map *map, + struct tracing_map_sort_key *sort_keys, + unsigned int n_sort_keys, + struct tracing_map_sort_entry ***sort_entries) +{ + int (*cmp_entries_fn)(const struct tracing_map_sort_entry **, + const struct tracing_map_sort_entry **); + struct tracing_map_sort_entry *sort_entry, **entries; + int i, n_entries, ret; + + entries = vmalloc(map->max_elts * sizeof(sort_entry)); + if (!entries) + return -ENOMEM; + + for (i = 0, n_entries = 0; i < map->map_size; i++) { + struct tracing_map_entry *entry; + + entry = TRACING_MAP_ENTRY(map->map, i); + + if (!entry->key || !entry->val) + continue; + + entries[n_entries] = create_sort_entry(entry->val->key, + entry->val); + if (!entries[n_entries++]) { + ret = -ENOMEM; + goto free; + } + } + + if (n_entries == 0) { + ret = 0; + goto free; + } + + if (n_entries == 1) { + *sort_entries = entries; + return 1; + } + + ret = merge_dups(entries, n_entries, map->key_size); + if (ret < 0) + goto free; + n_entries -= ret; + + if (is_key(map, sort_keys[0].field_idx)) + cmp_entries_fn = cmp_entries_key; + else + cmp_entries_fn = cmp_entries_sum; + + set_sort_key(map, &sort_keys[0]); + + sort(entries, n_entries, sizeof(struct tracing_map_sort_entry *), + (int (*)(const void *, const void *))cmp_entries_fn, NULL); + + if (n_sort_keys > 1) + sort_secondary(map, + (const struct tracing_map_sort_entry **)entries, + n_entries, + &sort_keys[0], + &sort_keys[1]); + + *sort_entries = entries; + + return n_entries; + free: + tracing_map_destroy_sort_entries(entries, n_entries); + + return ret; +} diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h new file mode 100644 index 000000000000..618838f5f30a --- /dev/null +++ b/kernel/trace/tracing_map.h @@ -0,0 +1,283 @@ +#ifndef __TRACING_MAP_H +#define __TRACING_MAP_H + +#define TRACING_MAP_BITS_DEFAULT 11 +#define TRACING_MAP_BITS_MAX 17 +#define TRACING_MAP_BITS_MIN 7 + +#define TRACING_MAP_KEYS_MAX 2 +#define TRACING_MAP_VALS_MAX 3 +#define TRACING_MAP_FIELDS_MAX (TRACING_MAP_KEYS_MAX + \ + TRACING_MAP_VALS_MAX) +#define TRACING_MAP_SORT_KEYS_MAX 2 + +typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b); + +/* + * This is an overview of the tracing_map data structures and how they + * relate to the tracing_map API. The details of the algorithms + * aren't discussed here - this is just a general overview of the data + * structures and how they interact with the API. + * + * The central data structure of the tracing_map is an initially + * zeroed array of struct tracing_map_entry (stored in the map field + * of struct tracing_map). tracing_map_entry is a very simple data + * structure containing only two fields: a 32-bit unsigned 'key' + * variable and a pointer named 'val'. This array of struct + * tracing_map_entry is essentially a hash table which will be + * modified by a single function, tracing_map_insert(), but which can + * be traversed and read by a user at any time (though the user does + * this indirectly via an array of tracing_map_sort_entry - see the + * explanation of that data structure in the discussion of the + * sorting-related data structures below). + * + * The central function of the tracing_map API is + * tracing_map_insert(). tracing_map_insert() hashes the + * arbitrarily-sized key passed into it into a 32-bit unsigned key. + * It then uses this key, truncated to the array size, as an index + * into the array of tracing_map_entries. If the value of the 'key' + * field of the tracing_map_entry found at that location is 0, then + * that entry is considered to be free and can be claimed, by + * replacing the 0 in the 'key' field of the tracing_map_entry with + * the new 32-bit hashed key. Once claimed, that tracing_map_entry's + * 'val' field is then used to store a unique element which will be + * forever associated with that 32-bit hashed key in the + * tracing_map_entry. + * + * That unique element now in the tracing_map_entry's 'val' field is + * an instance of tracing_map_elt, where 'elt' in the latter part of + * that variable name is short for 'element'. The purpose of a + * tracing_map_elt is to hold values specific to the particular + * 32-bit hashed key it's assocated with. Things such as the unique + * set of aggregated sums associated with the 32-bit hashed key, along + * with a copy of the full key associated with the entry, and which + * was used to produce the 32-bit hashed key. + * + * When tracing_map_create() is called to create the tracing map, the + * user specifies (indirectly via the map_bits param, the details are + * unimportant for this discussion) the maximum number of elements + * that the map can hold (stored in the max_elts field of struct + * tracing_map). This is the maximum possible number of + * tracing_map_entries in the tracing_map_entry array which can be + * 'claimed' as described in the above discussion, and therefore is + * also the maximum number of tracing_map_elts that can be associated + * with the tracing_map_entry array in the tracing_map. Because of + * the way the insertion algorithm works, the size of the allocated + * tracing_map_entry array is always twice the maximum number of + * elements (2 * max_elts). This value is stored in the map_size + * field of struct tracing_map. + * + * Because tracing_map_insert() needs to work from any context, + * including from within the memory allocation functions themselves, + * both the tracing_map_entry array and a pool of max_elts + * tracing_map_elts are pre-allocated before any call is made to + * tracing_map_insert(). + * + * The tracing_map_entry array is allocated as a single block by + * tracing_map_create(). + * + * Because the tracing_map_elts are much larger objects and can't + * generally be allocated together as a single large array without + * failure, they're allocated individually, by tracing_map_init(). + * + * The pool of tracing_map_elts are allocated by tracing_map_init() + * rather than by tracing_map_create() because at the time + * tracing_map_create() is called, there isn't enough information to + * create the tracing_map_elts. Specifically,the user first needs to + * tell the tracing_map implementation how many fields the + * tracing_map_elts contain, and which types of fields they are (key + * or sum). The user does this via the tracing_map_add_sum_field() + * and tracing_map_add_key_field() functions, following which the user + * calls tracing_map_init() to finish up the tracing map setup. The + * array holding the pointers which make up the pre-allocated pool of + * tracing_map_elts is allocated as a single block and is stored in + * the elts field of struct tracing_map. + * + * There is also a set of structures used for sorting that might + * benefit from some minimal explanation. + * + * struct tracing_map_sort_key is used to drive the sort at any given + * time. By 'any given time' we mean that a different + * tracing_map_sort_key will be used at different times depending on + * whether the sort currently being performed is a primary or a + * secondary sort. + * + * The sort key is very simple, consisting of the field index of the + * tracing_map_elt field to sort on (which the user saved when adding + * the field), and whether the sort should be done in an ascending or + * descending order. + * + * For the convenience of the sorting code, a tracing_map_sort_entry + * is created for each tracing_map_elt, again individually allocated + * to avoid failures that might be expected if allocated as a single + * large array of struct tracing_map_sort_entry. + * tracing_map_sort_entry instances are the objects expected by the + * various internal sorting functions, and are also what the user + * ultimately receives after calling tracing_map_sort_entries(). + * Because it doesn't make sense for users to access an unordered and + * sparsely populated tracing_map directly, the + * tracing_map_sort_entries() function is provided so that users can + * retrieve a sorted list of all existing elements. In addition to + * the associated tracing_map_elt 'elt' field contained within the + * tracing_map_sort_entry, which is the object of interest to the + * user, tracing_map_sort_entry objects contain a number of additional + * fields which are used for caching and internal purposes and can + * safely be ignored. +*/ + +struct tracing_map_field { + tracing_map_cmp_fn_t cmp_fn; + union { + atomic64_t sum; + unsigned int offset; + }; +}; + +struct tracing_map_elt { + struct tracing_map *map; + struct tracing_map_field *fields; + void *key; + void *private_data; +}; + +struct tracing_map_entry { + u32 key; + struct tracing_map_elt *val; +}; + +struct tracing_map_sort_key { + unsigned int field_idx; + bool descending; +}; + +struct tracing_map_sort_entry { + void *key; + struct tracing_map_elt *elt; + bool elt_copied; + bool dup; +}; + +struct tracing_map_array { + unsigned int entries_per_page; + unsigned int entry_size_shift; + unsigned int entry_shift; + unsigned int entry_mask; + unsigned int n_pages; + void **pages; +}; + +#define TRACING_MAP_ARRAY_ELT(array, idx) \ + (array->pages[idx >> array->entry_shift] + \ + ((idx & array->entry_mask) << array->entry_size_shift)) + +#define TRACING_MAP_ENTRY(array, idx) \ + ((struct tracing_map_entry *)TRACING_MAP_ARRAY_ELT(array, idx)) + +#define TRACING_MAP_ELT(array, idx) \ + ((struct tracing_map_elt **)TRACING_MAP_ARRAY_ELT(array, idx)) + +struct tracing_map { + unsigned int key_size; + unsigned int map_bits; + unsigned int map_size; + unsigned int max_elts; + atomic_t next_elt; + struct tracing_map_array *elts; + struct tracing_map_array *map; + const struct tracing_map_ops *ops; + void *private_data; + struct tracing_map_field fields[TRACING_MAP_FIELDS_MAX]; + unsigned int n_fields; + int key_idx[TRACING_MAP_KEYS_MAX]; + unsigned int n_keys; + struct tracing_map_sort_key sort_key; + atomic64_t hits; + atomic64_t drops; +}; + +/** + * struct tracing_map_ops - callbacks for tracing_map + * + * The methods in this structure define callback functions for various + * operations on a tracing_map or objects related to a tracing_map. + * + * For a detailed description of tracing_map_elt objects please see + * the overview of tracing_map data structures at the beginning of + * this file. + * + * All the methods below are optional. + * + * @elt_alloc: When a tracing_map_elt is allocated, this function, if + * defined, will be called and gives clients the opportunity to + * allocate additional data and attach it to the element + * (tracing_map_elt->private_data is meant for that purpose). + * Element allocation occurs before tracing begins, when the + * tracing_map_init() call is made by client code. + * + * @elt_copy: At certain points in the lifetime of an element, it may + * need to be copied. The copy should include a copy of the + * client-allocated data, which can be copied into the 'to' + * element from the 'from' element. + * + * @elt_free: When a tracing_map_elt is freed, this function is called + * and allows client-allocated per-element data to be freed. + * + * @elt_clear: This callback allows per-element client-defined data to + * be cleared, if applicable. + * + * @elt_init: This callback allows per-element client-defined data to + * be initialized when used i.e. when the element is actually + * claimed by tracing_map_insert() in the context of the map + * insertion. + */ +struct tracing_map_ops { + int (*elt_alloc)(struct tracing_map_elt *elt); + void (*elt_copy)(struct tracing_map_elt *to, + struct tracing_map_elt *from); + void (*elt_free)(struct tracing_map_elt *elt); + void (*elt_clear)(struct tracing_map_elt *elt); + void (*elt_init)(struct tracing_map_elt *elt); +}; + +extern struct tracing_map * +tracing_map_create(unsigned int map_bits, + unsigned int key_size, + const struct tracing_map_ops *ops, + void *private_data); +extern int tracing_map_init(struct tracing_map *map); + +extern int tracing_map_add_sum_field(struct tracing_map *map); +extern int tracing_map_add_key_field(struct tracing_map *map, + unsigned int offset, + tracing_map_cmp_fn_t cmp_fn); + +extern void tracing_map_destroy(struct tracing_map *map); +extern void tracing_map_clear(struct tracing_map *map); + +extern struct tracing_map_elt * +tracing_map_insert(struct tracing_map *map, void *key); +extern struct tracing_map_elt * +tracing_map_lookup(struct tracing_map *map, void *key); + +extern tracing_map_cmp_fn_t tracing_map_cmp_num(int field_size, + int field_is_signed); +extern int tracing_map_cmp_string(void *val_a, void *val_b); +extern int tracing_map_cmp_none(void *val_a, void *val_b); + +extern void tracing_map_update_sum(struct tracing_map_elt *elt, + unsigned int i, u64 n); +extern u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i); +extern void tracing_map_set_field_descr(struct tracing_map *map, + unsigned int i, + unsigned int key_offset, + tracing_map_cmp_fn_t cmp_fn); +extern int +tracing_map_sort_entries(struct tracing_map *map, + struct tracing_map_sort_key *sort_keys, + unsigned int n_sort_keys, + struct tracing_map_sort_entry ***sort_entries); + +extern void +tracing_map_destroy_sort_entries(struct tracing_map_sort_entry **entries, + unsigned int n_entries); +#endif /* __TRACING_MAP_H */ diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 9bafc211930c..68f594212759 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -938,6 +938,20 @@ bool userns_may_setgroups(const struct user_namespace *ns) return allowed; } +/* + * Returns true if @ns is the same namespace as or a descendant of + * @target_ns. + */ +bool current_in_userns(const struct user_namespace *target_ns) +{ + struct user_namespace *ns; + for (ns = current_user_ns(); ns; ns = ns->parent) { + if (ns == target_ns) + return true; + } + return false; +} + static inline struct user_namespace *to_user_ns(struct ns_common *ns) { return container_of(ns, struct user_namespace, ns); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5f5068e94003..ef071ca73fc3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -433,54 +433,28 @@ static void *work_debug_hint(void *addr) return ((struct work_struct *) addr)->func; } -/* - * fixup_init is called when: - * - an active object is initialized - */ -static int work_fixup_init(void *addr, enum debug_obj_state state) +static bool work_is_static_object(void *addr) { struct work_struct *work = addr; - switch (state) { - case ODEBUG_STATE_ACTIVE: - cancel_work_sync(work); - debug_object_init(work, &work_debug_descr); - return 1; - default: - return 0; - } + return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work)); } /* - * fixup_activate is called when: - * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) + * fixup_init is called when: + * - an active object is initialized */ -static int work_fixup_activate(void *addr, enum debug_obj_state state) +static bool work_fixup_init(void *addr, enum debug_obj_state state) { struct work_struct *work = addr; switch (state) { - - case ODEBUG_STATE_NOTAVAILABLE: - /* - * This is not really a fixup. The work struct was - * statically initialized. We just make sure that it - * is tracked in the object tracker. - */ - if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) { - debug_object_init(work, &work_debug_descr); - debug_object_activate(work, &work_debug_descr); - return 0; - } - WARN_ON_ONCE(1); - return 0; - case ODEBUG_STATE_ACTIVE: - WARN_ON(1); - + cancel_work_sync(work); + debug_object_init(work, &work_debug_descr); + return true; default: - return 0; + return false; } } @@ -488,7 +462,7 @@ static int work_fixup_activate(void *addr, enum debug_obj_state state) * fixup_free is called when: * - an active object is freed */ -static int work_fixup_free(void *addr, enum debug_obj_state state) +static bool work_fixup_free(void *addr, enum debug_obj_state state) { struct work_struct *work = addr; @@ -496,17 +470,17 @@ static int work_fixup_free(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: cancel_work_sync(work); debug_object_free(work, &work_debug_descr); - return 1; + return true; default: - return 0; + return false; } } static struct debug_obj_descr work_debug_descr = { .name = "work_struct", .debug_hint = work_debug_hint, + .is_static_object = work_is_static_object, .fixup_init = work_fixup_init, - .fixup_activate = work_fixup_activate, .fixup_free = work_fixup_free, }; @@ -4395,8 +4369,8 @@ static void show_pwq(struct pool_workqueue *pwq) /** * show_workqueue_state - dump workqueue state * - * Called from a sysrq handler and prints out all busy workqueues and - * pools. + * Called from a sysrq handler or try_to_freeze_tasks() and prints out + * all busy workqueues and pools. */ void show_workqueue_state(void) { @@ -4626,95 +4600,72 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) return; - /* is @cpu the only online CPU? */ cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask); - if (cpumask_weight(&cpumask) != 1) - return; /* as we're called from CPU_ONLINE, the following shouldn't fail */ for_each_pool_worker(worker, pool) - WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, - pool->attrs->cpumask) < 0); + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, &cpumask) < 0); } -/* - * Workqueues should be brought up before normal priority CPU notifiers. - * This will be registered high priority CPU notifier. - */ -static int workqueue_cpu_up_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +int workqueue_prepare_cpu(unsigned int cpu) +{ + struct worker_pool *pool; + + for_each_cpu_worker_pool(pool, cpu) { + if (pool->nr_workers) + continue; + if (!create_worker(pool)) + return -ENOMEM; + } + return 0; +} + +int workqueue_online_cpu(unsigned int cpu) { - int cpu = (unsigned long)hcpu; struct worker_pool *pool; struct workqueue_struct *wq; int pi; - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_UP_PREPARE: - for_each_cpu_worker_pool(pool, cpu) { - if (pool->nr_workers) - continue; - if (!create_worker(pool)) - return NOTIFY_BAD; - } - break; - - case CPU_DOWN_FAILED: - case CPU_ONLINE: - mutex_lock(&wq_pool_mutex); + mutex_lock(&wq_pool_mutex); - for_each_pool(pool, pi) { - mutex_lock(&pool->attach_mutex); + for_each_pool(pool, pi) { + mutex_lock(&pool->attach_mutex); - if (pool->cpu == cpu) - rebind_workers(pool); - else if (pool->cpu < 0) - restore_unbound_workers_cpumask(pool, cpu); + if (pool->cpu == cpu) + rebind_workers(pool); + else if (pool->cpu < 0) + restore_unbound_workers_cpumask(pool, cpu); - mutex_unlock(&pool->attach_mutex); - } + mutex_unlock(&pool->attach_mutex); + } - /* update NUMA affinity of unbound workqueues */ - list_for_each_entry(wq, &workqueues, list) - wq_update_unbound_numa(wq, cpu, true); + /* update NUMA affinity of unbound workqueues */ + list_for_each_entry(wq, &workqueues, list) + wq_update_unbound_numa(wq, cpu, true); - mutex_unlock(&wq_pool_mutex); - break; - } - return NOTIFY_OK; + mutex_unlock(&wq_pool_mutex); + return 0; } -/* - * Workqueues should be brought down after normal priority CPU notifiers. - * This will be registered as low priority CPU notifier. - */ -static int workqueue_cpu_down_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +int workqueue_offline_cpu(unsigned int cpu) { - int cpu = (unsigned long)hcpu; struct work_struct unbind_work; struct workqueue_struct *wq; - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DOWN_PREPARE: - /* unbinding per-cpu workers should happen on the local CPU */ - INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); - queue_work_on(cpu, system_highpri_wq, &unbind_work); - - /* update NUMA affinity of unbound workqueues */ - mutex_lock(&wq_pool_mutex); - list_for_each_entry(wq, &workqueues, list) - wq_update_unbound_numa(wq, cpu, false); - mutex_unlock(&wq_pool_mutex); - - /* wait for per-cpu unbinding to finish */ - flush_work(&unbind_work); - destroy_work_on_stack(&unbind_work); - break; - } - return NOTIFY_OK; + /* unbinding per-cpu workers should happen on the local CPU */ + INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); + queue_work_on(cpu, system_highpri_wq, &unbind_work); + + /* update NUMA affinity of unbound workqueues */ + mutex_lock(&wq_pool_mutex); + list_for_each_entry(wq, &workqueues, list) + wq_update_unbound_numa(wq, cpu, false); + mutex_unlock(&wq_pool_mutex); + + /* wait for per-cpu unbinding to finish */ + flush_work(&unbind_work); + destroy_work_on_stack(&unbind_work); + return 0; } #ifdef CONFIG_SMP @@ -5516,9 +5467,6 @@ static int __init init_workqueues(void) pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); - cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); - hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); - wq_numa_init(); /* initialize CPU pools */ |
