diff options
Diffstat (limited to 'kernel')
107 files changed, 5543 insertions, 2336 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index e2ec54e2b952..eb26e12c6c2a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -9,7 +9,7 @@ obj-y = fork.o exec_domain.o panic.o \ extable.o params.o \ kthread.o sys_ni.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ - async.o range.o smpboot.o + async.o range.o smpboot.o ucount.o obj-$(CONFIG_MULTIUSER) += groups.o diff --git a/kernel/audit.c b/kernel/audit.c index a8a91bd2b2a9..f1ca11613379 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -877,6 +877,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return err; } if (s.mask & AUDIT_STATUS_PID) { + /* NOTE: we are using task_tgid_vnr() below because + * the s.pid value is relative to the namespace + * of the caller; at present this doesn't matter + * much since you can really only run auditd + * from the initial pid namespace, but something + * to keep in mind if this changes */ int new_pid = s.pid; pid_t requesting_pid = task_tgid_vnr(current); @@ -1917,7 +1923,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) " euid=%u suid=%u fsuid=%u" " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", task_ppid_nr(tsk), - task_pid_nr(tsk), + task_tgid_nr(tsk), from_kuid(&init_user_ns, audit_get_loginuid(tsk)), from_kuid(&init_user_ns, cred->uid), from_kgid(&init_user_ns, cred->gid), diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 5abf1dc1f91c..2cd5256dbff7 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -457,7 +457,7 @@ static int audit_filter_rules(struct task_struct *tsk, switch (f->type) { case AUDIT_PID: - pid = task_pid_nr(tsk); + pid = task_tgid_nr(tsk); result = audit_comparator(pid, f->op, f->val); break; case AUDIT_PPID: @@ -1993,7 +1993,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, loginuid = from_kuid(&init_user_ns, kloginuid), tty = audit_get_tty(current); - audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid); + audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid); audit_log_task_context(ab); audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d", oldloginuid, loginuid, tty ? tty_name(tty) : "(none)", @@ -2220,7 +2220,7 @@ void __audit_ptrace(struct task_struct *t) { struct audit_context *context = current->audit_context; - context->target_pid = task_pid_nr(t); + context->target_pid = task_tgid_nr(t); context->target_auid = audit_get_loginuid(t); context->target_uid = task_uid(t); context->target_sessionid = audit_get_sessionid(t); @@ -2245,7 +2245,7 @@ int __audit_signal_info(int sig, struct task_struct *t) if (audit_pid && t->tgid == audit_pid) { if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { - audit_sig_pid = task_pid_nr(tsk); + audit_sig_pid = task_tgid_nr(tsk); if (uid_valid(tsk->loginuid)) audit_sig_uid = tsk->loginuid; else @@ -2345,7 +2345,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, void __audit_log_capset(const struct cred *new, const struct cred *old) { struct audit_context *context = current->audit_context; - context->capset.pid = task_pid_nr(current); + context->capset.pid = task_tgid_nr(current); context->capset.cap.effective = new->cap_effective; context->capset.cap.inheritable = new->cap_effective; context->capset.cap.permitted = new->cap_permitted; @@ -2377,7 +2377,7 @@ static void audit_log_task(struct audit_buffer *ab) from_kgid(&init_user_ns, gid), sessionid); audit_log_task_context(ab); - audit_log_format(ab, " pid=%d comm=", task_pid_nr(current)); + audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current)); audit_log_untrustedstring(ab, get_task_comm(comm, current)); audit_log_d_path_exe(ab, current->mm); } diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 633a650d7aeb..a2ac051c342f 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -538,7 +538,7 @@ static int __init register_perf_event_array_map(void) } late_initcall(register_perf_event_array_map); -#ifdef CONFIG_SOCK_CGROUP_DATA +#ifdef CONFIG_CGROUPS static void *cgroup_fd_array_get_ptr(struct bpf_map *map, struct file *map_file /* not used */, int fd) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 03fd23d4d587..aa6d98154106 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1018,7 +1018,7 @@ void bpf_user_rnd_init_once(void) prandom_init_once(&bpf_user_rnd_state); } -u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_0(bpf_user_rnd_u32) { /* Should someone ever have the rather unwise idea to use some * of the registers passed into this function, then note that @@ -1031,7 +1031,7 @@ u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) state = &get_cpu_var(bpf_user_rnd_state); res = prandom_u32_state(state); - put_cpu_var(state); + put_cpu_var(bpf_user_rnd_state); return res; } diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1ea3afba1a4f..39918402e6e9 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -16,6 +16,7 @@ #include <linux/ktime.h> #include <linux/sched.h> #include <linux/uidgid.h> +#include <linux/filter.h> /* If kernel subsystem is allowing eBPF programs to call this function, * inside its own verifier_ops->get_func_proto() callback it should return @@ -26,48 +27,32 @@ * if program is allowed to access maps, so check rcu_read_lock_held in * all three functions. */ -static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key) { - /* verifier checked that R1 contains a valid pointer to bpf_map - * and R2 points to a program stack and map->key_size bytes were - * initialized - */ - struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; - void *key = (void *) (unsigned long) r2; - void *value; - WARN_ON_ONCE(!rcu_read_lock_held()); - - value = map->ops->map_lookup_elem(map, key); - - /* lookup() returns either pointer to element value or NULL - * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type - */ - return (unsigned long) value; + return (unsigned long) map->ops->map_lookup_elem(map, key); } const struct bpf_func_proto bpf_map_lookup_elem_proto = { .func = bpf_map_lookup_elem, .gpl_only = false, + .pkt_access = true, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, }; -static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key, + void *, value, u64, flags) { - struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; - void *key = (void *) (unsigned long) r2; - void *value = (void *) (unsigned long) r3; - WARN_ON_ONCE(!rcu_read_lock_held()); - - return map->ops->map_update_elem(map, key, value, r4); + return map->ops->map_update_elem(map, key, value, flags); } const struct bpf_func_proto bpf_map_update_elem_proto = { .func = bpf_map_update_elem, .gpl_only = false, + .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, @@ -75,19 +60,16 @@ const struct bpf_func_proto bpf_map_update_elem_proto = { .arg4_type = ARG_ANYTHING, }; -static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key) { - struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; - void *key = (void *) (unsigned long) r2; - WARN_ON_ONCE(!rcu_read_lock_held()); - return map->ops->map_delete_elem(map, key); } const struct bpf_func_proto bpf_map_delete_elem_proto = { .func = bpf_map_delete_elem, .gpl_only = false, + .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, @@ -99,7 +81,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto = { .ret_type = RET_INTEGER, }; -static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_0(bpf_get_smp_processor_id) { return smp_processor_id(); } @@ -110,7 +92,7 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = { .ret_type = RET_INTEGER, }; -static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_0(bpf_ktime_get_ns) { /* NMI safe access to clock monotonic */ return ktime_get_mono_fast_ns(); @@ -122,11 +104,11 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto = { .ret_type = RET_INTEGER, }; -static u64 bpf_get_current_pid_tgid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_0(bpf_get_current_pid_tgid) { struct task_struct *task = current; - if (!task) + if (unlikely(!task)) return -EINVAL; return (u64) task->tgid << 32 | task->pid; @@ -138,18 +120,18 @@ const struct bpf_func_proto bpf_get_current_pid_tgid_proto = { .ret_type = RET_INTEGER, }; -static u64 bpf_get_current_uid_gid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_0(bpf_get_current_uid_gid) { struct task_struct *task = current; kuid_t uid; kgid_t gid; - if (!task) + if (unlikely(!task)) return -EINVAL; current_uid_gid(&uid, &gid); return (u64) from_kgid(&init_user_ns, gid) << 32 | - from_kuid(&init_user_ns, uid); + from_kuid(&init_user_ns, uid); } const struct bpf_func_proto bpf_get_current_uid_gid_proto = { @@ -158,10 +140,9 @@ const struct bpf_func_proto bpf_get_current_uid_gid_proto = { .ret_type = RET_INTEGER, }; -static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5) +BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size) { struct task_struct *task = current; - char *buf = (char *) (long) r1; if (unlikely(!task)) goto err_clear; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index bf4495fcd25d..732ae16d12b7 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -116,10 +116,9 @@ free_smap: return ERR_PTR(err); } -u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) +BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, + u64, flags) { - struct pt_regs *regs = (struct pt_regs *) (long) r1; - struct bpf_map *map = (struct bpf_map *) (long) r2; struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct perf_callchain_entry *trace; struct stack_map_bucket *bucket, *new_bucket, *old_bucket; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index daea765d72e6..99a7e5b388f2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14,6 +14,7 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/bpf.h> +#include <linux/bpf_verifier.h> #include <linux/filter.h> #include <net/netlink.h> #include <linux/file.h> @@ -126,76 +127,16 @@ * are set to NOT_INIT to indicate that they are no longer readable. */ -struct reg_state { - enum bpf_reg_type type; - union { - /* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */ - s64 imm; - - /* valid when type == PTR_TO_PACKET* */ - struct { - u32 id; - u16 off; - u16 range; - }; - - /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | - * PTR_TO_MAP_VALUE_OR_NULL - */ - struct bpf_map *map_ptr; - }; -}; - -enum bpf_stack_slot_type { - STACK_INVALID, /* nothing was stored in this stack slot */ - STACK_SPILL, /* register spilled into stack */ - STACK_MISC /* BPF program wrote some data into this slot */ -}; - -#define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ - -/* state of the program: - * type of all registers and stack info - */ -struct verifier_state { - struct reg_state regs[MAX_BPF_REG]; - u8 stack_slot_type[MAX_BPF_STACK]; - struct reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE]; -}; - -/* linked list of verifier states used to prune search */ -struct verifier_state_list { - struct verifier_state state; - struct verifier_state_list *next; -}; - /* verifier_state + insn_idx are pushed to stack when branch is encountered */ -struct verifier_stack_elem { +struct bpf_verifier_stack_elem { /* verifer state is 'st' * before processing instruction 'insn_idx' * and after processing instruction 'prev_insn_idx' */ - struct verifier_state st; + struct bpf_verifier_state st; int insn_idx; int prev_insn_idx; - struct verifier_stack_elem *next; -}; - -#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ - -/* single container for all structs - * one verifier_env per bpf_check() call - */ -struct verifier_env { - struct bpf_prog *prog; /* eBPF program being verified */ - struct verifier_stack_elem *head; /* stack of verifier states to be processed */ - int stack_size; /* number of states to be processed */ - struct verifier_state cur_state; /* current verifier state */ - struct verifier_state_list **explored_states; /* search pruning optimization */ - struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ - u32 used_map_cnt; /* number of used maps */ - u32 id_gen; /* used to generate unique reg IDs */ - bool allow_ptr_leaks; + struct bpf_verifier_stack_elem *next; }; #define BPF_COMPLEXITY_LIMIT_INSNS 65536 @@ -204,6 +145,7 @@ struct verifier_env { struct bpf_call_arg_meta { struct bpf_map *map_ptr; bool raw_mode; + bool pkt_access; int regno; int access_size; }; @@ -240,6 +182,7 @@ static const char * const reg_type_str[] = { [CONST_PTR_TO_MAP] = "map_ptr", [PTR_TO_MAP_VALUE] = "map_value", [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null", + [PTR_TO_MAP_VALUE_ADJ] = "map_value_adj", [FRAME_PTR] = "fp", [PTR_TO_STACK] = "fp", [CONST_IMM] = "imm", @@ -247,9 +190,9 @@ static const char * const reg_type_str[] = { [PTR_TO_PACKET_END] = "pkt_end", }; -static void print_verifier_state(struct verifier_state *state) +static void print_verifier_state(struct bpf_verifier_state *state) { - struct reg_state *reg; + struct bpf_reg_state *reg; enum bpf_reg_type t; int i; @@ -267,10 +210,17 @@ static void print_verifier_state(struct verifier_state *state) else if (t == UNKNOWN_VALUE && reg->imm) verbose("%lld", reg->imm); else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || - t == PTR_TO_MAP_VALUE_OR_NULL) + t == PTR_TO_MAP_VALUE_OR_NULL || + t == PTR_TO_MAP_VALUE_ADJ) verbose("(ks=%d,vs=%d)", reg->map_ptr->key_size, reg->map_ptr->value_size); + if (reg->min_value != BPF_REGISTER_MIN_RANGE) + verbose(",min_value=%llu", + (unsigned long long)reg->min_value); + if (reg->max_value != BPF_REGISTER_MAX_RANGE) + verbose(",max_value=%llu", + (unsigned long long)reg->max_value); } for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { if (state->stack_slot_type[i] == STACK_SPILL) @@ -425,9 +375,9 @@ static void print_bpf_insn(struct bpf_insn *insn) } } -static int pop_stack(struct verifier_env *env, int *prev_insn_idx) +static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx) { - struct verifier_stack_elem *elem; + struct bpf_verifier_stack_elem *elem; int insn_idx; if (env->head == NULL) @@ -444,12 +394,12 @@ static int pop_stack(struct verifier_env *env, int *prev_insn_idx) return insn_idx; } -static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx, - int prev_insn_idx) +static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, + int insn_idx, int prev_insn_idx) { - struct verifier_stack_elem *elem; + struct bpf_verifier_stack_elem *elem; - elem = kmalloc(sizeof(struct verifier_stack_elem), GFP_KERNEL); + elem = kmalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL); if (!elem) goto err; @@ -475,13 +425,15 @@ static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; -static void init_reg_state(struct reg_state *regs) +static void init_reg_state(struct bpf_reg_state *regs) { int i; for (i = 0; i < MAX_BPF_REG; i++) { regs[i].type = NOT_INIT; regs[i].imm = 0; + regs[i].min_value = BPF_REGISTER_MIN_RANGE; + regs[i].max_value = BPF_REGISTER_MAX_RANGE; } /* frame pointer */ @@ -491,20 +443,26 @@ static void init_reg_state(struct reg_state *regs) regs[BPF_REG_1].type = PTR_TO_CTX; } -static void mark_reg_unknown_value(struct reg_state *regs, u32 regno) +static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno) { BUG_ON(regno >= MAX_BPF_REG); regs[regno].type = UNKNOWN_VALUE; regs[regno].imm = 0; } +static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno) +{ + regs[regno].min_value = BPF_REGISTER_MIN_RANGE; + regs[regno].max_value = BPF_REGISTER_MAX_RANGE; +} + enum reg_arg_type { SRC_OP, /* register is used as source operand */ DST_OP, /* register is used as destination operand */ DST_OP_NO_MARK /* same as above, check only, don't mark */ }; -static int check_reg_arg(struct reg_state *regs, u32 regno, +static int check_reg_arg(struct bpf_reg_state *regs, u32 regno, enum reg_arg_type t) { if (regno >= MAX_BPF_REG) { @@ -564,8 +522,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type) /* check_stack_read/write functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ -static int check_stack_write(struct verifier_state *state, int off, int size, - int value_regno) +static int check_stack_write(struct bpf_verifier_state *state, int off, + int size, int value_regno) { int i; /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, @@ -590,7 +548,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size, } else { /* regular write of data into stack */ state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] = - (struct reg_state) {}; + (struct bpf_reg_state) {}; for (i = 0; i < size; i++) state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC; @@ -598,7 +556,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size, return 0; } -static int check_stack_read(struct verifier_state *state, int off, int size, +static int check_stack_read(struct bpf_verifier_state *state, int off, int size, int value_regno) { u8 *slot_type; @@ -639,7 +597,7 @@ static int check_stack_read(struct verifier_state *state, int off, int size, } /* check read/write into map element returned by bpf_map_lookup_elem() */ -static int check_map_access(struct verifier_env *env, u32 regno, int off, +static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, int size) { struct bpf_map *map = env->cur_state.regs[regno].map_ptr; @@ -654,24 +612,31 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off, #define MAX_PACKET_OFF 0xffff -static bool may_write_pkt_data(enum bpf_prog_type type) +static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, + const struct bpf_call_arg_meta *meta) { - switch (type) { + switch (env->prog->type) { + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: case BPF_PROG_TYPE_XDP: + if (meta) + return meta->pkt_access; + + env->seen_direct_write = true; return true; default: return false; } } -static int check_packet_access(struct verifier_env *env, u32 regno, int off, +static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, int size) { - struct reg_state *regs = env->cur_state.regs; - struct reg_state *reg = ®s[regno]; + struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *reg = ®s[regno]; off += reg->off; - if (off < 0 || off + size > reg->range) { + if (off < 0 || size <= 0 || off + size > reg->range) { verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", off, size, regno, reg->id, reg->off, reg->range); return -EACCES; @@ -680,9 +645,13 @@ static int check_packet_access(struct verifier_env *env, u32 regno, int off, } /* check access to 'struct bpf_context' fields */ -static int check_ctx_access(struct verifier_env *env, int off, int size, +static int check_ctx_access(struct bpf_verifier_env *env, int off, int size, enum bpf_access_type t, enum bpf_reg_type *reg_type) { + /* for analyzer ctx accesses are already validated and converted */ + if (env->analyzer_ops) + return 0; + if (env->prog->aux->ops->is_valid_access && env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) { /* remember the offset of last byte accessed in ctx */ @@ -695,7 +664,7 @@ static int check_ctx_access(struct verifier_env *env, int off, int size, return -EACCES; } -static bool is_pointer_value(struct verifier_env *env, int regno) +static bool is_pointer_value(struct bpf_verifier_env *env, int regno) { if (env->allow_ptr_leaks) return false; @@ -709,28 +678,19 @@ static bool is_pointer_value(struct verifier_env *env, int regno) } } -static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg, - int off, int size) +static int check_ptr_alignment(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, int off, int size) { - if (reg->type != PTR_TO_PACKET) { + if (reg->type != PTR_TO_PACKET && reg->type != PTR_TO_MAP_VALUE_ADJ) { if (off % size != 0) { - verbose("misaligned access off %d size %d\n", off, size); + verbose("misaligned access off %d size %d\n", + off, size); return -EACCES; } else { return 0; } } - switch (env->prog->type) { - case BPF_PROG_TYPE_SCHED_CLS: - case BPF_PROG_TYPE_SCHED_ACT: - case BPF_PROG_TYPE_XDP: - break; - default: - verbose("verifier is misconfigured\n"); - return -EACCES; - } - if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) /* misaligned access to packet is ok on x86,arm,arm64 */ return 0; @@ -741,7 +701,8 @@ static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg, } /* skb->data is NET_IP_ALIGN-ed */ - if ((NET_IP_ALIGN + reg->off + off) % size != 0) { + if (reg->type == PTR_TO_PACKET && + (NET_IP_ALIGN + reg->off + off) % size != 0) { verbose("misaligned packet access off %d+%d+%d size %d\n", NET_IP_ALIGN, reg->off, off, size); return -EACCES; @@ -755,12 +716,12 @@ static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg, * if t==write && value_regno==-1, some unknown value is stored into memory * if t==read && value_regno==-1, don't care what we read from memory */ -static int check_mem_access(struct verifier_env *env, u32 regno, int off, +static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, int bpf_size, enum bpf_access_type t, int value_regno) { - struct verifier_state *state = &env->cur_state; - struct reg_state *reg = &state->regs[regno]; + struct bpf_verifier_state *state = &env->cur_state; + struct bpf_reg_state *reg = &state->regs[regno]; int size, err = 0; if (reg->type == PTR_TO_STACK) @@ -774,12 +735,52 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, if (err) return err; - if (reg->type == PTR_TO_MAP_VALUE) { + if (reg->type == PTR_TO_MAP_VALUE || + reg->type == PTR_TO_MAP_VALUE_ADJ) { if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { verbose("R%d leaks addr into map\n", value_regno); return -EACCES; } + + /* If we adjusted the register to this map value at all then we + * need to change off and size to min_value and max_value + * respectively to make sure our theoretical access will be + * safe. + */ + if (reg->type == PTR_TO_MAP_VALUE_ADJ) { + if (log_level) + print_verifier_state(state); + env->varlen_map_value_access = true; + /* The minimum value is only important with signed + * comparisons where we can't assume the floor of a + * value is 0. If we are using signed variables for our + * index'es we need to make sure that whatever we use + * will have a set floor within our range. + */ + if ((s64)reg->min_value < 0) { + verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", + regno); + return -EACCES; + } + err = check_map_access(env, regno, reg->min_value + off, + size); + if (err) { + verbose("R%d min value is outside of the array range\n", + regno); + return err; + } + + /* If we haven't set a max value then we need to bail + * since we can't be sure we won't do bad things. + */ + if (reg->max_value == BPF_REGISTER_MAX_RANGE) { + verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n", + regno); + return -EACCES; + } + off += reg->max_value; + } err = check_map_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown_value(state->regs, value_regno); @@ -795,9 +796,8 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, err = check_ctx_access(env, off, size, t, ®_type); if (!err && t == BPF_READ && value_regno >= 0) { mark_reg_unknown_value(state->regs, value_regno); - if (env->allow_ptr_leaks) - /* note that reg.[id|off|range] == 0 */ - state->regs[value_regno].type = reg_type; + /* note that reg.[id|off|range] == 0 */ + state->regs[value_regno].type = reg_type; } } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) { @@ -817,7 +817,7 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, err = check_stack_read(state, off, size, value_regno); } } else if (state->regs[regno].type == PTR_TO_PACKET) { - if (t == BPF_WRITE && !may_write_pkt_data(env->prog->type)) { + if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL)) { verbose("cannot write into packet\n"); return -EACCES; } @@ -846,9 +846,9 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, return err; } -static int check_xadd(struct verifier_env *env, struct bpf_insn *insn) +static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = env->cur_state.regs; int err; if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) || @@ -882,12 +882,12 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn) * bytes from that pointer, make sure that it's within stack boundary * and all elements of stack are initialized */ -static int check_stack_boundary(struct verifier_env *env, int regno, +static int check_stack_boundary(struct bpf_verifier_env *env, int regno, int access_size, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { - struct verifier_state *state = &env->cur_state; - struct reg_state *regs = state->regs; + struct bpf_verifier_state *state = &env->cur_state; + struct bpf_reg_state *regs = state->regs; int off, i; if (regs[regno].type != PTR_TO_STACK) { @@ -926,18 +926,18 @@ static int check_stack_boundary(struct verifier_env *env, int regno, return 0; } -static int check_func_arg(struct verifier_env *env, u32 regno, +static int check_func_arg(struct bpf_verifier_env *env, u32 regno, enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) { - struct reg_state *reg = env->cur_state.regs + regno; - enum bpf_reg_type expected_type; + struct bpf_reg_state *regs = env->cur_state.regs, *reg = ®s[regno]; + enum bpf_reg_type expected_type, type = reg->type; int err = 0; if (arg_type == ARG_DONTCARE) return 0; - if (reg->type == NOT_INIT) { + if (type == NOT_INIT) { verbose("R%d !read_ok\n", regno); return -EACCES; } @@ -950,16 +950,29 @@ static int check_func_arg(struct verifier_env *env, u32 regno, return 0; } + if (type == PTR_TO_PACKET && !may_access_direct_pkt_data(env, meta)) { + verbose("helper access to the packet is not allowed\n"); + return -EACCES; + } + if (arg_type == ARG_PTR_TO_MAP_KEY || arg_type == ARG_PTR_TO_MAP_VALUE) { expected_type = PTR_TO_STACK; + if (type != PTR_TO_PACKET && type != expected_type) + goto err_type; } else if (arg_type == ARG_CONST_STACK_SIZE || arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { expected_type = CONST_IMM; + if (type != expected_type) + goto err_type; } else if (arg_type == ARG_CONST_MAP_PTR) { expected_type = CONST_PTR_TO_MAP; + if (type != expected_type) + goto err_type; } else if (arg_type == ARG_PTR_TO_CTX) { expected_type = PTR_TO_CTX; + if (type != expected_type) + goto err_type; } else if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_RAW_STACK) { expected_type = PTR_TO_STACK; @@ -967,20 +980,16 @@ static int check_func_arg(struct verifier_env *env, u32 regno, * passed in as argument, it's a CONST_IMM type. Final test * happens during stack boundary checking. */ - if (reg->type == CONST_IMM && reg->imm == 0) - expected_type = CONST_IMM; + if (type == CONST_IMM && reg->imm == 0) + /* final test in check_stack_boundary() */; + else if (type != PTR_TO_PACKET && type != expected_type) + goto err_type; meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK; } else { verbose("unsupported arg_type %d\n", arg_type); return -EFAULT; } - if (reg->type != expected_type) { - verbose("R%d type=%s expected=%s\n", regno, - reg_type_str[reg->type], reg_type_str[expected_type]); - return -EACCES; - } - if (arg_type == ARG_CONST_MAP_PTR) { /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ meta->map_ptr = reg->map_ptr; @@ -998,8 +1007,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno, verbose("invalid map_ptr to access map->key\n"); return -EACCES; } - err = check_stack_boundary(env, regno, meta->map_ptr->key_size, - false, NULL); + if (type == PTR_TO_PACKET) + err = check_packet_access(env, regno, 0, + meta->map_ptr->key_size); + else + err = check_stack_boundary(env, regno, + meta->map_ptr->key_size, + false, NULL); } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity @@ -1009,9 +1023,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno, verbose("invalid map_ptr to access map->value\n"); return -EACCES; } - err = check_stack_boundary(env, regno, - meta->map_ptr->value_size, - false, NULL); + if (type == PTR_TO_PACKET) + err = check_packet_access(env, regno, 0, + meta->map_ptr->value_size); + else + err = check_stack_boundary(env, regno, + meta->map_ptr->value_size, + false, NULL); } else if (arg_type == ARG_CONST_STACK_SIZE || arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO); @@ -1025,11 +1043,18 @@ static int check_func_arg(struct verifier_env *env, u32 regno, verbose("ARG_CONST_STACK_SIZE cannot be first argument\n"); return -EACCES; } - err = check_stack_boundary(env, regno - 1, reg->imm, - zero_size_allowed, meta); + if (regs[regno - 1].type == PTR_TO_PACKET) + err = check_packet_access(env, regno - 1, 0, reg->imm); + else + err = check_stack_boundary(env, regno - 1, reg->imm, + zero_size_allowed, meta); } return err; +err_type: + verbose("R%d type=%s expected=%s\n", regno, + reg_type_str[type], reg_type_str[expected_type]); + return -EACCES; } static int check_map_func_compatibility(struct bpf_map *map, int func_id) @@ -1053,7 +1078,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) goto error; break; case BPF_MAP_TYPE_CGROUP_ARRAY: - if (func_id != BPF_FUNC_skb_under_cgroup) + if (func_id != BPF_FUNC_skb_under_cgroup && + func_id != BPF_FUNC_current_task_under_cgroup) goto error; break; default: @@ -1075,6 +1101,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) goto error; break; + case BPF_FUNC_current_task_under_cgroup: case BPF_FUNC_skb_under_cgroup: if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY) goto error; @@ -1108,10 +1135,10 @@ static int check_raw_mode(const struct bpf_func_proto *fn) return count > 1 ? -EINVAL : 0; } -static void clear_all_pkt_pointers(struct verifier_env *env) +static void clear_all_pkt_pointers(struct bpf_verifier_env *env) { - struct verifier_state *state = &env->cur_state; - struct reg_state *regs = state->regs, *reg; + struct bpf_verifier_state *state = &env->cur_state; + struct bpf_reg_state *regs = state->regs, *reg; int i; for (i = 0; i < MAX_BPF_REG; i++) @@ -1131,12 +1158,12 @@ static void clear_all_pkt_pointers(struct verifier_env *env) } } -static int check_call(struct verifier_env *env, int func_id) +static int check_call(struct bpf_verifier_env *env, int func_id) { - struct verifier_state *state = &env->cur_state; + struct bpf_verifier_state *state = &env->cur_state; const struct bpf_func_proto *fn = NULL; - struct reg_state *regs = state->regs; - struct reg_state *reg; + struct bpf_reg_state *regs = state->regs; + struct bpf_reg_state *reg; struct bpf_call_arg_meta meta; bool changes_data; int i, err; @@ -1164,6 +1191,7 @@ static int check_call(struct verifier_env *env, int func_id) changes_data = bpf_helper_changes_skb_data(fn->func); memset(&meta, 0, sizeof(meta)); + meta.pkt_access = fn->pkt_access; /* We only support one arg being in raw mode at the moment, which * is sufficient for the helper functions we have right now. @@ -1214,6 +1242,7 @@ static int check_call(struct verifier_env *env, int func_id) regs[BPF_REG_0].type = NOT_INIT; } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; + regs[BPF_REG_0].max_value = regs[BPF_REG_0].min_value = 0; /* remember map_ptr, so that check_map_access() * can check 'value_size' boundary of memory access * to map element returned from bpf_map_lookup_elem() @@ -1238,12 +1267,13 @@ static int check_call(struct verifier_env *env, int func_id) return 0; } -static int check_packet_ptr_add(struct verifier_env *env, struct bpf_insn *insn) +static int check_packet_ptr_add(struct bpf_verifier_env *env, + struct bpf_insn *insn) { - struct reg_state *regs = env->cur_state.regs; - struct reg_state *dst_reg = ®s[insn->dst_reg]; - struct reg_state *src_reg = ®s[insn->src_reg]; - struct reg_state tmp_reg; + struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *dst_reg = ®s[insn->dst_reg]; + struct bpf_reg_state *src_reg = ®s[insn->src_reg]; + struct bpf_reg_state tmp_reg; s32 imm; if (BPF_SRC(insn->code) == BPF_K) { @@ -1311,10 +1341,10 @@ add_imm: return 0; } -static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn) +static int evaluate_reg_alu(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct reg_state *regs = env->cur_state.regs; - struct reg_state *dst_reg = ®s[insn->dst_reg]; + struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *dst_reg = ®s[insn->dst_reg]; u8 opcode = BPF_OP(insn->code); s64 imm_log2; @@ -1324,7 +1354,7 @@ static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn) */ if (BPF_SRC(insn->code) == BPF_X) { - struct reg_state *src_reg = ®s[insn->src_reg]; + struct bpf_reg_state *src_reg = ®s[insn->src_reg]; if (src_reg->type == UNKNOWN_VALUE && src_reg->imm > 0 && dst_reg->imm && opcode == BPF_ADD) { @@ -1413,11 +1443,12 @@ static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn) return 0; } -static int evaluate_reg_imm_alu(struct verifier_env *env, struct bpf_insn *insn) +static int evaluate_reg_imm_alu(struct bpf_verifier_env *env, + struct bpf_insn *insn) { - struct reg_state *regs = env->cur_state.regs; - struct reg_state *dst_reg = ®s[insn->dst_reg]; - struct reg_state *src_reg = ®s[insn->src_reg]; + struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *dst_reg = ®s[insn->dst_reg]; + struct bpf_reg_state *src_reg = ®s[insn->src_reg]; u8 opcode = BPF_OP(insn->code); /* dst_reg->type == CONST_IMM here, simulate execution of 'add' insn. @@ -1433,10 +1464,110 @@ static int evaluate_reg_imm_alu(struct verifier_env *env, struct bpf_insn *insn) return 0; } +static void check_reg_overflow(struct bpf_reg_state *reg) +{ + if (reg->max_value > BPF_REGISTER_MAX_RANGE) + reg->max_value = BPF_REGISTER_MAX_RANGE; + if ((s64)reg->min_value < BPF_REGISTER_MIN_RANGE) + reg->min_value = BPF_REGISTER_MIN_RANGE; +} + +static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg; + u64 min_val = BPF_REGISTER_MIN_RANGE, max_val = BPF_REGISTER_MAX_RANGE; + bool min_set = false, max_set = false; + u8 opcode = BPF_OP(insn->code); + + dst_reg = ®s[insn->dst_reg]; + if (BPF_SRC(insn->code) == BPF_X) { + check_reg_overflow(®s[insn->src_reg]); + min_val = regs[insn->src_reg].min_value; + max_val = regs[insn->src_reg].max_value; + + /* If the source register is a random pointer then the + * min_value/max_value values represent the range of the known + * accesses into that value, not the actual min/max value of the + * register itself. In this case we have to reset the reg range + * values so we know it is not safe to look at. + */ + if (regs[insn->src_reg].type != CONST_IMM && + regs[insn->src_reg].type != UNKNOWN_VALUE) { + min_val = BPF_REGISTER_MIN_RANGE; + max_val = BPF_REGISTER_MAX_RANGE; + } + } else if (insn->imm < BPF_REGISTER_MAX_RANGE && + (s64)insn->imm > BPF_REGISTER_MIN_RANGE) { + min_val = max_val = insn->imm; + min_set = max_set = true; + } + + /* We don't know anything about what was done to this register, mark it + * as unknown. + */ + if (min_val == BPF_REGISTER_MIN_RANGE && + max_val == BPF_REGISTER_MAX_RANGE) { + reset_reg_range_values(regs, insn->dst_reg); + return; + } + + switch (opcode) { + case BPF_ADD: + dst_reg->min_value += min_val; + dst_reg->max_value += max_val; + break; + case BPF_SUB: + dst_reg->min_value -= min_val; + dst_reg->max_value -= max_val; + break; + case BPF_MUL: + dst_reg->min_value *= min_val; + dst_reg->max_value *= max_val; + break; + case BPF_AND: + /* & is special since it could end up with 0 bits set. */ + dst_reg->min_value &= min_val; + dst_reg->max_value = max_val; + break; + case BPF_LSH: + /* Gotta have special overflow logic here, if we're shifting + * more than MAX_RANGE then just assume we have an invalid + * range. + */ + if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) + dst_reg->min_value = BPF_REGISTER_MIN_RANGE; + else + dst_reg->min_value <<= min_val; + + if (max_val > ilog2(BPF_REGISTER_MAX_RANGE)) + dst_reg->max_value = BPF_REGISTER_MAX_RANGE; + else + dst_reg->max_value <<= max_val; + break; + case BPF_RSH: + dst_reg->min_value >>= min_val; + dst_reg->max_value >>= max_val; + break; + case BPF_MOD: + /* % is special since it is an unsigned modulus, so the floor + * will always be 0. + */ + dst_reg->min_value = 0; + dst_reg->max_value = max_val - 1; + break; + default: + reset_reg_range_values(regs, insn->dst_reg); + break; + } + + check_reg_overflow(dst_reg); +} + /* check validity of 32-bit and 64-bit arithmetic operations */ -static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) +static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct reg_state *regs = env->cur_state.regs, *dst_reg; + struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg; u8 opcode = BPF_OP(insn->code); int err; @@ -1496,6 +1627,11 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) if (err) return err; + /* we are setting our register to something new, we need to + * reset its range values. + */ + reset_reg_range_values(regs, insn->dst_reg); + if (BPF_SRC(insn->code) == BPF_X) { if (BPF_CLASS(insn->code) == BPF_ALU64) { /* case: R1 = R2 @@ -1517,6 +1653,8 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) */ regs[insn->dst_reg].type = CONST_IMM; regs[insn->dst_reg].imm = insn->imm; + regs[insn->dst_reg].max_value = insn->imm; + regs[insn->dst_reg].min_value = insn->imm; } } else if (opcode > BPF_END) { @@ -1569,6 +1707,9 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) dst_reg = ®s[insn->dst_reg]; + /* first we want to adjust our ranges. */ + adjust_reg_min_max_vals(env, insn); + /* pattern match 'bpf_add Rx, imm' instruction */ if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && dst_reg->type == FRAME_PTR && BPF_SRC(insn->code) == BPF_K) { @@ -1603,28 +1744,58 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) return -EACCES; } - /* mark dest operand */ - mark_reg_unknown_value(regs, insn->dst_reg); + /* If we did pointer math on a map value then just set it to our + * PTR_TO_MAP_VALUE_ADJ type so we can deal with any stores or + * loads to this register appropriately, otherwise just mark the + * register as unknown. + */ + if (env->allow_ptr_leaks && + (dst_reg->type == PTR_TO_MAP_VALUE || + dst_reg->type == PTR_TO_MAP_VALUE_ADJ)) + dst_reg->type = PTR_TO_MAP_VALUE_ADJ; + else + mark_reg_unknown_value(regs, insn->dst_reg); } return 0; } -static void find_good_pkt_pointers(struct verifier_env *env, - struct reg_state *dst_reg) +static void find_good_pkt_pointers(struct bpf_verifier_state *state, + struct bpf_reg_state *dst_reg) { - struct verifier_state *state = &env->cur_state; - struct reg_state *regs = state->regs, *reg; + struct bpf_reg_state *regs = state->regs, *reg; int i; - /* r2 = r3; - * r2 += 8 - * if (r2 > pkt_end) goto somewhere - * r2 == dst_reg, pkt_end == src_reg, - * r2=pkt(id=n,off=8,r=0) - * r3=pkt(id=n,off=0,r=0) - * find register r3 and mark its range as r3=pkt(id=n,off=0,r=8) - * so that range of bytes [r3, r3 + 8) is safe to access + + /* LLVM can generate two kind of checks: + * + * Type 1: + * + * r2 = r3; + * r2 += 8; + * if (r2 > pkt_end) goto <handle exception> + * <access okay> + * + * Where: + * r2 == dst_reg, pkt_end == src_reg + * r2=pkt(id=n,off=8,r=0) + * r3=pkt(id=n,off=0,r=0) + * + * Type 2: + * + * r2 = r3; + * r2 += 8; + * if (pkt_end >= r2) goto <access okay> + * <handle exception> + * + * Where: + * pkt_end == dst_reg, r2 == src_reg + * r2=pkt(id=n,off=8,r=0) + * r3=pkt(id=n,off=0,r=0) + * + * Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8) + * so that range of bytes [r3, r3 + 8) is safe to access. */ + for (i = 0; i < MAX_BPF_REG; i++) if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id) regs[i].range = dst_reg->off; @@ -1638,11 +1809,109 @@ static void find_good_pkt_pointers(struct verifier_env *env, } } -static int check_cond_jmp_op(struct verifier_env *env, +/* Adjusts the register min/max values in the case that the dst_reg is the + * variable register that we are working on, and src_reg is a constant or we're + * simply doing a BPF_K check. + */ +static void reg_set_min_max(struct bpf_reg_state *true_reg, + struct bpf_reg_state *false_reg, u64 val, + u8 opcode) +{ + switch (opcode) { + case BPF_JEQ: + /* If this is false then we know nothing Jon Snow, but if it is + * true then we know for sure. + */ + true_reg->max_value = true_reg->min_value = val; + break; + case BPF_JNE: + /* If this is true we know nothing Jon Snow, but if it is false + * we know the value for sure; + */ + false_reg->max_value = false_reg->min_value = val; + break; + case BPF_JGT: + /* Unsigned comparison, the minimum value is 0. */ + false_reg->min_value = 0; + case BPF_JSGT: + /* If this is false then we know the maximum val is val, + * otherwise we know the min val is val+1. + */ + false_reg->max_value = val; + true_reg->min_value = val + 1; + break; + case BPF_JGE: + /* Unsigned comparison, the minimum value is 0. */ + false_reg->min_value = 0; + case BPF_JSGE: + /* If this is false then we know the maximum value is val - 1, + * otherwise we know the mimimum value is val. + */ + false_reg->max_value = val - 1; + true_reg->min_value = val; + break; + default: + break; + } + + check_reg_overflow(false_reg); + check_reg_overflow(true_reg); +} + +/* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg + * is the variable reg. + */ +static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, + struct bpf_reg_state *false_reg, u64 val, + u8 opcode) +{ + switch (opcode) { + case BPF_JEQ: + /* If this is false then we know nothing Jon Snow, but if it is + * true then we know for sure. + */ + true_reg->max_value = true_reg->min_value = val; + break; + case BPF_JNE: + /* If this is true we know nothing Jon Snow, but if it is false + * we know the value for sure; + */ + false_reg->max_value = false_reg->min_value = val; + break; + case BPF_JGT: + /* Unsigned comparison, the minimum value is 0. */ + true_reg->min_value = 0; + case BPF_JSGT: + /* + * If this is false, then the val is <= the register, if it is + * true the register <= to the val. + */ + false_reg->min_value = val; + true_reg->max_value = val - 1; + break; + case BPF_JGE: + /* Unsigned comparison, the minimum value is 0. */ + true_reg->min_value = 0; + case BPF_JSGE: + /* If this is false then constant < register, if it is true then + * the register < constant. + */ + false_reg->min_value = val + 1; + true_reg->max_value = val; + break; + default: + break; + } + + check_reg_overflow(false_reg); + check_reg_overflow(true_reg); +} + +static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { - struct reg_state *regs = env->cur_state.regs, *dst_reg; - struct verifier_state *other_branch; + struct bpf_verifier_state *other_branch, *this_branch = &env->cur_state; + struct bpf_reg_state *regs = this_branch->regs, *dst_reg; u8 opcode = BPF_OP(insn->code); int err; @@ -1704,7 +1973,24 @@ static int check_cond_jmp_op(struct verifier_env *env, if (!other_branch) return -EFAULT; - /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */ + /* detect if we are comparing against a constant value so we can adjust + * our min/max values for our dst register. + */ + if (BPF_SRC(insn->code) == BPF_X) { + if (regs[insn->src_reg].type == CONST_IMM) + reg_set_min_max(&other_branch->regs[insn->dst_reg], + dst_reg, regs[insn->src_reg].imm, + opcode); + else if (dst_reg->type == CONST_IMM) + reg_set_min_max_inv(&other_branch->regs[insn->src_reg], + ®s[insn->src_reg], dst_reg->imm, + opcode); + } else { + reg_set_min_max(&other_branch->regs[insn->dst_reg], + dst_reg, insn->imm, opcode); + } + + /* detect if R == 0 where R is returned from bpf_map_lookup_elem() */ if (BPF_SRC(insn->code) == BPF_K && insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { @@ -1723,13 +2009,17 @@ static int check_cond_jmp_op(struct verifier_env *env, } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && dst_reg->type == PTR_TO_PACKET && regs[insn->src_reg].type == PTR_TO_PACKET_END) { - find_good_pkt_pointers(env, dst_reg); + find_good_pkt_pointers(this_branch, dst_reg); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && + dst_reg->type == PTR_TO_PACKET_END && + regs[insn->src_reg].type == PTR_TO_PACKET) { + find_good_pkt_pointers(other_branch, ®s[insn->src_reg]); } else if (is_pointer_value(env, insn->dst_reg)) { verbose("R%d pointer comparison prohibited\n", insn->dst_reg); return -EACCES; } if (log_level) - print_verifier_state(&env->cur_state); + print_verifier_state(this_branch); return 0; } @@ -1742,9 +2032,9 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) } /* verify BPF_LD_IMM64 instruction */ -static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn) +static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = env->cur_state.regs; int err; if (BPF_SIZE(insn->code) != BPF_DW) { @@ -1760,9 +2050,19 @@ static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn) if (err) return err; - if (insn->src_reg == 0) - /* generic move 64-bit immediate into a register */ + if (insn->src_reg == 0) { + /* generic move 64-bit immediate into a register, + * only analyzer needs to collect the ld_imm value. + */ + u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; + + if (!env->analyzer_ops) + return 0; + + regs[insn->dst_reg].type = CONST_IMM; + regs[insn->dst_reg].imm = imm; return 0; + } /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */ BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD); @@ -1799,11 +2099,11 @@ static bool may_access_skb(enum bpf_prog_type type) * Output: * R0 - 8/16/32-bit skb data converted to cpu endianness */ -static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn) +static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = env->cur_state.regs; u8 mode = BPF_MODE(insn->code); - struct reg_state *reg; + struct bpf_reg_state *reg; int i, err; if (!may_access_skb(env->prog->type)) { @@ -1889,7 +2189,7 @@ enum { BRANCH = 2, }; -#define STATE_LIST_MARK ((struct verifier_state_list *) -1L) +#define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L) static int *insn_stack; /* stack of insns to process */ static int cur_stack; /* current stack index */ @@ -1900,7 +2200,7 @@ static int *insn_state; * w - next instruction * e - edge */ -static int push_insn(int t, int w, int e, struct verifier_env *env) +static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) { if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH)) return 0; @@ -1941,7 +2241,7 @@ static int push_insn(int t, int w, int e, struct verifier_env *env) /* non-recursive depth-first-search to detect loops in BPF program * loop == back-edge in directed graph */ -static int check_cfg(struct verifier_env *env) +static int check_cfg(struct bpf_verifier_env *env) { struct bpf_insn *insns = env->prog->insnsi; int insn_cnt = env->prog->len; @@ -2050,7 +2350,8 @@ err_free: /* the following conditions reduce the number of explored insns * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet */ -static bool compare_ptrs_to_packet(struct reg_state *old, struct reg_state *cur) +static bool compare_ptrs_to_packet(struct bpf_reg_state *old, + struct bpf_reg_state *cur) { if (old->id != cur->id) return false; @@ -2125,9 +2426,11 @@ static bool compare_ptrs_to_packet(struct reg_state *old, struct reg_state *cur) * whereas register type in current state is meaningful, it means that * the current state will reach 'bpf_exit' instruction safely */ -static bool states_equal(struct verifier_state *old, struct verifier_state *cur) +static bool states_equal(struct bpf_verifier_env *env, + struct bpf_verifier_state *old, + struct bpf_verifier_state *cur) { - struct reg_state *rold, *rcur; + struct bpf_reg_state *rold, *rcur; int i; for (i = 0; i < MAX_BPF_REG; i++) { @@ -2137,6 +2440,13 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur) if (memcmp(rold, rcur, sizeof(*rold)) == 0) continue; + /* If the ranges were not the same, but everything else was and + * we didn't do a variable access into a map then we are a-ok. + */ + if (!env->varlen_map_value_access && + rold->type == rcur->type && rold->imm == rcur->imm) + continue; + if (rold->type == NOT_INIT || (rold->type == UNKNOWN_VALUE && rcur->type != NOT_INIT)) continue; @@ -2167,9 +2477,9 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur) * the same, check that stored pointers types * are the same as well. * Ex: explored safe path could have stored - * (struct reg_state) {.type = PTR_TO_STACK, .imm = -8} + * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -8} * but current path has stored: - * (struct reg_state) {.type = PTR_TO_STACK, .imm = -16} + * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -16} * such verifier states are not equivalent. * return false to continue verification of this path */ @@ -2180,10 +2490,10 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur) return true; } -static int is_state_visited(struct verifier_env *env, int insn_idx) +static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { - struct verifier_state_list *new_sl; - struct verifier_state_list *sl; + struct bpf_verifier_state_list *new_sl; + struct bpf_verifier_state_list *sl; sl = env->explored_states[insn_idx]; if (!sl) @@ -2193,7 +2503,7 @@ static int is_state_visited(struct verifier_env *env, int insn_idx) return 0; while (sl != STATE_LIST_MARK) { - if (states_equal(&sl->state, &env->cur_state)) + if (states_equal(env, &sl->state, &env->cur_state)) /* reached equivalent register/stack state, * prune the search */ @@ -2207,7 +2517,7 @@ static int is_state_visited(struct verifier_env *env, int insn_idx) * it will be rejected. Since there are no loops, we won't be * seeing this 'insn_idx' instruction again on the way to bpf_exit */ - new_sl = kmalloc(sizeof(struct verifier_state_list), GFP_USER); + new_sl = kmalloc(sizeof(struct bpf_verifier_state_list), GFP_USER); if (!new_sl) return -ENOMEM; @@ -2218,11 +2528,20 @@ static int is_state_visited(struct verifier_env *env, int insn_idx) return 0; } -static int do_check(struct verifier_env *env) +static int ext_analyzer_insn_hook(struct bpf_verifier_env *env, + int insn_idx, int prev_insn_idx) +{ + if (!env->analyzer_ops || !env->analyzer_ops->insn_hook) + return 0; + + return env->analyzer_ops->insn_hook(env, insn_idx, prev_insn_idx); +} + +static int do_check(struct bpf_verifier_env *env) { - struct verifier_state *state = &env->cur_state; + struct bpf_verifier_state *state = &env->cur_state; struct bpf_insn *insns = env->prog->insnsi; - struct reg_state *regs = state->regs; + struct bpf_reg_state *regs = state->regs; int insn_cnt = env->prog->len; int insn_idx, prev_insn_idx = 0; int insn_processed = 0; @@ -2230,6 +2549,7 @@ static int do_check(struct verifier_env *env) init_reg_state(regs); insn_idx = 0; + env->varlen_map_value_access = false; for (;;) { struct bpf_insn *insn; u8 class; @@ -2276,13 +2596,17 @@ static int do_check(struct verifier_env *env) print_bpf_insn(insn); } + err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx); + if (err) + return err; + if (class == BPF_ALU || class == BPF_ALU64) { err = check_alu_op(env, insn); if (err) return err; } else if (class == BPF_LDX) { - enum bpf_reg_type src_reg_type; + enum bpf_reg_type *prev_src_type, src_reg_type; /* check for reserved fields is already done */ @@ -2306,21 +2630,25 @@ static int do_check(struct verifier_env *env) if (err) return err; - if (BPF_SIZE(insn->code) != BPF_W) { + reset_reg_range_values(regs, insn->dst_reg); + if (BPF_SIZE(insn->code) != BPF_W && + BPF_SIZE(insn->code) != BPF_DW) { insn_idx++; continue; } - if (insn->imm == 0) { + prev_src_type = &env->insn_aux_data[insn_idx].ptr_type; + + if (*prev_src_type == NOT_INIT) { /* saw a valid insn * dst_reg = *(u32 *)(src_reg + off) - * use reserved 'imm' field to mark this insn + * save type to validate intersecting paths */ - insn->imm = src_reg_type; + *prev_src_type = src_reg_type; - } else if (src_reg_type != insn->imm && + } else if (src_reg_type != *prev_src_type && (src_reg_type == PTR_TO_CTX || - insn->imm == PTR_TO_CTX)) { + *prev_src_type == PTR_TO_CTX)) { /* ABuser program is trying to use the same insn * dst_reg = *(u32*) (src_reg + off) * with different pointer types: @@ -2333,7 +2661,7 @@ static int do_check(struct verifier_env *env) } } else if (class == BPF_STX) { - enum bpf_reg_type dst_reg_type; + enum bpf_reg_type *prev_dst_type, dst_reg_type; if (BPF_MODE(insn->code) == BPF_XADD) { err = check_xadd(env, insn); @@ -2361,11 +2689,13 @@ static int do_check(struct verifier_env *env) if (err) return err; - if (insn->imm == 0) { - insn->imm = dst_reg_type; - } else if (dst_reg_type != insn->imm && + prev_dst_type = &env->insn_aux_data[insn_idx].ptr_type; + + if (*prev_dst_type == NOT_INIT) { + *prev_dst_type = dst_reg_type; + } else if (dst_reg_type != *prev_dst_type && (dst_reg_type == PTR_TO_CTX || - insn->imm == PTR_TO_CTX)) { + *prev_dst_type == PTR_TO_CTX)) { verbose("same insn cannot be used with different pointers\n"); return -EINVAL; } @@ -2471,6 +2801,7 @@ process_bpf_exit: verbose("invalid BPF_LD mode\n"); return -EINVAL; } + reset_reg_range_values(regs, insn->dst_reg); } else { verbose("unknown insn class %d\n", class); return -EINVAL; @@ -2483,14 +2814,28 @@ process_bpf_exit: return 0; } +static int check_map_prog_compatibility(struct bpf_map *map, + struct bpf_prog *prog) + +{ + if (prog->type == BPF_PROG_TYPE_PERF_EVENT && + (map->map_type == BPF_MAP_TYPE_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_HASH) && + (map->map_flags & BPF_F_NO_PREALLOC)) { + verbose("perf_event programs can only use preallocated hash map\n"); + return -EINVAL; + } + return 0; +} + /* look for pseudo eBPF instructions that access map FDs and * replace them with actual map pointers */ -static int replace_map_fd_with_map_ptr(struct verifier_env *env) +static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) { struct bpf_insn *insn = env->prog->insnsi; int insn_cnt = env->prog->len; - int i, j; + int i, j, err; for (i = 0; i < insn_cnt; i++, insn++) { if (BPF_CLASS(insn->code) == BPF_LDX && @@ -2534,6 +2879,12 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) return PTR_ERR(map); } + err = check_map_prog_compatibility(map, env->prog); + if (err) { + fdput(f); + return err; + } + /* store map pointer inside BPF_LD_IMM64 instruction */ insn[0].imm = (u32) (unsigned long) map; insn[1].imm = ((u64) (unsigned long) map) >> 32; @@ -2577,7 +2928,7 @@ next_insn: } /* drop refcnt of maps used by the rejected program */ -static void release_maps(struct verifier_env *env) +static void release_maps(struct bpf_verifier_env *env) { int i; @@ -2586,7 +2937,7 @@ static void release_maps(struct verifier_env *env) } /* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */ -static void convert_pseudo_ld_imm64(struct verifier_env *env) +static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) { struct bpf_insn *insn = env->prog->insnsi; int insn_cnt = env->prog->len; @@ -2600,62 +2951,74 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env) /* convert load instructions that access fields of 'struct __sk_buff' * into sequence of instructions that access fields of 'struct sk_buff' */ -static int convert_ctx_accesses(struct verifier_env *env) +static int convert_ctx_accesses(struct bpf_verifier_env *env) { - struct bpf_insn *insn = env->prog->insnsi; - int insn_cnt = env->prog->len; - struct bpf_insn insn_buf[16]; + const struct bpf_verifier_ops *ops = env->prog->aux->ops; + const int insn_cnt = env->prog->len; + struct bpf_insn insn_buf[16], *insn; struct bpf_prog *new_prog; enum bpf_access_type type; - int i; + int i, cnt, delta = 0; - if (!env->prog->aux->ops->convert_ctx_access) + if (ops->gen_prologue) { + cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, + env->prog); + if (cnt >= ARRAY_SIZE(insn_buf)) { + verbose("bpf verifier is misconfigured\n"); + return -EINVAL; + } else if (cnt) { + new_prog = bpf_patch_insn_single(env->prog, 0, + insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + env->prog = new_prog; + delta += cnt - 1; + } + } + + if (!ops->convert_ctx_access) return 0; - for (i = 0; i < insn_cnt; i++, insn++) { - u32 insn_delta, cnt; + insn = env->prog->insnsi + delta; - if (insn->code == (BPF_LDX | BPF_MEM | BPF_W)) + for (i = 0; i < insn_cnt; i++, insn++) { + if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) || + insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) type = BPF_READ; - else if (insn->code == (BPF_STX | BPF_MEM | BPF_W)) + else if (insn->code == (BPF_STX | BPF_MEM | BPF_W) || + insn->code == (BPF_STX | BPF_MEM | BPF_DW)) type = BPF_WRITE; else continue; - if (insn->imm != PTR_TO_CTX) { - /* clear internal mark */ - insn->imm = 0; + if (env->insn_aux_data[i].ptr_type != PTR_TO_CTX) continue; - } - cnt = env->prog->aux->ops-> - convert_ctx_access(type, insn->dst_reg, insn->src_reg, - insn->off, insn_buf, env->prog); + cnt = ops->convert_ctx_access(type, insn->dst_reg, insn->src_reg, + insn->off, insn_buf, env->prog); if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { verbose("bpf verifier is misconfigured\n"); return -EINVAL; } - new_prog = bpf_patch_insn_single(env->prog, i, insn_buf, cnt); + new_prog = bpf_patch_insn_single(env->prog, i + delta, insn_buf, + cnt); if (!new_prog) return -ENOMEM; - insn_delta = cnt - 1; + delta += cnt - 1; /* keep walking new program and skip insns we just inserted */ env->prog = new_prog; - insn = new_prog->insnsi + i + insn_delta; - - insn_cnt += insn_delta; - i += insn_delta; + insn = new_prog->insnsi + i + delta; } return 0; } -static void free_states(struct verifier_env *env) +static void free_states(struct bpf_verifier_env *env) { - struct verifier_state_list *sl, *sln; + struct bpf_verifier_state_list *sl, *sln; int i; if (!env->explored_states) @@ -2678,19 +3041,24 @@ static void free_states(struct verifier_env *env) int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) { char __user *log_ubuf = NULL; - struct verifier_env *env; + struct bpf_verifier_env *env; int ret = -EINVAL; if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS) return -E2BIG; - /* 'struct verifier_env' can be global, but since it's not small, + /* 'struct bpf_verifier_env' can be global, but since it's not small, * allocate/free it every time bpf_check() is called */ - env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL); + env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); if (!env) return -ENOMEM; + env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) * + (*prog)->len); + ret = -ENOMEM; + if (!env->insn_aux_data) + goto err_free_env; env->prog = *prog; /* grab the mutex to protect few globals used by verifier */ @@ -2709,12 +3077,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) /* log_* values have to be sane */ if (log_size < 128 || log_size > UINT_MAX >> 8 || log_level == 0 || log_ubuf == NULL) - goto free_env; + goto err_unlock; ret = -ENOMEM; log_buf = vmalloc(log_size); if (!log_buf) - goto free_env; + goto err_unlock; } else { log_level = 0; } @@ -2724,7 +3092,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) goto skip_full_check; env->explored_states = kcalloc(env->prog->len, - sizeof(struct verifier_state_list *), + sizeof(struct bpf_verifier_state_list *), GFP_USER); ret = -ENOMEM; if (!env->explored_states) @@ -2783,14 +3151,67 @@ skip_full_check: free_log_buf: if (log_level) vfree(log_buf); -free_env: if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release * them now. Otherwise free_bpf_prog_info() will release them. */ release_maps(env); *prog = env->prog; +err_unlock: + mutex_unlock(&bpf_verifier_lock); + vfree(env->insn_aux_data); +err_free_env: kfree(env); + return ret; +} + +int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, + void *priv) +{ + struct bpf_verifier_env *env; + int ret; + + env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); + if (!env) + return -ENOMEM; + + env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) * + prog->len); + ret = -ENOMEM; + if (!env->insn_aux_data) + goto err_free_env; + env->prog = prog; + env->analyzer_ops = ops; + env->analyzer_priv = priv; + + /* grab the mutex to protect few globals used by verifier */ + mutex_lock(&bpf_verifier_lock); + + log_level = 0; + + env->explored_states = kcalloc(env->prog->len, + sizeof(struct bpf_verifier_state_list *), + GFP_KERNEL); + ret = -ENOMEM; + if (!env->explored_states) + goto skip_full_check; + + ret = check_cfg(env); + if (ret < 0) + goto skip_full_check; + + env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); + + ret = do_check(env); + +skip_full_check: + while (pop_stack(env, NULL) >= 0); + free_states(env); + mutex_unlock(&bpf_verifier_lock); + vfree(env->insn_aux_data); +err_free_env: + kfree(env); return ret; } +EXPORT_SYMBOL_GPL(bpf_analyzer); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d1c51b7f5221..44066158f0d1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3446,9 +3446,28 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, * Except for the root, subtree_control must be zero for a cgroup * with tasks so that child cgroups don't compete against tasks. */ - if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) { - ret = -EBUSY; - goto out_unlock; + if (enable && cgroup_parent(cgrp)) { + struct cgrp_cset_link *link; + + /* + * Because namespaces pin csets too, @cgrp->cset_links + * might not be empty even when @cgrp is empty. Walk and + * verify each cset. + */ + spin_lock_irq(&css_set_lock); + + ret = 0; + list_for_each_entry(link, &cgrp->cset_links, cset_link) { + if (css_set_populated(link->cset)) { + ret = -EBUSY; + break; + } + } + + spin_unlock_irq(&css_set_lock); + + if (ret) + goto out_unlock; } /* save and update control masks and prepare csses */ @@ -3899,7 +3918,9 @@ void cgroup_file_notify(struct cgroup_file *cfile) * cgroup_task_count - count the number of tasks in a cgroup. * @cgrp: the cgroup in question * - * Return the number of tasks in the cgroup. + * Return the number of tasks in the cgroup. The returned number can be + * higher than the actual number of tasks due to css_set references from + * namespace roots and temporary usages. */ static int cgroup_task_count(const struct cgroup *cgrp) { @@ -5606,6 +5627,12 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); + /* + * The latency of the synchronize_sched() is too high for cgroups, + * avoid it at the cost of forcing all readers into the slow path. + */ + rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss); + get_user_ns(init_cgroup_ns.user_ns); mutex_lock(&cgroup_mutex); @@ -6270,6 +6297,12 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) if (cgroup_sk_alloc_disabled) return; + /* Socket clone path */ + if (skcd->val) { + cgroup_get(sock_cgroup_ptr(skcd)); + return; + } + rcu_read_lock(); while (true) { @@ -6295,6 +6328,16 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) /* cgroup namespaces */ +static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES); +} + +static void dec_cgroup_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES); +} + static struct cgroup_namespace *alloc_cgroup_ns(void) { struct cgroup_namespace *new_ns; @@ -6316,6 +6359,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) void free_cgroup_ns(struct cgroup_namespace *ns) { put_css_set(ns->root_cset); + dec_cgroup_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); kfree(ns); @@ -6327,6 +6371,7 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, struct cgroup_namespace *old_ns) { struct cgroup_namespace *new_ns; + struct ucounts *ucounts; struct css_set *cset; BUG_ON(!old_ns); @@ -6340,6 +6385,10 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); + ucounts = inc_cgroup_namespaces(user_ns); + if (!ucounts) + return ERR_PTR(-ENOSPC); + /* It is not safe to take cgroup_mutex here */ spin_lock_irq(&css_set_lock); cset = task_css_set(current); @@ -6349,10 +6398,12 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, new_ns = alloc_cgroup_ns(); if (IS_ERR(new_ns)) { put_css_set(cset); + dec_cgroup_namespaces(ucounts); return new_ns; } new_ns->user_ns = get_user_ns(user_ns); + new_ns->ucounts = ucounts; new_ns->root_cset = cset; return new_ns; @@ -6403,12 +6454,18 @@ static void cgroupns_put(struct ns_common *ns) put_cgroup_ns(to_cg_ns(ns)); } +static struct user_namespace *cgroupns_owner(struct ns_common *ns) +{ + return to_cg_ns(ns)->user_ns; +} + const struct proc_ns_operations cgroupns_operations = { .name = "cgroup", .type = CLONE_NEWCGROUP, .get = cgroupns_get, .put = cgroupns_put, .install = cgroupns_install, + .owner = cgroupns_owner, }; static __init int cgroup_namespaces_init(void) diff --git a/kernel/configs/kvm_guest.config b/kernel/configs/kvm_guest.config new file mode 100644 index 000000000000..8d9643767142 --- /dev/null +++ b/kernel/configs/kvm_guest.config @@ -0,0 +1,32 @@ +CONFIG_NET=y +CONFIG_NET_CORE=y +CONFIG_NETDEVICES=y +CONFIG_BLOCK=y +CONFIG_BLK_DEV=y +CONFIG_NETWORK_FILESYSTEMS=y +CONFIG_INET=y +CONFIG_TTY=y +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_BINFMT_ELF=y +CONFIG_PCI=y +CONFIG_PCI_MSI=y +CONFIG_DEBUG_KERNEL=y +CONFIG_VIRTUALIZATION=y +CONFIG_HYPERVISOR_GUEST=y +CONFIG_PARAVIRT=y +CONFIG_KVM_GUEST=y +CONFIG_VIRTIO=y +CONFIG_VIRTIO_PCI=y +CONFIG_VIRTIO_BLK=y +CONFIG_VIRTIO_CONSOLE=y +CONFIG_VIRTIO_NET=y +CONFIG_9P_FS=y +CONFIG_NET_9P=y +CONFIG_NET_9P_VIRTIO=y +CONFIG_SCSI_LOWLEVEL=y +CONFIG_SCSI_VIRTIO=y +CONFIG_VIRTIO_INPUT=y +CONFIG_DRM_VIRTIO_GPU=y diff --git a/kernel/cpu.c b/kernel/cpu.c index 341bf80f80bd..5df20d6d1520 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -23,6 +23,8 @@ #include <linux/tick.h> #include <linux/irq.h> #include <linux/smpboot.h> +#include <linux/relay.h> +#include <linux/slab.h> #include <trace/events/power.h> #define CREATE_TRACE_POINTS @@ -37,8 +39,9 @@ * @thread: Pointer to the hotplug thread * @should_run: Thread should execute * @rollback: Perform a rollback - * @cb_stat: The state for a single callback (install/uninstall) - * @cb: Single callback function (install/uninstall) + * @single: Single callback invocation + * @bringup: Single callback bringup or teardown selector + * @cb_state: The state for a single callback (install/uninstall) * @result: Result of the operation * @done: Signal completion to the issuer of the task */ @@ -49,8 +52,10 @@ struct cpuhp_cpu_state { struct task_struct *thread; bool should_run; bool rollback; + bool single; + bool bringup; + struct hlist_node *node; enum cpuhp_state cb_state; - int (*cb)(unsigned int cpu); int result; struct completion done; #endif @@ -68,35 +73,103 @@ static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); * @cant_stop: Bringup/teardown can't be stopped at this step */ struct cpuhp_step { - const char *name; - int (*startup)(unsigned int cpu); - int (*teardown)(unsigned int cpu); - bool skip_onerr; - bool cant_stop; + const char *name; + union { + int (*single)(unsigned int cpu); + int (*multi)(unsigned int cpu, + struct hlist_node *node); + } startup; + union { + int (*single)(unsigned int cpu); + int (*multi)(unsigned int cpu, + struct hlist_node *node); + } teardown; + struct hlist_head list; + bool skip_onerr; + bool cant_stop; + bool multi_instance; }; static DEFINE_MUTEX(cpuhp_state_mutex); static struct cpuhp_step cpuhp_bp_states[]; static struct cpuhp_step cpuhp_ap_states[]; +static bool cpuhp_is_ap_state(enum cpuhp_state state) +{ + /* + * The extra check for CPUHP_TEARDOWN_CPU is only for documentation + * purposes as that state is handled explicitly in cpu_down. + */ + return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; +} + +static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) +{ + struct cpuhp_step *sp; + + sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states; + return sp + state; +} + /** * cpuhp_invoke_callback _ Invoke the callbacks for a given state * @cpu: The cpu for which the callback should be invoked * @step: The step in the state machine - * @cb: The callback function to invoke + * @bringup: True if the bringup callback should be invoked * - * Called from cpu hotplug and from the state register machinery + * Called from cpu hotplug and from the state register machinery. */ -static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state step, - int (*cb)(unsigned int)) +static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, + bool bringup, struct hlist_node *node) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); - int ret = 0; - - if (cb) { - trace_cpuhp_enter(cpu, st->target, step, cb); + struct cpuhp_step *step = cpuhp_get_step(state); + int (*cbm)(unsigned int cpu, struct hlist_node *node); + int (*cb)(unsigned int cpu); + int ret, cnt; + + if (!step->multi_instance) { + cb = bringup ? step->startup.single : step->teardown.single; + if (!cb) + return 0; + trace_cpuhp_enter(cpu, st->target, state, cb); ret = cb(cpu); - trace_cpuhp_exit(cpu, st->state, step, ret); + trace_cpuhp_exit(cpu, st->state, state, ret); + return ret; + } + cbm = bringup ? step->startup.multi : step->teardown.multi; + if (!cbm) + return 0; + + /* Single invocation for instance add/remove */ + if (node) { + trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); + ret = cbm(cpu, node); + trace_cpuhp_exit(cpu, st->state, state, ret); + return ret; + } + + /* State transition. Invoke on all instances */ + cnt = 0; + hlist_for_each(node, &step->list) { + trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node); + ret = cbm(cpu, node); + trace_cpuhp_exit(cpu, st->state, state, ret); + if (ret) + goto err; + cnt++; + } + return 0; +err: + /* Rollback the instances if one failed */ + cbm = !bringup ? step->startup.multi : step->teardown.multi; + if (!cbm) + return ret; + + hlist_for_each(node, &step->list) { + if (!cnt--) + break; + cbm(cpu, node); } return ret; } @@ -260,10 +333,17 @@ void cpu_hotplug_disable(void) } EXPORT_SYMBOL_GPL(cpu_hotplug_disable); +static void __cpu_hotplug_enable(void) +{ + if (WARN_ONCE(!cpu_hotplug_disabled, "Unbalanced cpu hotplug enable\n")) + return; + cpu_hotplug_disabled--; +} + void cpu_hotplug_enable(void) { cpu_maps_update_begin(); - WARN_ON(--cpu_hotplug_disabled < 0); + __cpu_hotplug_enable(); cpu_maps_update_done(); } EXPORT_SYMBOL_GPL(cpu_hotplug_enable); @@ -330,12 +410,6 @@ static int notify_online(unsigned int cpu) return 0; } -static int notify_starting(unsigned int cpu) -{ - cpu_notify(CPU_STARTING, cpu); - return 0; -} - static int bringup_wait_for_ap(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); @@ -349,8 +423,16 @@ static int bringup_cpu(unsigned int cpu) struct task_struct *idle = idle_thread_get(cpu); int ret; + /* + * Some architectures have to walk the irq descriptors to + * setup the vector space for the cpu which comes online. + * Prevent irq alloc/free across the bringup. + */ + irq_lock_sparse(); + /* Arch-specific enabling code. */ ret = __cpu_up(cpu, idle); + irq_unlock_sparse(); if (ret) { cpu_notify(CPU_UP_CANCELED, cpu); return ret; @@ -363,62 +445,55 @@ static int bringup_cpu(unsigned int cpu) /* * Hotplug state machine related functions */ -static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st, - struct cpuhp_step *steps) +static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st) { for (st->state++; st->state < st->target; st->state++) { - struct cpuhp_step *step = steps + st->state; + struct cpuhp_step *step = cpuhp_get_step(st->state); if (!step->skip_onerr) - cpuhp_invoke_callback(cpu, st->state, step->startup); + cpuhp_invoke_callback(cpu, st->state, true, NULL); } } static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, - struct cpuhp_step *steps, enum cpuhp_state target) + enum cpuhp_state target) { enum cpuhp_state prev_state = st->state; int ret = 0; for (; st->state > target; st->state--) { - struct cpuhp_step *step = steps + st->state; - - ret = cpuhp_invoke_callback(cpu, st->state, step->teardown); + ret = cpuhp_invoke_callback(cpu, st->state, false, NULL); if (ret) { st->target = prev_state; - undo_cpu_down(cpu, st, steps); + undo_cpu_down(cpu, st); break; } } return ret; } -static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st, - struct cpuhp_step *steps) +static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) { for (st->state--; st->state > st->target; st->state--) { - struct cpuhp_step *step = steps + st->state; + struct cpuhp_step *step = cpuhp_get_step(st->state); if (!step->skip_onerr) - cpuhp_invoke_callback(cpu, st->state, step->teardown); + cpuhp_invoke_callback(cpu, st->state, false, NULL); } } static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, - struct cpuhp_step *steps, enum cpuhp_state target) + enum cpuhp_state target) { enum cpuhp_state prev_state = st->state; int ret = 0; while (st->state < target) { - struct cpuhp_step *step; - st->state++; - step = steps + st->state; - ret = cpuhp_invoke_callback(cpu, st->state, step->startup); + ret = cpuhp_invoke_callback(cpu, st->state, true, NULL); if (ret) { st->target = prev_state; - undo_cpu_up(cpu, st, steps); + undo_cpu_up(cpu, st); break; } } @@ -447,13 +522,13 @@ static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st) { enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU); - return cpuhp_down_callbacks(cpu, st, cpuhp_ap_states, target); + return cpuhp_down_callbacks(cpu, st, target); } /* Execute the online startup callbacks. Used to be CPU_ONLINE */ static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st) { - return cpuhp_up_callbacks(cpu, st, cpuhp_ap_states, st->target); + return cpuhp_up_callbacks(cpu, st, st->target); } /* @@ -476,18 +551,20 @@ static void cpuhp_thread_fun(unsigned int cpu) st->should_run = false; /* Single callback invocation for [un]install ? */ - if (st->cb) { + if (st->single) { if (st->cb_state < CPUHP_AP_ONLINE) { local_irq_disable(); - ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb); + ret = cpuhp_invoke_callback(cpu, st->cb_state, + st->bringup, st->node); local_irq_enable(); } else { - ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb); + ret = cpuhp_invoke_callback(cpu, st->cb_state, + st->bringup, st->node); } } else if (st->rollback) { BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); - undo_cpu_down(cpu, st, cpuhp_ap_states); + undo_cpu_down(cpu, st); /* * This is a momentary workaround to keep the notifier users * happy. Will go away once we got rid of the notifiers. @@ -509,8 +586,9 @@ static void cpuhp_thread_fun(unsigned int cpu) } /* Invoke a single callback on a remote cpu */ -static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, - int (*cb)(unsigned int)) +static int +cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, + struct hlist_node *node) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); @@ -522,10 +600,13 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, * we invoke the thread function directly. */ if (!st->thread) - return cpuhp_invoke_callback(cpu, state, cb); + return cpuhp_invoke_callback(cpu, state, bringup, node); st->cb_state = state; - st->cb = cb; + st->single = true; + st->bringup = bringup; + st->node = node; + /* * Make sure the above stores are visible before should_run becomes * true. Paired with the mb() above in cpuhp_thread_fun() @@ -541,7 +622,7 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st) { st->result = 0; - st->cb = NULL; + st->single = false; /* * Make sure the above stores are visible before should_run becomes * true. Paired with the mb() above in cpuhp_thread_fun() @@ -674,12 +755,6 @@ static int notify_down_prepare(unsigned int cpu) return err; } -static int notify_dying(unsigned int cpu) -{ - cpu_notify(CPU_DYING, cpu); - return 0; -} - /* Take this CPU down. */ static int take_cpu_down(void *_param) { @@ -692,12 +767,16 @@ static int take_cpu_down(void *_param) if (err < 0) return err; + /* + * We get here while we are in CPUHP_TEARDOWN_CPU state and we must not + * do this step again. + */ + WARN_ON(st->state != CPUHP_TEARDOWN_CPU); + st->state--; /* Invoke the former CPU_DYING callbacks */ - for (; st->state > target; st->state--) { - struct cpuhp_step *step = cpuhp_ap_states + st->state; + for (; st->state > target; st->state--) + cpuhp_invoke_callback(cpu, st->state, false, NULL); - cpuhp_invoke_callback(cpu, st->state, step->teardown); - } /* Give up timekeeping duties */ tick_handover_do_timer(); /* Park the stopper thread */ @@ -734,7 +813,7 @@ static int takedown_cpu(unsigned int cpu) BUG_ON(cpu_online(cpu)); /* - * The migration_call() CPU_DYING callback will have removed all + * The CPUHP_AP_SCHED_MIGRATE_DYING callback will have removed all * runnable tasks from the cpu, there's only the idle task left now * that the migration thread is done doing the stop_machine thing. * @@ -787,7 +866,6 @@ void cpuhp_report_idle_dead(void) #define notify_down_prepare NULL #define takedown_cpu NULL #define notify_dead NULL -#define notify_dying NULL #endif #ifdef CONFIG_HOTPLUG_CPU @@ -836,7 +914,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need * to do the further cleanups. */ - ret = cpuhp_down_callbacks(cpu, st, cpuhp_bp_states, target); + ret = cpuhp_down_callbacks(cpu, st, target); if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) { st->target = prev_state; st->rollback = true; @@ -877,10 +955,9 @@ EXPORT_SYMBOL(cpu_down); #endif /*CONFIG_HOTPLUG_CPU*/ /** - * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers + * notify_cpu_starting(cpu) - Invoke the callbacks on the starting CPU * @cpu: cpu that just started * - * This function calls the cpu_chain notifiers with CPU_STARTING. * It must be called by the arch code on the new cpu, before the new cpu * enables interrupts and before the "boot" cpu returns from __cpu_up(). */ @@ -889,12 +966,10 @@ void notify_cpu_starting(unsigned int cpu) struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); + rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ while (st->state < target) { - struct cpuhp_step *step; - st->state++; - step = cpuhp_ap_states + st->state; - cpuhp_invoke_callback(cpu, st->state, step->startup); + cpuhp_invoke_callback(cpu, st->state, true, NULL); } } @@ -979,7 +1054,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) * responsible for bringing it up to the target state. */ target = min((int)target, CPUHP_BRINGUP_CPU); - ret = cpuhp_up_callbacks(cpu, st, cpuhp_bp_states, target); + ret = cpuhp_up_callbacks(cpu, st, target); out: cpu_hotplug_done(); return ret; @@ -1024,12 +1099,13 @@ EXPORT_SYMBOL_GPL(cpu_up); #ifdef CONFIG_PM_SLEEP_SMP static cpumask_var_t frozen_cpus; -int disable_nonboot_cpus(void) +int freeze_secondary_cpus(int primary) { - int cpu, first_cpu, error = 0; + int cpu, error = 0; cpu_maps_update_begin(); - first_cpu = cpumask_first(cpu_online_mask); + if (!cpu_online(primary)) + primary = cpumask_first(cpu_online_mask); /* * We take down all of the non-boot CPUs in one shot to avoid races * with the userspace trying to use the CPU hotplug at the same time @@ -1038,7 +1114,7 @@ int disable_nonboot_cpus(void) pr_info("Disabling non-boot CPUs ...\n"); for_each_online_cpu(cpu) { - if (cpu == first_cpu) + if (cpu == primary) continue; trace_suspend_resume(TPS("CPU_OFF"), cpu, true); error = _cpu_down(cpu, 1, CPUHP_OFFLINE); @@ -1081,7 +1157,7 @@ void enable_nonboot_cpus(void) /* Allow everyone to use the CPU hotplug again */ cpu_maps_update_begin(); - WARN_ON(--cpu_hotplug_disabled < 0); + __cpu_hotplug_enable(); if (cpumask_empty(frozen_cpus)) goto out; @@ -1170,40 +1246,50 @@ core_initcall(cpu_hotplug_pm_sync_init); static struct cpuhp_step cpuhp_bp_states[] = { [CPUHP_OFFLINE] = { .name = "offline", - .startup = NULL, - .teardown = NULL, + .startup.single = NULL, + .teardown.single = NULL, }, #ifdef CONFIG_SMP [CPUHP_CREATE_THREADS]= { - .name = "threads:create", - .startup = smpboot_create_threads, - .teardown = NULL, + .name = "threads:prepare", + .startup.single = smpboot_create_threads, + .teardown.single = NULL, .cant_stop = true, }, [CPUHP_PERF_PREPARE] = { - .name = "perf prepare", - .startup = perf_event_init_cpu, - .teardown = perf_event_exit_cpu, + .name = "perf:prepare", + .startup.single = perf_event_init_cpu, + .teardown.single = perf_event_exit_cpu, }, [CPUHP_WORKQUEUE_PREP] = { - .name = "workqueue prepare", - .startup = workqueue_prepare_cpu, - .teardown = NULL, + .name = "workqueue:prepare", + .startup.single = workqueue_prepare_cpu, + .teardown.single = NULL, }, [CPUHP_HRTIMERS_PREPARE] = { - .name = "hrtimers prepare", - .startup = hrtimers_prepare_cpu, - .teardown = hrtimers_dead_cpu, + .name = "hrtimers:prepare", + .startup.single = hrtimers_prepare_cpu, + .teardown.single = hrtimers_dead_cpu, }, [CPUHP_SMPCFD_PREPARE] = { - .name = "SMPCFD prepare", - .startup = smpcfd_prepare_cpu, - .teardown = smpcfd_dead_cpu, + .name = "smpcfd:prepare", + .startup.single = smpcfd_prepare_cpu, + .teardown.single = smpcfd_dead_cpu, + }, + [CPUHP_RELAY_PREPARE] = { + .name = "relay:prepare", + .startup.single = relay_prepare_cpu, + .teardown.single = NULL, + }, + [CPUHP_SLAB_PREPARE] = { + .name = "slab:prepare", + .startup.single = slab_prepare_cpu, + .teardown.single = slab_dead_cpu, }, [CPUHP_RCUTREE_PREP] = { - .name = "RCU-tree prepare", - .startup = rcutree_prepare_cpu, - .teardown = rcutree_dead_cpu, + .name = "RCU/tree:prepare", + .startup.single = rcutree_prepare_cpu, + .teardown.single = rcutree_dead_cpu, }, /* * Preparatory and dead notifiers. Will be replaced once the notifiers @@ -1211,8 +1297,8 @@ static struct cpuhp_step cpuhp_bp_states[] = { */ [CPUHP_NOTIFY_PREPARE] = { .name = "notify:prepare", - .startup = notify_prepare, - .teardown = notify_dead, + .startup.single = notify_prepare, + .teardown.single = notify_dead, .skip_onerr = true, .cant_stop = true, }, @@ -1222,20 +1308,21 @@ static struct cpuhp_step cpuhp_bp_states[] = { * otherwise a RCU stall occurs. */ [CPUHP_TIMERS_DEAD] = { - .name = "timers dead", - .startup = NULL, - .teardown = timers_dead_cpu, + .name = "timers:dead", + .startup.single = NULL, + .teardown.single = timers_dead_cpu, }, /* Kicks the plugged cpu into life */ [CPUHP_BRINGUP_CPU] = { .name = "cpu:bringup", - .startup = bringup_cpu, - .teardown = NULL, + .startup.single = bringup_cpu, + .teardown.single = NULL, .cant_stop = true, }, [CPUHP_AP_SMPCFD_DYING] = { - .startup = NULL, - .teardown = smpcfd_dying_cpu, + .name = "smpcfd:dying", + .startup.single = NULL, + .teardown.single = smpcfd_dying_cpu, }, /* * Handled on controll processor until the plugged processor manages @@ -1243,8 +1330,8 @@ static struct cpuhp_step cpuhp_bp_states[] = { */ [CPUHP_TEARDOWN_CPU] = { .name = "cpu:teardown", - .startup = NULL, - .teardown = takedown_cpu, + .startup.single = NULL, + .teardown.single = takedown_cpu, .cant_stop = true, }, #else @@ -1270,24 +1357,13 @@ static struct cpuhp_step cpuhp_ap_states[] = { /* First state is scheduler control. Interrupts are disabled */ [CPUHP_AP_SCHED_STARTING] = { .name = "sched:starting", - .startup = sched_cpu_starting, - .teardown = sched_cpu_dying, + .startup.single = sched_cpu_starting, + .teardown.single = sched_cpu_dying, }, [CPUHP_AP_RCUTREE_DYING] = { - .startup = NULL, - .teardown = rcutree_dying_cpu, - }, - /* - * Low level startup/teardown notifiers. Run with interrupts - * disabled. Will be removed once the notifiers are converted to - * states. - */ - [CPUHP_AP_NOTIFY_STARTING] = { - .name = "notify:starting", - .startup = notify_starting, - .teardown = notify_dying, - .skip_onerr = true, - .cant_stop = true, + .name = "RCU/tree:dying", + .startup.single = NULL, + .teardown.single = rcutree_dying_cpu, }, /* Entry state on starting. Interrupts enabled from here on. Transient * state for synchronsization */ @@ -1296,24 +1372,24 @@ static struct cpuhp_step cpuhp_ap_states[] = { }, /* Handle smpboot threads park/unpark */ [CPUHP_AP_SMPBOOT_THREADS] = { - .name = "smpboot:threads", - .startup = smpboot_unpark_threads, - .teardown = NULL, + .name = "smpboot/threads:online", + .startup.single = smpboot_unpark_threads, + .teardown.single = NULL, }, [CPUHP_AP_PERF_ONLINE] = { - .name = "perf online", - .startup = perf_event_init_cpu, - .teardown = perf_event_exit_cpu, + .name = "perf:online", + .startup.single = perf_event_init_cpu, + .teardown.single = perf_event_exit_cpu, }, [CPUHP_AP_WORKQUEUE_ONLINE] = { - .name = "workqueue online", - .startup = workqueue_online_cpu, - .teardown = workqueue_offline_cpu, + .name = "workqueue:online", + .startup.single = workqueue_online_cpu, + .teardown.single = workqueue_offline_cpu, }, [CPUHP_AP_RCUTREE_ONLINE] = { - .name = "RCU-tree online", - .startup = rcutree_online_cpu, - .teardown = rcutree_offline_cpu, + .name = "RCU/tree:online", + .startup.single = rcutree_online_cpu, + .teardown.single = rcutree_offline_cpu, }, /* @@ -1322,8 +1398,8 @@ static struct cpuhp_step cpuhp_ap_states[] = { */ [CPUHP_AP_NOTIFY_ONLINE] = { .name = "notify:online", - .startup = notify_online, - .teardown = notify_down_prepare, + .startup.single = notify_online, + .teardown.single = notify_down_prepare, .skip_onerr = true, }, #endif @@ -1335,16 +1411,16 @@ static struct cpuhp_step cpuhp_ap_states[] = { /* Last state is scheduler control setting the cpu active */ [CPUHP_AP_ACTIVE] = { .name = "sched:active", - .startup = sched_cpu_activate, - .teardown = sched_cpu_deactivate, + .startup.single = sched_cpu_activate, + .teardown.single = sched_cpu_deactivate, }, #endif /* CPU is fully up and running. */ [CPUHP_ONLINE] = { .name = "online", - .startup = NULL, - .teardown = NULL, + .startup.single = NULL, + .teardown.single = NULL, }, }; @@ -1356,54 +1432,42 @@ static int cpuhp_cb_check(enum cpuhp_state state) return 0; } -static bool cpuhp_is_ap_state(enum cpuhp_state state) -{ - /* - * The extra check for CPUHP_TEARDOWN_CPU is only for documentation - * purposes as that state is handled explicitely in cpu_down. - */ - return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; -} - -static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) -{ - struct cpuhp_step *sp; - - sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states; - return sp + state; -} - static void cpuhp_store_callbacks(enum cpuhp_state state, const char *name, int (*startup)(unsigned int cpu), - int (*teardown)(unsigned int cpu)) + int (*teardown)(unsigned int cpu), + bool multi_instance) { /* (Un)Install the callbacks for further cpu hotplug operations */ struct cpuhp_step *sp; mutex_lock(&cpuhp_state_mutex); sp = cpuhp_get_step(state); - sp->startup = startup; - sp->teardown = teardown; + sp->startup.single = startup; + sp->teardown.single = teardown; sp->name = name; + sp->multi_instance = multi_instance; + INIT_HLIST_HEAD(&sp->list); mutex_unlock(&cpuhp_state_mutex); } static void *cpuhp_get_teardown_cb(enum cpuhp_state state) { - return cpuhp_get_step(state)->teardown; + return cpuhp_get_step(state)->teardown.single; } /* * Call the startup/teardown function for a step either on the AP or * on the current CPU. */ -static int cpuhp_issue_call(int cpu, enum cpuhp_state state, - int (*cb)(unsigned int), bool bringup) +static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup, + struct hlist_node *node) { + struct cpuhp_step *sp = cpuhp_get_step(state); int ret; - if (!cb) + if ((bringup && !sp->startup.single) || + (!bringup && !sp->teardown.single)) return 0; /* * The non AP bound callbacks can fail on bringup. On teardown @@ -1411,11 +1475,11 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, */ #ifdef CONFIG_SMP if (cpuhp_is_ap_state(state)) - ret = cpuhp_invoke_ap_callback(cpu, state, cb); + ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node); else - ret = cpuhp_invoke_callback(cpu, state, cb); + ret = cpuhp_invoke_callback(cpu, state, bringup, node); #else - ret = cpuhp_invoke_callback(cpu, state, cb); + ret = cpuhp_invoke_callback(cpu, state, bringup, node); #endif BUG_ON(ret && !bringup); return ret; @@ -1427,13 +1491,10 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, * Note: The teardown callbacks for rollback are not allowed to fail! */ static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state, - int (*teardown)(unsigned int cpu)) + struct hlist_node *node) { int cpu; - if (!teardown) - return; - /* Roll back the already executed steps on the other cpus */ for_each_present_cpu(cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); @@ -1444,7 +1505,7 @@ static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state, /* Did we invoke the startup call on that cpu ? */ if (cpustate >= state) - cpuhp_issue_call(cpu, state, teardown, false); + cpuhp_issue_call(cpu, state, false, node); } } @@ -1471,6 +1532,52 @@ static int cpuhp_reserve_state(enum cpuhp_state state) return -ENOSPC; } +int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, + bool invoke) +{ + struct cpuhp_step *sp; + int cpu; + int ret; + + sp = cpuhp_get_step(state); + if (sp->multi_instance == false) + return -EINVAL; + + get_online_cpus(); + + if (!invoke || !sp->startup.multi) + goto add_node; + + /* + * Try to call the startup callback for each present cpu + * depending on the hotplug state of the cpu. + */ + for_each_present_cpu(cpu) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int cpustate = st->state; + + if (cpustate < state) + continue; + + ret = cpuhp_issue_call(cpu, state, true, node); + if (ret) { + if (sp->teardown.multi) + cpuhp_rollback_install(cpu, state, node); + goto err; + } + } +add_node: + ret = 0; + mutex_lock(&cpuhp_state_mutex); + hlist_add_head(node, &sp->list); + mutex_unlock(&cpuhp_state_mutex); + +err: + put_online_cpus(); + return ret; +} +EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance); + /** * __cpuhp_setup_state - Setup the callbacks for an hotplug machine state * @state: The state to setup @@ -1484,7 +1591,8 @@ static int cpuhp_reserve_state(enum cpuhp_state state) int __cpuhp_setup_state(enum cpuhp_state state, const char *name, bool invoke, int (*startup)(unsigned int cpu), - int (*teardown)(unsigned int cpu)) + int (*teardown)(unsigned int cpu), + bool multi_instance) { int cpu, ret = 0; int dyn_state = 0; @@ -1503,7 +1611,7 @@ int __cpuhp_setup_state(enum cpuhp_state state, state = ret; } - cpuhp_store_callbacks(state, name, startup, teardown); + cpuhp_store_callbacks(state, name, startup, teardown, multi_instance); if (!invoke || !startup) goto out; @@ -1519,10 +1627,11 @@ int __cpuhp_setup_state(enum cpuhp_state state, if (cpustate < state) continue; - ret = cpuhp_issue_call(cpu, state, startup, true); + ret = cpuhp_issue_call(cpu, state, true, NULL); if (ret) { - cpuhp_rollback_install(cpu, state, teardown); - cpuhp_store_callbacks(state, NULL, NULL, NULL); + if (teardown) + cpuhp_rollback_install(cpu, state, NULL); + cpuhp_store_callbacks(state, NULL, NULL, NULL, false); goto out; } } @@ -1534,6 +1643,42 @@ out: } EXPORT_SYMBOL(__cpuhp_setup_state); +int __cpuhp_state_remove_instance(enum cpuhp_state state, + struct hlist_node *node, bool invoke) +{ + struct cpuhp_step *sp = cpuhp_get_step(state); + int cpu; + + BUG_ON(cpuhp_cb_check(state)); + + if (!sp->multi_instance) + return -EINVAL; + + get_online_cpus(); + if (!invoke || !cpuhp_get_teardown_cb(state)) + goto remove; + /* + * Call the teardown callback for each present cpu depending + * on the hotplug state of the cpu. This function is not + * allowed to fail currently! + */ + for_each_present_cpu(cpu) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int cpustate = st->state; + + if (cpustate >= state) + cpuhp_issue_call(cpu, state, false, node); + } + +remove: + mutex_lock(&cpuhp_state_mutex); + hlist_del(node); + mutex_unlock(&cpuhp_state_mutex); + put_online_cpus(); + + return 0; +} +EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance); /** * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state * @state: The state to remove @@ -1545,14 +1690,21 @@ EXPORT_SYMBOL(__cpuhp_setup_state); */ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) { - int (*teardown)(unsigned int cpu) = cpuhp_get_teardown_cb(state); + struct cpuhp_step *sp = cpuhp_get_step(state); int cpu; BUG_ON(cpuhp_cb_check(state)); get_online_cpus(); - if (!invoke || !teardown) + if (sp->multi_instance) { + WARN(!hlist_empty(&sp->list), + "Error: Removing state %d which has instances left.\n", + state); + goto remove; + } + + if (!invoke || !cpuhp_get_teardown_cb(state)) goto remove; /* @@ -1565,10 +1717,10 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) int cpustate = st->state; if (cpustate >= state) - cpuhp_issue_call(cpu, state, teardown, false); + cpuhp_issue_call(cpu, state, false, NULL); } remove: - cpuhp_store_callbacks(state, NULL, NULL, NULL); + cpuhp_store_callbacks(state, NULL, NULL, NULL, false); put_online_cpus(); } EXPORT_SYMBOL(__cpuhp_remove_state); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index c27e53326bef..2b4c20ab5bbe 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -325,8 +325,7 @@ static struct file_system_type cpuset_fs_type = { /* * Return in pmask the portion of a cpusets's cpus_allowed that * are online. If none are online, walk up the cpuset hierarchy - * until we find one that does have some online cpus. The top - * cpuset always has some cpus online. + * until we find one that does have some online cpus. * * One way or another, we guarantee to return some non-empty subset * of cpu_online_mask. @@ -335,8 +334,20 @@ static struct file_system_type cpuset_fs_type = { */ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) { - while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) + while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) { cs = parent_cs(cs); + if (unlikely(!cs)) { + /* + * The top cpuset doesn't have any online cpu as a + * consequence of a race between cpuset_hotplug_work + * and cpu hotplug notifier. But we know the top + * cpuset's effective_cpus is on its way to to be + * identical to cpu_online_mask. + */ + cpumask_copy(pmask, cpu_online_mask); + return; + } + } cpumask_and(pmask, cs->effective_cpus, cpu_online_mask); } @@ -2074,7 +2085,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) * which could have been changed by cpuset just after it inherits the * state from the parent and before it sits on the cgroup's task list. */ -void cpuset_fork(struct task_struct *task) +static void cpuset_fork(struct task_struct *task) { if (task_css_is_root(task, cpuset_cgrp_id)) return; diff --git a/kernel/events/core.c b/kernel/events/core.c index 3cfabdf7b942..c6e47e97b33f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1475,8 +1475,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) if (event->group_leader == event) { struct list_head *list; - if (is_software_event(event)) - event->group_flags |= PERF_GROUP_SOFTWARE; + event->group_caps = event->event_caps; list = ctx_group_list(event, ctx); list_add_tail(&event->group_entry, list); @@ -1630,9 +1629,7 @@ static void perf_group_attach(struct perf_event *event) WARN_ON_ONCE(group_leader->ctx != event->ctx); - if (group_leader->group_flags & PERF_GROUP_SOFTWARE && - !is_software_event(event)) - group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; + group_leader->group_caps &= event->event_caps; list_add_tail(&event->group_entry, &group_leader->sibling_list); group_leader->nr_siblings++; @@ -1723,7 +1720,7 @@ static void perf_group_detach(struct perf_event *event) sibling->group_leader = sibling; /* Inherit group flags from the previous leader */ - sibling->group_flags = event->group_flags; + sibling->group_caps = event->group_caps; WARN_ON_ONCE(sibling->ctx != event->ctx); } @@ -1832,6 +1829,8 @@ group_sched_out(struct perf_event *group_event, struct perf_event *event; int state = group_event->state; + perf_pmu_disable(ctx->pmu); + event_sched_out(group_event, cpuctx, ctx); /* @@ -1840,6 +1839,8 @@ group_sched_out(struct perf_event *group_event, list_for_each_entry(event, &group_event->sibling_list, group_entry) event_sched_out(event, cpuctx, ctx); + perf_pmu_enable(ctx->pmu); + if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) cpuctx->exclusive = 0; } @@ -2145,7 +2146,7 @@ static int group_can_go_on(struct perf_event *event, /* * Groups consisting entirely of software events can always go on. */ - if (event->group_flags & PERF_GROUP_SOFTWARE) + if (event->group_caps & PERF_EV_CAP_SOFTWARE) return 1; /* * If an exclusive group is already on, no other hardware @@ -2491,16 +2492,16 @@ static int __perf_event_stop(void *info) * while restarting. */ if (sd->restart) - event->pmu->start(event, PERF_EF_START); + event->pmu->start(event, 0); return 0; } -static int perf_event_restart(struct perf_event *event) +static int perf_event_stop(struct perf_event *event, int restart) { struct stop_event_data sd = { .event = event, - .restart = 1, + .restart = restart, }; int ret = 0; @@ -2837,19 +2838,36 @@ unlock: } } +static DEFINE_PER_CPU(struct list_head, sched_cb_list); + void perf_sched_cb_dec(struct pmu *pmu) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + this_cpu_dec(perf_sched_cb_usages); + + if (!--cpuctx->sched_cb_usage) + list_del(&cpuctx->sched_cb_entry); } + void perf_sched_cb_inc(struct pmu *pmu) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + if (!cpuctx->sched_cb_usage++) + list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); + this_cpu_inc(perf_sched_cb_usages); } /* * This function provides the context switch callback to the lower code * layer. It is invoked ONLY when the context switch callback is enabled. + * + * This callback is relevant even to per-cpu events; for example multi event + * PEBS requires this to provide PID/TID information. This requires we flush + * all queued PEBS records before we context switch to a new task. */ static void perf_pmu_sched_task(struct task_struct *prev, struct task_struct *next, @@ -2857,34 +2875,24 @@ static void perf_pmu_sched_task(struct task_struct *prev, { struct perf_cpu_context *cpuctx; struct pmu *pmu; - unsigned long flags; if (prev == next) return; - local_irq_save(flags); - - rcu_read_lock(); - - list_for_each_entry_rcu(pmu, &pmus, entry) { - if (pmu->sched_task) { - cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - - perf_ctx_lock(cpuctx, cpuctx->task_ctx); + list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) { + pmu = cpuctx->unique_pmu; /* software PMUs will not have sched_task */ - perf_pmu_disable(pmu); + if (WARN_ON_ONCE(!pmu->sched_task)) + continue; - pmu->sched_task(cpuctx->task_ctx, sched_in); + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + perf_pmu_disable(pmu); - perf_pmu_enable(pmu); + pmu->sched_task(cpuctx->task_ctx, sched_in); - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); - } + perf_pmu_enable(pmu); + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } - - rcu_read_unlock(); - - local_irq_restore(flags); } static void perf_event_switch(struct task_struct *task, @@ -3416,6 +3424,22 @@ struct perf_read_data { int ret; }; +static int find_cpu_to_read(struct perf_event *event, int local_cpu) +{ + int event_cpu = event->oncpu; + u16 local_pkg, event_pkg; + + if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { + event_pkg = topology_physical_package_id(event_cpu); + local_pkg = topology_physical_package_id(local_cpu); + + if (event_pkg == local_pkg) + return local_cpu; + } + + return event_cpu; +} + /* * Cross CPU call to read the hardware event */ @@ -3537,7 +3561,7 @@ u64 perf_event_read_local(struct perf_event *event) static int perf_event_read(struct perf_event *event, bool group) { - int ret = 0; + int ret = 0, cpu_to_read, local_cpu; /* * If event is enabled and currently active on a CPU, update the @@ -3549,10 +3573,23 @@ static int perf_event_read(struct perf_event *event, bool group) .group = group, .ret = 0, }; - ret = smp_call_function_single(event->oncpu, __perf_event_read, &data, 1); - /* The event must have been read from an online CPU: */ - WARN_ON_ONCE(ret); - ret = ret ? : data.ret; + + local_cpu = get_cpu(); + cpu_to_read = find_cpu_to_read(event, local_cpu); + put_cpu(); + + /* + * Purposely ignore the smp_call_function_single() return + * value. + * + * If event->oncpu isn't a valid CPU it means the event got + * scheduled out and that will have updated the event count. + * + * Therefore, either way, we'll have an up-to-date event count + * after this. + */ + (void)smp_call_function_single(cpu_to_read, __perf_event_read, &data, 1); + ret = data.ret; } else if (event->state == PERF_EVENT_STATE_INACTIVE) { struct perf_event_context *ctx = event->ctx; unsigned long flags; @@ -3921,7 +3958,7 @@ static void exclusive_event_destroy(struct perf_event *event) static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) { - if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && + if ((e1->pmu == e2->pmu) && (e1->cpu == e2->cpu || e1->cpu == -1 || e2->cpu == -1)) @@ -4837,6 +4874,19 @@ static void ring_buffer_attach(struct perf_event *event, spin_unlock_irqrestore(&rb->event_lock, flags); } + /* + * Avoid racing with perf_mmap_close(AUX): stop the event + * before swizzling the event::rb pointer; if it's getting + * unmapped, its aux_mmap_count will be 0 and it won't + * restart. See the comment in __perf_pmu_output_stop(). + * + * Data will inevitably be lost when set_output is done in + * mid-air, but then again, whoever does it like this is + * not in for the data anyway. + */ + if (has_aux(event)) + perf_event_stop(event, 0); + rcu_assign_pointer(event->rb, rb); if (old_rb) { @@ -5329,9 +5379,10 @@ perf_output_sample_regs(struct perf_output_handle *handle, struct pt_regs *regs, u64 mask) { int bit; + DECLARE_BITMAP(_mask, 64); - for_each_set_bit(bit, (const unsigned long *) &mask, - sizeof(mask) * BITS_PER_BYTE) { + bitmap_from_u64(_mask, mask); + for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) { u64 val; val = perf_reg_value(regs, bit); @@ -6112,7 +6163,7 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data) raw_spin_unlock_irqrestore(&ifh->lock, flags); if (restart) - perf_event_restart(event); + perf_event_stop(event, 1); } void perf_event_exec(void) @@ -6156,7 +6207,13 @@ static void __perf_event_output_stop(struct perf_event *event, void *data) /* * In case of inheritance, it will be the parent that links to the - * ring-buffer, but it will be the child that's actually using it: + * ring-buffer, but it will be the child that's actually using it. + * + * We are using event::rb to determine if the event should be stopped, + * however this may race with ring_buffer_attach() (through set_output), + * which will make us skip the event that actually needs to be stopped. + * So ring_buffer_attach() has to stop an aux event before re-assigning + * its rb pointer. */ if (rcu_dereference(parent->rb) == rb) ro->err = __perf_event_stop(&sd); @@ -6670,7 +6727,7 @@ static void __perf_addr_filters_adjust(struct perf_event *event, void *data) raw_spin_unlock_irqrestore(&ifh->lock, flags); if (restart) - perf_event_restart(event); + perf_event_stop(event, 1); } /* @@ -7022,7 +7079,7 @@ static int __perf_event_overflow(struct perf_event *event, irq_work_queue(&event->pending); } - event->overflow_handler(event, data, regs); + READ_ONCE(event->overflow_handler)(event, data, regs); if (*perf_event_fasync(event) && event->pending_kill) { event->pending_wakeup = 1; @@ -7637,11 +7694,83 @@ static void perf_event_free_filter(struct perf_event *event) ftrace_profile_free_filter(event); } +#ifdef CONFIG_BPF_SYSCALL +static void bpf_overflow_handler(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct bpf_perf_event_data_kern ctx = { + .data = data, + .regs = regs, + }; + int ret = 0; + + preempt_disable(); + if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) + goto out; + rcu_read_lock(); + ret = BPF_PROG_RUN(event->prog, (void *)&ctx); + rcu_read_unlock(); +out: + __this_cpu_dec(bpf_prog_active); + preempt_enable(); + if (!ret) + return; + + event->orig_overflow_handler(event, data, regs); +} + +static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) +{ + struct bpf_prog *prog; + + if (event->overflow_handler_context) + /* hw breakpoint or kernel counter */ + return -EINVAL; + + if (event->prog) + return -EEXIST; + + prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + event->prog = prog; + event->orig_overflow_handler = READ_ONCE(event->overflow_handler); + WRITE_ONCE(event->overflow_handler, bpf_overflow_handler); + return 0; +} + +static void perf_event_free_bpf_handler(struct perf_event *event) +{ + struct bpf_prog *prog = event->prog; + + if (!prog) + return; + + WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler); + event->prog = NULL; + bpf_prog_put(prog); +} +#else +static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) +{ + return -EOPNOTSUPP; +} +static void perf_event_free_bpf_handler(struct perf_event *event) +{ +} +#endif + static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) { bool is_kprobe, is_tracepoint; struct bpf_prog *prog; + if (event->attr.type == PERF_TYPE_HARDWARE || + event->attr.type == PERF_TYPE_SOFTWARE) + return perf_event_set_bpf_handler(event, prog_fd); + if (event->attr.type != PERF_TYPE_TRACEPOINT) return -EINVAL; @@ -7682,6 +7811,8 @@ static void perf_event_free_bpf_prog(struct perf_event *event) { struct bpf_prog *prog; + perf_event_free_bpf_handler(event); + if (!event->tp_event) return; @@ -7859,7 +7990,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event) mmput(mm); restart: - perf_event_restart(event); + perf_event_stop(event, 1); } /* @@ -8998,6 +9129,19 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (!overflow_handler && parent_event) { overflow_handler = parent_event->overflow_handler; context = parent_event->overflow_handler_context; +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING) + if (overflow_handler == bpf_overflow_handler) { + struct bpf_prog *prog = bpf_prog_inc(parent_event->prog); + + if (IS_ERR(prog)) { + err = PTR_ERR(prog); + goto err_ns; + } + event->prog = prog; + event->orig_overflow_handler = + parent_event->orig_overflow_handler; + } +#endif } if (overflow_handler) { @@ -9478,6 +9622,9 @@ SYSCALL_DEFINE5(perf_event_open, goto err_alloc; } + if (pmu->task_ctx_nr == perf_sw_context) + event->event_caps |= PERF_EV_CAP_SOFTWARE; + if (group_leader && (is_software_event(event) != is_software_event(group_leader))) { if (is_software_event(event)) { @@ -9491,7 +9638,7 @@ SYSCALL_DEFINE5(perf_event_open, */ pmu = group_leader->pmu; } else if (is_software_event(group_leader) && - (group_leader->group_flags & PERF_GROUP_SOFTWARE)) { + (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { /* * In case the group is a pure software group, and we * try to add a hardware event, move the whole group to @@ -10426,6 +10573,8 @@ static void __init perf_event_init_all_cpus(void) INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); + + INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu)); } } diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index ae9b90dc9a5a..257fa460b846 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -330,15 +330,22 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, if (!rb) return NULL; - if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount)) + if (!rb_has_aux(rb)) goto err; /* - * If rb::aux_mmap_count is zero (and rb_has_aux() above went through), - * the aux buffer is in perf_mmap_close(), about to get freed. + * If aux_mmap_count is zero, the aux buffer is in perf_mmap_close(), + * about to get freed, so we leave immediately. + * + * Checking rb::aux_mmap_count and rb::refcount has to be done in + * the same order, see perf_mmap_close. Otherwise we end up freeing + * aux pages in this path, which is a bug, because in_atomic(). */ if (!atomic_read(&rb->aux_mmap_count)) - goto err_put; + goto err; + + if (!atomic_inc_not_zero(&rb->aux_refcount)) + goto err; /* * Nesting is not supported for AUX area, make sure nested diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 8c50276b60d1..d4129bb05e5d 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -150,7 +150,7 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) * Returns 0 on success, -EFAULT on failure. */ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, - struct page *page, struct page *kpage) + struct page *old_page, struct page *new_page) { struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; @@ -161,49 +161,49 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, const unsigned long mmun_end = addr + PAGE_SIZE; struct mem_cgroup *memcg; - err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg, + err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg, false); if (err) return err; /* For try_to_free_swap() and munlock_vma_page() below */ - lock_page(page); + lock_page(old_page); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); err = -EAGAIN; - ptep = page_check_address(page, mm, addr, &ptl, 0); + ptep = page_check_address(old_page, mm, addr, &ptl, 0); if (!ptep) { - mem_cgroup_cancel_charge(kpage, memcg, false); + mem_cgroup_cancel_charge(new_page, memcg, false); goto unlock; } - get_page(kpage); - page_add_new_anon_rmap(kpage, vma, addr, false); - mem_cgroup_commit_charge(kpage, memcg, false, false); - lru_cache_add_active_or_unevictable(kpage, vma); + get_page(new_page); + page_add_new_anon_rmap(new_page, vma, addr, false); + mem_cgroup_commit_charge(new_page, memcg, false, false); + lru_cache_add_active_or_unevictable(new_page, vma); - if (!PageAnon(page)) { - dec_mm_counter(mm, mm_counter_file(page)); + if (!PageAnon(old_page)) { + dec_mm_counter(mm, mm_counter_file(old_page)); inc_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, addr, pte_pfn(*ptep)); ptep_clear_flush_notify(vma, addr, ptep); - set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); + set_pte_at_notify(mm, addr, ptep, mk_pte(new_page, vma->vm_page_prot)); - page_remove_rmap(page, false); - if (!page_mapped(page)) - try_to_free_swap(page); + page_remove_rmap(old_page, false); + if (!page_mapped(old_page)) + try_to_free_swap(old_page); pte_unmap_unlock(ptep, ptl); if (vma->vm_flags & VM_LOCKED) - munlock_vma_page(page); - put_page(page); + munlock_vma_page(old_page); + put_page(old_page); err = 0; unlock: mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - unlock_page(page); + unlock_page(old_page); return err; } diff --git a/kernel/exit.c b/kernel/exit.c index 091a78be3b09..9d68c45ebbe3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -511,7 +511,7 @@ static void exit_mm(struct task_struct *tsk) mm_update_next_owner(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) - exit_oom_victim(tsk); + exit_oom_victim(); } static struct task_struct *find_alive_thread(struct task_struct *p) @@ -725,7 +725,7 @@ static void check_stack_usage(void) static inline void check_stack_usage(void) {} #endif -void do_exit(long code) +void __noreturn do_exit(long code) { struct task_struct *tsk = current; int group_dead; @@ -882,29 +882,7 @@ void do_exit(long code) exit_rcu(); TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i)); - /* - * The setting of TASK_RUNNING by try_to_wake_up() may be delayed - * when the following two conditions become true. - * - There is race condition of mmap_sem (It is acquired by - * exit_mm()), and - * - SMI occurs before setting TASK_RUNINNG. - * (or hypervisor of virtual machine switches to other guest) - * As a result, we may become TASK_RUNNING after becoming TASK_DEAD - * - * To avoid it, we have to wait for releasing tsk->pi_lock which - * is held by try_to_wake_up() - */ - smp_mb(); - raw_spin_unlock_wait(&tsk->pi_lock); - - /* causes final put_task_struct in finish_task_switch(). */ - tsk->state = TASK_DEAD; - tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ - schedule(); - BUG(); - /* Avoid "noreturn function does return". */ - for (;;) - cpu_relax(); /* For when BUG is null */ + do_task_dead(); } EXPORT_SYMBOL_GPL(do_exit); diff --git a/kernel/fork.c b/kernel/fork.c index beb31725f7e2..6d42242485cb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -158,19 +158,83 @@ void __weak arch_release_thread_stack(unsigned long *stack) * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a * kmemcache based allocator. */ -# if THREAD_SIZE >= PAGE_SIZE -static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, - int node) +# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) + +#ifdef CONFIG_VMAP_STACK +/* + * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB + * flush. Try to minimize the number of calls by caching stacks. + */ +#define NR_CACHED_STACKS 2 +static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]); +#endif + +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) { +#ifdef CONFIG_VMAP_STACK + void *stack; + int i; + + local_irq_disable(); + for (i = 0; i < NR_CACHED_STACKS; i++) { + struct vm_struct *s = this_cpu_read(cached_stacks[i]); + + if (!s) + continue; + this_cpu_write(cached_stacks[i], NULL); + + tsk->stack_vm_area = s; + local_irq_enable(); + return s->addr; + } + local_irq_enable(); + + stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, + VMALLOC_START, VMALLOC_END, + THREADINFO_GFP | __GFP_HIGHMEM, + PAGE_KERNEL, + 0, node, __builtin_return_address(0)); + + /* + * We can't call find_vm_area() in interrupt context, and + * free_thread_stack() can be called in interrupt context, + * so cache the vm_struct. + */ + if (stack) + tsk->stack_vm_area = find_vm_area(stack); + return stack; +#else struct page *page = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); return page ? page_address(page) : NULL; +#endif } -static inline void free_thread_stack(unsigned long *stack) +static inline void free_thread_stack(struct task_struct *tsk) { - __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER); +#ifdef CONFIG_VMAP_STACK + if (task_stack_vm_area(tsk)) { + unsigned long flags; + int i; + + local_irq_save(flags); + for (i = 0; i < NR_CACHED_STACKS; i++) { + if (this_cpu_read(cached_stacks[i])) + continue; + + this_cpu_write(cached_stacks[i], tsk->stack_vm_area); + local_irq_restore(flags); + return; + } + local_irq_restore(flags); + + vfree(tsk->stack); + return; + } +#endif + + __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_stack_cache; @@ -181,9 +245,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); } -static void free_thread_stack(unsigned long *stack) +static void free_thread_stack(struct task_struct *tsk) { - kmem_cache_free(thread_stack_cache, stack); + kmem_cache_free(thread_stack_cache, tsk->stack); } void thread_stack_cache_init(void) @@ -213,24 +277,76 @@ struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; -static void account_kernel_stack(unsigned long *stack, int account) +static void account_kernel_stack(struct task_struct *tsk, int account) { - /* All stack pages are in the same zone and belong to the same memcg. */ - struct page *first_page = virt_to_page(stack); + void *stack = task_stack_page(tsk); + struct vm_struct *vm = task_stack_vm_area(tsk); + + BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); + + if (vm) { + int i; + + BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); + + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { + mod_zone_page_state(page_zone(vm->pages[i]), + NR_KERNEL_STACK_KB, + PAGE_SIZE / 1024 * account); + } + + /* All stack pages belong to the same memcg. */ + memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); + } else { + /* + * All stack pages are in the same zone and belong to the + * same memcg. + */ + struct page *first_page = virt_to_page(stack); - mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, - THREAD_SIZE / 1024 * account); + mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, + THREAD_SIZE / 1024 * account); - memcg_kmem_update_page_stat( - first_page, MEMCG_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); + memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); + } } -void free_task(struct task_struct *tsk) +static void release_task_stack(struct task_struct *tsk) { - account_kernel_stack(tsk->stack, -1); + account_kernel_stack(tsk, -1); arch_release_thread_stack(tsk->stack); - free_thread_stack(tsk->stack); + free_thread_stack(tsk); + tsk->stack = NULL; +#ifdef CONFIG_VMAP_STACK + tsk->stack_vm_area = NULL; +#endif +} + +#ifdef CONFIG_THREAD_INFO_IN_TASK +void put_task_stack(struct task_struct *tsk) +{ + if (atomic_dec_and_test(&tsk->stack_refcount)) + release_task_stack(tsk); +} +#endif + +void free_task(struct task_struct *tsk) +{ +#ifndef CONFIG_THREAD_INFO_IN_TASK + /* + * The task is finally done with both the stack and thread_info, + * so free both. + */ + release_task_stack(tsk); +#else + /* + * If the task had a separate stack allocation, it should be gone + * by now. + */ + WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0); +#endif rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); put_seccomp_filter(tsk); @@ -243,6 +359,12 @@ static inline void free_signal_struct(struct signal_struct *sig) { taskstats_tgid_free(sig); sched_autogroup_exit(sig); + /* + * __mmdrop is not safe to call from softirq context on x86 due to + * pgd_dtor so postpone it to the async context + */ + if (sig->oom_mm) + mmdrop_async(sig->oom_mm); kmem_cache_free(signal_cachep, sig); } @@ -302,6 +424,7 @@ int arch_task_struct_size __read_mostly; void __init fork_init(void) { + int i; #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR #ifndef ARCH_MIN_TASKALIGN #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES @@ -321,6 +444,10 @@ void __init fork_init(void) init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; init_task.signal->rlim[RLIMIT_SIGPENDING] = init_task.signal->rlim[RLIMIT_NPROC]; + + for (i = 0; i < UCOUNT_COUNTS; i++) { + init_user_ns.ucount_max[i] = max_threads/2; + } } int __weak arch_dup_task_struct(struct task_struct *dst, @@ -342,6 +469,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) { struct task_struct *tsk; unsigned long *stack; + struct vm_struct *stack_vm_area; int err; if (node == NUMA_NO_NODE) @@ -354,11 +482,26 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (!stack) goto free_tsk; + stack_vm_area = task_stack_vm_area(tsk); + err = arch_dup_task_struct(tsk, orig); + + /* + * arch_dup_task_struct() clobbers the stack-related fields. Make + * sure they're properly initialized before using any stack-related + * functions again. + */ + tsk->stack = stack; +#ifdef CONFIG_VMAP_STACK + tsk->stack_vm_area = stack_vm_area; +#endif +#ifdef CONFIG_THREAD_INFO_IN_TASK + atomic_set(&tsk->stack_refcount, 1); +#endif + if (err) goto free_stack; - tsk->stack = stack; #ifdef CONFIG_SECCOMP /* * We must handle setting up seccomp filters once we're under @@ -390,14 +533,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->task_frag.page = NULL; tsk->wake_q.next = NULL; - account_kernel_stack(stack, 1); + account_kernel_stack(tsk, 1); kcov_task_init(tsk); return tsk; free_stack: - free_thread_stack(stack); + free_thread_stack(tsk); free_tsk: free_task_struct(tsk); return NULL; @@ -711,6 +854,7 @@ static inline void __mmput(struct mm_struct *mm) ksm_exit(mm); khugepaged_exit(mm); /* must run before exit_mmap */ exit_mmap(mm); + mm_put_huge_zero_page(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); @@ -719,6 +863,7 @@ static inline void __mmput(struct mm_struct *mm) } if (mm->binfmt) module_put(mm->binfmt->module); + set_bit(MMF_OOM_SKIP, &mm->flags); mmdrop(mm); } @@ -1715,6 +1860,7 @@ bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); exit_creds(p); bad_fork_free: + put_task_stack(p); free_task(p); fork_out: return ERR_PTR(retval); diff --git a/kernel/futex.c b/kernel/futex.c index 46cb3a301bc1..2c4be467fecd 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -381,8 +381,12 @@ static inline int hb_waiters_pending(struct futex_hash_bucket *hb) #endif } -/* - * We hash on the keys returned from get_futex_key (see below). +/** + * hash_futex - Return the hash bucket in the global hash + * @key: Pointer to the futex key for which the hash is calculated + * + * We hash on the keys returned from get_futex_key (see below) and return the + * corresponding hash bucket in the global hash. */ static struct futex_hash_bucket *hash_futex(union futex_key *key) { @@ -392,7 +396,12 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key) return &futex_queues[hash & (futex_hashsize - 1)]; } -/* + +/** + * match_futex - Check whether two futex keys are equal + * @key1: Pointer to key1 + * @key2: Pointer to key2 + * * Return 1 if two futex_keys are equal, 0 otherwise. */ static inline int match_futex(union futex_key *key1, union futex_key *key2) diff --git a/kernel/groups.c b/kernel/groups.c index 74d431d25251..2fcadd66a8fd 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -7,55 +7,31 @@ #include <linux/security.h> #include <linux/syscalls.h> #include <linux/user_namespace.h> +#include <linux/vmalloc.h> #include <asm/uaccess.h> struct group_info *groups_alloc(int gidsetsize) { - struct group_info *group_info; - int nblocks; - int i; - - nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK; - /* Make sure we always allocate at least one indirect block pointer */ - nblocks = nblocks ? : 1; - group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER); - if (!group_info) + struct group_info *gi; + unsigned int len; + + len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize; + gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY); + if (!gi) + gi = __vmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_HIGHMEM, PAGE_KERNEL); + if (!gi) return NULL; - group_info->ngroups = gidsetsize; - group_info->nblocks = nblocks; - atomic_set(&group_info->usage, 1); - - if (gidsetsize <= NGROUPS_SMALL) - group_info->blocks[0] = group_info->small_block; - else { - for (i = 0; i < nblocks; i++) { - kgid_t *b; - b = (void *)__get_free_page(GFP_USER); - if (!b) - goto out_undo_partial_alloc; - group_info->blocks[i] = b; - } - } - return group_info; -out_undo_partial_alloc: - while (--i >= 0) { - free_page((unsigned long)group_info->blocks[i]); - } - kfree(group_info); - return NULL; + atomic_set(&gi->usage, 1); + gi->ngroups = gidsetsize; + return gi; } EXPORT_SYMBOL(groups_alloc); void groups_free(struct group_info *group_info) { - if (group_info->blocks[0] != group_info->small_block) { - int i; - for (i = 0; i < group_info->nblocks; i++) - free_page((unsigned long)group_info->blocks[i]); - } - kfree(group_info); + kvfree(group_info); } EXPORT_SYMBOL(groups_free); @@ -70,7 +46,7 @@ static int groups_to_user(gid_t __user *grouplist, for (i = 0; i < count; i++) { gid_t gid; - gid = from_kgid_munged(user_ns, GROUP_AT(group_info, i)); + gid = from_kgid_munged(user_ns, group_info->gid[i]); if (put_user(gid, grouplist+i)) return -EFAULT; } @@ -95,7 +71,7 @@ static int groups_from_user(struct group_info *group_info, if (!gid_valid(kgid)) return -EINVAL; - GROUP_AT(group_info, i) = kgid; + group_info->gid[i] = kgid; } return 0; } @@ -115,15 +91,14 @@ static void groups_sort(struct group_info *group_info) for (base = 0; base < max; base++) { int left = base; int right = left + stride; - kgid_t tmp = GROUP_AT(group_info, right); + kgid_t tmp = group_info->gid[right]; - while (left >= 0 && gid_gt(GROUP_AT(group_info, left), tmp)) { - GROUP_AT(group_info, right) = - GROUP_AT(group_info, left); + while (left >= 0 && gid_gt(group_info->gid[left], tmp)) { + group_info->gid[right] = group_info->gid[left]; right = left; left -= stride; } - GROUP_AT(group_info, right) = tmp; + group_info->gid[right] = tmp; } stride /= 3; } @@ -141,9 +116,9 @@ int groups_search(const struct group_info *group_info, kgid_t grp) right = group_info->ngroups; while (left < right) { unsigned int mid = (left+right)/2; - if (gid_gt(grp, GROUP_AT(group_info, mid))) + if (gid_gt(grp, group_info->gid[mid])) left = mid + 1; - else if (gid_lt(grp, GROUP_AT(group_info, mid))) + else if (gid_lt(grp, group_info->gid[mid])) right = mid; else return 1; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index d234022805dc..432c3d71d195 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -117,7 +117,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" " disables this message.\n"); sched_show_task(t); - debug_show_held_locks(t); + debug_show_all_locks(); touch_nmi_watchdog(); diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 32f6cfcff212..17f51d63da56 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -4,60 +4,151 @@ #include <linux/slab.h> #include <linux/cpu.h> -static int get_first_sibling(unsigned int cpu) +static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, + int cpus_per_vec) { - unsigned int ret; + const struct cpumask *siblmsk; + int cpu, sibl; - ret = cpumask_first(topology_sibling_cpumask(cpu)); - if (ret < nr_cpu_ids) - return ret; - return cpu; + for ( ; cpus_per_vec > 0; ) { + cpu = cpumask_first(nmsk); + + /* Should not happen, but I'm too lazy to think about it */ + if (cpu >= nr_cpu_ids) + return; + + cpumask_clear_cpu(cpu, nmsk); + cpumask_set_cpu(cpu, irqmsk); + cpus_per_vec--; + + /* If the cpu has siblings, use them first */ + siblmsk = topology_sibling_cpumask(cpu); + for (sibl = -1; cpus_per_vec > 0; ) { + sibl = cpumask_next(sibl, siblmsk); + if (sibl >= nr_cpu_ids) + break; + if (!cpumask_test_and_clear_cpu(sibl, nmsk)) + continue; + cpumask_set_cpu(sibl, irqmsk); + cpus_per_vec--; + } + } +} + +static int get_nodes_in_cpumask(const struct cpumask *mask, nodemask_t *nodemsk) +{ + int n, nodes; + + /* Calculate the number of nodes in the supplied affinity mask */ + for (n = 0, nodes = 0; n < num_online_nodes(); n++) { + if (cpumask_intersects(mask, cpumask_of_node(n))) { + node_set(n, *nodemsk); + nodes++; + } + } + return nodes; } -/* - * Take a map of online CPUs and the number of available interrupt vectors - * and generate an output cpumask suitable for spreading MSI/MSI-X vectors - * so that they are distributed as good as possible around the CPUs. If - * more vectors than CPUs are available we'll map one to each CPU, - * otherwise we map one to the first sibling of each socket. +/** + * irq_create_affinity_masks - Create affinity masks for multiqueue spreading + * @affinity: The affinity mask to spread. If NULL cpu_online_mask + * is used + * @nvecs: The number of vectors * - * If there are more vectors than CPUs we will still only have one bit - * set per CPU, but interrupt code will keep on assigning the vectors from - * the start of the bitmap until we run out of vectors. + * Returns the masks pointer or NULL if allocation failed. */ -struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs) +struct cpumask *irq_create_affinity_masks(const struct cpumask *affinity, + int nvec) { - struct cpumask *affinity_mask; - unsigned int max_vecs = *nr_vecs; + int n, nodes, vecs_per_node, cpus_per_vec, extra_vecs, curvec = 0; + nodemask_t nodemsk = NODE_MASK_NONE; + struct cpumask *masks; + cpumask_var_t nmsk; - if (max_vecs == 1) + if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) return NULL; - affinity_mask = kzalloc(cpumask_size(), GFP_KERNEL); - if (!affinity_mask) { - *nr_vecs = 1; - return NULL; - } + masks = kzalloc(nvec * sizeof(*masks), GFP_KERNEL); + if (!masks) + goto out; + /* Stabilize the cpumasks */ get_online_cpus(); - if (max_vecs >= num_online_cpus()) { - cpumask_copy(affinity_mask, cpu_online_mask); - *nr_vecs = num_online_cpus(); - } else { - unsigned int vecs = 0, cpu; - - for_each_online_cpu(cpu) { - if (cpu == get_first_sibling(cpu)) { - cpumask_set_cpu(cpu, affinity_mask); - vecs++; - } + /* If the supplied affinity mask is NULL, use cpu online mask */ + if (!affinity) + affinity = cpu_online_mask; + + nodes = get_nodes_in_cpumask(affinity, &nodemsk); - if (--max_vecs == 0) + /* + * If the number of nodes in the mask is less than or equal the + * number of vectors we just spread the vectors across the nodes. + */ + if (nvec <= nodes) { + for_each_node_mask(n, nodemsk) { + cpumask_copy(masks + curvec, cpumask_of_node(n)); + if (++curvec == nvec) break; } - *nr_vecs = vecs; + goto outonl; } + + /* Spread the vectors per node */ + vecs_per_node = nvec / nodes; + /* Account for rounding errors */ + extra_vecs = nvec - (nodes * vecs_per_node); + + for_each_node_mask(n, nodemsk) { + int ncpus, v, vecs_to_assign = vecs_per_node; + + /* Get the cpus on this node which are in the mask */ + cpumask_and(nmsk, affinity, cpumask_of_node(n)); + + /* Calculate the number of cpus per vector */ + ncpus = cpumask_weight(nmsk); + + for (v = 0; curvec < nvec && v < vecs_to_assign; curvec++, v++) { + cpus_per_vec = ncpus / vecs_to_assign; + + /* Account for extra vectors to compensate rounding errors */ + if (extra_vecs) { + cpus_per_vec++; + if (!--extra_vecs) + vecs_per_node++; + } + irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec); + } + + if (curvec >= nvec) + break; + } + +outonl: put_online_cpus(); +out: + free_cpumask_var(nmsk); + return masks; +} + +/** + * irq_calc_affinity_vectors - Calculate to optimal number of vectors for a given affinity mask + * @affinity: The affinity mask to spread. If NULL cpu_online_mask + * is used + * @maxvec: The maximum number of vectors available + */ +int irq_calc_affinity_vectors(const struct cpumask *affinity, int maxvec) +{ + int cpus, ret; - return affinity_mask; + /* Stabilize the cpumasks */ + get_online_cpus(); + /* If the supplied affinity mask is NULL, use cpu online mask */ + if (!affinity) + affinity = cpu_online_mask; + + cpus = cpumask_weight(affinity); + ret = (cpus < maxvec) ? cpus : maxvec; + + put_online_cpus(); + return ret; } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 637389088b3f..be3c34e4f2ac 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -76,7 +76,6 @@ int irq_set_irq_type(unsigned int irq, unsigned int type) if (!desc) return -EINVAL; - type &= IRQ_TYPE_SENSE_MASK; ret = __irq_set_trigger(desc, type); irq_put_desc_busunlock(desc, flags); return ret; @@ -756,7 +755,6 @@ void handle_percpu_devid_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); struct irqaction *action = desc->action; - void *dev_id = raw_cpu_ptr(action->percpu_dev_id); unsigned int irq = irq_desc_get_irq(desc); irqreturn_t res; @@ -765,15 +763,26 @@ void handle_percpu_devid_irq(struct irq_desc *desc) if (chip->irq_ack) chip->irq_ack(&desc->irq_data); - trace_irq_handler_entry(irq, action); - res = action->handler(irq, dev_id); - trace_irq_handler_exit(irq, action, res); + if (likely(action)) { + trace_irq_handler_entry(irq, action); + res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id)); + trace_irq_handler_exit(irq, action, res); + } else { + unsigned int cpu = smp_processor_id(); + bool enabled = cpumask_test_cpu(cpu, desc->percpu_enabled); + + if (enabled) + irq_percpu_disable(desc, cpu); + + pr_err_once("Spurious%s percpu IRQ%u on CPU%u\n", + enabled ? " and unmasked" : "", irq, cpu); + } if (chip->irq_eoi) chip->irq_eoi(&desc->irq_data); } -void +static void __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, int is_chained, const char *name) { @@ -820,6 +829,8 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, desc->name = name; if (handle != handle_bad_irq && is_chained) { + unsigned int type = irqd_get_trigger_type(&desc->irq_data); + /* * We're about to start this interrupt immediately, * hence the need to set the trigger configuration. @@ -828,8 +839,10 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, * chained interrupt. Reset it immediately because we * do know better. */ - __irq_set_trigger(desc, irqd_get_trigger_type(&desc->irq_data)); - desc->handle_irq = handle; + if (type != IRQ_TYPE_NONE) { + __irq_set_trigger(desc, type); + desc->handle_irq = handle; + } irq_settings_set_noprobe(desc); irq_settings_set_norequest(desc); diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index abd286afbd27..ee32870079c9 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -260,9 +260,9 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags) } /** - * irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain + * __irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain * @d: irq domain for which to allocate chips - * @irqs_per_chip: Number of interrupts each chip handles + * @irqs_per_chip: Number of interrupts each chip handles (max 32) * @num_ct: Number of irq_chip_type instances associated with this * @name: Name of the irq chip * @handler: Default flow handler associated with these chips @@ -270,11 +270,11 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags) * @set: IRQ_* bits to set in the mapping function * @gcflags: Generic chip specific setup flags */ -int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, - int num_ct, const char *name, - irq_flow_handler_t handler, - unsigned int clr, unsigned int set, - enum irq_gc_flags gcflags) +int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, + int num_ct, const char *name, + irq_flow_handler_t handler, + unsigned int clr, unsigned int set, + enum irq_gc_flags gcflags) { struct irq_domain_chip_generic *dgc; struct irq_chip_generic *gc; @@ -326,7 +326,21 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, d->name = name; return 0; } -EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips); +EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips); + +static struct irq_chip_generic * +__irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq) +{ + struct irq_domain_chip_generic *dgc = d->gc; + int idx; + + if (!dgc) + return ERR_PTR(-ENODEV); + idx = hw_irq / dgc->irqs_per_chip; + if (idx >= dgc->num_chips) + return ERR_PTR(-EINVAL); + return dgc->gc[idx]; +} /** * irq_get_domain_generic_chip - Get a pointer to the generic chip of a hw_irq @@ -336,15 +350,9 @@ EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips); struct irq_chip_generic * irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq) { - struct irq_domain_chip_generic *dgc = d->gc; - int idx; + struct irq_chip_generic *gc = __irq_get_domain_generic_chip(d, hw_irq); - if (!dgc) - return NULL; - idx = hw_irq / dgc->irqs_per_chip; - if (idx >= dgc->num_chips) - return NULL; - return dgc->gc[idx]; + return !IS_ERR(gc) ? gc : NULL; } EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip); @@ -368,13 +376,9 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, unsigned long flags; int idx; - if (!d->gc) - return -ENODEV; - - idx = hw_irq / dgc->irqs_per_chip; - if (idx >= dgc->num_chips) - return -EINVAL; - gc = dgc->gc[idx]; + gc = __irq_get_domain_generic_chip(d, hw_irq); + if (IS_ERR(gc)) + return PTR_ERR(gc); idx = hw_irq % dgc->irqs_per_chip; @@ -409,10 +413,30 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set); return 0; } -EXPORT_SYMBOL_GPL(irq_map_generic_chip); + +static void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq) +{ + struct irq_data *data = irq_domain_get_irq_data(d, virq); + struct irq_domain_chip_generic *dgc = d->gc; + unsigned int hw_irq = data->hwirq; + struct irq_chip_generic *gc; + int irq_idx; + + gc = irq_get_domain_generic_chip(d, hw_irq); + if (!gc) + return; + + irq_idx = hw_irq % dgc->irqs_per_chip; + + clear_bit(irq_idx, &gc->installed); + irq_domain_set_info(d, virq, hw_irq, &no_irq_chip, NULL, NULL, NULL, + NULL); + +} struct irq_domain_ops irq_generic_chip_ops = { .map = irq_map_generic_chip, + .unmap = irq_unmap_generic_chip, .xlate = irq_domain_xlate_onetwocell, }; EXPORT_SYMBOL_GPL(irq_generic_chip_ops); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index a623b44f2d4b..00bb0aeea1d0 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -15,6 +15,7 @@ #include <linux/radix-tree.h> #include <linux/bitmap.h> #include <linux/irqdomain.h> +#include <linux/sysfs.h> #include "internals.h" @@ -123,6 +124,181 @@ static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS); #ifdef CONFIG_SPARSE_IRQ +static void irq_kobj_release(struct kobject *kobj); + +#ifdef CONFIG_SYSFS +static struct kobject *irq_kobj_base; + +#define IRQ_ATTR_RO(_name) \ +static struct kobj_attribute _name##_attr = __ATTR_RO(_name) + +static ssize_t per_cpu_count_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); + int cpu, irq = desc->irq_data.irq; + ssize_t ret = 0; + char *p = ""; + + for_each_possible_cpu(cpu) { + unsigned int c = kstat_irqs_cpu(irq, cpu); + + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%u", p, c); + p = ","; + } + + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); + return ret; +} +IRQ_ATTR_RO(per_cpu_count); + +static ssize_t chip_name_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); + ssize_t ret = 0; + + raw_spin_lock_irq(&desc->lock); + if (desc->irq_data.chip && desc->irq_data.chip->name) { + ret = scnprintf(buf, PAGE_SIZE, "%s\n", + desc->irq_data.chip->name); + } + raw_spin_unlock_irq(&desc->lock); + + return ret; +} +IRQ_ATTR_RO(chip_name); + +static ssize_t hwirq_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); + ssize_t ret = 0; + + raw_spin_lock_irq(&desc->lock); + if (desc->irq_data.domain) + ret = sprintf(buf, "%d\n", (int)desc->irq_data.hwirq); + raw_spin_unlock_irq(&desc->lock); + + return ret; +} +IRQ_ATTR_RO(hwirq); + +static ssize_t type_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); + ssize_t ret = 0; + + raw_spin_lock_irq(&desc->lock); + ret = sprintf(buf, "%s\n", + irqd_is_level_type(&desc->irq_data) ? "level" : "edge"); + raw_spin_unlock_irq(&desc->lock); + + return ret; + +} +IRQ_ATTR_RO(type); + +static ssize_t name_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); + ssize_t ret = 0; + + raw_spin_lock_irq(&desc->lock); + if (desc->name) + ret = scnprintf(buf, PAGE_SIZE, "%s\n", desc->name); + raw_spin_unlock_irq(&desc->lock); + + return ret; +} +IRQ_ATTR_RO(name); + +static ssize_t actions_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); + struct irqaction *action; + ssize_t ret = 0; + char *p = ""; + + raw_spin_lock_irq(&desc->lock); + for (action = desc->action; action != NULL; action = action->next) { + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s", + p, action->name); + p = ","; + } + raw_spin_unlock_irq(&desc->lock); + + if (ret) + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); + + return ret; +} +IRQ_ATTR_RO(actions); + +static struct attribute *irq_attrs[] = { + &per_cpu_count_attr.attr, + &chip_name_attr.attr, + &hwirq_attr.attr, + &type_attr.attr, + &name_attr.attr, + &actions_attr.attr, + NULL +}; + +static struct kobj_type irq_kobj_type = { + .release = irq_kobj_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_attrs = irq_attrs, +}; + +static void irq_sysfs_add(int irq, struct irq_desc *desc) +{ + if (irq_kobj_base) { + /* + * Continue even in case of failure as this is nothing + * crucial. + */ + if (kobject_add(&desc->kobj, irq_kobj_base, "%d", irq)) + pr_warn("Failed to add kobject for irq %d\n", irq); + } +} + +static int __init irq_sysfs_init(void) +{ + struct irq_desc *desc; + int irq; + + /* Prevent concurrent irq alloc/free */ + irq_lock_sparse(); + + irq_kobj_base = kobject_create_and_add("irq", kernel_kobj); + if (!irq_kobj_base) { + irq_unlock_sparse(); + return -ENOMEM; + } + + /* Add the already allocated interrupts */ + for_each_irq_desc(irq, desc) + irq_sysfs_add(irq, desc); + irq_unlock_sparse(); + + return 0; +} +postcore_initcall(irq_sysfs_init); + +#else /* !CONFIG_SYSFS */ + +static struct kobj_type irq_kobj_type = { + .release = irq_kobj_release, +}; + +static void irq_sysfs_add(int irq, struct irq_desc *desc) {} + +#endif /* CONFIG_SYSFS */ + static RADIX_TREE(irq_desc_tree, GFP_KERNEL); static void irq_insert_desc(unsigned int irq, struct irq_desc *desc) @@ -187,6 +363,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, desc_set_defaults(irq, desc, node, affinity, owner); irqd_set(&desc->irq_data, flags); + kobject_init(&desc->kobj, &irq_kobj_type); return desc; @@ -197,15 +374,22 @@ err_desc: return NULL; } -static void delayed_free_desc(struct rcu_head *rhp) +static void irq_kobj_release(struct kobject *kobj) { - struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu); + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); free_masks(desc); free_percpu(desc->kstat_irqs); kfree(desc); } +static void delayed_free_desc(struct rcu_head *rhp) +{ + struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu); + + kobject_put(&desc->kobj); +} + static void free_desc(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -217,8 +401,12 @@ static void free_desc(unsigned int irq) * kstat_irq_usr(). Once we deleted the descriptor from the * sparse tree we can free it. Access in proc will fail to * lookup the descriptor. + * + * The sysfs entry must be serialized against a concurrent + * irq_sysfs_init() as well. */ mutex_lock(&sparse_irq_lock); + kobject_del(&desc->kobj); delete_irq_desc(irq); mutex_unlock(&sparse_irq_lock); @@ -236,31 +424,31 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node, const struct cpumask *mask = NULL; struct irq_desc *desc; unsigned int flags; - int i, cpu = -1; + int i; - if (affinity && cpumask_empty(affinity)) - return -EINVAL; + /* Validate affinity mask(s) */ + if (affinity) { + for (i = 0, mask = affinity; i < cnt; i++, mask++) { + if (cpumask_empty(mask)) + return -EINVAL; + } + } flags = affinity ? IRQD_AFFINITY_MANAGED : 0; + mask = NULL; for (i = 0; i < cnt; i++) { if (affinity) { - cpu = cpumask_next(cpu, affinity); - if (cpu >= nr_cpu_ids) - cpu = cpumask_first(affinity); - node = cpu_to_node(cpu); - - /* - * For single allocations we use the caller provided - * mask otherwise we use the mask of the target cpu - */ - mask = cnt == 1 ? affinity : cpumask_of(cpu); + node = cpu_to_node(cpumask_first(affinity)); + mask = affinity; + affinity++; } desc = alloc_desc(start + i, node, flags, mask, owner); if (!desc) goto err; mutex_lock(&sparse_irq_lock); irq_insert_desc(start + i, desc); + irq_sysfs_add(start + i, desc); mutex_unlock(&sparse_irq_lock); } return start; @@ -481,9 +669,9 @@ EXPORT_SYMBOL_GPL(irq_free_descs); * @cnt: Number of consecutive irqs to allocate. * @node: Preferred node on which the irq descriptor should be allocated * @owner: Owning module (can be NULL) - * @affinity: Optional pointer to an affinity mask which hints where the - * irq descriptors should be allocated and which default - * affinities to use + * @affinity: Optional pointer to an affinity mask array of size @cnt which + * hints where the irq descriptors should be allocated and which + * default affinities to use * * Returns the first irq number or error code */ diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 4752b43662e0..8c0a0ae43521 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -80,7 +80,7 @@ EXPORT_SYMBOL_GPL(irq_domain_free_fwnode); /** * __irq_domain_add() - Allocate a new irq_domain data structure - * @of_node: optional device-tree node of the interrupt controller + * @fwnode: firmware node for the interrupt controller * @size: Size of linear map; 0 for radix mapping only * @hwirq_max: Maximum number of interrupts supported by controller * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no @@ -96,10 +96,8 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, const struct irq_domain_ops *ops, void *host_data) { + struct device_node *of_node = to_of_node(fwnode); struct irq_domain *domain; - struct device_node *of_node; - - of_node = to_of_node(fwnode); domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), GFP_KERNEL, of_node_to_nid(of_node)); @@ -868,7 +866,10 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d, if (WARN_ON(intsize < 1)) return -EINVAL; *out_hwirq = intspec[0]; - *out_type = (intsize > 1) ? intspec[1] : IRQ_TYPE_NONE; + if (intsize > 1) + *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK; + else + *out_type = IRQ_TYPE_NONE; return 0; } EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 9530fcd27704..0c5f1a5db654 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -669,8 +669,6 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) return 0; } - flags &= IRQ_TYPE_SENSE_MASK; - if (chip->flags & IRQCHIP_SET_TYPE_MASKED) { if (!irqd_irq_masked(&desc->irq_data)) mask_irq(desc); @@ -678,7 +676,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) unmask = 1; } - /* caller masked out all except trigger mode flags */ + /* Mask all flags except trigger mode */ + flags &= IRQ_TYPE_SENSE_MASK; ret = chip->irq_set_type(&desc->irq_data, flags); switch (ret) { diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 19e9dfbe97fa..8a3e872798f3 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -18,20 +18,42 @@ /* Temparory solution for building, will be removed later */ #include <linux/pci.h> -struct msi_desc *alloc_msi_entry(struct device *dev) +/** + * alloc_msi_entry - Allocate an initialize msi_entry + * @dev: Pointer to the device for which this is allocated + * @nvec: The number of vectors used in this entry + * @affinity: Optional pointer to an affinity mask array size of @nvec + * + * If @affinity is not NULL then a an affinity array[@nvec] is allocated + * and the affinity masks from @affinity are copied. + */ +struct msi_desc * +alloc_msi_entry(struct device *dev, int nvec, const struct cpumask *affinity) { - struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL); + struct msi_desc *desc; + + desc = kzalloc(sizeof(*desc), GFP_KERNEL); if (!desc) return NULL; INIT_LIST_HEAD(&desc->list); desc->dev = dev; + desc->nvec_used = nvec; + if (affinity) { + desc->affinity = kmemdup(affinity, + nvec * sizeof(*desc->affinity), GFP_KERNEL); + if (!desc->affinity) { + kfree(desc); + return NULL; + } + } return desc; } void free_msi_entry(struct msi_desc *entry) { + kfree(entry->affinity); kfree(entry); } diff --git a/kernel/kthread.c b/kernel/kthread.c index 9ff173dca1ae..4ab4c3766a80 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -64,7 +64,7 @@ static inline struct kthread *to_kthread(struct task_struct *k) static struct kthread *to_live_kthread(struct task_struct *k) { struct completion *vfork = ACCESS_ONCE(k->vfork_done); - if (likely(vfork)) + if (likely(vfork) && try_get_task_stack(k)) return __to_kthread(vfork); return NULL; } @@ -425,8 +425,10 @@ void kthread_unpark(struct task_struct *k) { struct kthread *kthread = to_live_kthread(k); - if (kthread) + if (kthread) { __kthread_unpark(k, kthread); + put_task_stack(k); + } } EXPORT_SYMBOL_GPL(kthread_unpark); @@ -455,6 +457,7 @@ int kthread_park(struct task_struct *k) wait_for_completion(&kthread->parked); } } + put_task_stack(k); ret = 0; } return ret; @@ -490,6 +493,7 @@ int kthread_stop(struct task_struct *k) __kthread_unpark(k, kthread); wake_up_process(k); wait_for_completion(&kthread->exited); + put_task_stack(k); } ret = k->exit_code; put_task_struct(k); diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 8bbe50704621..af4643873e71 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -274,7 +274,6 @@ static int klp_write_object_relocations(struct module *pmod, objname = klp_is_module(obj) ? obj->name : "vmlinux"; - module_disable_ro(pmod); /* For each klp relocation section */ for (i = 1; i < pmod->klp_info->hdr.e_shnum; i++) { sec = pmod->klp_info->sechdrs + i; @@ -309,7 +308,6 @@ static int klp_write_object_relocations(struct module *pmod, break; } - module_enable_ro(pmod, true); return ret; } @@ -547,9 +545,6 @@ static int __klp_enable_patch(struct klp_patch *patch) list_prev_entry(patch, list)->state == KLP_DISABLED) return -EBUSY; - pr_notice_once("tainting kernel with TAINT_LIVEPATCH\n"); - add_taint(TAINT_LIVEPATCH, LOCKDEP_STILL_OK); - pr_notice("enabling patch '%s'\n", patch->mod->name); klp_for_each_object(patch, obj) { @@ -763,6 +758,12 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) func->old_sympos ? func->old_sympos : 1); } +/* Arches may override this to finish any remaining arch-specific tasks */ +void __weak arch_klp_init_object_loaded(struct klp_patch *patch, + struct klp_object *obj) +{ +} + /* parts of the initialization that is done only when the object is loaded */ static int klp_init_object_loaded(struct klp_patch *patch, struct klp_object *obj) @@ -770,9 +771,15 @@ static int klp_init_object_loaded(struct klp_patch *patch, struct klp_func *func; int ret; + module_disable_ro(patch->mod); ret = klp_write_object_relocations(patch->mod, obj); - if (ret) + if (ret) { + module_enable_ro(patch->mod, true); return ret; + } + + arch_klp_init_object_loaded(patch, obj); + module_enable_ro(patch->mod, true); klp_for_each_func(obj, func) { ret = klp_find_object_symbol(obj->name, func->old_name, diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 31322a4275cd..6f88e352cd4f 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -18,7 +18,6 @@ obj-$(CONFIG_LOCKDEP) += lockdep_proc.o endif obj-$(CONFIG_SMP) += spinlock.o obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o -obj-$(CONFIG_SMP) += lglock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o obj-$(CONFIG_RT_MUTEXES) += rtmutex.o diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c deleted file mode 100644 index 951cfcd10b4a..000000000000 --- a/kernel/locking/lglock.c +++ /dev/null @@ -1,111 +0,0 @@ -/* See include/linux/lglock.h for description */ -#include <linux/module.h> -#include <linux/lglock.h> -#include <linux/cpu.h> -#include <linux/string.h> - -/* - * Note there is no uninit, so lglocks cannot be defined in - * modules (but it's fine to use them from there) - * Could be added though, just undo lg_lock_init - */ - -void lg_lock_init(struct lglock *lg, char *name) -{ - LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0); -} -EXPORT_SYMBOL(lg_lock_init); - -void lg_local_lock(struct lglock *lg) -{ - arch_spinlock_t *lock; - - preempt_disable(); - lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); - lock = this_cpu_ptr(lg->lock); - arch_spin_lock(lock); -} -EXPORT_SYMBOL(lg_local_lock); - -void lg_local_unlock(struct lglock *lg) -{ - arch_spinlock_t *lock; - - lock_release(&lg->lock_dep_map, 1, _RET_IP_); - lock = this_cpu_ptr(lg->lock); - arch_spin_unlock(lock); - preempt_enable(); -} -EXPORT_SYMBOL(lg_local_unlock); - -void lg_local_lock_cpu(struct lglock *lg, int cpu) -{ - arch_spinlock_t *lock; - - preempt_disable(); - lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); - lock = per_cpu_ptr(lg->lock, cpu); - arch_spin_lock(lock); -} -EXPORT_SYMBOL(lg_local_lock_cpu); - -void lg_local_unlock_cpu(struct lglock *lg, int cpu) -{ - arch_spinlock_t *lock; - - lock_release(&lg->lock_dep_map, 1, _RET_IP_); - lock = per_cpu_ptr(lg->lock, cpu); - arch_spin_unlock(lock); - preempt_enable(); -} -EXPORT_SYMBOL(lg_local_unlock_cpu); - -void lg_double_lock(struct lglock *lg, int cpu1, int cpu2) -{ - BUG_ON(cpu1 == cpu2); - - /* lock in cpu order, just like lg_global_lock */ - if (cpu2 < cpu1) - swap(cpu1, cpu2); - - preempt_disable(); - lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); - arch_spin_lock(per_cpu_ptr(lg->lock, cpu1)); - arch_spin_lock(per_cpu_ptr(lg->lock, cpu2)); -} - -void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2) -{ - lock_release(&lg->lock_dep_map, 1, _RET_IP_); - arch_spin_unlock(per_cpu_ptr(lg->lock, cpu1)); - arch_spin_unlock(per_cpu_ptr(lg->lock, cpu2)); - preempt_enable(); -} - -void lg_global_lock(struct lglock *lg) -{ - int i; - - preempt_disable(); - lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); - for_each_possible_cpu(i) { - arch_spinlock_t *lock; - lock = per_cpu_ptr(lg->lock, i); - arch_spin_lock(lock); - } -} -EXPORT_SYMBOL(lg_global_lock); - -void lg_global_unlock(struct lglock *lg) -{ - int i; - - lock_release(&lg->lock_dep_map, 1, _RET_IP_); - for_each_possible_cpu(i) { - arch_spinlock_t *lock; - lock = per_cpu_ptr(lg->lock, i); - arch_spin_unlock(lock); - } - preempt_enable(); -} -EXPORT_SYMBOL(lg_global_unlock); diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index bec0b647f9cc..ce182599cf2e 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -8,152 +8,186 @@ #include <linux/sched.h> #include <linux/errno.h> -int __percpu_init_rwsem(struct percpu_rw_semaphore *brw, +int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, const char *name, struct lock_class_key *rwsem_key) { - brw->fast_read_ctr = alloc_percpu(int); - if (unlikely(!brw->fast_read_ctr)) + sem->read_count = alloc_percpu(int); + if (unlikely(!sem->read_count)) return -ENOMEM; /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ - __init_rwsem(&brw->rw_sem, name, rwsem_key); - rcu_sync_init(&brw->rss, RCU_SCHED_SYNC); - atomic_set(&brw->slow_read_ctr, 0); - init_waitqueue_head(&brw->write_waitq); + rcu_sync_init(&sem->rss, RCU_SCHED_SYNC); + __init_rwsem(&sem->rw_sem, name, rwsem_key); + init_waitqueue_head(&sem->writer); + sem->readers_block = 0; return 0; } EXPORT_SYMBOL_GPL(__percpu_init_rwsem); -void percpu_free_rwsem(struct percpu_rw_semaphore *brw) +void percpu_free_rwsem(struct percpu_rw_semaphore *sem) { /* * XXX: temporary kludge. The error path in alloc_super() * assumes that percpu_free_rwsem() is safe after kzalloc(). */ - if (!brw->fast_read_ctr) + if (!sem->read_count) return; - rcu_sync_dtor(&brw->rss); - free_percpu(brw->fast_read_ctr); - brw->fast_read_ctr = NULL; /* catch use after free bugs */ + rcu_sync_dtor(&sem->rss); + free_percpu(sem->read_count); + sem->read_count = NULL; /* catch use after free bugs */ } EXPORT_SYMBOL_GPL(percpu_free_rwsem); -/* - * This is the fast-path for down_read/up_read. If it succeeds we rely - * on the barriers provided by rcu_sync_enter/exit; see the comments in - * percpu_down_write() and percpu_up_write(). - * - * If this helper fails the callers rely on the normal rw_semaphore and - * atomic_dec_and_test(), so in this case we have the necessary barriers. - */ -static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) +int __percpu_down_read(struct percpu_rw_semaphore *sem, int try) { - bool success; + /* + * Due to having preemption disabled the decrement happens on + * the same CPU as the increment, avoiding the + * increment-on-one-CPU-and-decrement-on-another problem. + * + * If the reader misses the writer's assignment of readers_block, then + * the writer is guaranteed to see the reader's increment. + * + * Conversely, any readers that increment their sem->read_count after + * the writer looks are guaranteed to see the readers_block value, + * which in turn means that they are guaranteed to immediately + * decrement their sem->read_count, so that it doesn't matter that the + * writer missed them. + */ - preempt_disable(); - success = rcu_sync_is_idle(&brw->rss); - if (likely(success)) - __this_cpu_add(*brw->fast_read_ctr, val); - preempt_enable(); + smp_mb(); /* A matches D */ - return success; -} + /* + * If !readers_block the critical section starts here, matched by the + * release in percpu_up_write(). + */ + if (likely(!smp_load_acquire(&sem->readers_block))) + return 1; -/* - * Like the normal down_read() this is not recursive, the writer can - * come after the first percpu_down_read() and create the deadlock. - * - * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep, - * percpu_up_read() does rwsem_release(). This pairs with the usage - * of ->rw_sem in percpu_down/up_write(). - */ -void percpu_down_read(struct percpu_rw_semaphore *brw) -{ - might_sleep(); - rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_); + /* + * Per the above comment; we still have preemption disabled and + * will thus decrement on the same CPU as we incremented. + */ + __percpu_up_read(sem); - if (likely(update_fast_ctr(brw, +1))) - return; + if (try) + return 0; - /* Avoid rwsem_acquire_read() and rwsem_release() */ - __down_read(&brw->rw_sem); - atomic_inc(&brw->slow_read_ctr); - __up_read(&brw->rw_sem); -} -EXPORT_SYMBOL_GPL(percpu_down_read); + /* + * We either call schedule() in the wait, or we'll fall through + * and reschedule on the preempt_enable() in percpu_down_read(). + */ + preempt_enable_no_resched(); -int percpu_down_read_trylock(struct percpu_rw_semaphore *brw) -{ - if (unlikely(!update_fast_ctr(brw, +1))) { - if (!__down_read_trylock(&brw->rw_sem)) - return 0; - atomic_inc(&brw->slow_read_ctr); - __up_read(&brw->rw_sem); - } - - rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_); + /* + * Avoid lockdep for the down/up_read() we already have them. + */ + __down_read(&sem->rw_sem); + this_cpu_inc(*sem->read_count); + __up_read(&sem->rw_sem); + + preempt_disable(); return 1; } +EXPORT_SYMBOL_GPL(__percpu_down_read); -void percpu_up_read(struct percpu_rw_semaphore *brw) +void __percpu_up_read(struct percpu_rw_semaphore *sem) { - rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_); - - if (likely(update_fast_ctr(brw, -1))) - return; + smp_mb(); /* B matches C */ + /* + * In other words, if they see our decrement (presumably to aggregate + * zero, as that is the only time it matters) they will also see our + * critical section. + */ + __this_cpu_dec(*sem->read_count); - /* false-positive is possible but harmless */ - if (atomic_dec_and_test(&brw->slow_read_ctr)) - wake_up_all(&brw->write_waitq); + /* Prod writer to recheck readers_active */ + wake_up(&sem->writer); } -EXPORT_SYMBOL_GPL(percpu_up_read); +EXPORT_SYMBOL_GPL(__percpu_up_read); + +#define per_cpu_sum(var) \ +({ \ + typeof(var) __sum = 0; \ + int cpu; \ + compiletime_assert_atomic_type(__sum); \ + for_each_possible_cpu(cpu) \ + __sum += per_cpu(var, cpu); \ + __sum; \ +}) -static int clear_fast_ctr(struct percpu_rw_semaphore *brw) +/* + * Return true if the modular sum of the sem->read_count per-CPU variable is + * zero. If this sum is zero, then it is stable due to the fact that if any + * newly arriving readers increment a given counter, they will immediately + * decrement that same counter. + */ +static bool readers_active_check(struct percpu_rw_semaphore *sem) { - unsigned int sum = 0; - int cpu; + if (per_cpu_sum(*sem->read_count) != 0) + return false; + + /* + * If we observed the decrement; ensure we see the entire critical + * section. + */ - for_each_possible_cpu(cpu) { - sum += per_cpu(*brw->fast_read_ctr, cpu); - per_cpu(*brw->fast_read_ctr, cpu) = 0; - } + smp_mb(); /* C matches B */ - return sum; + return true; } -void percpu_down_write(struct percpu_rw_semaphore *brw) +void percpu_down_write(struct percpu_rw_semaphore *sem) { + /* Notify readers to take the slow path. */ + rcu_sync_enter(&sem->rss); + + down_write(&sem->rw_sem); + /* - * Make rcu_sync_is_idle() == F and thus disable the fast-path in - * percpu_down_read() and percpu_up_read(), and wait for gp pass. - * - * The latter synchronises us with the preceding readers which used - * the fast-past, so we can not miss the result of __this_cpu_add() - * or anything else inside their criticial sections. + * Notify new readers to block; up until now, and thus throughout the + * longish rcu_sync_enter() above, new readers could still come in. */ - rcu_sync_enter(&brw->rss); + WRITE_ONCE(sem->readers_block, 1); - /* exclude other writers, and block the new readers completely */ - down_write(&brw->rw_sem); + smp_mb(); /* D matches A */ - /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */ - atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr); + /* + * If they don't see our writer of readers_block, then we are + * guaranteed to see their sem->read_count increment, and therefore + * will wait for them. + */ - /* wait for all readers to complete their percpu_up_read() */ - wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr)); + /* Wait for all now active readers to complete. */ + wait_event(sem->writer, readers_active_check(sem)); } EXPORT_SYMBOL_GPL(percpu_down_write); -void percpu_up_write(struct percpu_rw_semaphore *brw) +void percpu_up_write(struct percpu_rw_semaphore *sem) { - /* release the lock, but the readers can't use the fast-path */ - up_write(&brw->rw_sem); /* - * Enable the fast-path in percpu_down_read() and percpu_up_read() - * but only after another gp pass; this adds the necessary barrier - * to ensure the reader can't miss the changes done by us. + * Signal the writer is done, no fast path yet. + * + * One reason that we cannot just immediately flip to readers_fast is + * that new readers might fail to see the results of this writer's + * critical section. + * + * Therefore we force it through the slow path which guarantees an + * acquire and thereby guarantees the critical section's consistency. + */ + smp_store_release(&sem->readers_block, 0); + + /* + * Release the write lock, this will allow readers back in the game. + */ + up_write(&sem->rw_sem); + + /* + * Once this completes (at least one RCU-sched grace period hence) the + * reader fast path will be available again. Safe to use outside the + * exclusive write lock because its counting. */ - rcu_sync_exit(&brw->rss); + rcu_sync_exit(&sem->rss); } EXPORT_SYMBOL_GPL(percpu_up_write); diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 8a99abf58080..e3b5520005db 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -70,11 +70,14 @@ struct pv_node { static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock) { struct __qspinlock *l = (void *)lock; - int ret = !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && - (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0); - qstat_inc(qstat_pv_lock_stealing, ret); - return ret; + if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && + (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0)) { + qstat_inc(qstat_pv_lock_stealing, true); + return true; + } + + return false; } /* @@ -257,7 +260,6 @@ static struct pv_node *pv_unhash(struct qspinlock *lock) static inline bool pv_wait_early(struct pv_node *prev, int loop) { - if ((loop & PV_PREV_CHECK_MASK) != 0) return false; @@ -286,12 +288,10 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) { struct pv_node *pn = (struct pv_node *)node; struct pv_node *pp = (struct pv_node *)prev; - int waitcnt = 0; int loop; bool wait_early; - /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */ - for (;; waitcnt++) { + for (;;) { for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) { if (READ_ONCE(node->locked)) return; @@ -315,7 +315,6 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) if (!READ_ONCE(node->locked)) { qstat_inc(qstat_pv_wait_node, true); - qstat_inc(qstat_pv_wait_again, waitcnt); qstat_inc(qstat_pv_wait_early, wait_early); pv_wait(&pn->state, vcpu_halted); } @@ -456,12 +455,9 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) pv_wait(&l->locked, _Q_SLOW_VAL); /* - * The unlocker should have freed the lock before kicking the - * CPU. So if the lock is still not free, it is a spurious - * wakeup or another vCPU has stolen the lock. The current - * vCPU should spin again. + * Because of lock stealing, the queue head vCPU may not be + * able to acquire the lock before it has to wait again. */ - qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked)); } /* @@ -544,7 +540,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) * unhash. Otherwise it would be possible to have multiple @lock * entries, which would be BAD. */ - locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); + locked = cmpxchg_release(&l->locked, _Q_LOCKED_VAL, 0); if (likely(locked == _Q_LOCKED_VAL)) return; diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index b9d031516254..eb0a599fcf58 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -24,8 +24,8 @@ * pv_latency_wake - average latency (ns) from vCPU kick to wakeup * pv_lock_slowpath - # of locking operations via the slowpath * pv_lock_stealing - # of lock stealing operations - * pv_spurious_wakeup - # of spurious wakeups - * pv_wait_again - # of vCPU wait's that happened after a vCPU kick + * pv_spurious_wakeup - # of spurious wakeups in non-head vCPUs + * pv_wait_again - # of wait's after a queue head vCPU kick * pv_wait_early - # of early vCPU wait's * pv_wait_head - # of vCPU wait's at the queue head * pv_wait_node - # of vCPU wait's at a non-head queue node diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 447e08de1fab..2337b4bb2366 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -121,16 +121,19 @@ enum rwsem_wake_type { * - woken process blocks are discarded from the list after having task zeroed * - writers are only marked woken if downgrading is false */ -static struct rw_semaphore * -__rwsem_mark_wake(struct rw_semaphore *sem, - enum rwsem_wake_type wake_type, struct wake_q_head *wake_q) +static void __rwsem_mark_wake(struct rw_semaphore *sem, + enum rwsem_wake_type wake_type, + struct wake_q_head *wake_q) { - struct rwsem_waiter *waiter; - struct task_struct *tsk; - struct list_head *next; - long oldcount, woken, loop, adjustment; + struct rwsem_waiter *waiter, *tmp; + long oldcount, woken = 0, adjustment = 0; + + /* + * Take a peek at the queue head waiter such that we can determine + * the wakeup(s) to perform. + */ + waiter = list_first_entry(&sem->wait_list, struct rwsem_waiter, list); - waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); if (waiter->type == RWSEM_WAITING_FOR_WRITE) { if (wake_type == RWSEM_WAKE_ANY) { /* @@ -142,19 +145,19 @@ __rwsem_mark_wake(struct rw_semaphore *sem, */ wake_q_add(wake_q, waiter->task); } - goto out; + + return; } - /* Writers might steal the lock before we grant it to the next reader. + /* + * Writers might steal the lock before we grant it to the next reader. * We prefer to do the first reader grant before counting readers * so we can bail out early if a writer stole the lock. */ - adjustment = 0; if (wake_type != RWSEM_WAKE_READ_OWNED) { adjustment = RWSEM_ACTIVE_READ_BIAS; try_reader_grant: oldcount = atomic_long_fetch_add(adjustment, &sem->count); - if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { /* * If the count is still less than RWSEM_WAITING_BIAS @@ -164,7 +167,8 @@ __rwsem_mark_wake(struct rw_semaphore *sem, */ if (atomic_long_add_return(-adjustment, &sem->count) < RWSEM_WAITING_BIAS) - goto out; + return; + /* Last active locker left. Retry waking readers. */ goto try_reader_grant; } @@ -176,38 +180,23 @@ __rwsem_mark_wake(struct rw_semaphore *sem, rwsem_set_reader_owned(sem); } - /* Grant an infinite number of read locks to the readers at the front - * of the queue. Note we increment the 'active part' of the count by - * the number of readers before waking any processes up. + /* + * Grant an infinite number of read locks to the readers at the front + * of the queue. We know that woken will be at least 1 as we accounted + * for above. Note we increment the 'active part' of the count by the + * number of readers before waking any processes up. */ - woken = 0; - do { - woken++; + list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) { + struct task_struct *tsk; - if (waiter->list.next == &sem->wait_list) + if (waiter->type == RWSEM_WAITING_FOR_WRITE) break; - waiter = list_entry(waiter->list.next, - struct rwsem_waiter, list); - - } while (waiter->type != RWSEM_WAITING_FOR_WRITE); - - adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; - if (waiter->type != RWSEM_WAITING_FOR_WRITE) - /* hit end of list above */ - adjustment -= RWSEM_WAITING_BIAS; - - if (adjustment) - atomic_long_add(adjustment, &sem->count); - - next = sem->wait_list.next; - loop = woken; - do { - waiter = list_entry(next, struct rwsem_waiter, list); - next = waiter->list.next; + woken++; tsk = waiter->task; wake_q_add(wake_q, tsk); + list_del(&waiter->list); /* * Ensure that the last operation is setting the reader * waiter to nil such that rwsem_down_read_failed() cannot @@ -215,13 +204,16 @@ __rwsem_mark_wake(struct rw_semaphore *sem, * to the task to wakeup. */ smp_store_release(&waiter->task, NULL); - } while (--loop); + } - sem->wait_list.next = next; - next->prev = &sem->wait_list; + adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; + if (list_empty(&sem->wait_list)) { + /* hit end of list above */ + adjustment -= RWSEM_WAITING_BIAS; + } - out: - return sem; + if (adjustment) + atomic_long_add(adjustment, &sem->count); } /* @@ -235,7 +227,6 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) struct task_struct *tsk = current; WAKE_Q(wake_q); - /* set up my own style of waitqueue */ waiter.task = tsk; waiter.type = RWSEM_WAITING_FOR_READ; @@ -247,7 +238,8 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) /* we're now waiting on the lock, but no longer actively locking */ count = atomic_long_add_return(adjustment, &sem->count); - /* If there are no active locks, wake the front queued process(es). + /* + * If there are no active locks, wake the front queued process(es). * * If there are no writers and we are first in the queue, * wake our own waiter to join the existing active readers ! @@ -255,7 +247,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) if (count == RWSEM_WAITING_BIAS || (count > RWSEM_WAITING_BIAS && adjustment != -RWSEM_ACTIVE_READ_BIAS)) - sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); wake_up_q(&wake_q); @@ -505,7 +497,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) if (count > RWSEM_WAITING_BIAS) { WAKE_Q(wake_q); - sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); + __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); /* * The wakeup is normally called _after_ the wait_lock * is released, but given that we are proactively waking @@ -614,9 +606,8 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); locked: - /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) - sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); wake_up_q(&wake_q); @@ -638,9 +629,8 @@ struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) - sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); + __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); wake_up_q(&wake_q); diff --git a/kernel/memremap.c b/kernel/memremap.c index 251d16b4cb41..b501e390bb34 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -247,6 +247,7 @@ static void devm_memremap_pages_release(struct device *dev, void *data) align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(resource_size(res), SECTION_SIZE); arch_remove_memory(align_start, align_size); + untrack_pfn(NULL, PHYS_PFN(align_start), align_size); pgmap_radix_release(res); dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc, "%s: failed to free all reserved pages\n", __func__); @@ -282,6 +283,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, struct percpu_ref *ref, struct vmem_altmap *altmap) { resource_size_t key, align_start, align_size, align_end; + pgprot_t pgprot = PAGE_KERNEL; struct dev_pagemap *pgmap; struct page_map *page_map; int error, nid, is_ram; @@ -351,6 +353,11 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (nid < 0) nid = numa_mem_id(); + error = track_pfn_remap(NULL, &pgprot, PHYS_PFN(align_start), 0, + align_size); + if (error) + goto err_pfn_remap; + error = arch_add_memory(nid, align_start, align_size, true); if (error) goto err_add_memory; @@ -371,6 +378,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, return __va(res->start); err_add_memory: + untrack_pfn(NULL, PHYS_PFN(align_start), align_size); + err_pfn_remap: err_radix: pgmap_radix_release(res); devres_free(page_map); diff --git a/kernel/module.c b/kernel/module.c index 529efae9f481..f57dd63186e6 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1149,6 +1149,8 @@ static size_t module_flags_taint(struct module *mod, char *buf) buf[l++] = 'C'; if (mod->taints & (1 << TAINT_UNSIGNED_MODULE)) buf[l++] = 'E'; + if (mod->taints & (1 << TAINT_LIVEPATCH)) + buf[l++] = 'K'; /* * TAINT_FORCED_RMMOD: could be added. * TAINT_CPU_OUT_OF_SPEC, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't @@ -2792,14 +2794,17 @@ static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned l } #ifdef CONFIG_LIVEPATCH -static int find_livepatch_modinfo(struct module *mod, struct load_info *info) +static int check_modinfo_livepatch(struct module *mod, struct load_info *info) { - mod->klp = get_modinfo(info, "livepatch") ? true : false; + if (get_modinfo(info, "livepatch")) { + mod->klp = true; + add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK); + } return 0; } #else /* !CONFIG_LIVEPATCH */ -static int find_livepatch_modinfo(struct module *mod, struct load_info *info) +static int check_modinfo_livepatch(struct module *mod, struct load_info *info) { if (get_modinfo(info, "livepatch")) { pr_err("%s: module is marked as livepatch module, but livepatch support is disabled", @@ -2969,7 +2974,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) "is unknown, you have been warned.\n", mod->name); } - err = find_livepatch_modinfo(mod, info); + err = check_modinfo_livepatch(mod, info); if (err) return err; diff --git a/kernel/padata.c b/kernel/padata.c index 993278895ccc..7848f0566403 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -30,6 +30,7 @@ #include <linux/slab.h> #include <linux/sysfs.h> #include <linux/rcupdate.h> +#include <linux/module.h> #define MAX_OBJ_NUM 1000 @@ -769,52 +770,43 @@ static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu) cpumask_test_cpu(cpu, pinst->cpumask.cbcpu); } - -static int padata_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int padata_cpu_online(unsigned int cpu, struct hlist_node *node) { - int err; struct padata_instance *pinst; - int cpu = (unsigned long)hcpu; + int ret; - pinst = container_of(nfb, struct padata_instance, cpu_notifier); + pinst = hlist_entry_safe(node, struct padata_instance, node); + if (!pinst_has_cpu(pinst, cpu)) + return 0; - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - if (!pinst_has_cpu(pinst, cpu)) - break; - mutex_lock(&pinst->lock); - err = __padata_add_cpu(pinst, cpu); - mutex_unlock(&pinst->lock); - if (err) - return notifier_from_errno(err); - break; + mutex_lock(&pinst->lock); + ret = __padata_add_cpu(pinst, cpu); + mutex_unlock(&pinst->lock); + return ret; +} - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - if (!pinst_has_cpu(pinst, cpu)) - break; - mutex_lock(&pinst->lock); - err = __padata_remove_cpu(pinst, cpu); - mutex_unlock(&pinst->lock); - if (err) - return notifier_from_errno(err); - break; - } +static int padata_cpu_prep_down(unsigned int cpu, struct hlist_node *node) +{ + struct padata_instance *pinst; + int ret; + + pinst = hlist_entry_safe(node, struct padata_instance, node); + if (!pinst_has_cpu(pinst, cpu)) + return 0; - return NOTIFY_OK; + mutex_lock(&pinst->lock); + ret = __padata_remove_cpu(pinst, cpu); + mutex_unlock(&pinst->lock); + return ret; } + +static enum cpuhp_state hp_online; #endif static void __padata_free(struct padata_instance *pinst) { #ifdef CONFIG_HOTPLUG_CPU - unregister_hotcpu_notifier(&pinst->cpu_notifier); + cpuhp_state_remove_instance_nocalls(hp_online, &pinst->node); #endif padata_stop(pinst); @@ -1012,11 +1004,8 @@ struct padata_instance *padata_alloc(struct workqueue_struct *wq, mutex_init(&pinst->lock); #ifdef CONFIG_HOTPLUG_CPU - pinst->cpu_notifier.notifier_call = padata_cpu_callback; - pinst->cpu_notifier.priority = 0; - register_hotcpu_notifier(&pinst->cpu_notifier); + cpuhp_state_add_instance_nocalls(hp_online, &pinst->node); #endif - return pinst; err_free_masks: @@ -1039,3 +1028,26 @@ void padata_free(struct padata_instance *pinst) kobject_put(&pinst->kobj); } EXPORT_SYMBOL(padata_free); + +#ifdef CONFIG_HOTPLUG_CPU + +static __init int padata_driver_init(void) +{ + int ret; + + ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "padata:online", + padata_cpu_online, + padata_cpu_prep_down); + if (ret < 0) + return ret; + hp_online = ret; + return 0; +} +module_init(padata_driver_init); + +static __exit void padata_driver_exit(void) +{ + cpuhp_remove_multi_state(hp_online); +} +module_exit(padata_driver_exit); +#endif diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a65ba137fd15..df9e8e9e0be7 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -79,23 +79,36 @@ static void proc_cleanup_work(struct work_struct *work) /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ #define MAX_PID_NS_LEVEL 32 +static struct ucounts *inc_pid_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES); +} + +static void dec_pid_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_PID_NAMESPACES); +} + static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns, struct pid_namespace *parent_pid_ns) { struct pid_namespace *ns; unsigned int level = parent_pid_ns->level + 1; + struct ucounts *ucounts; int i; int err; - if (level > MAX_PID_NS_LEVEL) { - err = -EINVAL; + err = -ENOSPC; + if (level > MAX_PID_NS_LEVEL) + goto out; + ucounts = inc_pid_namespaces(user_ns); + if (!ucounts) goto out; - } err = -ENOMEM; ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); if (ns == NULL) - goto out; + goto out_dec; ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); if (!ns->pidmap[0].page) @@ -114,6 +127,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->level = level; ns->parent = get_pid_ns(parent_pid_ns); ns->user_ns = get_user_ns(user_ns); + ns->ucounts = ucounts; ns->nr_hashed = PIDNS_HASH_ADDING; INIT_WORK(&ns->proc_work, proc_cleanup_work); @@ -129,6 +143,8 @@ out_free_map: kfree(ns->pidmap[0].page); out_free: kmem_cache_free(pid_ns_cachep, ns); +out_dec: + dec_pid_namespaces(ucounts); out: return ERR_PTR(err); } @@ -146,6 +162,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns) ns_free_inum(&ns->ns); for (i = 0; i < PIDMAP_ENTRIES; i++) kfree(ns->pidmap[i].page); + dec_pid_namespaces(ns->ucounts); put_user_ns(ns->user_ns); call_rcu(&ns->rcu, delayed_free_pidns); } @@ -388,12 +405,37 @@ static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns) return 0; } +static struct ns_common *pidns_get_parent(struct ns_common *ns) +{ + struct pid_namespace *active = task_active_pid_ns(current); + struct pid_namespace *pid_ns, *p; + + /* See if the parent is in the current namespace */ + pid_ns = p = to_pid_ns(ns)->parent; + for (;;) { + if (!p) + return ERR_PTR(-EPERM); + if (p == active) + break; + p = p->parent; + } + + return &get_pid_ns(pid_ns)->ns; +} + +static struct user_namespace *pidns_owner(struct ns_common *ns) +{ + return to_pid_ns(ns)->user_ns; +} + const struct proc_ns_operations pidns_operations = { .name = "pid", .type = CLONE_NEWPID, .get = pidns_get, .put = pidns_put, .install = pidns_install, + .owner = pidns_owner, + .get_parent = pidns_get_parent, }; static __init int pid_namespaces_init(void) diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 68d3ebc12601..e8517b63eb37 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -186,7 +186,7 @@ config PM_SLEEP_DEBUG config DPM_WATCHDOG bool "Device suspend/resume watchdog" - depends on PM_DEBUG && PSTORE + depends on PM_DEBUG && PSTORE && EXPERT ---help--- Sets up a watchdog timer to capture drivers that are locked up attempting to suspend/resume a device. @@ -197,7 +197,7 @@ config DPM_WATCHDOG config DPM_WATCHDOG_TIMEOUT int "Watchdog timeout in seconds" range 1 120 - default 60 + default 120 depends on DPM_WATCHDOG config PM_TRACE diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 33c79b6105c5..b26dbc48c75b 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -306,8 +306,10 @@ static int create_image(int platform_mode) if (error) printk(KERN_ERR "PM: Error %d creating hibernation image\n", error); - if (!in_suspend) + if (!in_suspend) { events_check_enabled = false; + clear_free_pages(); + } platform_leave(platform_mode); @@ -1189,22 +1191,6 @@ static int __init nohibernate_setup(char *str) return 1; } -static int __init page_poison_nohibernate_setup(char *str) -{ -#ifdef CONFIG_PAGE_POISONING_ZERO - /* - * The zeroing option for page poison skips the checks on alloc. - * since hibernation doesn't save free pages there's no way to - * guarantee the pages will still be zeroed. - */ - if (!strcmp(str, "on")) { - pr_info("Disabling hibernation due to page poisoning\n"); - return nohibernate_setup(str); - } -#endif - return 1; -} - __setup("noresume", noresume_setup); __setup("resume_offset=", resume_offset_setup); __setup("resume=", resume_setup); @@ -1212,4 +1198,3 @@ __setup("hibernate=", hibernate_setup); __setup("resumewait", resumewait_setup); __setup("resumedelay=", resumedelay_setup); __setup("nohibernate", nohibernate_setup); -__setup("page_poison=", page_poison_nohibernate_setup); diff --git a/kernel/power/main.c b/kernel/power/main.c index 5ea50b1b7595..281a697fd458 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -644,6 +644,7 @@ static int __init pm_init(void) return error; hibernate_image_size_init(); hibernate_reserved_size_init(); + pm_states_init(); power_kobj = kobject_create_and_add("power", NULL); if (!power_kobj) return -ENOMEM; diff --git a/kernel/power/power.h b/kernel/power/power.h index 242d8b827dd5..56d1d0dedf76 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -110,6 +110,8 @@ extern int create_basic_memory_bitmaps(void); extern void free_basic_memory_bitmaps(void); extern int hibernate_preallocate_memory(void); +extern void clear_free_pages(void); + /** * Auxiliary structure used for reading the snapshot image data and * metadata from and writing them to the list of page backup entries diff --git a/kernel/power/process.c b/kernel/power/process.c index 8f27d5a8adf6..2fba066e125f 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -144,23 +144,12 @@ int freeze_processes(void) /* * Now that the whole userspace is frozen we need to disbale * the OOM killer to disallow any further interference with - * killable tasks. + * killable tasks. There is no guarantee oom victims will + * ever reach a point they go away we have to wait with a timeout. */ - if (!error && !oom_killer_disable()) + if (!error && !oom_killer_disable(msecs_to_jiffies(freeze_timeout_msecs))) error = -EBUSY; - /* - * There is a hard to fix race between oom_reaper kernel thread - * and oom_killer_disable. oom_reaper calls exit_oom_victim - * before the victim reaches exit_mm so try to freeze all the tasks - * again and catch such a left over task. - */ - if (!error) { - pr_info("Double checking all user space processes after OOM killer disable... "); - error = try_to_freeze_tasks(true); - pr_cont("\n"); - } - if (error) thaw_processes(); return error; diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 97b0df71303e..168ff442ebde 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -482,7 +482,16 @@ void pm_qos_update_request(struct pm_qos_request *req, return; } - cancel_delayed_work_sync(&req->work); + /* + * This function may be called very early during boot, for example, + * from of_clk_init(), where irq needs to stay disabled. + * cancel_delayed_work_sync() assumes that irq is enabled on + * invocation and re-enables it on return. Avoid calling it until + * workqueue is initialized. + */ + if (keventd_up()) + cancel_delayed_work_sync(&req->work); + __pm_qos_update_request(req, new_value); } EXPORT_SYMBOL_GPL(pm_qos_update_request); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index b02228411d57..4f0f0604f1c4 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1132,6 +1132,28 @@ void free_basic_memory_bitmaps(void) pr_debug("PM: Basic memory bitmaps freed\n"); } +void clear_free_pages(void) +{ +#ifdef CONFIG_PAGE_POISONING_ZERO + struct memory_bitmap *bm = free_pages_map; + unsigned long pfn; + + if (WARN_ON(!(free_pages_map))) + return; + + memory_bm_position_reset(bm); + pfn = memory_bm_next_pfn(bm); + while (pfn != BM_END_OF_MAP) { + if (pfn_valid(pfn)) + clear_highpage(pfn_to_page(pfn)); + + pfn = memory_bm_next_pfn(bm); + } + memory_bm_position_reset(bm); + pr_info("PM: free pages cleared after restore\n"); +#endif /* PAGE_POISONING_ZERO */ +} + /** * snapshot_additional_pages - Estimate the number of extra pages needed. * @zone: Memory zone to carry out the computation for. diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 0acab9d7f96f..1e7f5da648d9 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -118,10 +118,18 @@ static bool valid_state(suspend_state_t state) */ static bool relative_states; +void __init pm_states_init(void) +{ + /* + * freeze state should be supported even without any suspend_ops, + * initialize pm_states accordingly here + */ + pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2]; +} + static int __init sleep_states_setup(char *str) { relative_states = !strncmp(str, "1", 1); - pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2]; return 1; } @@ -211,7 +219,7 @@ static int platform_suspend_begin(suspend_state_t state) { if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) return freeze_ops->begin(); - else if (suspend_ops->begin) + else if (suspend_ops && suspend_ops->begin) return suspend_ops->begin(state); else return 0; @@ -221,7 +229,7 @@ static void platform_resume_end(suspend_state_t state) { if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end) freeze_ops->end(); - else if (suspend_ops->end) + else if (suspend_ops && suspend_ops->end) suspend_ops->end(); } diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index eea6dbc2d8cf..d5e397315473 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -253,6 +253,17 @@ static int preferred_console = -1; int console_set_on_cmdline; EXPORT_SYMBOL(console_set_on_cmdline); +#ifdef CONFIG_OF +static bool of_specified_console; + +void console_set_by_of(void) +{ + of_specified_console = true; +} +#else +# define of_specified_console false +#endif + /* Flag: console code may call schedule() */ static int console_may_schedule; @@ -655,11 +666,8 @@ static ssize_t msg_print_ext_header(char *buf, size_t size, * better readable output. 'c' in the record flags mark the first * fragment of a line, '+' the following. */ - if (msg->flags & LOG_CONT && !(prev_flags & LOG_CONT)) - cont = 'c'; - else if ((msg->flags & LOG_CONT) || - ((prev_flags & LOG_CONT) && !(msg->flags & LOG_PREFIX))) - cont = '+'; + if (msg->flags & LOG_CONT) + cont = (prev_flags & LOG_CONT) ? '+' : 'c'; return scnprintf(buf, size, "%u,%llu,%llu,%c;", (msg->facility << 3) | msg->level, seq, ts_usec, cont); @@ -786,6 +794,8 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) return ret; } +static void cont_flush(void); + static ssize_t devkmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -801,6 +811,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, if (ret) return ret; raw_spin_lock_irq(&logbuf_lock); + cont_flush(); while (user->seq == log_next_seq) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; @@ -863,6 +874,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) return -ESPIPE; raw_spin_lock_irq(&logbuf_lock); + cont_flush(); switch (whence) { case SEEK_SET: /* the first record */ @@ -901,6 +913,7 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait) poll_wait(file, &log_wait, wait); raw_spin_lock_irq(&logbuf_lock); + cont_flush(); if (user->seq < log_next_seq) { /* return error when data has vanished underneath us */ if (user->seq < log_first_seq) @@ -1287,6 +1300,7 @@ static int syslog_print(char __user *buf, int size) size_t skip; raw_spin_lock_irq(&logbuf_lock); + cont_flush(); if (syslog_seq < log_first_seq) { /* messages are gone, move to first one */ syslog_seq = log_first_seq; @@ -1346,6 +1360,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) return -ENOMEM; raw_spin_lock_irq(&logbuf_lock); + cont_flush(); if (buf) { u64 next_seq; u64 seq; @@ -1507,6 +1522,7 @@ int do_syslog(int type, char __user *buf, int len, int source) /* Number of chars in the log buffer */ case SYSLOG_ACTION_SIZE_UNREAD: raw_spin_lock_irq(&logbuf_lock); + cont_flush(); if (syslog_seq < log_first_seq) { /* messages are gone, move to first one */ syslog_seq = log_first_seq; @@ -1643,35 +1659,33 @@ static struct cont { bool flushed:1; /* buffer sealed and committed */ } cont; -static void cont_flush(enum log_flags flags) +static void cont_flush(void) { if (cont.flushed) return; if (cont.len == 0) return; - if (cont.cons) { /* * If a fragment of this line was directly flushed to the * console; wait for the console to pick up the rest of the * line. LOG_NOCONS suppresses a duplicated output. */ - log_store(cont.facility, cont.level, flags | LOG_NOCONS, + log_store(cont.facility, cont.level, cont.flags | LOG_NOCONS, cont.ts_nsec, NULL, 0, cont.buf, cont.len); - cont.flags = flags; cont.flushed = true; } else { /* * If no fragment of this line ever reached the console, * just submit it to the store and free the buffer. */ - log_store(cont.facility, cont.level, flags, 0, + log_store(cont.facility, cont.level, cont.flags, 0, NULL, 0, cont.buf, cont.len); cont.len = 0; } } -static bool cont_add(int facility, int level, const char *text, size_t len) +static bool cont_add(int facility, int level, enum log_flags flags, const char *text, size_t len) { if (cont.len && cont.flushed) return false; @@ -1682,7 +1696,7 @@ static bool cont_add(int facility, int level, const char *text, size_t len) * the line gets too long, split it up in separate records. */ if (nr_ext_console_drivers || cont.len + len > sizeof(cont.buf)) { - cont_flush(LOG_CONT); + cont_flush(); return false; } @@ -1691,7 +1705,7 @@ static bool cont_add(int facility, int level, const char *text, size_t len) cont.level = level; cont.owner = current; cont.ts_nsec = local_clock(); - cont.flags = 0; + cont.flags = flags; cont.cons = 0; cont.flushed = false; } @@ -1699,8 +1713,15 @@ static bool cont_add(int facility, int level, const char *text, size_t len) memcpy(cont.buf + cont.len, text, len); cont.len += len; + // The original flags come from the first line, + // but later continuations can add a newline. + if (flags & LOG_NEWLINE) { + cont.flags |= LOG_NEWLINE; + cont_flush(); + } + if (cont.len > (sizeof(cont.buf) * 80) / 100) - cont_flush(LOG_CONT); + cont_flush(); return true; } @@ -1733,6 +1754,31 @@ static size_t cont_print_text(char *text, size_t size) return textlen; } +static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len) +{ + /* + * If an earlier line was buffered, and we're a continuation + * write from the same process, try to add it to the buffer. + */ + if (cont.len) { + if (cont.owner == current && (lflags & LOG_CONT)) { + if (cont_add(facility, level, lflags, text, text_len)) + return text_len; + } + /* Otherwise, make sure it's flushed */ + cont_flush(); + } + + /* If it doesn't end in a newline, try to buffer the current line */ + if (!(lflags & LOG_NEWLINE)) { + if (cont_add(facility, level, lflags, text, text_len)) + return text_len; + } + + /* Store it in the record log */ + return log_store(facility, level, lflags, 0, dict, dictlen, text, text_len); +} + asmlinkage int vprintk_emit(int facility, int level, const char *dict, size_t dictlen, const char *fmt, va_list args) @@ -1819,10 +1865,9 @@ asmlinkage int vprintk_emit(int facility, int level, /* strip kernel syslog prefix and extract log level or control flags */ if (facility == 0) { - int kern_level = printk_get_level(text); + int kern_level; - if (kern_level) { - const char *end_of_header = printk_skip_level(text); + while ((kern_level = printk_get_level(text)) != 0) { switch (kern_level) { case '0' ... '7': if (level == LOGLEVEL_DEFAULT) @@ -1830,14 +1875,13 @@ asmlinkage int vprintk_emit(int facility, int level, /* fallthrough */ case 'd': /* KERN_DEFAULT */ lflags |= LOG_PREFIX; + break; + case 'c': /* KERN_CONT */ + lflags |= LOG_CONT; } - /* - * No need to check length here because vscnprintf - * put '\0' at the end of the string. Only valid and - * newly printed level is detected. - */ - text_len -= end_of_header - text; - text = (char *)end_of_header; + + text_len -= 2; + text += 2; } } @@ -1847,45 +1891,7 @@ asmlinkage int vprintk_emit(int facility, int level, if (dict) lflags |= LOG_PREFIX|LOG_NEWLINE; - if (!(lflags & LOG_NEWLINE)) { - /* - * Flush the conflicting buffer. An earlier newline was missing, - * or another task also prints continuation lines. - */ - if (cont.len && (lflags & LOG_PREFIX || cont.owner != current)) - cont_flush(LOG_NEWLINE); - - /* buffer line if possible, otherwise store it right away */ - if (cont_add(facility, level, text, text_len)) - printed_len += text_len; - else - printed_len += log_store(facility, level, - lflags | LOG_CONT, 0, - dict, dictlen, text, text_len); - } else { - bool stored = false; - - /* - * If an earlier newline was missing and it was the same task, - * either merge it with the current buffer and flush, or if - * there was a race with interrupts (prefix == true) then just - * flush it out and store this line separately. - * If the preceding printk was from a different task and missed - * a newline, flush and append the newline. - */ - if (cont.len) { - if (cont.owner == current && !(lflags & LOG_PREFIX)) - stored = cont_add(facility, level, text, - text_len); - cont_flush(LOG_NEWLINE); - } - - if (stored) - printed_len += text_len; - else - printed_len += log_store(facility, level, lflags, 0, - dict, dictlen, text, text_len); - } + printed_len += log_output(facility, level, lflags, dict, dictlen, text, text_len); logbuf_cpu = UINT_MAX; raw_spin_unlock(&logbuf_lock); @@ -2647,7 +2653,7 @@ void register_console(struct console *newcon) * didn't select a console we take the first one * that registers here. */ - if (preferred_console < 0) { + if (preferred_console < 0 && !of_specified_console) { if (newcon->index < 0) newcon->index = 0; if (newcon->setup == NULL || @@ -3029,6 +3035,7 @@ void kmsg_dump(enum kmsg_dump_reason reason) dumper->active = true; raw_spin_lock_irqsave(&logbuf_lock, flags); + cont_flush(); dumper->cur_seq = clear_seq; dumper->cur_idx = clear_idx; dumper->next_seq = log_next_seq; @@ -3119,6 +3126,7 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, bool ret; raw_spin_lock_irqsave(&logbuf_lock, flags); + cont_flush(); ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len); raw_spin_unlock_irqrestore(&logbuf_lock, flags); @@ -3161,6 +3169,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, goto out; raw_spin_lock_irqsave(&logbuf_lock, flags); + cont_flush(); if (dumper->cur_seq < log_first_seq) { /* messages are gone, move to first available one */ dumper->cur_seq = log_first_seq; diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index d38ab08a3fe7..123ccbd22449 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -52,7 +52,7 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); #define PERF_FLAG "-perf:" #define PERFOUT_STRING(s) \ - pr_alert("%s" PERF_FLAG s "\n", perf_type) + pr_alert("%s" PERF_FLAG " %s\n", perf_type, s) #define VERBOSE_PERFOUT_STRING(s) \ do { if (verbose) pr_alert("%s" PERF_FLAG " %s\n", perf_type, s); } while (0) #define VERBOSE_PERFOUT_ERRSTRING(s) \ @@ -400,9 +400,8 @@ rcu_perf_writer(void *arg) sp.sched_priority = 0; sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); - pr_alert("%s" PERF_FLAG - "rcu_perf_writer %ld has %d measurements\n", - perf_type, me, MIN_MEAS); + pr_alert("%s%s rcu_perf_writer %ld has %d measurements\n", + perf_type, PERF_FLAG, me, MIN_MEAS); if (atomic_inc_return(&n_rcu_perf_writer_finished) >= nrealwriters) { schedule_timeout_interruptible(10); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 971e2b138063..bf08fee53dc7 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1238,6 +1238,7 @@ rcu_torture_stats_print(void) long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; static unsigned long rtcv_snap = ULONG_MAX; + struct task_struct *wtp; for_each_possible_cpu(cpu) { for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { @@ -1258,8 +1259,9 @@ rcu_torture_stats_print(void) atomic_read(&n_rcu_torture_alloc), atomic_read(&n_rcu_torture_alloc_fail), atomic_read(&n_rcu_torture_free)); - pr_cont("rtmbe: %d rtbke: %ld rtbre: %ld ", + pr_cont("rtmbe: %d rtbe: %ld rtbke: %ld rtbre: %ld ", atomic_read(&n_rcu_torture_mberror), + n_rcu_torture_barrier_error, n_rcu_torture_boost_ktrerror, n_rcu_torture_boost_rterror); pr_cont("rtbf: %ld rtb: %ld nt: %ld ", @@ -1312,10 +1314,12 @@ rcu_torture_stats_print(void) rcutorture_get_gp_data(cur_ops->ttype, &flags, &gpnum, &completed); - pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x\n", + wtp = READ_ONCE(writer_task); + pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n", rcu_torture_writer_state_getname(), rcu_torture_writer_state, - gpnum, completed, flags); + gpnum, completed, flags, + wtp == NULL ? ~0UL : wtp->state); show_rcu_gp_kthreads(); rcu_ftrace_dump(DUMP_ALL); } @@ -1362,12 +1366,12 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) onoff_interval, onoff_holdoff); } -static void rcutorture_booster_cleanup(int cpu) +static int rcutorture_booster_cleanup(unsigned int cpu) { struct task_struct *t; if (boost_tasks[cpu] == NULL) - return; + return 0; mutex_lock(&boost_mutex); t = boost_tasks[cpu]; boost_tasks[cpu] = NULL; @@ -1375,9 +1379,10 @@ static void rcutorture_booster_cleanup(int cpu) /* This must be outside of the mutex, otherwise deadlock! */ torture_stop_kthread(rcu_torture_boost, t); + return 0; } -static int rcutorture_booster_init(int cpu) +static int rcutorture_booster_init(unsigned int cpu) { int retval; @@ -1577,28 +1582,7 @@ static void rcu_torture_barrier_cleanup(void) } } -static int rcutorture_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - long cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - case CPU_DOWN_FAILED: - (void)rcutorture_booster_init(cpu); - break; - case CPU_DOWN_PREPARE: - rcutorture_booster_cleanup(cpu); - break; - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block rcutorture_cpu_nb = { - .notifier_call = rcutorture_cpu_notify, -}; +static enum cpuhp_state rcutor_hp; static void rcu_torture_cleanup(void) @@ -1638,11 +1622,8 @@ rcu_torture_cleanup(void) for (i = 0; i < ncbflooders; i++) torture_stop_kthread(rcu_torture_cbflood, cbflood_task[i]); if ((test_boost == 1 && cur_ops->can_boost) || - test_boost == 2) { - unregister_cpu_notifier(&rcutorture_cpu_nb); - for_each_possible_cpu(i) - rcutorture_booster_cleanup(i); - } + test_boost == 2) + cpuhp_remove_state(rcutor_hp); /* * Wait for all RCU callbacks to fire, then do flavor-specific @@ -1869,14 +1850,13 @@ rcu_torture_init(void) test_boost == 2) { boost_starttime = jiffies + test_boost_interval * HZ; - register_cpu_notifier(&rcutorture_cpu_nb); - for_each_possible_cpu(i) { - if (cpu_is_offline(i)) - continue; /* Heuristic: CPU can go offline. */ - firsterr = rcutorture_booster_init(i); - if (firsterr) - goto unwind; - } + + firsterr = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "RCU_TORTURE", + rcutorture_booster_init, + rcutorture_booster_cleanup); + if (firsterr < 0) + goto unwind; + rcutor_hp = firsterr; } firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); if (firsterr) diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index be922c9f3d37..50d1861f7759 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -68,6 +68,8 @@ void rcu_sync_lockdep_assert(struct rcu_sync *rsp) RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(), "suspicious rcu_sync_is_idle() usage"); } + +EXPORT_SYMBOL_GPL(rcu_sync_lockdep_assert); #endif /** @@ -83,6 +85,18 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type) } /** + * Must be called after rcu_sync_init() and before first use. + * + * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}() + * pairs turn into NO-OPs. + */ +void rcu_sync_enter_start(struct rcu_sync *rsp) +{ + rsp->gp_count++; + rsp->gp_state = GP_PASSED; +} + +/** * rcu_sync_enter() - Force readers onto slowpath * @rsp: Pointer to rcu_sync structure to use for synchronization * diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5d80925e7fc8..7e2e03879c2e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -41,7 +41,6 @@ #include <linux/export.h> #include <linux/completion.h> #include <linux/moduleparam.h> -#include <linux/module.h> #include <linux/percpu.h> #include <linux/notifier.h> #include <linux/cpu.h> @@ -60,7 +59,6 @@ #include "tree.h" #include "rcu.h" -MODULE_ALIAS("rcutree"); #ifdef MODULE_PARAM_PREFIX #undef MODULE_PARAM_PREFIX #endif @@ -1848,6 +1846,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { bool ret; + bool need_gp; /* Handle the ends of any preceding grace periods first. */ if (rdp->completed == rnp->completed && @@ -1874,9 +1873,10 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, */ rdp->gpnum = rnp->gpnum; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); - rdp->cpu_no_qs.b.norm = true; + need_gp = !!(rnp->qsmask & rdp->grpmask); + rdp->cpu_no_qs.b.norm = need_gp; rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); - rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask); + rdp->core_needs_qs = need_gp; zero_cpu_stall_ticks(rdp); WRITE_ONCE(rdp->gpwrap, false); } @@ -2344,7 +2344,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); - swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ + rcu_gp_kthread_wake(rsp); } /* @@ -2970,7 +2970,7 @@ static void force_quiescent_state(struct rcu_state *rsp) } WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); - swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ + rcu_gp_kthread_wake(rsp); } /* @@ -3792,8 +3792,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rnp = rdp->mynode; mask = rdp->grpmask; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ - rnp->qsmaskinitnext |= mask; - rnp->expmaskinitnext |= mask; if (!rdp->beenonline) WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1); rdp->beenonline = true; /* We have now been online. */ @@ -3860,6 +3858,32 @@ int rcutree_dead_cpu(unsigned int cpu) return 0; } +/* + * Mark the specified CPU as being online so that subsequent grace periods + * (both expedited and normal) will wait on it. Note that this means that + * incoming CPUs are not allowed to use RCU read-side critical sections + * until this function is called. Failing to observe this restriction + * will result in lockdep splats. + */ +void rcu_cpu_starting(unsigned int cpu) +{ + unsigned long flags; + unsigned long mask; + struct rcu_data *rdp; + struct rcu_node *rnp; + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) { + rdp = this_cpu_ptr(rsp->rda); + rnp = rdp->mynode; + mask = rdp->grpmask; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + rnp->qsmaskinitnext |= mask; + rnp->expmaskinitnext |= mask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } +} + #ifdef CONFIG_HOTPLUG_CPU /* * The CPU is exiting the idle loop into the arch_cpu_idle_dead() @@ -4209,8 +4233,10 @@ void __init rcu_init(void) * or the scheduler are operational. */ pm_notifier(rcu_pm_notify, 0); - for_each_online_cpu(cpu) + for_each_online_cpu(cpu) { rcutree_prepare_cpu(cpu); + rcu_cpu_starting(cpu); + } } #include "tree_exp.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index f714f873bf9d..e99a5234d9ed 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -400,6 +400,7 @@ struct rcu_data { #ifdef CONFIG_RCU_FAST_NO_HZ struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ + atomic_long_t exp_workdone0; /* # done by workqueue. */ atomic_long_t exp_workdone1; /* # done by others #1. */ atomic_long_t exp_workdone2; /* # done by others #2. */ atomic_long_t exp_workdone3; /* # done by others #3. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 6d86ab6ec2c9..24343eb87b58 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -359,7 +359,8 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); if (raw_smp_processor_id() == cpu || - !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) + !(atomic_add_return(0, &rdtp->dynticks) & 0x1) || + !(rnp->qsmaskinitnext & rdp->grpmask)) mask_ofl_test |= rdp->grpmask; } mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; @@ -384,17 +385,16 @@ retry_ipi: mask_ofl_ipi &= ~mask; continue; } - /* Failed, raced with offline. */ + /* Failed, raced with CPU hotplug operation. */ raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (cpu_online(cpu) && + if ((rnp->qsmaskinitnext & mask) && (rnp->expmask & mask)) { + /* Online, so delay for a bit and try again. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); schedule_timeout_uninterruptible(1); - if (cpu_online(cpu) && - (rnp->expmask & mask)) - goto retry_ipi; - raw_spin_lock_irqsave_rcu_node(rnp, flags); + goto retry_ipi; } + /* CPU really is offline, so we can ignore it. */ if (!(rnp->expmask & mask)) mask_ofl_ipi &= ~mask; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -427,12 +427,10 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) jiffies_stall); if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) return; - if (ret < 0) { - /* Hit a signal, disable CPU stall warnings. */ - swait_event(rsp->expedited_wq, - sync_rcu_preempt_exp_done(rnp_root)); - return; - } + WARN_ON(ret < 0); /* workqueues should not be signaled. */ + if (rcu_cpu_stall_suppress) + continue; + panic_on_rcu_stall(); pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rsp->name); ndetected = 0; @@ -500,7 +498,6 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) * next GP, to proceed. */ mutex_lock(&rsp->exp_wake_mutex); - mutex_unlock(&rsp->exp_mutex); rcu_for_each_node_breadth_first(rsp, rnp) { if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { @@ -516,6 +513,70 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) mutex_unlock(&rsp->exp_wake_mutex); } +/* Let the workqueue handler know what it is supposed to do. */ +struct rcu_exp_work { + smp_call_func_t rew_func; + struct rcu_state *rew_rsp; + unsigned long rew_s; + struct work_struct rew_work; +}; + +/* + * Work-queue handler to drive an expedited grace period forward. + */ +static void wait_rcu_exp_gp(struct work_struct *wp) +{ + struct rcu_exp_work *rewp; + + /* Initialize the rcu_node tree in preparation for the wait. */ + rewp = container_of(wp, struct rcu_exp_work, rew_work); + sync_rcu_exp_select_cpus(rewp->rew_rsp, rewp->rew_func); + + /* Wait and clean up, including waking everyone. */ + rcu_exp_wait_wake(rewp->rew_rsp, rewp->rew_s); +} + +/* + * Given an rcu_state pointer and a smp_call_function() handler, kick + * off the specified flavor of expedited grace period. + */ +static void _synchronize_rcu_expedited(struct rcu_state *rsp, + smp_call_func_t func) +{ + struct rcu_data *rdp; + struct rcu_exp_work rew; + struct rcu_node *rnp; + unsigned long s; + + /* If expedited grace periods are prohibited, fall back to normal. */ + if (rcu_gp_is_normal()) { + wait_rcu_gp(rsp->call); + return; + } + + /* Take a snapshot of the sequence number. */ + s = rcu_exp_gp_seq_snap(rsp); + if (exp_funnel_lock(rsp, s)) + return; /* Someone else did our work for us. */ + + /* Marshall arguments and schedule the expedited grace period. */ + rew.rew_func = func; + rew.rew_rsp = rsp; + rew.rew_s = s; + INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); + schedule_work(&rew.rew_work); + + /* Wait for expedited grace period to complete. */ + rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); + rnp = rcu_get_root(rsp); + wait_event(rnp->exp_wq[(s >> 1) & 0x3], + sync_exp_work_done(rsp, + &rdp->exp_workdone0, s)); + + /* Let the next expedited grace period start. */ + mutex_unlock(&rsp->exp_mutex); +} + /** * synchronize_sched_expedited - Brute-force RCU-sched grace period * @@ -534,29 +595,13 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) */ void synchronize_sched_expedited(void) { - unsigned long s; struct rcu_state *rsp = &rcu_sched_state; /* If only one CPU, this is automatically a grace period. */ if (rcu_blocking_is_gp()) return; - /* If expedited grace periods are prohibited, fall back to normal. */ - if (rcu_gp_is_normal()) { - wait_rcu_gp(call_rcu_sched); - return; - } - - /* Take a snapshot of the sequence number. */ - s = rcu_exp_gp_seq_snap(rsp); - if (exp_funnel_lock(rsp, s)) - return; /* Someone else did our work for us. */ - - /* Initialize the rcu_node tree in preparation for the wait. */ - sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); - - /* Wait and clean up, including waking everyone. */ - rcu_exp_wait_wake(rsp, s); + _synchronize_rcu_expedited(rsp, sync_sched_exp_handler); } EXPORT_SYMBOL_GPL(synchronize_sched_expedited); @@ -620,23 +665,8 @@ static void sync_rcu_exp_handler(void *info) void synchronize_rcu_expedited(void) { struct rcu_state *rsp = rcu_state_p; - unsigned long s; - - /* If expedited grace periods are prohibited, fall back to normal. */ - if (rcu_gp_is_normal()) { - wait_rcu_gp(call_rcu); - return; - } - - s = rcu_exp_gp_seq_snap(rsp); - if (exp_funnel_lock(rsp, s)) - return; /* Someone else did our work for us. */ - - /* Initialize the rcu_node tree in preparation for the wait. */ - sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); - /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */ - rcu_exp_wait_wake(rsp, s); + _synchronize_rcu_expedited(rsp, sync_rcu_exp_handler); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0082fce402a0..85c5a883c6e3 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2173,6 +2173,7 @@ static int rcu_nocb_kthread(void *arg) cl++; c++; local_bh_enable(); + cond_resched_rcu_qs(); list = next; } trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 86782f9a4604..b1f28972872c 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -185,16 +185,17 @@ static int show_rcuexp(struct seq_file *m, void *v) int cpu; struct rcu_state *rsp = (struct rcu_state *)m->private; struct rcu_data *rdp; - unsigned long s1 = 0, s2 = 0, s3 = 0; + unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0; for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(rsp->rda, cpu); + s0 += atomic_long_read(&rdp->exp_workdone0); s1 += atomic_long_read(&rdp->exp_workdone1); s2 += atomic_long_read(&rdp->exp_workdone2); s3 += atomic_long_read(&rdp->exp_workdone3); } - seq_printf(m, "s=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", - rsp->expedited_sequence, s1, s2, s3, + seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", + rsp->expedited_sequence, s0, s1, s2, s3, atomic_long_read(&rsp->expedited_normal), atomic_read(&rsp->expedited_need_qs), rsp->expedited_sequence / 2); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index f0d8322bc3ec..f19271dce0a9 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -46,7 +46,7 @@ #include <linux/export.h> #include <linux/hardirq.h> #include <linux/delay.h> -#include <linux/module.h> +#include <linux/moduleparam.h> #include <linux/kthread.h> #include <linux/tick.h> @@ -54,7 +54,6 @@ #include "rcu.h" -MODULE_ALIAS("rcupdate"); #ifdef MODULE_PARAM_PREFIX #undef MODULE_PARAM_PREFIX #endif diff --git a/kernel/relay.c b/kernel/relay.c index d797502140b9..9988f5cc2d46 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -214,7 +214,7 @@ static void relay_destroy_buf(struct rchan_buf *buf) __free_page(buf->page_array[i]); relay_free_page_array(buf->page_array); } - chan->buf[buf->cpu] = NULL; + *per_cpu_ptr(chan->buf, buf->cpu) = NULL; kfree(buf->padding); kfree(buf); kref_put(&chan->kref, relay_destroy_channel); @@ -382,20 +382,21 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init) */ void relay_reset(struct rchan *chan) { + struct rchan_buf *buf; unsigned int i; if (!chan) return; - if (chan->is_global && chan->buf[0]) { - __relay_reset(chan->buf[0], 0); + if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0))) { + __relay_reset(buf, 0); return; } mutex_lock(&relay_channels_mutex); for_each_possible_cpu(i) - if (chan->buf[i]) - __relay_reset(chan->buf[i], 0); + if ((buf = *per_cpu_ptr(chan->buf, i))) + __relay_reset(buf, 0); mutex_unlock(&relay_channels_mutex); } EXPORT_SYMBOL_GPL(relay_reset); @@ -440,7 +441,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu) struct dentry *dentry; if (chan->is_global) - return chan->buf[0]; + return *per_cpu_ptr(chan->buf, 0); buf = relay_create_buf(chan); if (!buf) @@ -464,7 +465,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu) __relay_reset(buf, 1); if(chan->is_global) { - chan->buf[0] = buf; + *per_cpu_ptr(chan->buf, 0) = buf; buf->cpu = 0; } @@ -512,46 +513,25 @@ static void setup_callbacks(struct rchan *chan, chan->cb = cb; } -/** - * relay_hotcpu_callback - CPU hotplug callback - * @nb: notifier block - * @action: hotplug action to take - * @hcpu: CPU number - * - * Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD) - */ -static int relay_hotcpu_callback(struct notifier_block *nb, - unsigned long action, - void *hcpu) +int relay_prepare_cpu(unsigned int cpu) { - unsigned int hotcpu = (unsigned long)hcpu; struct rchan *chan; + struct rchan_buf *buf; - switch(action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - mutex_lock(&relay_channels_mutex); - list_for_each_entry(chan, &relay_channels, list) { - if (chan->buf[hotcpu]) - continue; - chan->buf[hotcpu] = relay_open_buf(chan, hotcpu); - if(!chan->buf[hotcpu]) { - printk(KERN_ERR - "relay_hotcpu_callback: cpu %d buffer " - "creation failed\n", hotcpu); - mutex_unlock(&relay_channels_mutex); - return notifier_from_errno(-ENOMEM); - } + mutex_lock(&relay_channels_mutex); + list_for_each_entry(chan, &relay_channels, list) { + if ((buf = *per_cpu_ptr(chan->buf, cpu))) + continue; + buf = relay_open_buf(chan, cpu); + if (!buf) { + pr_err("relay: cpu %d buffer creation failed\n", cpu); + mutex_unlock(&relay_channels_mutex); + return -ENOMEM; } - mutex_unlock(&relay_channels_mutex); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - /* No need to flush the cpu : will be flushed upon - * final relay_flush() call. */ - break; + *per_cpu_ptr(chan->buf, cpu) = buf; } - return NOTIFY_OK; + mutex_unlock(&relay_channels_mutex); + return 0; } /** @@ -583,6 +563,7 @@ struct rchan *relay_open(const char *base_filename, { unsigned int i; struct rchan *chan; + struct rchan_buf *buf; if (!(subbuf_size && n_subbufs)) return NULL; @@ -593,6 +574,7 @@ struct rchan *relay_open(const char *base_filename, if (!chan) return NULL; + chan->buf = alloc_percpu(struct rchan_buf *); chan->version = RELAYFS_CHANNEL_VERSION; chan->n_subbufs = n_subbufs; chan->subbuf_size = subbuf_size; @@ -608,9 +590,10 @@ struct rchan *relay_open(const char *base_filename, mutex_lock(&relay_channels_mutex); for_each_online_cpu(i) { - chan->buf[i] = relay_open_buf(chan, i); - if (!chan->buf[i]) + buf = relay_open_buf(chan, i); + if (!buf) goto free_bufs; + *per_cpu_ptr(chan->buf, i) = buf; } list_add(&chan->list, &relay_channels); mutex_unlock(&relay_channels_mutex); @@ -619,8 +602,8 @@ struct rchan *relay_open(const char *base_filename, free_bufs: for_each_possible_cpu(i) { - if (chan->buf[i]) - relay_close_buf(chan->buf[i]); + if ((buf = *per_cpu_ptr(chan->buf, i))) + relay_close_buf(buf); } kref_put(&chan->kref, relay_destroy_channel); @@ -666,6 +649,7 @@ int relay_late_setup_files(struct rchan *chan, unsigned int i, curr_cpu; unsigned long flags; struct dentry *dentry; + struct rchan_buf *buf; struct rchan_percpu_buf_dispatcher disp; if (!chan || !base_filename) @@ -684,10 +668,11 @@ int relay_late_setup_files(struct rchan *chan, if (chan->is_global) { err = -EINVAL; - if (!WARN_ON_ONCE(!chan->buf[0])) { - dentry = relay_create_buf_file(chan, chan->buf[0], 0); + buf = *per_cpu_ptr(chan->buf, 0); + if (!WARN_ON_ONCE(!buf)) { + dentry = relay_create_buf_file(chan, buf, 0); if (dentry && !WARN_ON_ONCE(!chan->is_global)) { - relay_set_buf_dentry(chan->buf[0], dentry); + relay_set_buf_dentry(buf, dentry); err = 0; } } @@ -702,13 +687,14 @@ int relay_late_setup_files(struct rchan *chan, * on all currently online CPUs. */ for_each_online_cpu(i) { - if (unlikely(!chan->buf[i])) { + buf = *per_cpu_ptr(chan->buf, i); + if (unlikely(!buf)) { WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n"); err = -EINVAL; break; } - dentry = relay_create_buf_file(chan, chan->buf[i], i); + dentry = relay_create_buf_file(chan, buf, i); if (unlikely(!dentry)) { err = -EINVAL; break; @@ -716,10 +702,10 @@ int relay_late_setup_files(struct rchan *chan, if (curr_cpu == i) { local_irq_save(flags); - relay_set_buf_dentry(chan->buf[i], dentry); + relay_set_buf_dentry(buf, dentry); local_irq_restore(flags); } else { - disp.buf = chan->buf[i]; + disp.buf = buf; disp.dentry = dentry; smp_mb(); /* relay_channels_mutex must be held, so wait. */ @@ -822,11 +808,10 @@ void relay_subbufs_consumed(struct rchan *chan, if (!chan) return; - if (cpu >= NR_CPUS || !chan->buf[cpu] || - subbufs_consumed > chan->n_subbufs) + buf = *per_cpu_ptr(chan->buf, cpu); + if (cpu >= NR_CPUS || !buf || subbufs_consumed > chan->n_subbufs) return; - buf = chan->buf[cpu]; if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed) buf->subbufs_consumed = buf->subbufs_produced; else @@ -842,18 +827,19 @@ EXPORT_SYMBOL_GPL(relay_subbufs_consumed); */ void relay_close(struct rchan *chan) { + struct rchan_buf *buf; unsigned int i; if (!chan) return; mutex_lock(&relay_channels_mutex); - if (chan->is_global && chan->buf[0]) - relay_close_buf(chan->buf[0]); + if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0))) + relay_close_buf(buf); else for_each_possible_cpu(i) - if (chan->buf[i]) - relay_close_buf(chan->buf[i]); + if ((buf = *per_cpu_ptr(chan->buf, i))) + relay_close_buf(buf); if (chan->last_toobig) printk(KERN_WARNING "relay: one or more items not logged " @@ -874,20 +860,21 @@ EXPORT_SYMBOL_GPL(relay_close); */ void relay_flush(struct rchan *chan) { + struct rchan_buf *buf; unsigned int i; if (!chan) return; - if (chan->is_global && chan->buf[0]) { - relay_switch_subbuf(chan->buf[0], 0); + if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0))) { + relay_switch_subbuf(buf, 0); return; } mutex_lock(&relay_channels_mutex); for_each_possible_cpu(i) - if (chan->buf[i]) - relay_switch_subbuf(chan->buf[i], 0); + if ((buf = *per_cpu_ptr(chan->buf, i))) + relay_switch_subbuf(buf, 0); mutex_unlock(&relay_channels_mutex); } EXPORT_SYMBOL_GPL(relay_flush); @@ -1121,51 +1108,23 @@ static size_t relay_file_read_end_pos(struct rchan_buf *buf, return end_pos; } -/* - * subbuf_read_actor - read up to one subbuf's worth of data - */ -static int subbuf_read_actor(size_t read_start, - struct rchan_buf *buf, - size_t avail, - read_descriptor_t *desc) -{ - void *from; - int ret = 0; - - from = buf->start + read_start; - ret = avail; - if (copy_to_user(desc->arg.buf, from, avail)) { - desc->error = -EFAULT; - ret = 0; - } - desc->arg.data += ret; - desc->written += ret; - desc->count -= ret; - - return ret; -} - -typedef int (*subbuf_actor_t) (size_t read_start, - struct rchan_buf *buf, - size_t avail, - read_descriptor_t *desc); - -/* - * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries - */ -static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, - subbuf_actor_t subbuf_actor, - read_descriptor_t *desc) +static ssize_t relay_file_read(struct file *filp, + char __user *buffer, + size_t count, + loff_t *ppos) { struct rchan_buf *buf = filp->private_data; size_t read_start, avail; + size_t written = 0; int ret; - if (!desc->count) + if (!count) return 0; inode_lock(file_inode(filp)); do { + void *from; + if (!relay_file_read_avail(buf, *ppos)) break; @@ -1174,32 +1133,22 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, if (!avail) break; - avail = min(desc->count, avail); - ret = subbuf_actor(read_start, buf, avail, desc); - if (desc->error < 0) + avail = min(count, avail); + from = buf->start + read_start; + ret = avail; + if (copy_to_user(buffer, from, avail)) break; - if (ret) { - relay_file_read_consume(buf, read_start, ret); - *ppos = relay_file_read_end_pos(buf, read_start, ret); - } - } while (desc->count && ret); - inode_unlock(file_inode(filp)); + buffer += ret; + written += ret; + count -= ret; - return desc->written; -} + relay_file_read_consume(buf, read_start, ret); + *ppos = relay_file_read_end_pos(buf, read_start, ret); + } while (count); + inode_unlock(file_inode(filp)); -static ssize_t relay_file_read(struct file *filp, - char __user *buffer, - size_t count, - loff_t *ppos) -{ - read_descriptor_t desc; - desc.written = 0; - desc.count = count; - desc.arg.buf = buffer; - desc.error = 0; - return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, &desc); + return written; } static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed) @@ -1377,12 +1326,3 @@ const struct file_operations relay_file_operations = { .splice_read = relay_file_splice_read, }; EXPORT_SYMBOL_GPL(relay_file_operations); - -static __init int relay_init(void) -{ - - hotcpu_notifier(relay_hotcpu_callback, 0); - return 0; -} - -early_initcall(relay_init); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2a906f20fba7..94732d1ab00a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -581,6 +581,8 @@ static bool wake_up_full_nohz_cpu(int cpu) * If needed we can still optimize that later with an * empty IRQ. */ + if (cpu_is_offline(cpu)) + return true; /* Don't try to wake offline CPUs. */ if (tick_nohz_full_cpu(cpu)) { if (cpu != smp_processor_id() || tick_nohz_tick_stopped()) @@ -591,6 +593,11 @@ static bool wake_up_full_nohz_cpu(int cpu) return false; } +/* + * Wake up the specified CPU. If the CPU is going offline, it is the + * caller's responsibility to deal with the lost wakeup, for example, + * by hooking into the CPU_DEAD notifier like timers and hrtimers do. + */ void wake_up_nohz_cpu(int cpu) { if (!wake_up_full_nohz_cpu(cpu)) @@ -1063,8 +1070,12 @@ static int migration_cpu_stop(void *data) * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because * we're holding p->pi_lock. */ - if (task_rq(p) == rq && task_on_rq_queued(p)) - rq = __migrate_task(rq, p, arg->dest_cpu); + if (task_rq(p) == rq) { + if (task_on_rq_queued(p)) + rq = __migrate_task(rq, p, arg->dest_cpu); + else + p->wake_cpu = arg->dest_cpu; + } raw_spin_unlock(&rq->lock); raw_spin_unlock(&p->pi_lock); @@ -1105,10 +1116,10 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) p->sched_class->set_cpus_allowed(p, new_mask); - if (running) - p->sched_class->set_curr_task(rq); if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE); + if (running) + set_curr_task(rq, p); } /* @@ -1265,7 +1276,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) /* * Task isn't running anymore; make it appear like we migrated * it before it went to sleep. This means on wakeup we make the - * previous cpu our targer instead of where it really is. + * previous cpu our target instead of where it really is. */ p->wake_cpu = cpu; } @@ -1629,23 +1640,25 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p, static void ttwu_stat(struct task_struct *p, int cpu, int wake_flags) { -#ifdef CONFIG_SCHEDSTATS - struct rq *rq = this_rq(); + struct rq *rq; -#ifdef CONFIG_SMP - int this_cpu = smp_processor_id(); + if (!schedstat_enabled()) + return; - if (cpu == this_cpu) { - schedstat_inc(rq, ttwu_local); - schedstat_inc(p, se.statistics.nr_wakeups_local); + rq = this_rq(); + +#ifdef CONFIG_SMP + if (cpu == rq->cpu) { + schedstat_inc(rq->ttwu_local); + schedstat_inc(p->se.statistics.nr_wakeups_local); } else { struct sched_domain *sd; - schedstat_inc(p, se.statistics.nr_wakeups_remote); + schedstat_inc(p->se.statistics.nr_wakeups_remote); rcu_read_lock(); - for_each_domain(this_cpu, sd) { + for_each_domain(rq->cpu, sd) { if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { - schedstat_inc(sd, ttwu_wake_remote); + schedstat_inc(sd->ttwu_wake_remote); break; } } @@ -1653,17 +1666,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) } if (wake_flags & WF_MIGRATED) - schedstat_inc(p, se.statistics.nr_wakeups_migrate); - + schedstat_inc(p->se.statistics.nr_wakeups_migrate); #endif /* CONFIG_SMP */ - schedstat_inc(rq, ttwu_count); - schedstat_inc(p, se.statistics.nr_wakeups); + schedstat_inc(rq->ttwu_count); + schedstat_inc(p->se.statistics.nr_wakeups); if (wake_flags & WF_SYNC) - schedstat_inc(p, se.statistics.nr_wakeups_sync); - -#endif /* CONFIG_SCHEDSTATS */ + schedstat_inc(p->se.statistics.nr_wakeups_sync); } static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) @@ -2016,6 +2026,28 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) success = 1; /* we're going to change ->state */ cpu = task_cpu(p); + /* + * Ensure we load p->on_rq _after_ p->state, otherwise it would + * be possible to, falsely, observe p->on_rq == 0 and get stuck + * in smp_cond_load_acquire() below. + * + * sched_ttwu_pending() try_to_wake_up() + * [S] p->on_rq = 1; [L] P->state + * UNLOCK rq->lock -----. + * \ + * +--- RMB + * schedule() / + * LOCK rq->lock -----' + * UNLOCK rq->lock + * + * [task p] + * [S] p->state = UNINTERRUPTIBLE [L] p->on_rq + * + * Pairs with the UNLOCK+LOCK on rq->lock from the + * last wakeup of our task and the schedule that got our task + * current. + */ + smp_rmb(); if (p->on_rq && ttwu_remote(p, wake_flags)) goto stat; @@ -2062,8 +2094,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) ttwu_queue(p, cpu, wake_flags); stat: - if (schedstat_enabled()) - ttwu_stat(p, cpu, wake_flags); + ttwu_stat(p, cpu, wake_flags); out: raw_spin_unlock_irqrestore(&p->pi_lock, flags); @@ -2073,6 +2104,7 @@ out: /** * try_to_wake_up_local - try to wake up a local task with rq lock held * @p: the thread to be awakened + * @cookie: context's cookie for pinning * * Put @p on the run-queue if it's not already there. The caller must * ensure that this_rq() is locked, @p is bound to this_rq() and not @@ -2111,8 +2143,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie ttwu_activate(rq, p, ENQUEUE_WAKEUP); ttwu_do_wakeup(rq, p, 0, cookie); - if (schedstat_enabled()) - ttwu_stat(p, smp_processor_id(), 0); + ttwu_stat(p, smp_processor_id(), 0); out: raw_spin_unlock(&p->pi_lock); } @@ -2750,6 +2781,10 @@ static struct rq *finish_task_switch(struct task_struct *prev) * task and put them back on the free list. */ kprobe_flush_task(prev); + + /* Task is done with its stack. */ + put_task_stack(prev); + put_task_struct(prev); } @@ -3170,6 +3205,9 @@ static inline void preempt_latency_stop(int val) { } */ static noinline void __schedule_bug(struct task_struct *prev) { + /* Save this before calling printk(), since that will clobber it */ + unsigned long preempt_disable_ip = get_preempt_disable_ip(current); + if (oops_in_progress) return; @@ -3180,13 +3218,12 @@ static noinline void __schedule_bug(struct task_struct *prev) print_modules(); if (irqs_disabled()) print_irqtrace_events(prev); -#ifdef CONFIG_DEBUG_PREEMPT - if (in_atomic_preempt_off()) { + if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) + && in_atomic_preempt_off()) { pr_err("Preemption disabled at:"); - print_ip_sym(current->preempt_disable_ip); + print_ip_sym(preempt_disable_ip); pr_cont("\n"); } -#endif if (panic_on_warn) panic("scheduling while atomic\n"); @@ -3212,7 +3249,7 @@ static inline void schedule_debug(struct task_struct *prev) profile_hit(SCHED_PROFILING, __builtin_return_address(0)); - schedstat_inc(this_rq(), sched_count); + schedstat_inc(this_rq()->sched_count); } /* @@ -3305,17 +3342,6 @@ static void __sched notrace __schedule(bool preempt) rq = cpu_rq(cpu); prev = rq->curr; - /* - * do_exit() calls schedule() with preemption disabled as an exception; - * however we must fix that up, otherwise the next task will see an - * inconsistent (higher) preempt count. - * - * It also avoids the below schedule_debug() test from complaining - * about this. - */ - if (unlikely(prev->state == TASK_DEAD)) - preempt_enable_no_resched_notrace(); - schedule_debug(prev); if (sched_feat(HRTICK)) @@ -3381,7 +3407,33 @@ static void __sched notrace __schedule(bool preempt) balance_callback(rq); } -STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */ + +void __noreturn do_task_dead(void) +{ + /* + * The setting of TASK_RUNNING by try_to_wake_up() may be delayed + * when the following two conditions become true. + * - There is race condition of mmap_sem (It is acquired by + * exit_mm()), and + * - SMI occurs before setting TASK_RUNINNG. + * (or hypervisor of virtual machine switches to other guest) + * As a result, we may become TASK_RUNNING after becoming TASK_DEAD + * + * To avoid it, we have to wait for releasing tsk->pi_lock which + * is held by try_to_wake_up() + */ + smp_mb(); + raw_spin_unlock_wait(¤t->pi_lock); + + /* causes final put_task_struct in finish_task_switch(). */ + __set_current_state(TASK_DEAD); + current->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ + __schedule(false); + BUG(); + /* Avoid "noreturn function does return". */ + for (;;) + cpu_relax(); /* For when BUG is null */ +} static inline void sched_submit_work(struct task_struct *tsk) { @@ -3665,10 +3717,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) p->prio = prio; - if (running) - p->sched_class->set_curr_task(rq); if (queued) enqueue_task(rq, p, queue_flag); + if (running) + set_curr_task(rq, p); check_class_changed(rq, p, prev_class, oldprio); out_unlock: @@ -3682,7 +3734,8 @@ out_unlock: void set_user_nice(struct task_struct *p, long nice) { - int old_prio, delta, queued; + bool queued, running; + int old_prio, delta; struct rq_flags rf; struct rq *rq; @@ -3704,8 +3757,11 @@ void set_user_nice(struct task_struct *p, long nice) goto out_unlock; } queued = task_on_rq_queued(p); + running = task_current(rq, p); if (queued) dequeue_task(rq, p, DEQUEUE_SAVE); + if (running) + put_prev_task(rq, p); p->static_prio = NICE_TO_PRIO(nice); set_load_weight(p); @@ -3722,6 +3778,8 @@ void set_user_nice(struct task_struct *p, long nice) if (delta < 0 || (delta > 0 && task_running(rq, p))) resched_curr(rq); } + if (running) + set_curr_task(rq, p); out_unlock: task_rq_unlock(rq, p, &rf); } @@ -4221,8 +4279,6 @@ change: prev_class = p->sched_class; __setscheduler(rq, p, attr, pi); - if (running) - p->sched_class->set_curr_task(rq); if (queued) { /* * We enqueue to tail when the priority of a task is @@ -4233,6 +4289,8 @@ change: enqueue_task(rq, p, queue_flags); } + if (running) + set_curr_task(rq, p); check_class_changed(rq, p, prev_class, oldprio); preempt_disable(); /* avoid rq from going away on us */ @@ -4824,7 +4882,7 @@ SYSCALL_DEFINE0(sched_yield) { struct rq *rq = this_rq_lock(); - schedstat_inc(rq, yld_count); + schedstat_inc(rq->yld_count); current->sched_class->yield_task(rq); /* @@ -4841,6 +4899,7 @@ SYSCALL_DEFINE0(sched_yield) return 0; } +#ifndef CONFIG_PREEMPT int __sched _cond_resched(void) { if (should_resched(0)) { @@ -4850,6 +4909,7 @@ int __sched _cond_resched(void) return 0; } EXPORT_SYMBOL(_cond_resched); +#endif /* * __cond_resched_lock() - if a reschedule is pending, drop the given lock, @@ -4975,7 +5035,7 @@ again: yielded = curr->sched_class->yield_to_task(rq, p, preempt); if (yielded) { - schedstat_inc(rq, yld_count); + schedstat_inc(rq->yld_count); /* * Make p's CPU reschedule; pick_next_entity takes care of * fairness. @@ -5395,10 +5455,10 @@ void sched_setnuma(struct task_struct *p, int nid) p->numa_preferred_nid = nid; - if (running) - p->sched_class->set_curr_task(rq); if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE); + if (running) + set_curr_task(rq, p); task_rq_unlock(rq, p, &rf); } #endif /* CONFIG_NUMA_BALANCING */ @@ -5695,6 +5755,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) } } #else /* !CONFIG_SCHED_DEBUG */ + +# define sched_debug_enabled 0 # define sched_domain_debug(sd, cpu) do { } while (0) static inline bool sched_debug(void) { @@ -5713,6 +5775,7 @@ static int sd_degenerate(struct sched_domain *sd) SD_BALANCE_FORK | SD_BALANCE_EXEC | SD_SHARE_CPUCAPACITY | + SD_ASYM_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN)) { if (sd->groups != sd->groups->next) @@ -5743,6 +5806,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | SD_BALANCE_EXEC | + SD_ASYM_CPUCAPACITY | SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_PREFER_SIBLING | @@ -5887,10 +5951,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc) } while (sg != first); } -static void free_sched_domain(struct rcu_head *rcu) +static void destroy_sched_domain(struct sched_domain *sd) { - struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); - /* * If its an overlapping domain it has private groups, iterate and * nuke them all. @@ -5901,18 +5963,26 @@ static void free_sched_domain(struct rcu_head *rcu) kfree(sd->groups->sgc); kfree(sd->groups); } + if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) + kfree(sd->shared); kfree(sd); } -static void destroy_sched_domain(struct sched_domain *sd, int cpu) +static void destroy_sched_domains_rcu(struct rcu_head *rcu) { - call_rcu(&sd->rcu, free_sched_domain); + struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); + + while (sd) { + struct sched_domain *parent = sd->parent; + destroy_sched_domain(sd); + sd = parent; + } } -static void destroy_sched_domains(struct sched_domain *sd, int cpu) +static void destroy_sched_domains(struct sched_domain *sd) { - for (; sd; sd = sd->parent) - destroy_sched_domain(sd, cpu); + if (sd) + call_rcu(&sd->rcu, destroy_sched_domains_rcu); } /* @@ -5927,14 +5997,14 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) DEFINE_PER_CPU(struct sched_domain *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); +DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain *, sd_numa); -DEFINE_PER_CPU(struct sched_domain *, sd_busy); DEFINE_PER_CPU(struct sched_domain *, sd_asym); static void update_top_cache_domain(int cpu) { + struct sched_domain_shared *sds = NULL; struct sched_domain *sd; - struct sched_domain *busy_sd = NULL; int id = cpu; int size = 1; @@ -5942,13 +6012,13 @@ static void update_top_cache_domain(int cpu) if (sd) { id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); - busy_sd = sd->parent; /* sd_busy */ + sds = sd->shared; } - rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd); rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_size, cpu) = size; per_cpu(sd_llc_id, cpu) = id; + rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); sd = lowest_flag_domain(cpu, SD_NUMA); rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); @@ -5984,7 +6054,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) */ if (parent->flags & SD_PREFER_SIBLING) tmp->flags |= SD_PREFER_SIBLING; - destroy_sched_domain(parent, cpu); + destroy_sched_domain(parent); } else tmp = tmp->parent; } @@ -5992,7 +6062,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) if (sd && sd_degenerate(sd)) { tmp = sd; sd = sd->parent; - destroy_sched_domain(tmp, cpu); + destroy_sched_domain(tmp); if (sd) sd->child = NULL; } @@ -6002,7 +6072,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) rq_attach_root(rq, rd); tmp = rq->sd; rcu_assign_pointer(rq->sd, sd); - destroy_sched_domains(tmp, cpu); + destroy_sched_domains(tmp); update_top_cache_domain(cpu); } @@ -6245,7 +6315,6 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) return; update_group_capacity(sd, cpu); - atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight); } /* @@ -6333,6 +6402,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd) WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); *per_cpu_ptr(sdd->sd, cpu) = NULL; + if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref)) + *per_cpu_ptr(sdd->sds, cpu) = NULL; + if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) *per_cpu_ptr(sdd->sg, cpu) = NULL; @@ -6352,26 +6424,37 @@ static int sched_domains_curr_level; /* * SD_flags allowed in topology descriptions. * - * SD_SHARE_CPUCAPACITY - describes SMT topologies - * SD_SHARE_PKG_RESOURCES - describes shared caches - * SD_NUMA - describes NUMA topologies - * SD_SHARE_POWERDOMAIN - describes shared power domain + * These flags are purely descriptive of the topology and do not prescribe + * behaviour. Behaviour is artificial and mapped in the below sd_init() + * function: + * + * SD_SHARE_CPUCAPACITY - describes SMT topologies + * SD_SHARE_PKG_RESOURCES - describes shared caches + * SD_NUMA - describes NUMA topologies + * SD_SHARE_POWERDOMAIN - describes shared power domain + * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies * - * Odd one out: - * SD_ASYM_PACKING - describes SMT quirks + * Odd one out, which beside describing the topology has a quirk also + * prescribes the desired behaviour that goes along with it: + * + * SD_ASYM_PACKING - describes SMT quirks */ #define TOPOLOGY_SD_FLAGS \ (SD_SHARE_CPUCAPACITY | \ SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING | \ + SD_ASYM_CPUCAPACITY | \ SD_SHARE_POWERDOMAIN) static struct sched_domain * -sd_init(struct sched_domain_topology_level *tl, int cpu) +sd_init(struct sched_domain_topology_level *tl, + const struct cpumask *cpu_map, + struct sched_domain *child, int cpu) { - struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); - int sd_weight, sd_flags = 0; + struct sd_data *sdd = &tl->data; + struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); + int sd_id, sd_weight, sd_flags = 0; #ifdef CONFIG_NUMA /* @@ -6420,15 +6503,26 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) .smt_gain = 0, .max_newidle_lb_cost = 0, .next_decay_max_lb_cost = jiffies, + .child = child, #ifdef CONFIG_SCHED_DEBUG .name = tl->name, #endif }; + cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); + sd_id = cpumask_first(sched_domain_span(sd)); + /* * Convert topological properties into behaviour. */ + if (sd->flags & SD_ASYM_CPUCAPACITY) { + struct sched_domain *t = sd; + + for_each_lower_domain(t) + t->flags |= SD_BALANCE_WAKE; + } + if (sd->flags & SD_SHARE_CPUCAPACITY) { sd->flags |= SD_PREFER_SIBLING; sd->imbalance_pct = 110; @@ -6460,7 +6554,17 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) sd->idle_idx = 1; } - sd->private = &tl->data; + /* + * For all levels sharing cache; connect a sched_domain_shared + * instance. + */ + if (sd->flags & SD_SHARE_PKG_RESOURCES) { + sd->shared = *per_cpu_ptr(sdd->sds, sd_id); + atomic_inc(&sd->shared->ref); + atomic_set(&sd->shared->nr_busy_cpus, sd_weight); + } + + sd->private = sdd; return sd; } @@ -6487,6 +6591,9 @@ static struct sched_domain_topology_level *sched_domain_topology = void set_sched_topology(struct sched_domain_topology_level *tl) { + if (WARN_ON_ONCE(sched_smp_initialized)) + return; + sched_domain_topology = tl; } @@ -6767,6 +6874,10 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sdd->sd) return -ENOMEM; + sdd->sds = alloc_percpu(struct sched_domain_shared *); + if (!sdd->sds) + return -ENOMEM; + sdd->sg = alloc_percpu(struct sched_group *); if (!sdd->sg) return -ENOMEM; @@ -6777,6 +6888,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map) for_each_cpu(j, cpu_map) { struct sched_domain *sd; + struct sched_domain_shared *sds; struct sched_group *sg; struct sched_group_capacity *sgc; @@ -6787,6 +6899,13 @@ static int __sdt_alloc(const struct cpumask *cpu_map) *per_cpu_ptr(sdd->sd, j) = sd; + sds = kzalloc_node(sizeof(struct sched_domain_shared), + GFP_KERNEL, cpu_to_node(j)); + if (!sds) + return -ENOMEM; + + *per_cpu_ptr(sdd->sds, j) = sds; + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); if (!sg) @@ -6826,6 +6945,8 @@ static void __sdt_free(const struct cpumask *cpu_map) kfree(*per_cpu_ptr(sdd->sd, j)); } + if (sdd->sds) + kfree(*per_cpu_ptr(sdd->sds, j)); if (sdd->sg) kfree(*per_cpu_ptr(sdd->sg, j)); if (sdd->sgc) @@ -6833,6 +6954,8 @@ static void __sdt_free(const struct cpumask *cpu_map) } free_percpu(sdd->sd); sdd->sd = NULL; + free_percpu(sdd->sds); + sdd->sds = NULL; free_percpu(sdd->sg); sdd->sg = NULL; free_percpu(sdd->sgc); @@ -6844,16 +6967,12 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *child, int cpu) { - struct sched_domain *sd = sd_init(tl, cpu); - if (!sd) - return child; + struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu); - cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); if (child) { sd->level = child->level + 1; sched_domain_level_max = max(sched_domain_level_max, sd->level); child->parent = sd; - sd->child = child; if (!cpumask_subset(sched_domain_span(child), sched_domain_span(sd))) { @@ -6884,6 +7003,7 @@ static int build_sched_domains(const struct cpumask *cpu_map, enum s_alloc alloc_state; struct sched_domain *sd; struct s_data d; + struct rq *rq = NULL; int i, ret = -ENOMEM; alloc_state = __visit_domain_allocation_hell(&d, cpu_map); @@ -6934,11 +7054,22 @@ static int build_sched_domains(const struct cpumask *cpu_map, /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) { + rq = cpu_rq(i); sd = *per_cpu_ptr(d.sd, i); + + /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ + if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity)) + WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig); + cpu_attach_domain(sd, d.rd, i); } rcu_read_unlock(); + if (rq && sched_debug_enabled) { + pr_info("span: %*pbl (max cpu_capacity = %lu)\n", + cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); + } + ret = 0; error: __free_domain_allocs(&d, alloc_state, cpu_map); @@ -7297,6 +7428,22 @@ int sched_cpu_dying(unsigned int cpu) } #endif +#ifdef CONFIG_SCHED_SMT +DEFINE_STATIC_KEY_FALSE(sched_smt_present); + +static void sched_init_smt(void) +{ + /* + * We've enumerated all CPUs and will assume that if any CPU + * has SMT siblings, CPU0 will too. + */ + if (cpumask_weight(cpu_smt_mask(0)) > 1) + static_branch_enable(&sched_smt_present); +} +#else +static inline void sched_init_smt(void) { } +#endif + void __init sched_init_smp(void) { cpumask_var_t non_isolated_cpus; @@ -7326,6 +7473,9 @@ void __init sched_init_smp(void) init_sched_rt_class(); init_sched_dl_class(); + + sched_init_smt(); + sched_smp_initialized = true; } @@ -7363,6 +7513,7 @@ static struct kmem_cache *task_group_cache __read_mostly; #endif DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); +DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); void __init sched_init(void) { @@ -7399,6 +7550,8 @@ void __init sched_init(void) for_each_possible_cpu(i) { per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( cpumask_size(), GFP_KERNEL, cpu_to_node(i)); + per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node( + cpumask_size(), GFP_KERNEL, cpu_to_node(i)); } #endif /* CONFIG_CPUMASK_OFFSTACK */ @@ -7501,10 +7654,6 @@ void __init sched_init(void) set_load_weight(&init_task); -#ifdef CONFIG_PREEMPT_NOTIFIERS - INIT_HLIST_HEAD(&init_task.preempt_notifiers); -#endif - /* * The boot idle thread does lazy MMU switching as well: */ @@ -7512,11 +7661,6 @@ void __init sched_init(void) enter_lazy_tlb(&init_mm, current); /* - * During early bootup we pretend to be a normal task: - */ - current->sched_class = &fair_sched_class; - - /* * Make us the idle thread. Technically, schedule() should not be * called from this thread, however somewhere below it might be, * but because we are the idle thread, we just pick up running again @@ -7570,6 +7714,7 @@ EXPORT_SYMBOL(__might_sleep); void ___might_sleep(const char *file, int line, int preempt_offset) { static unsigned long prev_jiffy; /* ratelimiting */ + unsigned long preempt_disable_ip; rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && @@ -7580,6 +7725,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset) return; prev_jiffy = jiffies; + /* Save this before calling printk(), since that will clobber it */ + preempt_disable_ip = get_preempt_disable_ip(current); + printk(KERN_ERR "BUG: sleeping function called from invalid context at %s:%d\n", file, line); @@ -7594,14 +7742,14 @@ void ___might_sleep(const char *file, int line, int preempt_offset) debug_show_held_locks(current); if (irqs_disabled()) print_irqtrace_events(current); -#ifdef CONFIG_DEBUG_PREEMPT - if (!preempt_count_equals(preempt_offset)) { + if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) + && !preempt_count_equals(preempt_offset)) { pr_err("Preemption disabled at:"); - print_ip_sym(current->preempt_disable_ip); + print_ip_sym(preempt_disable_ip); pr_cont("\n"); } -#endif dump_stack(); + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } EXPORT_SYMBOL(___might_sleep); #endif @@ -7622,12 +7770,10 @@ void normalize_rt_tasks(void) if (p->flags & PF_KTHREAD) continue; - p->se.exec_start = 0; -#ifdef CONFIG_SCHEDSTATS - p->se.statistics.wait_start = 0; - p->se.statistics.sleep_start = 0; - p->se.statistics.block_start = 0; -#endif + p->se.exec_start = 0; + schedstat_set(p->se.statistics.wait_start, 0); + schedstat_set(p->se.statistics.sleep_start, 0); + schedstat_set(p->se.statistics.block_start, 0); if (!dl_task(p) && !rt_task(p)) { /* @@ -7688,7 +7834,7 @@ struct task_struct *curr_task(int cpu) * * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! */ -void set_curr_task(int cpu, struct task_struct *p) +void ia64_set_curr_task(int cpu, struct task_struct *p) { cpu_curr(cpu) = p; } @@ -7819,10 +7965,10 @@ void sched_move_task(struct task_struct *tsk) sched_change_group(tsk, TASK_MOVE_GROUP); - if (unlikely(running)) - tsk->sched_class->set_curr_task(rq); if (queued) enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); + if (unlikely(running)) + set_curr_task(rq, tsk); task_rq_unlock(rq, tsk, &rf); } diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index d4184498c9f5..e73119013c53 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -31,56 +31,81 @@ static inline int right_child(int i) return (i << 1) + 2; } -static void cpudl_exchange(struct cpudl *cp, int a, int b) +static void cpudl_heapify_down(struct cpudl *cp, int idx) { - int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; + int l, r, largest; - swap(cp->elements[a].cpu, cp->elements[b].cpu); - swap(cp->elements[a].dl , cp->elements[b].dl ); + int orig_cpu = cp->elements[idx].cpu; + u64 orig_dl = cp->elements[idx].dl; - swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx); -} - -static void cpudl_heapify(struct cpudl *cp, int idx) -{ - int l, r, largest; + if (left_child(idx) >= cp->size) + return; /* adapted from lib/prio_heap.c */ while(1) { + u64 largest_dl; l = left_child(idx); r = right_child(idx); largest = idx; + largest_dl = orig_dl; - if ((l < cp->size) && dl_time_before(cp->elements[idx].dl, - cp->elements[l].dl)) + if ((l < cp->size) && dl_time_before(orig_dl, + cp->elements[l].dl)) { largest = l; - if ((r < cp->size) && dl_time_before(cp->elements[largest].dl, - cp->elements[r].dl)) + largest_dl = cp->elements[l].dl; + } + if ((r < cp->size) && dl_time_before(largest_dl, + cp->elements[r].dl)) largest = r; + if (largest == idx) break; - /* Push idx down the heap one level and bump one up */ - cpudl_exchange(cp, largest, idx); + /* pull largest child onto idx */ + cp->elements[idx].cpu = cp->elements[largest].cpu; + cp->elements[idx].dl = cp->elements[largest].dl; + cp->elements[cp->elements[idx].cpu].idx = idx; idx = largest; } + /* actual push down of saved original values orig_* */ + cp->elements[idx].cpu = orig_cpu; + cp->elements[idx].dl = orig_dl; + cp->elements[cp->elements[idx].cpu].idx = idx; } -static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) +static void cpudl_heapify_up(struct cpudl *cp, int idx) { - WARN_ON(idx == IDX_INVALID || !cpu_present(idx)); + int p; - if (dl_time_before(new_dl, cp->elements[idx].dl)) { - cp->elements[idx].dl = new_dl; - cpudl_heapify(cp, idx); - } else { - cp->elements[idx].dl = new_dl; - while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, - cp->elements[idx].dl)) { - cpudl_exchange(cp, idx, parent(idx)); - idx = parent(idx); - } - } + int orig_cpu = cp->elements[idx].cpu; + u64 orig_dl = cp->elements[idx].dl; + + if (idx == 0) + return; + + do { + p = parent(idx); + if (dl_time_before(orig_dl, cp->elements[p].dl)) + break; + /* pull parent onto idx */ + cp->elements[idx].cpu = cp->elements[p].cpu; + cp->elements[idx].dl = cp->elements[p].dl; + cp->elements[cp->elements[idx].cpu].idx = idx; + idx = p; + } while (idx != 0); + /* actual push up of saved original values orig_* */ + cp->elements[idx].cpu = orig_cpu; + cp->elements[idx].dl = orig_dl; + cp->elements[cp->elements[idx].cpu].idx = idx; +} + +static void cpudl_heapify(struct cpudl *cp, int idx) +{ + if (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, + cp->elements[idx].dl)) + cpudl_heapify_up(cp, idx); + else + cpudl_heapify_down(cp, idx); } static inline int cpudl_maximum(struct cpudl *cp) @@ -120,16 +145,15 @@ out: } /* - * cpudl_set - update the cpudl max-heap + * cpudl_clear - remove a cpu from the cpudl max-heap * @cp: the cpudl max-heap context * @cpu: the target cpu - * @dl: the new earliest deadline for this cpu * * Notes: assumes cpu_rq(cpu)->lock is locked * * Returns: (void) */ -void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) +void cpudl_clear(struct cpudl *cp, int cpu) { int old_idx, new_cpu; unsigned long flags; @@ -137,47 +161,60 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) WARN_ON(!cpu_present(cpu)); raw_spin_lock_irqsave(&cp->lock, flags); + old_idx = cp->elements[cpu].idx; - if (!is_valid) { - /* remove item */ - if (old_idx == IDX_INVALID) { - /* - * Nothing to remove if old_idx was invalid. - * This could happen if a rq_offline_dl is - * called for a CPU without -dl tasks running. - */ - goto out; - } + if (old_idx == IDX_INVALID) { + /* + * Nothing to remove if old_idx was invalid. + * This could happen if a rq_offline_dl is + * called for a CPU without -dl tasks running. + */ + } else { new_cpu = cp->elements[cp->size - 1].cpu; cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; cp->elements[old_idx].cpu = new_cpu; cp->size--; cp->elements[new_cpu].idx = old_idx; cp->elements[cpu].idx = IDX_INVALID; - while (old_idx > 0 && dl_time_before( - cp->elements[parent(old_idx)].dl, - cp->elements[old_idx].dl)) { - cpudl_exchange(cp, old_idx, parent(old_idx)); - old_idx = parent(old_idx); - } - cpumask_set_cpu(cpu, cp->free_cpus); - cpudl_heapify(cp, old_idx); + cpudl_heapify(cp, old_idx); - goto out; + cpumask_set_cpu(cpu, cp->free_cpus); } + raw_spin_unlock_irqrestore(&cp->lock, flags); +} + +/* + * cpudl_set - update the cpudl max-heap + * @cp: the cpudl max-heap context + * @cpu: the target cpu + * @dl: the new earliest deadline for this cpu + * + * Notes: assumes cpu_rq(cpu)->lock is locked + * + * Returns: (void) + */ +void cpudl_set(struct cpudl *cp, int cpu, u64 dl) +{ + int old_idx; + unsigned long flags; + WARN_ON(!cpu_present(cpu)); + + raw_spin_lock_irqsave(&cp->lock, flags); + + old_idx = cp->elements[cpu].idx; if (old_idx == IDX_INVALID) { - cp->size++; - cp->elements[cp->size - 1].dl = dl; - cp->elements[cp->size - 1].cpu = cpu; - cp->elements[cpu].idx = cp->size - 1; - cpudl_change_key(cp, cp->size - 1, dl); + int new_idx = cp->size++; + cp->elements[new_idx].dl = dl; + cp->elements[new_idx].cpu = cpu; + cp->elements[cpu].idx = new_idx; + cpudl_heapify_up(cp, new_idx); cpumask_clear_cpu(cpu, cp->free_cpus); } else { - cpudl_change_key(cp, old_idx, dl); + cp->elements[old_idx].dl = dl; + cpudl_heapify(cp, old_idx); } -out: raw_spin_unlock_irqrestore(&cp->lock, flags); } diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index fcbdf83fed7e..f7da8c55bba0 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -23,7 +23,8 @@ struct cpudl { #ifdef CONFIG_SMP int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); -void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); +void cpudl_set(struct cpudl *cp, int cpu, u64 dl); +void cpudl_clear(struct cpudl *cp, int cpu); int cpudl_init(struct cpudl *cp); void cpudl_set_freecpu(struct cpudl *cp, int cpu); void cpudl_clear_freecpu(struct cpudl *cp, int cpu); diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index 1141954e73b4..dbc51442ecbc 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -33,7 +33,7 @@ DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); */ void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data, void (*func)(struct update_util_data *data, u64 time, - unsigned long util, unsigned long max)) + unsigned int flags)) { if (WARN_ON(!data || !func)) return; diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index a84641b222c1..69e06898997d 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -12,7 +12,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/cpufreq.h> -#include <linux/module.h> #include <linux/slab.h> #include <trace/events/power.h> @@ -48,11 +47,14 @@ struct sugov_cpu { struct sugov_policy *sg_policy; unsigned int cached_raw_freq; + unsigned long iowait_boost; + unsigned long iowait_boost_max; + u64 last_update; /* The fields below are only needed when sharing a policy. */ unsigned long util; unsigned long max; - u64 last_update; + unsigned int flags; }; static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); @@ -144,24 +146,75 @@ static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util, return cpufreq_driver_resolve_freq(policy, freq); } +static void sugov_get_util(unsigned long *util, unsigned long *max) +{ + struct rq *rq = this_rq(); + unsigned long cfs_max; + + cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id()); + + *util = min(rq->cfs.avg.util_avg, cfs_max); + *max = cfs_max; +} + +static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, + unsigned int flags) +{ + if (flags & SCHED_CPUFREQ_IOWAIT) { + sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; + } else if (sg_cpu->iowait_boost) { + s64 delta_ns = time - sg_cpu->last_update; + + /* Clear iowait_boost if the CPU apprears to have been idle. */ + if (delta_ns > TICK_NSEC) + sg_cpu->iowait_boost = 0; + } +} + +static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, + unsigned long *max) +{ + unsigned long boost_util = sg_cpu->iowait_boost; + unsigned long boost_max = sg_cpu->iowait_boost_max; + + if (!boost_util) + return; + + if (*util * boost_max < *max * boost_util) { + *util = boost_util; + *max = boost_max; + } + sg_cpu->iowait_boost >>= 1; +} + static void sugov_update_single(struct update_util_data *hook, u64 time, - unsigned long util, unsigned long max) + unsigned int flags) { struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct cpufreq_policy *policy = sg_policy->policy; + unsigned long util, max; unsigned int next_f; + sugov_set_iowait_boost(sg_cpu, time, flags); + sg_cpu->last_update = time; + if (!sugov_should_update_freq(sg_policy, time)) return; - next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq : - get_next_freq(sg_cpu, util, max); + if (flags & SCHED_CPUFREQ_RT_DL) { + next_f = policy->cpuinfo.max_freq; + } else { + sugov_get_util(&util, &max); + sugov_iowait_boost(sg_cpu, &util, &max); + next_f = get_next_freq(sg_cpu, util, max); + } sugov_update_commit(sg_policy, time, next_f); } static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, - unsigned long util, unsigned long max) + unsigned long util, unsigned long max, + unsigned int flags) { struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct cpufreq_policy *policy = sg_policy->policy; @@ -169,9 +222,11 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 last_freq_update_time = sg_policy->last_freq_update_time; unsigned int j; - if (util == ULONG_MAX) + if (flags & SCHED_CPUFREQ_RT_DL) return max_f; + sugov_iowait_boost(sg_cpu, &util, &max); + for_each_cpu(j, policy->cpus) { struct sugov_cpu *j_sg_cpu; unsigned long j_util, j_max; @@ -186,41 +241,50 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, * frequency update and the time elapsed between the last update * of the CPU utilization and the last frequency update is long * enough, don't take the CPU into account as it probably is - * idle now. + * idle now (and clear iowait_boost for it). */ delta_ns = last_freq_update_time - j_sg_cpu->last_update; - if (delta_ns > TICK_NSEC) + if (delta_ns > TICK_NSEC) { + j_sg_cpu->iowait_boost = 0; continue; - - j_util = j_sg_cpu->util; - if (j_util == ULONG_MAX) + } + if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL) return max_f; + j_util = j_sg_cpu->util; j_max = j_sg_cpu->max; if (j_util * max > j_max * util) { util = j_util; max = j_max; } + + sugov_iowait_boost(j_sg_cpu, &util, &max); } return get_next_freq(sg_cpu, util, max); } static void sugov_update_shared(struct update_util_data *hook, u64 time, - unsigned long util, unsigned long max) + unsigned int flags) { struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); struct sugov_policy *sg_policy = sg_cpu->sg_policy; + unsigned long util, max; unsigned int next_f; + sugov_get_util(&util, &max); + raw_spin_lock(&sg_policy->update_lock); sg_cpu->util = util; sg_cpu->max = max; + sg_cpu->flags = flags; + + sugov_set_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; if (sugov_should_update_freq(sg_policy, time)) { - next_f = sugov_next_freq_shared(sg_cpu, util, max); + next_f = sugov_next_freq_shared(sg_cpu, util, max, flags); sugov_update_commit(sg_policy, time, next_f); } @@ -444,10 +508,13 @@ static int sugov_start(struct cpufreq_policy *policy) sg_cpu->sg_policy = sg_policy; if (policy_is_shared(policy)) { - sg_cpu->util = ULONG_MAX; + sg_cpu->util = 0; sg_cpu->max = 0; + sg_cpu->flags = SCHED_CPUFREQ_RT; sg_cpu->last_update = 0; sg_cpu->cached_raw_freq = 0; + sg_cpu->iowait_boost = 0; + sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, sugov_update_shared); } else { @@ -495,28 +562,15 @@ static struct cpufreq_governor schedutil_gov = { .limits = sugov_limits, }; -static int __init sugov_module_init(void) -{ - return cpufreq_register_governor(&schedutil_gov); -} - -static void __exit sugov_module_exit(void) -{ - cpufreq_unregister_governor(&schedutil_gov); -} - -MODULE_AUTHOR("Rafael J. Wysocki <rafael.j.wysocki@intel.com>"); -MODULE_DESCRIPTION("Utilization-based CPU frequency selection"); -MODULE_LICENSE("GPL"); - #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL struct cpufreq_governor *cpufreq_default_governor(void) { return &schedutil_gov; } - -fs_initcall(sugov_module_init); -#else -module_init(sugov_module_init); #endif -module_exit(sugov_module_exit); + +static int __init sugov_register(void) +{ + return cpufreq_register_governor(&schedutil_gov); +} +fs_initcall(sugov_register); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index a846cf89eb96..5ebee3164e64 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -23,10 +23,8 @@ * task when irq is in progress while we read rq->clock. That is a worthy * compromise in place of having locks on each irq in account_system_time. */ -DEFINE_PER_CPU(u64, cpu_hardirq_time); -DEFINE_PER_CPU(u64, cpu_softirq_time); +DEFINE_PER_CPU(struct irqtime, cpu_irqtime); -static DEFINE_PER_CPU(u64, irq_start_time); static int sched_clock_irqtime; void enable_sched_clock_irqtime(void) @@ -39,16 +37,13 @@ void disable_sched_clock_irqtime(void) sched_clock_irqtime = 0; } -#ifndef CONFIG_64BIT -DEFINE_PER_CPU(seqcount_t, irq_time_seq); -#endif /* CONFIG_64BIT */ - /* * Called before incrementing preempt_count on {soft,}irq_enter * and before decrementing preempt_count on {soft,}irq_exit. */ void irqtime_account_irq(struct task_struct *curr) { + struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); s64 delta; int cpu; @@ -56,10 +51,10 @@ void irqtime_account_irq(struct task_struct *curr) return; cpu = smp_processor_id(); - delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); - __this_cpu_add(irq_start_time, delta); + delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; + irqtime->irq_start_time += delta; - irq_time_write_begin(); + u64_stats_update_begin(&irqtime->sync); /* * We do not account for softirq time from ksoftirqd here. * We want to continue accounting softirq time to ksoftirqd thread @@ -67,42 +62,36 @@ void irqtime_account_irq(struct task_struct *curr) * that do not consume any time, but still wants to run. */ if (hardirq_count()) - __this_cpu_add(cpu_hardirq_time, delta); + irqtime->hardirq_time += delta; else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) - __this_cpu_add(cpu_softirq_time, delta); + irqtime->softirq_time += delta; - irq_time_write_end(); + u64_stats_update_end(&irqtime->sync); } EXPORT_SYMBOL_GPL(irqtime_account_irq); -static cputime_t irqtime_account_hi_update(cputime_t maxtime) +static cputime_t irqtime_account_update(u64 irqtime, int idx, cputime_t maxtime) { u64 *cpustat = kcpustat_this_cpu->cpustat; - unsigned long flags; cputime_t irq_cputime; - local_irq_save(flags); - irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) - - cpustat[CPUTIME_IRQ]; + irq_cputime = nsecs_to_cputime64(irqtime) - cpustat[idx]; irq_cputime = min(irq_cputime, maxtime); - cpustat[CPUTIME_IRQ] += irq_cputime; - local_irq_restore(flags); + cpustat[idx] += irq_cputime; + return irq_cputime; } -static cputime_t irqtime_account_si_update(cputime_t maxtime) +static cputime_t irqtime_account_hi_update(cputime_t maxtime) { - u64 *cpustat = kcpustat_this_cpu->cpustat; - unsigned long flags; - cputime_t softirq_cputime; + return irqtime_account_update(__this_cpu_read(cpu_irqtime.hardirq_time), + CPUTIME_IRQ, maxtime); +} - local_irq_save(flags); - softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) - - cpustat[CPUTIME_SOFTIRQ]; - softirq_cputime = min(softirq_cputime, maxtime); - cpustat[CPUTIME_SOFTIRQ] += softirq_cputime; - local_irq_restore(flags); - return softirq_cputime; +static cputime_t irqtime_account_si_update(cputime_t maxtime) +{ + return irqtime_account_update(__this_cpu_read(cpu_irqtime.softirq_time), + CPUTIME_SOFTIRQ, maxtime); } #else /* CONFIG_IRQ_TIME_ACCOUNTING */ @@ -295,6 +284,9 @@ static inline cputime_t account_other_time(cputime_t max) { cputime_t accounted; + /* Shall be converted to a lockdep-enabled lightweight check */ + WARN_ON_ONCE(!irqs_disabled()); + accounted = steal_account_process_time(max); if (accounted < max) @@ -306,6 +298,26 @@ static inline cputime_t account_other_time(cputime_t max) return accounted; } +#ifdef CONFIG_64BIT +static inline u64 read_sum_exec_runtime(struct task_struct *t) +{ + return t->se.sum_exec_runtime; +} +#else +static u64 read_sum_exec_runtime(struct task_struct *t) +{ + u64 ns; + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(t, &rf); + ns = t->se.sum_exec_runtime; + task_rq_unlock(rq, t, &rf); + + return ns; +} +#endif + /* * Accumulate raw cputime values of dead tasks (sig->[us]time) and live * tasks (sum on group iteration) belonging to @tsk's group. @@ -318,6 +330,17 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) unsigned int seq, nextseq; unsigned long flags; + /* + * Update current task runtime to account pending time since last + * scheduler action or thread_group_cputime() call. This thread group + * might have other running tasks on different CPUs, but updating + * their runtime can affect syscall performance, so we skip account + * those pending times and rely only on values updated on tick or + * other scheduler action. + */ + if (same_thread_group(current, tsk)) + (void) task_sched_runtime(current); + rcu_read_lock(); /* Attempt a lockless read on the first round. */ nextseq = 0; @@ -332,7 +355,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) task_cputime(t, &utime, &stime); times->utime += utime; times->stime += stime; - times->sum_exec_runtime += task_sched_runtime(t); + times->sum_exec_runtime += read_sum_exec_runtime(t); } /* If lockless access failed, take the lock. */ nextseq = 1; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 1ce8867283dc..37e2449186c4 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -243,10 +243,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq); static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p) { struct rq *later_rq = NULL; - bool fallback = false; later_rq = find_lock_later_rq(p, rq); - if (!later_rq) { int cpu; @@ -254,7 +252,6 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p * If we cannot preempt any rq, fall back to pick any * online cpu. */ - fallback = true; cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p)); if (cpu >= nr_cpu_ids) { /* @@ -274,16 +271,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p double_lock_balance(rq, later_rq); } - /* - * By now the task is replenished and enqueued; migrate it. - */ - deactivate_task(rq, p, 0); set_task_cpu(p, later_rq->cpu); - activate_task(later_rq, p, 0); - - if (!fallback) - resched_curr(later_rq); - double_unlock_balance(later_rq, rq); return later_rq; @@ -346,12 +334,12 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, * one, and to (try to!) reconcile itself with its own scheduling * parameters. */ -static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se) +static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); + WARN_ON(dl_se->dl_boosted); WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); /* @@ -367,8 +355,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, * future; in fact, we must consider execution overheads (time * spent on hardirq context, etc.). */ - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; - dl_se->runtime = pi_se->dl_runtime; + dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline; + dl_se->runtime = dl_se->dl_runtime; } /* @@ -641,29 +629,31 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) goto unlock; } - enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); - if (dl_task(rq->curr)) - check_preempt_curr_dl(rq, p, 0); - else - resched_curr(rq); - #ifdef CONFIG_SMP - /* - * Perform balancing operations here; after the replenishments. We - * cannot drop rq->lock before this, otherwise the assertion in - * start_dl_timer() about not missing updates is not true. - * - * If we find that the rq the task was on is no longer available, we - * need to select a new rq. - * - * XXX figure out if select_task_rq_dl() deals with offline cpus. - */ if (unlikely(!rq->online)) { + /* + * If the runqueue is no longer available, migrate the + * task elsewhere. This necessarily changes rq. + */ lockdep_unpin_lock(&rq->lock, rf.cookie); rq = dl_task_offline_migration(rq, p); rf.cookie = lockdep_pin_lock(&rq->lock); + + /* + * Now that the task has been migrated to the new RQ and we + * have that locked, proceed as normal and enqueue the task + * there. + */ } +#endif + + enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); + if (dl_task(rq->curr)) + check_preempt_curr_dl(rq, p, 0); + else + resched_curr(rq); +#ifdef CONFIG_SMP /* * Queueing this task back might have overloaded rq, check if we need * to kick someone away. @@ -735,9 +725,8 @@ static void update_curr_dl(struct rq *rq) return; } - /* kick cpufreq (see the comment in linux/cpufreq.h). */ - if (cpu_of(rq) == smp_processor_id()) - cpufreq_trigger_update(rq_clock(rq)); + /* kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_DL); schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); @@ -798,7 +787,7 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) if (dl_rq->earliest_dl.curr == 0 || dl_time_before(deadline, dl_rq->earliest_dl.curr)) { dl_rq->earliest_dl.curr = deadline; - cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1); + cpudl_set(&rq->rd->cpudl, rq->cpu, deadline); } } @@ -813,14 +802,14 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) if (!dl_rq->dl_nr_running) { dl_rq->earliest_dl.curr = 0; dl_rq->earliest_dl.next = 0; - cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); + cpudl_clear(&rq->rd->cpudl, rq->cpu); } else { struct rb_node *leftmost = dl_rq->rb_leftmost; struct sched_dl_entity *entry; entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); dl_rq->earliest_dl.curr = entry->deadline; - cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1); + cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline); } } @@ -1671,7 +1660,7 @@ static void rq_online_dl(struct rq *rq) cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu); if (rq->dl.dl_nr_running > 0) - cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); + cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr); } /* Assumes rq->lock is held */ @@ -1680,7 +1669,7 @@ static void rq_offline_dl(struct rq *rq) if (rq->dl.overloaded) dl_clear_overload(rq); - cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); + cpudl_clear(&rq->rd->cpudl, rq->cpu); cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu); } @@ -1723,10 +1712,20 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) */ static void switched_to_dl(struct rq *rq, struct task_struct *p) { + + /* If p is not queued we will update its parameters at next wakeup. */ + if (!task_on_rq_queued(p)) + return; + + /* + * If p is boosted we already updated its params in + * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH), + * p's deadline being now already after rq_clock(rq). + */ if (dl_time_before(p->dl.deadline, rq_clock(rq))) - setup_new_dl_entity(&p->dl, &p->dl); + setup_new_dl_entity(&p->dl); - if (task_on_rq_queued(p) && rq->curr != p) { + if (rq->curr != p) { #ifdef CONFIG_SMP if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded) queue_push_tasks(rq); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2a0a9995256d..13935886a471 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -369,8 +369,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group #define P(F) \ SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) +#define P_SCHEDSTAT(F) \ + SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F)) #define PN(F) \ SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) +#define PN_SCHEDSTAT(F) \ + SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F))) if (!se) return; @@ -378,26 +382,27 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group PN(se->exec_start); PN(se->vruntime); PN(se->sum_exec_runtime); -#ifdef CONFIG_SCHEDSTATS if (schedstat_enabled()) { - PN(se->statistics.wait_start); - PN(se->statistics.sleep_start); - PN(se->statistics.block_start); - PN(se->statistics.sleep_max); - PN(se->statistics.block_max); - PN(se->statistics.exec_max); - PN(se->statistics.slice_max); - PN(se->statistics.wait_max); - PN(se->statistics.wait_sum); - P(se->statistics.wait_count); + PN_SCHEDSTAT(se->statistics.wait_start); + PN_SCHEDSTAT(se->statistics.sleep_start); + PN_SCHEDSTAT(se->statistics.block_start); + PN_SCHEDSTAT(se->statistics.sleep_max); + PN_SCHEDSTAT(se->statistics.block_max); + PN_SCHEDSTAT(se->statistics.exec_max); + PN_SCHEDSTAT(se->statistics.slice_max); + PN_SCHEDSTAT(se->statistics.wait_max); + PN_SCHEDSTAT(se->statistics.wait_sum); + P_SCHEDSTAT(se->statistics.wait_count); } -#endif P(se->load.weight); #ifdef CONFIG_SMP P(se->avg.load_avg); P(se->avg.util_avg); #endif + +#undef PN_SCHEDSTAT #undef PN +#undef P_SCHEDSTAT #undef P } #endif @@ -429,9 +434,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) p->prio); SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", - SPLIT_NS(schedstat_val(p, se.statistics.wait_sum)), + SPLIT_NS(schedstat_val_or_zero(p->se.statistics.wait_sum)), SPLIT_NS(p->se.sum_exec_runtime), - SPLIT_NS(schedstat_val(p, se.statistics.sum_sleep_runtime))); + SPLIT_NS(schedstat_val_or_zero(p->se.statistics.sum_sleep_runtime))); #ifdef CONFIG_NUMA_BALANCING SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); @@ -626,9 +631,7 @@ do { \ #undef P64 #endif -#ifdef CONFIG_SCHEDSTATS -#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); - +#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, schedstat_val(rq->n)); if (schedstat_enabled()) { P(yld_count); P(sched_count); @@ -636,9 +639,8 @@ do { \ P(ttwu_count); P(ttwu_local); } - #undef P -#endif + spin_lock_irqsave(&sched_debug_lock, flags); print_cfs_stats(m, cpu); print_rt_stats(m, cpu); @@ -868,10 +870,14 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) #define P(F) \ SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) +#define P_SCHEDSTAT(F) \ + SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)schedstat_val(p->F)) #define __PN(F) \ SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) #define PN(F) \ SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) +#define PN_SCHEDSTAT(F) \ + SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(p->F))) PN(se.exec_start); PN(se.vruntime); @@ -881,37 +887,36 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.nr_migrations); -#ifdef CONFIG_SCHEDSTATS if (schedstat_enabled()) { u64 avg_atom, avg_per_cpu; - PN(se.statistics.sum_sleep_runtime); - PN(se.statistics.wait_start); - PN(se.statistics.sleep_start); - PN(se.statistics.block_start); - PN(se.statistics.sleep_max); - PN(se.statistics.block_max); - PN(se.statistics.exec_max); - PN(se.statistics.slice_max); - PN(se.statistics.wait_max); - PN(se.statistics.wait_sum); - P(se.statistics.wait_count); - PN(se.statistics.iowait_sum); - P(se.statistics.iowait_count); - P(se.statistics.nr_migrations_cold); - P(se.statistics.nr_failed_migrations_affine); - P(se.statistics.nr_failed_migrations_running); - P(se.statistics.nr_failed_migrations_hot); - P(se.statistics.nr_forced_migrations); - P(se.statistics.nr_wakeups); - P(se.statistics.nr_wakeups_sync); - P(se.statistics.nr_wakeups_migrate); - P(se.statistics.nr_wakeups_local); - P(se.statistics.nr_wakeups_remote); - P(se.statistics.nr_wakeups_affine); - P(se.statistics.nr_wakeups_affine_attempts); - P(se.statistics.nr_wakeups_passive); - P(se.statistics.nr_wakeups_idle); + PN_SCHEDSTAT(se.statistics.sum_sleep_runtime); + PN_SCHEDSTAT(se.statistics.wait_start); + PN_SCHEDSTAT(se.statistics.sleep_start); + PN_SCHEDSTAT(se.statistics.block_start); + PN_SCHEDSTAT(se.statistics.sleep_max); + PN_SCHEDSTAT(se.statistics.block_max); + PN_SCHEDSTAT(se.statistics.exec_max); + PN_SCHEDSTAT(se.statistics.slice_max); + PN_SCHEDSTAT(se.statistics.wait_max); + PN_SCHEDSTAT(se.statistics.wait_sum); + P_SCHEDSTAT(se.statistics.wait_count); + PN_SCHEDSTAT(se.statistics.iowait_sum); + P_SCHEDSTAT(se.statistics.iowait_count); + P_SCHEDSTAT(se.statistics.nr_migrations_cold); + P_SCHEDSTAT(se.statistics.nr_failed_migrations_affine); + P_SCHEDSTAT(se.statistics.nr_failed_migrations_running); + P_SCHEDSTAT(se.statistics.nr_failed_migrations_hot); + P_SCHEDSTAT(se.statistics.nr_forced_migrations); + P_SCHEDSTAT(se.statistics.nr_wakeups); + P_SCHEDSTAT(se.statistics.nr_wakeups_sync); + P_SCHEDSTAT(se.statistics.nr_wakeups_migrate); + P_SCHEDSTAT(se.statistics.nr_wakeups_local); + P_SCHEDSTAT(se.statistics.nr_wakeups_remote); + P_SCHEDSTAT(se.statistics.nr_wakeups_affine); + P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts); + P_SCHEDSTAT(se.statistics.nr_wakeups_passive); + P_SCHEDSTAT(se.statistics.nr_wakeups_idle); avg_atom = p->se.sum_exec_runtime; if (nr_switches) @@ -930,7 +935,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) __PN(avg_atom); __PN(avg_per_cpu); } -#endif + __P(nr_switches); SEQ_printf(m, "%-45s:%21Ld\n", "nr_voluntary_switches", (long long)p->nvcsw); @@ -947,8 +952,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) #endif P(policy); P(prio); +#undef PN_SCHEDSTAT #undef PN #undef __PN +#undef P_SCHEDSTAT #undef P #undef __P diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 039de34f1521..502e95a6e927 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -114,6 +114,12 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif +/* + * The margin used when comparing utilization with CPU capacity: + * util * 1024 < capacity * margin + */ +unsigned int capacity_margin = 1280; /* ~20% */ + static inline void update_load_add(struct load_weight *lw, unsigned long inc) { lw->weight += inc; @@ -256,9 +262,7 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) static inline struct task_struct *task_of(struct sched_entity *se) { -#ifdef CONFIG_SCHED_DEBUG - WARN_ON_ONCE(!entity_is_task(se)); -#endif + SCHED_WARN_ON(!entity_is_task(se)); return container_of(se, struct task_struct, se); } @@ -456,17 +460,23 @@ static inline int entity_before(struct sched_entity *a, static void update_min_vruntime(struct cfs_rq *cfs_rq) { + struct sched_entity *curr = cfs_rq->curr; + u64 vruntime = cfs_rq->min_vruntime; - if (cfs_rq->curr) - vruntime = cfs_rq->curr->vruntime; + if (curr) { + if (curr->on_rq) + vruntime = curr->vruntime; + else + curr = NULL; + } if (cfs_rq->rb_leftmost) { struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, struct sched_entity, run_node); - if (!cfs_rq->curr) + if (!curr) vruntime = se->vruntime; else vruntime = min_vruntime(vruntime, se->vruntime); @@ -656,7 +666,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_SMP -static int select_idle_sibling(struct task_struct *p, int cpu); +static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); /* @@ -726,7 +736,6 @@ void post_init_entity_util_avg(struct sched_entity *se) struct sched_avg *sa = &se->avg; long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; u64 now = cfs_rq_clock_task(cfs_rq); - int tg_update; if (cap > 0) { if (cfs_rq->avg.util_avg != 0) { @@ -759,10 +768,9 @@ void post_init_entity_util_avg(struct sched_entity *se) } } - tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); + update_cfs_rq_load_avg(now, cfs_rq, false); attach_entity_load_avg(cfs_rq, se); - if (tg_update) - update_tg_load_avg(cfs_rq, false); + update_tg_load_avg(cfs_rq, false); } #else /* !CONFIG_SMP */ @@ -799,7 +807,7 @@ static void update_curr(struct cfs_rq *cfs_rq) max(delta_exec, curr->statistics.exec_max)); curr->sum_exec_runtime += delta_exec; - schedstat_add(cfs_rq, exec_clock, delta_exec); + schedstat_add(cfs_rq->exec_clock, delta_exec); curr->vruntime += calc_delta_fair(delta_exec, curr); update_min_vruntime(cfs_rq); @@ -820,26 +828,34 @@ static void update_curr_fair(struct rq *rq) update_curr(cfs_rq_of(&rq->curr->se)); } -#ifdef CONFIG_SCHEDSTATS static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 wait_start = rq_clock(rq_of(cfs_rq)); + u64 wait_start, prev_wait_start; + + if (!schedstat_enabled()) + return; + + wait_start = rq_clock(rq_of(cfs_rq)); + prev_wait_start = schedstat_val(se->statistics.wait_start); if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && - likely(wait_start > se->statistics.wait_start)) - wait_start -= se->statistics.wait_start; + likely(wait_start > prev_wait_start)) + wait_start -= prev_wait_start; - se->statistics.wait_start = wait_start; + schedstat_set(se->statistics.wait_start, wait_start); } -static void +static inline void update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct task_struct *p; u64 delta; - delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; + if (!schedstat_enabled()) + return; + + delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start); if (entity_is_task(se)) { p = task_of(se); @@ -849,35 +865,114 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) * time stamp can be adjusted to accumulate wait time * prior to migration. */ - se->statistics.wait_start = delta; + schedstat_set(se->statistics.wait_start, delta); return; } trace_sched_stat_wait(p, delta); } - se->statistics.wait_max = max(se->statistics.wait_max, delta); - se->statistics.wait_count++; - se->statistics.wait_sum += delta; - se->statistics.wait_start = 0; + schedstat_set(se->statistics.wait_max, + max(schedstat_val(se->statistics.wait_max), delta)); + schedstat_inc(se->statistics.wait_count); + schedstat_add(se->statistics.wait_sum, delta); + schedstat_set(se->statistics.wait_start, 0); +} + +static inline void +update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct task_struct *tsk = NULL; + u64 sleep_start, block_start; + + if (!schedstat_enabled()) + return; + + sleep_start = schedstat_val(se->statistics.sleep_start); + block_start = schedstat_val(se->statistics.block_start); + + if (entity_is_task(se)) + tsk = task_of(se); + + if (sleep_start) { + u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start; + + if ((s64)delta < 0) + delta = 0; + + if (unlikely(delta > schedstat_val(se->statistics.sleep_max))) + schedstat_set(se->statistics.sleep_max, delta); + + schedstat_set(se->statistics.sleep_start, 0); + schedstat_add(se->statistics.sum_sleep_runtime, delta); + + if (tsk) { + account_scheduler_latency(tsk, delta >> 10, 1); + trace_sched_stat_sleep(tsk, delta); + } + } + if (block_start) { + u64 delta = rq_clock(rq_of(cfs_rq)) - block_start; + + if ((s64)delta < 0) + delta = 0; + + if (unlikely(delta > schedstat_val(se->statistics.block_max))) + schedstat_set(se->statistics.block_max, delta); + + schedstat_set(se->statistics.block_start, 0); + schedstat_add(se->statistics.sum_sleep_runtime, delta); + + if (tsk) { + if (tsk->in_iowait) { + schedstat_add(se->statistics.iowait_sum, delta); + schedstat_inc(se->statistics.iowait_count); + trace_sched_stat_iowait(tsk, delta); + } + + trace_sched_stat_blocked(tsk, delta); + + /* + * Blocking time is in units of nanosecs, so shift by + * 20 to get a milliseconds-range estimation of the + * amount of time that the task spent sleeping: + */ + if (unlikely(prof_on == SLEEP_PROFILING)) { + profile_hits(SLEEP_PROFILING, + (void *)get_wchan(tsk), + delta >> 20); + } + account_scheduler_latency(tsk, delta >> 10, 0); + } + } } /* * Task is being enqueued - update stats: */ static inline void -update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { + if (!schedstat_enabled()) + return; + /* * Are we enqueueing a waiting task? (for current tasks * a dequeue/enqueue event is a NOP) */ if (se != cfs_rq->curr) update_stats_wait_start(cfs_rq, se); + + if (flags & ENQUEUE_WAKEUP) + update_stats_enqueue_sleeper(cfs_rq, se); } static inline void update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { + + if (!schedstat_enabled()) + return; + /* * Mark the end of the wait period if dequeueing a * waiting task: @@ -885,40 +980,18 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (se != cfs_rq->curr) update_stats_wait_end(cfs_rq, se); - if (flags & DEQUEUE_SLEEP) { - if (entity_is_task(se)) { - struct task_struct *tsk = task_of(se); + if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) { + struct task_struct *tsk = task_of(se); - if (tsk->state & TASK_INTERRUPTIBLE) - se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); - if (tsk->state & TASK_UNINTERRUPTIBLE) - se->statistics.block_start = rq_clock(rq_of(cfs_rq)); - } + if (tsk->state & TASK_INTERRUPTIBLE) + schedstat_set(se->statistics.sleep_start, + rq_clock(rq_of(cfs_rq))); + if (tsk->state & TASK_UNINTERRUPTIBLE) + schedstat_set(se->statistics.block_start, + rq_clock(rq_of(cfs_rq))); } - -} -#else -static inline void -update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -} - -static inline void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ } -static inline void -update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -} - -static inline void -update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) -{ -} -#endif - /* * We are picking a new current task - update its stats: */ @@ -1513,8 +1586,16 @@ balance: * One idle CPU per node is evaluated for a task numa move. * Call select_idle_sibling to maybe find a better one. */ - if (!cur) - env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); + if (!cur) { + /* + * select_idle_siblings() uses an per-cpu cpumask that + * can be used from IRQ context. + */ + local_irq_disable(); + env->dst_cpu = select_idle_sibling(env->p, env->src_cpu, + env->dst_cpu); + local_irq_enable(); + } assign: task_numa_assign(env, cur, imp); @@ -2292,7 +2373,7 @@ void task_numa_work(struct callback_head *work) unsigned long nr_pte_updates = 0; long pages, virtpages; - WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); + SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); work->next = work; /* protect against double add */ /* @@ -2803,9 +2884,21 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, } #ifdef CONFIG_FAIR_GROUP_SCHED -/* - * Updating tg's load_avg is necessary before update_cfs_share (which is done) - * and effective_load (which is not done because it is too costly). +/** + * update_tg_load_avg - update the tg's load avg + * @cfs_rq: the cfs_rq whose avg changed + * @force: update regardless of how small the difference + * + * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load. + * However, because tg->load_avg is a global value there are performance + * considerations. + * + * In order to avoid having to look at the other cfs_rq's, we use a + * differential update where we store the last value we propagated. This in + * turn allows skipping updates if the differential is 'small'. + * + * Updating tg's load_avg is necessary before update_cfs_share() (which is + * done) and effective_load() (which is not done because it is too costly). */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) { @@ -2875,12 +2968,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) { - struct rq *rq = rq_of(cfs_rq); - int cpu = cpu_of(rq); - - if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) { - unsigned long max = rq->cpu_capacity_orig; - + if (&this_rq()->cfs == cfs_rq) { /* * There are a few boundary cases this might miss but it should * get called often enough that that should (hopefully) not be @@ -2897,8 +2985,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) * * See cpu_util(). */ - cpufreq_update_util(rq_clock(rq), - min(cfs_rq->avg.util_avg, max), max); + cpufreq_update_util(rq_of(cfs_rq), 0); } } @@ -2931,10 +3018,10 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) * * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. * - * Returns true if the load decayed or we removed utilization. It is expected - * that one calls update_tg_load_avg() on this condition, but after you've - * modified the cfs_rq avg (attach/detach), such that we propagate the new - * avg up. + * Returns true if the load decayed or we removed load. + * + * Since both these conditions indicate a changed cfs_rq->avg.load we should + * call update_tg_load_avg() when this function returns true. */ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) @@ -3159,10 +3246,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) static inline void update_load_avg(struct sched_entity *se, int not_used) { - struct cfs_rq *cfs_rq = cfs_rq_of(se); - struct rq *rq = rq_of(cfs_rq); - - cpufreq_trigger_update(rq_clock(rq)); + cpufreq_update_util(rq_of(cfs_rq_of(se)), 0); } static inline void @@ -3183,68 +3267,6 @@ static inline int idle_balance(struct rq *rq) #endif /* CONFIG_SMP */ -static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -#ifdef CONFIG_SCHEDSTATS - struct task_struct *tsk = NULL; - - if (entity_is_task(se)) - tsk = task_of(se); - - if (se->statistics.sleep_start) { - u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start; - - if ((s64)delta < 0) - delta = 0; - - if (unlikely(delta > se->statistics.sleep_max)) - se->statistics.sleep_max = delta; - - se->statistics.sleep_start = 0; - se->statistics.sum_sleep_runtime += delta; - - if (tsk) { - account_scheduler_latency(tsk, delta >> 10, 1); - trace_sched_stat_sleep(tsk, delta); - } - } - if (se->statistics.block_start) { - u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start; - - if ((s64)delta < 0) - delta = 0; - - if (unlikely(delta > se->statistics.block_max)) - se->statistics.block_max = delta; - - se->statistics.block_start = 0; - se->statistics.sum_sleep_runtime += delta; - - if (tsk) { - if (tsk->in_iowait) { - se->statistics.iowait_sum += delta; - se->statistics.iowait_count++; - trace_sched_stat_iowait(tsk, delta); - } - - trace_sched_stat_blocked(tsk, delta); - - /* - * Blocking time is in units of nanosecs, so shift by - * 20 to get a milliseconds-range estimation of the - * amount of time that the task spent sleeping: - */ - if (unlikely(prof_on == SLEEP_PROFILING)) { - profile_hits(SLEEP_PROFILING, - (void *)get_wchan(tsk), - delta >> 20); - } - account_scheduler_latency(tsk, delta >> 10, 0); - } - } -#endif -} - static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) { #ifdef CONFIG_SCHED_DEBUG @@ -3254,7 +3276,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) d = -d; if (d > 3*sysctl_sched_latency) - schedstat_inc(cfs_rq, nr_spread_over); + schedstat_inc(cfs_rq->nr_spread_over); #endif } @@ -3371,17 +3393,12 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); - if (flags & ENQUEUE_WAKEUP) { + if (flags & ENQUEUE_WAKEUP) place_entity(cfs_rq, se, 0); - if (schedstat_enabled()) - enqueue_sleeper(cfs_rq, se); - } check_schedstat_required(); - if (schedstat_enabled()) { - update_stats_enqueue(cfs_rq, se); - check_spread(cfs_rq, se); - } + update_stats_enqueue(cfs_rq, se, flags); + check_spread(cfs_rq, se); if (!curr) __enqueue_entity(cfs_rq, se); se->on_rq = 1; @@ -3448,8 +3465,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_curr(cfs_rq); dequeue_entity_load_avg(cfs_rq, se); - if (schedstat_enabled()) - update_stats_dequeue(cfs_rq, se, flags); + update_stats_dequeue(cfs_rq, se, flags); clear_buddies(cfs_rq, se); @@ -3459,9 +3475,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) account_entity_dequeue(cfs_rq, se); /* - * Normalize the entity after updating the min_vruntime because the - * update can refer to the ->curr item and we need to reflect this - * movement in our normalized position. + * Normalize after update_curr(); which will also have moved + * min_vruntime if @se is the one holding it back. But before doing + * update_min_vruntime() again, which will discount @se's position and + * can move min_vruntime forward still more. */ if (!(flags & DEQUEUE_SLEEP)) se->vruntime -= cfs_rq->min_vruntime; @@ -3469,8 +3486,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* return excess runtime on last dequeue */ return_cfs_rq_runtime(cfs_rq); - update_min_vruntime(cfs_rq); update_cfs_shares(cfs_rq); + + /* + * Now advance min_vruntime if @se was the entity holding it back, + * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be + * put back on, and if we advance min_vruntime, we'll be placed back + * further than we started -- ie. we'll be penalized. + */ + if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) + update_min_vruntime(cfs_rq); } /* @@ -3523,25 +3548,25 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * a CPU. So account for the time it spent waiting on the * runqueue. */ - if (schedstat_enabled()) - update_stats_wait_end(cfs_rq, se); + update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); update_load_avg(se, 1); } update_stats_curr_start(cfs_rq, se); cfs_rq->curr = se; -#ifdef CONFIG_SCHEDSTATS + /* * Track our maximum slice length, if the CPU's load is at * least twice that of our own weight (i.e. dont track it * when there are only lesser-weight tasks around): */ if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { - se->statistics.slice_max = max(se->statistics.slice_max, - se->sum_exec_runtime - se->prev_sum_exec_runtime); + schedstat_set(se->statistics.slice_max, + max((u64)schedstat_val(se->statistics.slice_max), + se->sum_exec_runtime - se->prev_sum_exec_runtime)); } -#endif + se->prev_sum_exec_runtime = se->sum_exec_runtime; } @@ -3620,13 +3645,10 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* throttle cfs_rqs exceeding runtime */ check_cfs_rq_runtime(cfs_rq); - if (schedstat_enabled()) { - check_spread(cfs_rq, prev); - if (prev->on_rq) - update_stats_wait_start(cfs_rq, prev); - } + check_spread(cfs_rq, prev); if (prev->on_rq) { + update_stats_wait_start(cfs_rq, prev); /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); /* in !on_rq case, update occurred at dequeue */ @@ -4456,9 +4478,9 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); - WARN_ON(task_rq(p) != rq); + SCHED_WARN_ON(task_rq(p) != rq); - if (cfs_rq->nr_running > 1) { + if (rq->cfs.h_nr_running > 1) { u64 slice = sched_slice(cfs_rq, se); u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; s64 delta = slice - ran; @@ -4509,6 +4531,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; + /* + * If in_iowait is set, the code below may not trigger any cpufreq + * utilization updates, so do it here explicitly with the IOWAIT flag + * passed. + */ + if (p->in_iowait) + cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT); + for_each_sched_entity(se) { if (se->on_rq) break; @@ -4605,6 +4635,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } #ifdef CONFIG_SMP + +/* Working cpumask for: load_balance, load_balance_newidle. */ +DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); +DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); + #ifdef CONFIG_NO_HZ_COMMON /* * per rq 'load' arrray crap; XXX kill this. @@ -5006,9 +5041,9 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) * wl = S * s'_i; see (2) */ if (W > 0 && w < W) - wl = (w * (long)tg->shares) / W; + wl = (w * (long)scale_load_down(tg->shares)) / W; else - wl = tg->shares; + wl = scale_load_down(tg->shares); /* * Per the above, wl is the new se->load.weight value; since @@ -5091,18 +5126,18 @@ static int wake_wide(struct task_struct *p) return 1; } -static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) +static int wake_affine(struct sched_domain *sd, struct task_struct *p, + int prev_cpu, int sync) { s64 this_load, load; s64 this_eff_load, prev_eff_load; - int idx, this_cpu, prev_cpu; + int idx, this_cpu; struct task_group *tg; unsigned long weight; int balanced; idx = sd->wake_idx; this_cpu = smp_processor_id(); - prev_cpu = task_cpu(p); load = source_load(prev_cpu, idx); this_load = target_load(this_cpu, idx); @@ -5146,13 +5181,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) balanced = this_eff_load <= prev_eff_load; - schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); + schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); if (!balanced) return 0; - schedstat_inc(sd, ttwu_move_affine); - schedstat_inc(p, se.statistics.nr_wakeups_affine); + schedstat_inc(sd->ttwu_move_affine); + schedstat_inc(p->se.statistics.nr_wakeups_affine); return 1; } @@ -5228,6 +5263,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) int shallowest_idle_cpu = -1; int i; + /* Check if we have any choice: */ + if (group->group_weight == 1) + return cpumask_first(sched_group_cpus(group)); + /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { if (idle_cpu(i)) { @@ -5265,64 +5304,237 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) } /* - * Try and locate an idle CPU in the sched_domain. + * Implement a for_each_cpu() variant that starts the scan at a given cpu + * (@start), and wraps around. + * + * This is used to scan for idle CPUs; such that not all CPUs looking for an + * idle CPU find the same CPU. The down-side is that tasks tend to cycle + * through the LLC domain. + * + * Especially tbench is found sensitive to this. + */ + +static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped) +{ + int next; + +again: + next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1); + + if (*wrapped) { + if (next >= start) + return nr_cpumask_bits; + } else { + if (next >= nr_cpumask_bits) { + *wrapped = 1; + n = -1; + goto again; + } + } + + return next; +} + +#define for_each_cpu_wrap(cpu, mask, start, wrap) \ + for ((wrap) = 0, (cpu) = (start)-1; \ + (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \ + (cpu) < nr_cpumask_bits; ) + +#ifdef CONFIG_SCHED_SMT + +static inline void set_idle_cores(int cpu, int val) +{ + struct sched_domain_shared *sds; + + sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + if (sds) + WRITE_ONCE(sds->has_idle_cores, val); +} + +static inline bool test_idle_cores(int cpu, bool def) +{ + struct sched_domain_shared *sds; + + sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + if (sds) + return READ_ONCE(sds->has_idle_cores); + + return def; +} + +/* + * Scans the local SMT mask to see if the entire core is idle, and records this + * information in sd_llc_shared->has_idle_cores. + * + * Since SMT siblings share all cache levels, inspecting this limited remote + * state should be fairly cheap. + */ +void __update_idle_core(struct rq *rq) +{ + int core = cpu_of(rq); + int cpu; + + rcu_read_lock(); + if (test_idle_cores(core, true)) + goto unlock; + + for_each_cpu(cpu, cpu_smt_mask(core)) { + if (cpu == core) + continue; + + if (!idle_cpu(cpu)) + goto unlock; + } + + set_idle_cores(core, 1); +unlock: + rcu_read_unlock(); +} + +/* + * Scan the entire LLC domain for idle cores; this dynamically switches off if + * there are no idle cores left in the system; tracked through + * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above. + */ +static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) +{ + struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); + int core, cpu, wrap; + + if (!static_branch_likely(&sched_smt_present)) + return -1; + + if (!test_idle_cores(target, false)) + return -1; + + cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p)); + + for_each_cpu_wrap(core, cpus, target, wrap) { + bool idle = true; + + for_each_cpu(cpu, cpu_smt_mask(core)) { + cpumask_clear_cpu(cpu, cpus); + if (!idle_cpu(cpu)) + idle = false; + } + + if (idle) + return core; + } + + /* + * Failed to find an idle core; stop looking for one. + */ + set_idle_cores(target, 0); + + return -1; +} + +/* + * Scan the local SMT mask for idle CPUs. */ -static int select_idle_sibling(struct task_struct *p, int target) +static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +{ + int cpu; + + if (!static_branch_likely(&sched_smt_present)) + return -1; + + for_each_cpu(cpu, cpu_smt_mask(target)) { + if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) + continue; + if (idle_cpu(cpu)) + return cpu; + } + + return -1; +} + +#else /* CONFIG_SCHED_SMT */ + +static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) +{ + return -1; +} + +static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +{ + return -1; +} + +#endif /* CONFIG_SCHED_SMT */ + +/* + * Scan the LLC domain for idle CPUs; this is dynamically regulated by + * comparing the average scan cost (tracked in sd->avg_scan_cost) against the + * average idle time for this rq (as found in rq->avg_idle). + */ +static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) +{ + struct sched_domain *this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); + u64 avg_idle = this_rq()->avg_idle; + u64 avg_cost = this_sd->avg_scan_cost; + u64 time, cost; + s64 delta; + int cpu, wrap; + + /* + * Due to large variance we need a large fuzz factor; hackbench in + * particularly is sensitive here. + */ + if ((avg_idle / 512) < avg_cost) + return -1; + + time = local_clock(); + + for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) { + if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) + continue; + if (idle_cpu(cpu)) + break; + } + + time = local_clock() - time; + cost = this_sd->avg_scan_cost; + delta = (s64)(time - cost) / 8; + this_sd->avg_scan_cost += delta; + + return cpu; +} + +/* + * Try and locate an idle core/thread in the LLC cache domain. + */ +static int select_idle_sibling(struct task_struct *p, int prev, int target) { struct sched_domain *sd; - struct sched_group *sg; - int i = task_cpu(p); + int i; if (idle_cpu(target)) return target; /* - * If the prevous cpu is cache affine and idle, don't be stupid. + * If the previous cpu is cache affine and idle, don't be stupid. */ - if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) - return i; + if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) + return prev; - /* - * Otherwise, iterate the domains and find an eligible idle cpu. - * - * A completely idle sched group at higher domains is more - * desirable than an idle group at a lower level, because lower - * domains have smaller groups and usually share hardware - * resources which causes tasks to contend on them, e.g. x86 - * hyperthread siblings in the lowest domain (SMT) can contend - * on the shared cpu pipeline. - * - * However, while we prefer idle groups at higher domains - * finding an idle cpu at the lowest domain is still better than - * returning 'target', which we've already established, isn't - * idle. - */ sd = rcu_dereference(per_cpu(sd_llc, target)); - for_each_lower_domain(sd) { - sg = sd->groups; - do { - if (!cpumask_intersects(sched_group_cpus(sg), - tsk_cpus_allowed(p))) - goto next; - - /* Ensure the entire group is idle */ - for_each_cpu(i, sched_group_cpus(sg)) { - if (i == target || !idle_cpu(i)) - goto next; - } + if (!sd) + return target; + + i = select_idle_core(p, sd, target); + if ((unsigned)i < nr_cpumask_bits) + return i; + + i = select_idle_cpu(p, sd, target); + if ((unsigned)i < nr_cpumask_bits) + return i; + + i = select_idle_smt(p, sd, target); + if ((unsigned)i < nr_cpumask_bits) + return i; - /* - * It doesn't matter which cpu we pick, the - * whole group is idle. - */ - target = cpumask_first_and(sched_group_cpus(sg), - tsk_cpus_allowed(p)); - goto done; -next: - sg = sg->next; - } while (sg != sd->groups); - } -done: return target; } @@ -5360,6 +5572,32 @@ static int cpu_util(int cpu) return (util >= capacity) ? capacity : util; } +static inline int task_util(struct task_struct *p) +{ + return p->se.avg.util_avg; +} + +/* + * Disable WAKE_AFFINE in the case where task @p doesn't fit in the + * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu. + * + * In that case WAKE_AFFINE doesn't make sense and we'll let + * BALANCE_WAKE sort things out. + */ +static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) +{ + long min_cap, max_cap; + + min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu)); + max_cap = cpu_rq(cpu)->rd->max_cpu_capacity; + + /* Minimum capacity is close to max, no need to abort wake_affine */ + if (max_cap - min_cap < max_cap >> 3) + return 0; + + return min_cap * 1024 < task_util(p) * capacity_margin; +} + /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, @@ -5383,7 +5621,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); - want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) + && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); } rcu_read_lock(); @@ -5409,13 +5648,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (affine_sd) { sd = NULL; /* Prefer wake_affine over balance flags */ - if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) + if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync)) new_cpu = cpu; } if (!sd) { if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ - new_cpu = select_idle_sibling(p, new_cpu); + new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); } else while (sd) { struct sched_group *group; @@ -5939,7 +6178,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * * The adjacency matrix of the resulting graph is given by: * - * log_2 n + * log_2 n * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6) * k = 0 * @@ -5985,7 +6224,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * * [XXX write more on how we solve this.. _after_ merging pjt's patches that * rewrite all of this once again.] - */ + */ static unsigned long __read_mostly max_load_balance_interval = HZ/10; @@ -6133,7 +6372,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { int cpu; - schedstat_inc(p, se.statistics.nr_failed_migrations_affine); + schedstat_inc(p->se.statistics.nr_failed_migrations_affine); env->flags |= LBF_SOME_PINNED; @@ -6164,7 +6403,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) env->flags &= ~LBF_ALL_PINNED; if (task_running(env->src_rq, p)) { - schedstat_inc(p, se.statistics.nr_failed_migrations_running); + schedstat_inc(p->se.statistics.nr_failed_migrations_running); return 0; } @@ -6181,13 +6420,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (tsk_cache_hot <= 0 || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { if (tsk_cache_hot == 1) { - schedstat_inc(env->sd, lb_hot_gained[env->idle]); - schedstat_inc(p, se.statistics.nr_forced_migrations); + schedstat_inc(env->sd->lb_hot_gained[env->idle]); + schedstat_inc(p->se.statistics.nr_forced_migrations); } return 1; } - schedstat_inc(p, se.statistics.nr_failed_migrations_hot); + schedstat_inc(p->se.statistics.nr_failed_migrations_hot); return 0; } @@ -6227,7 +6466,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) * so we can safely collect stats here rather than * inside detach_tasks(). */ - schedstat_inc(env->sd, lb_gained[env->idle]); + schedstat_inc(env->sd->lb_gained[env->idle]); return p; } return NULL; @@ -6319,7 +6558,7 @@ next: * so we can safely collect detach_one_task() stats here rather * than inside detach_one_task(). */ - schedstat_add(env->sd, lb_gained[env->idle], detached); + schedstat_add(env->sd->lb_gained[env->idle], detached); return detached; } @@ -6647,7 +6886,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) /* * !SD_OVERLAP domains can assume that child groups * span the current group. - */ + */ group = child->groups; do { @@ -7147,7 +7386,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE; if (load_above_capacity > busiest->group_capacity) { load_above_capacity -= busiest->group_capacity; - load_above_capacity *= NICE_0_LOAD; + load_above_capacity *= scale_load_down(NICE_0_LOAD); load_above_capacity /= busiest->group_capacity; } else load_above_capacity = ~0UL; @@ -7354,9 +7593,6 @@ static struct rq *find_busiest_queue(struct lb_env *env, */ #define MAX_PINNED_INTERVAL 512 -/* Working cpumask for load_balance and load_balance_newidle. */ -DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); - static int need_active_balance(struct lb_env *env) { struct sched_domain *sd = env->sd; @@ -7460,7 +7696,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, cpumask_copy(cpus, cpu_active_mask); - schedstat_inc(sd, lb_count[idle]); + schedstat_inc(sd->lb_count[idle]); redo: if (!should_we_balance(&env)) { @@ -7470,19 +7706,19 @@ redo: group = find_busiest_group(&env); if (!group) { - schedstat_inc(sd, lb_nobusyg[idle]); + schedstat_inc(sd->lb_nobusyg[idle]); goto out_balanced; } busiest = find_busiest_queue(&env, group); if (!busiest) { - schedstat_inc(sd, lb_nobusyq[idle]); + schedstat_inc(sd->lb_nobusyq[idle]); goto out_balanced; } BUG_ON(busiest == env.dst_rq); - schedstat_add(sd, lb_imbalance[idle], env.imbalance); + schedstat_add(sd->lb_imbalance[idle], env.imbalance); env.src_cpu = busiest->cpu; env.src_rq = busiest; @@ -7589,7 +7825,7 @@ more_balance: } if (!ld_moved) { - schedstat_inc(sd, lb_failed[idle]); + schedstat_inc(sd->lb_failed[idle]); /* * Increment the failure counter only on periodic balance. * We do not want newidle balance, which can be very @@ -7672,7 +7908,7 @@ out_all_pinned: * we can't migrate them. Let the imbalance flag set so parent level * can try to migrate them. */ - schedstat_inc(sd, lb_balanced[idle]); + schedstat_inc(sd->lb_balanced[idle]); sd->nr_balance_failed = 0; @@ -7704,11 +7940,12 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) } static inline void -update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance) +update_next_balance(struct sched_domain *sd, unsigned long *next_balance) { unsigned long interval, next; - interval = get_sd_balance_interval(sd, cpu_busy); + /* used by idle balance, so cpu_busy = 0 */ + interval = get_sd_balance_interval(sd, 0); next = sd->last_balance + interval; if (time_after(*next_balance, next)) @@ -7738,7 +7975,7 @@ static int idle_balance(struct rq *this_rq) rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); if (sd) - update_next_balance(sd, 0, &next_balance); + update_next_balance(sd, &next_balance); rcu_read_unlock(); goto out; @@ -7756,7 +7993,7 @@ static int idle_balance(struct rq *this_rq) continue; if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { - update_next_balance(sd, 0, &next_balance); + update_next_balance(sd, &next_balance); break; } @@ -7774,7 +8011,7 @@ static int idle_balance(struct rq *this_rq) curr_cost += domain_cost; } - update_next_balance(sd, 0, &next_balance); + update_next_balance(sd, &next_balance); /* * Stop searching for tasks to pull if there are @@ -7864,15 +8101,15 @@ static int active_load_balance_cpu_stop(void *data) .idle = CPU_IDLE, }; - schedstat_inc(sd, alb_count); + schedstat_inc(sd->alb_count); p = detach_one_task(&env); if (p) { - schedstat_inc(sd, alb_pushed); + schedstat_inc(sd->alb_pushed); /* Active balancing done, reset the failure counter. */ sd->nr_balance_failed = 0; } else { - schedstat_inc(sd, alb_failed); + schedstat_inc(sd->alb_failed); } } rcu_read_unlock(); @@ -7964,13 +8201,13 @@ static inline void set_cpu_sd_state_busy(void) int cpu = smp_processor_id(); rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_busy, cpu)); + sd = rcu_dereference(per_cpu(sd_llc, cpu)); if (!sd || !sd->nohz_idle) goto unlock; sd->nohz_idle = 0; - atomic_inc(&sd->groups->sgc->nr_busy_cpus); + atomic_inc(&sd->shared->nr_busy_cpus); unlock: rcu_read_unlock(); } @@ -7981,13 +8218,13 @@ void set_cpu_sd_state_idle(void) int cpu = smp_processor_id(); rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_busy, cpu)); + sd = rcu_dereference(per_cpu(sd_llc, cpu)); if (!sd || sd->nohz_idle) goto unlock; sd->nohz_idle = 1; - atomic_dec(&sd->groups->sgc->nr_busy_cpus); + atomic_dec(&sd->shared->nr_busy_cpus); unlock: rcu_read_unlock(); } @@ -8214,8 +8451,8 @@ end: static inline bool nohz_kick_needed(struct rq *rq) { unsigned long now = jiffies; + struct sched_domain_shared *sds; struct sched_domain *sd; - struct sched_group_capacity *sgc; int nr_busy, cpu = rq->cpu; bool kick = false; @@ -8243,11 +8480,13 @@ static inline bool nohz_kick_needed(struct rq *rq) return true; rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_busy, cpu)); - if (sd) { - sgc = sd->groups->sgc; - nr_busy = atomic_read(&sgc->nr_busy_cpus); - + sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + if (sds) { + /* + * XXX: write a coherent comment on why we do this. + * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com + */ + nr_busy = atomic_read(&sds->nr_busy_cpus); if (nr_busy > 1) { kick = true; goto unlock; @@ -8441,7 +8680,6 @@ static void detach_task_cfs_rq(struct task_struct *p) struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); - int tg_update; if (!vruntime_normalized(p)) { /* @@ -8453,10 +8691,9 @@ static void detach_task_cfs_rq(struct task_struct *p) } /* Catch up with the cfs_rq and remove our load when we leave */ - tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); + update_cfs_rq_load_avg(now, cfs_rq, false); detach_entity_load_avg(cfs_rq, se); - if (tg_update) - update_tg_load_avg(cfs_rq, false); + update_tg_load_avg(cfs_rq, false); } static void attach_task_cfs_rq(struct task_struct *p) @@ -8464,7 +8701,6 @@ static void attach_task_cfs_rq(struct task_struct *p) struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); - int tg_update; #ifdef CONFIG_FAIR_GROUP_SCHED /* @@ -8475,10 +8711,9 @@ static void attach_task_cfs_rq(struct task_struct *p) #endif /* Synchronize task with its cfs_rq */ - tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); + update_cfs_rq_load_avg(now, cfs_rq, false); attach_entity_load_avg(cfs_rq, se); - if (tg_update) - update_tg_load_avg(cfs_rq, false); + update_tg_load_avg(cfs_rq, false); if (!vruntime_normalized(p)) se->vruntime += cfs_rq->min_vruntime; diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 9fb873cfc75c..1d8718d5300d 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -16,6 +16,9 @@ #include "sched.h" +/* Linker adds these: start and end of __cpuidle functions */ +extern char __cpuidle_text_start[], __cpuidle_text_end[]; + /** * sched_idle_set_state - Record idle state for the current CPU. * @idle_state: State to record. @@ -53,7 +56,7 @@ static int __init cpu_idle_nopoll_setup(char *__unused) __setup("hlt", cpu_idle_nopoll_setup); #endif -static inline int cpu_idle_poll(void) +static noinline int __cpuidle cpu_idle_poll(void) { rcu_idle_enter(); trace_cpu_idle_rcuidle(0, smp_processor_id()); @@ -84,7 +87,7 @@ void __weak arch_cpu_idle(void) * * To use when the cpuidle framework cannot be used. */ -void default_idle_call(void) +void __cpuidle default_idle_call(void) { if (current_clr_polling_and_test()) { local_irq_enable(); @@ -271,6 +274,12 @@ static void cpu_idle_loop(void) } } +bool cpu_in_idle(unsigned long pc) +{ + return pc >= (unsigned long)__cpuidle_text_start && + pc < (unsigned long)__cpuidle_text_end; +} + void cpu_startup_entry(enum cpuhp_state state) { /* diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 2ce5458bbe1d..5405d3feb112 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -27,8 +27,8 @@ static struct task_struct * pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) { put_prev_task(rq, prev); - - schedstat_inc(rq, sched_goidle); + update_idle_core(rq); + schedstat_inc(rq->sched_goidle); return rq->idle; } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index d5690b722691..2516b8df6dbb 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -957,9 +957,8 @@ static void update_curr_rt(struct rq *rq) if (unlikely((s64)delta_exec <= 0)) return; - /* Kick cpufreq (see the comment in linux/cpufreq.h). */ - if (cpu_of(rq) == smp_processor_id()) - cpufreq_trigger_update(rq_clock(rq)); + /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT); schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c64fc5114004..055f935d4421 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2,6 +2,7 @@ #include <linux/sched.h> #include <linux/sched/sysctl.h> #include <linux/sched/rt.h> +#include <linux/u64_stats_sync.h> #include <linux/sched/deadline.h> #include <linux/binfmts.h> #include <linux/mutex.h> @@ -15,6 +16,12 @@ #include "cpudeadline.h" #include "cpuacct.h" +#ifdef CONFIG_SCHED_DEBUG +#define SCHED_WARN_ON(x) WARN_ONCE(x, #x) +#else +#define SCHED_WARN_ON(x) ((void)(x)) +#endif + struct rq; struct cpuidle_state; @@ -565,6 +572,8 @@ struct root_domain { */ cpumask_var_t rto_mask; struct cpupri cpupri; + + unsigned long max_cpu_capacity; }; extern struct root_domain def_root_domain; @@ -597,7 +606,6 @@ struct rq { #ifdef CONFIG_SMP unsigned long last_load_update_tick; #endif /* CONFIG_SMP */ - u64 nohz_stamp; unsigned long nohz_flags; #endif /* CONFIG_NO_HZ_COMMON */ #ifdef CONFIG_NO_HZ_FULL @@ -723,6 +731,23 @@ static inline int cpu_of(struct rq *rq) #endif } + +#ifdef CONFIG_SCHED_SMT + +extern struct static_key_false sched_smt_present; + +extern void __update_idle_core(struct rq *rq); + +static inline void update_idle_core(struct rq *rq) +{ + if (static_branch_unlikely(&sched_smt_present)) + __update_idle_core(rq); +} + +#else +static inline void update_idle_core(struct rq *rq) { } +#endif + DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) @@ -857,8 +882,8 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) DECLARE_PER_CPU(struct sched_domain *, sd_llc); DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); +DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); DECLARE_PER_CPU(struct sched_domain *, sd_numa); -DECLARE_PER_CPU(struct sched_domain *, sd_busy); DECLARE_PER_CPU(struct sched_domain *, sd_asym); struct sched_group_capacity { @@ -870,10 +895,6 @@ struct sched_group_capacity { unsigned int capacity; unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ - /* - * Number of busy cpus in this group. - */ - atomic_t nr_busy_cpus; unsigned long cpumask[0]; /* iteration mask */ }; @@ -1000,7 +1021,11 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) * per-task data have been completed by this moment. */ smp_wmb(); +#ifdef CONFIG_THREAD_INFO_IN_TASK + p->cpu = cpu; +#else task_thread_info(p)->cpu = cpu; +#endif p->wake_cpu = cpu; #endif } @@ -1260,6 +1285,11 @@ static inline void put_prev_task(struct rq *rq, struct task_struct *prev) prev->sched_class->put_prev_task(rq, prev); } +static inline void set_curr_task(struct rq *rq, struct task_struct *curr) +{ + curr->sched_class->set_curr_task(rq); +} + #define sched_class_highest (&stop_sched_class) #define for_each_class(class) \ for (class = sched_class_highest; class; class = class->next) @@ -1290,7 +1320,7 @@ static inline void idle_set_state(struct rq *rq, static inline struct cpuidle_state *idle_get_state(struct rq *rq) { - WARN_ON(!rcu_read_lock_held()); + SCHED_WARN_ON(!rcu_read_lock_held()); return rq->idle_state; } #else @@ -1710,52 +1740,28 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { } #endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING +struct irqtime { + u64 hardirq_time; + u64 softirq_time; + u64 irq_start_time; + struct u64_stats_sync sync; +}; -DECLARE_PER_CPU(u64, cpu_hardirq_time); -DECLARE_PER_CPU(u64, cpu_softirq_time); - -#ifndef CONFIG_64BIT -DECLARE_PER_CPU(seqcount_t, irq_time_seq); - -static inline void irq_time_write_begin(void) -{ - __this_cpu_inc(irq_time_seq.sequence); - smp_wmb(); -} - -static inline void irq_time_write_end(void) -{ - smp_wmb(); - __this_cpu_inc(irq_time_seq.sequence); -} +DECLARE_PER_CPU(struct irqtime, cpu_irqtime); static inline u64 irq_time_read(int cpu) { - u64 irq_time; - unsigned seq; + struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); + unsigned int seq; + u64 total; do { - seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); - irq_time = per_cpu(cpu_softirq_time, cpu) + - per_cpu(cpu_hardirq_time, cpu); - } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); - - return irq_time; -} -#else /* CONFIG_64BIT */ -static inline void irq_time_write_begin(void) -{ -} - -static inline void irq_time_write_end(void) -{ -} + seq = __u64_stats_fetch_begin(&irqtime->sync); + total = irqtime->softirq_time + irqtime->hardirq_time; + } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); -static inline u64 irq_time_read(int cpu) -{ - return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); + return total; } -#endif /* CONFIG_64BIT */ #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ #ifdef CONFIG_CPU_FREQ @@ -1763,27 +1769,13 @@ DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); /** * cpufreq_update_util - Take a note about CPU utilization changes. - * @time: Current time. - * @util: Current utilization. - * @max: Utilization ceiling. + * @rq: Runqueue to carry out the update for. + * @flags: Update reason flags. * - * This function is called by the scheduler on every invocation of - * update_load_avg() on the CPU whose utilization is being updated. + * This function is called by the scheduler on the CPU whose utilization is + * being updated. * * It can only be called from RCU-sched read-side critical sections. - */ -static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) -{ - struct update_util_data *data; - - data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); - if (data) - data->func(data, time, util, max); -} - -/** - * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed. - * @time: Current time. * * The way cpufreq is currently arranged requires it to evaluate the CPU * performance state (frequency/voltage) on a regular basis to prevent it from @@ -1797,13 +1789,23 @@ static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned lo * but that really is a band-aid. Going forward it should be replaced with * solutions targeted more specifically at RT and DL tasks. */ -static inline void cpufreq_trigger_update(u64 time) +static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) +{ + struct update_util_data *data; + + data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); + if (data) + data->func(data, rq_clock(rq), flags); +} + +static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) { - cpufreq_update_util(time, ULONG_MAX, 0); + if (cpu_of(rq) == smp_processor_id()) + cpufreq_update_util(rq, flags); } #else -static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {} -static inline void cpufreq_trigger_update(u64 time) {} +static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} +static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {} #endif /* CONFIG_CPU_FREQ */ #ifdef arch_scale_freq_capacity diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 78955cbea31c..34659a853505 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -29,11 +29,12 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) if (rq) rq->rq_sched_info.run_delay += delta; } -# define schedstat_enabled() static_branch_unlikely(&sched_schedstats) -# define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0) -# define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0) -# define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) -# define schedstat_val(rq, field) ((schedstat_enabled()) ? (rq)->field : 0) +#define schedstat_enabled() static_branch_unlikely(&sched_schedstats) +#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) +#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) +#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) +#define schedstat_val(var) (var) +#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) #else /* !CONFIG_SCHEDSTATS */ static inline void @@ -45,12 +46,13 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) static inline void rq_sched_info_depart(struct rq *rq, unsigned long long delta) {} -# define schedstat_enabled() 0 -# define schedstat_inc(rq, field) do { } while (0) -# define schedstat_add(rq, field, amt) do { } while (0) -# define schedstat_set(var, val) do { } while (0) -# define schedstat_val(rq, field) 0 -#endif +#define schedstat_enabled() 0 +#define schedstat_inc(var) do { } while (0) +#define schedstat_add(var, amt) do { } while (0) +#define schedstat_set(var, val) do { } while (0) +#define schedstat_val(var) 0 +#define schedstat_val_or_zero(var) 0 +#endif /* CONFIG_SCHEDSTATS */ #ifdef CONFIG_SCHED_INFO static inline void sched_info_reset_dequeued(struct task_struct *t) diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index f15d6b6a538a..4f7053579fe3 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -196,27 +196,48 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) } EXPORT_SYMBOL(prepare_to_wait_exclusive); -long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) +void init_wait_entry(wait_queue_t *wait, int flags) { - unsigned long flags; - - if (signal_pending_state(state, current)) - return -ERESTARTSYS; - + wait->flags = flags; wait->private = current; wait->func = autoremove_wake_function; + INIT_LIST_HEAD(&wait->task_list); +} +EXPORT_SYMBOL(init_wait_entry); + +long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ + unsigned long flags; + long ret = 0; spin_lock_irqsave(&q->lock, flags); - if (list_empty(&wait->task_list)) { - if (wait->flags & WQ_FLAG_EXCLUSIVE) - __add_wait_queue_tail(q, wait); - else - __add_wait_queue(q, wait); + if (unlikely(signal_pending_state(state, current))) { + /* + * Exclusive waiter must not fail if it was selected by wakeup, + * it should "consume" the condition we were waiting for. + * + * The caller will recheck the condition and return success if + * we were already woken up, we can not miss the event because + * wakeup locks/unlocks the same q->lock. + * + * But we need to ensure that set-condition + wakeup after that + * can't see us, it should wake up another exclusive waiter if + * we fail. + */ + list_del_init(&wait->task_list); + ret = -ERESTARTSYS; + } else { + if (list_empty(&wait->task_list)) { + if (wait->flags & WQ_FLAG_EXCLUSIVE) + __add_wait_queue_tail(q, wait); + else + __add_wait_queue(q, wait); + } + set_current_state(state); } - set_current_state(state); spin_unlock_irqrestore(&q->lock, flags); - return 0; + return ret; } EXPORT_SYMBOL(prepare_to_wait_event); @@ -255,39 +276,6 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) } EXPORT_SYMBOL(finish_wait); -/** - * abort_exclusive_wait - abort exclusive waiting in a queue - * @q: waitqueue waited on - * @wait: wait descriptor - * @mode: runstate of the waiter to be woken - * @key: key to identify a wait bit queue or %NULL - * - * Sets current thread back to running state and removes - * the wait descriptor from the given waitqueue if still - * queued. - * - * Wakes up the next waiter if the caller is concurrently - * woken up through the queue. - * - * This prevents waiter starvation where an exclusive waiter - * aborts and is woken up concurrently and no one wakes up - * the next waiter. - */ -void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, - unsigned int mode, void *key) -{ - unsigned long flags; - - __set_current_state(TASK_RUNNING); - spin_lock_irqsave(&q->lock, flags); - if (!list_empty(&wait->task_list)) - list_del_init(&wait->task_list); - else if (waitqueue_active(q)) - __wake_up_locked_key(q, mode, key); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(abort_exclusive_wait); - int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) { int ret = default_wake_function(wait, mode, sync, key); @@ -425,20 +413,29 @@ int __sched __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, wait_bit_action_f *action, unsigned mode) { - do { - int ret; + int ret = 0; + for (;;) { prepare_to_wait_exclusive(wq, &q->wait, mode); - if (!test_bit(q->key.bit_nr, q->key.flags)) - continue; - ret = action(&q->key, mode); - if (!ret) - continue; - abort_exclusive_wait(wq, &q->wait, mode, &q->key); - return ret; - } while (test_and_set_bit(q->key.bit_nr, q->key.flags)); - finish_wait(wq, &q->wait); - return 0; + if (test_bit(q->key.bit_nr, q->key.flags)) { + ret = action(&q->key, mode); + /* + * See the comment in prepare_to_wait_event(). + * finish_wait() does not necessarily takes wq->lock, + * but test_and_set_bit() implies mb() which pairs with + * smp_mb__after_atomic() before wake_up_page(). + */ + if (ret) + finish_wait(wq, &q->wait); + } + if (!test_and_set_bit(q->key.bit_nr, q->key.flags)) { + if (!ret) + finish_wait(wq, &q->wait); + return 0; + } else if (ret) { + return ret; + } + } } EXPORT_SYMBOL(__wait_on_bit_lock); diff --git a/kernel/signal.c b/kernel/signal.c index af21afc00d08..75761acc77cf 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3044,6 +3044,11 @@ void kernel_sigaction(int sig, __sighandler_t action) } EXPORT_SYMBOL(kernel_sigaction); +void __weak sigaction_compat_abi(struct k_sigaction *act, + struct k_sigaction *oact) +{ +} + int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) { struct task_struct *p = current, *t; @@ -3059,6 +3064,8 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) if (oact) *oact = *k; + sigaction_compat_abi(act, oact); + if (act) { sigdelsetmask(&act->sa.sa_mask, sigmask(SIGKILL) | sigmask(SIGSTOP)); diff --git a/kernel/smp.c b/kernel/smp.c index 3aa642d39c03..bba3b201668d 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -14,6 +14,7 @@ #include <linux/smp.h> #include <linux/cpu.h> #include <linux/sched.h> +#include <linux/hypervisor.h> #include "smpboot.h" @@ -724,3 +725,54 @@ void wake_up_all_idle_cpus(void) preempt_enable(); } EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); + +/** + * smp_call_on_cpu - Call a function on a specific cpu + * + * Used to call a function on a specific cpu and wait for it to return. + * Optionally make sure the call is done on a specified physical cpu via vcpu + * pinning in order to support virtualized environments. + */ +struct smp_call_on_cpu_struct { + struct work_struct work; + struct completion done; + int (*func)(void *); + void *data; + int ret; + int cpu; +}; + +static void smp_call_on_cpu_callback(struct work_struct *work) +{ + struct smp_call_on_cpu_struct *sscs; + + sscs = container_of(work, struct smp_call_on_cpu_struct, work); + if (sscs->cpu >= 0) + hypervisor_pin_vcpu(sscs->cpu); + sscs->ret = sscs->func(sscs->data); + if (sscs->cpu >= 0) + hypervisor_pin_vcpu(-1); + + complete(&sscs->done); +} + +int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys) +{ + struct smp_call_on_cpu_struct sscs = { + .done = COMPLETION_INITIALIZER_ONSTACK(sscs.done), + .func = func, + .data = par, + .cpu = phys ? cpu : -1, + }; + + INIT_WORK_ONSTACK(&sscs.work, smp_call_on_cpu_callback); + + if (cpu >= nr_cpu_ids || !cpu_online(cpu)) + return -ENXIO; + + queue_work_on(cpu, system_wq, &sscs.work); + wait_for_completion(&sscs.done); + + return sscs.ret; +} +EXPORT_SYMBOL_GPL(smp_call_on_cpu); diff --git a/kernel/softirq.c b/kernel/softirq.c index 17caf4b63342..66762645f9e8 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -78,6 +78,17 @@ static void wakeup_softirqd(void) } /* + * If ksoftirqd is scheduled, we do not want to process pending softirqs + * right now. Let ksoftirqd handle this at its own rate, to get fairness. + */ +static bool ksoftirqd_running(void) +{ + struct task_struct *tsk = __this_cpu_read(ksoftirqd); + + return tsk && (tsk->state == TASK_RUNNING); +} + +/* * preempt_count and SOFTIRQ_OFFSET usage: * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving * softirq processing. @@ -313,7 +324,7 @@ asmlinkage __visible void do_softirq(void) pending = local_softirq_pending(); - if (pending) + if (pending && !ksoftirqd_running()) do_softirq_own_stack(); local_irq_restore(flags); @@ -340,6 +351,9 @@ void irq_enter(void) static inline void invoke_softirq(void) { + if (ksoftirqd_running()) + return; + if (!force_irqthreads) { #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK /* @@ -700,7 +714,7 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu) BUG(); } -static void takeover_tasklets(unsigned int cpu) +static int takeover_tasklets(unsigned int cpu) { /* CPU is dead, so no lock needed. */ local_irq_disable(); @@ -723,27 +737,12 @@ static void takeover_tasklets(unsigned int cpu) raise_softirq_irqoff(HI_SOFTIRQ); local_irq_enable(); + return 0; } +#else +#define takeover_tasklets NULL #endif /* CONFIG_HOTPLUG_CPU */ -static int cpu_callback(struct notifier_block *nfb, unsigned long action, - void *hcpu) -{ - switch (action) { -#ifdef CONFIG_HOTPLUG_CPU - case CPU_DEAD: - case CPU_DEAD_FROZEN: - takeover_tasklets((unsigned long)hcpu); - break; -#endif /* CONFIG_HOTPLUG_CPU */ - } - return NOTIFY_OK; -} - -static struct notifier_block cpu_nfb = { - .notifier_call = cpu_callback -}; - static struct smp_hotplug_thread softirq_threads = { .store = &ksoftirqd, .thread_should_run = ksoftirqd_should_run, @@ -753,8 +752,8 @@ static struct smp_hotplug_thread softirq_threads = { static __init int spawn_ksoftirqd(void) { - register_cpu_notifier(&cpu_nfb); - + cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL, + takeover_tasklets); BUG_ON(smpboot_register_percpu_thread(&softirq_threads)); return 0; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 4a1ca5f6da7e..ec9ab2f01489 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -20,7 +20,6 @@ #include <linux/kallsyms.h> #include <linux/smpboot.h> #include <linux/atomic.h> -#include <linux/lglock.h> #include <linux/nmi.h> /* @@ -47,13 +46,9 @@ struct cpu_stopper { static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); static bool stop_machine_initialized = false; -/* - * Avoids a race between stop_two_cpus and global stop_cpus, where - * the stoppers could get queued up in reverse order, leading to - * system deadlock. Using an lglock means stop_two_cpus remains - * relatively cheap. - */ -DEFINE_STATIC_LGLOCK(stop_cpus_lock); +/* static data for stop_cpus */ +static DEFINE_MUTEX(stop_cpus_mutex); +static bool stop_cpus_in_progress; static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) { @@ -126,6 +121,11 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) cpu_stop_init_done(&done, 1); if (!cpu_stop_queue_work(cpu, &work)) return -ENOENT; + /* + * In case @cpu == smp_proccessor_id() we can avoid a sleep+wakeup + * cycle by doing a preemption: + */ + cond_resched(); wait_for_completion(&done.completion); return done.ret; } @@ -230,14 +230,26 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1); struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2); int err; - - lg_double_lock(&stop_cpus_lock, cpu1, cpu2); +retry: spin_lock_irq(&stopper1->lock); spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING); err = -ENOENT; if (!stopper1->enabled || !stopper2->enabled) goto unlock; + /* + * Ensure that if we race with __stop_cpus() the stoppers won't get + * queued up in reverse order leading to system deadlock. + * + * We can't miss stop_cpus_in_progress if queue_stop_cpus_work() has + * queued a work on cpu1 but not on cpu2, we hold both locks. + * + * It can be falsely true but it is safe to spin until it is cleared, + * queue_stop_cpus_work() does everything under preempt_disable(). + */ + err = -EDEADLK; + if (unlikely(stop_cpus_in_progress)) + goto unlock; err = 0; __cpu_stop_queue_work(stopper1, work1); @@ -245,8 +257,12 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, unlock: spin_unlock(&stopper2->lock); spin_unlock_irq(&stopper1->lock); - lg_double_unlock(&stop_cpus_lock, cpu1, cpu2); + if (unlikely(err == -EDEADLK)) { + while (stop_cpus_in_progress) + cpu_relax(); + goto retry; + } return err; } /** @@ -316,9 +332,6 @@ bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, return cpu_stop_queue_work(cpu, work_buf); } -/* static data for stop_cpus */ -static DEFINE_MUTEX(stop_cpus_mutex); - static bool queue_stop_cpus_work(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg, struct cpu_stop_done *done) @@ -332,7 +345,8 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask, * preempted by a stopper which might wait for other stoppers * to enter @fn which can lead to deadlock. */ - lg_global_lock(&stop_cpus_lock); + preempt_disable(); + stop_cpus_in_progress = true; for_each_cpu(cpu, cpumask) { work = &per_cpu(cpu_stopper.stop_work, cpu); work->fn = fn; @@ -341,7 +355,8 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask, if (cpu_stop_queue_work(cpu, work)) queued = true; } - lg_global_unlock(&stop_cpus_lock); + stop_cpus_in_progress = false; + preempt_enable(); return queued; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a13bbdaab47d..a43775c6646c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -65,6 +65,7 @@ #include <linux/sched/sysctl.h> #include <linux/kexec.h> #include <linux/bpf.h> +#include <linux/mount.h> #include <asm/uaccess.h> #include <asm/processor.h> @@ -1838,6 +1839,14 @@ static struct ctl_table fs_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, + { + .procname = "mount-max", + .data = &sysctl_mount_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + }, { } }; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 6a5a310a1a53..7e4fad75acaa 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -600,9 +600,18 @@ static void __clocksource_select(bool skipcur) */ if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) { /* Override clocksource cannot be used. */ - pr_warn("Override clocksource %s is not HRT compatible - cannot switch while in HRT/NOHZ mode\n", - cs->name); - override_name[0] = 0; + if (cs->flags & CLOCK_SOURCE_UNSTABLE) { + pr_warn("Override clocksource %s is unstable and not HRT compatible - cannot switch while in HRT/NOHZ mode\n", + cs->name); + override_name[0] = 0; + } else { + /* + * The override cannot be currently verified. + * Deferring to let the watchdog check. + */ + pr_info("Override clocksource %s is not currently HRT compatible - deferring\n", + cs->name); + } } else /* Override clocksource can be used. */ best = cs; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 9ba7c820fc23..bb5ec425dfe0 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -307,7 +307,7 @@ EXPORT_SYMBOL_GPL(__ktime_divns); */ ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) { - ktime_t res = ktime_add(lhs, rhs); + ktime_t res = ktime_add_unsafe(lhs, rhs); /* * We use KTIME_SEC_MAX here, the maximum timeout which we can @@ -703,7 +703,7 @@ static void clock_was_set_work(struct work_struct *work) static DECLARE_WORK(hrtimer_work, clock_was_set_work); /* - * Called from timekeeping and resume code to reprogramm the hrtimer + * Called from timekeeping and resume code to reprogram the hrtimer * interrupt device on all cpus. */ void clock_was_set_delayed(void) @@ -1241,7 +1241,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, /* * Note: We clear the running state after enqueue_hrtimer and - * we do not reprogramm the event hardware. Happens either in + * we do not reprogram the event hardware. Happens either in * hrtimer_start_range_ns() or in hrtimer_interrupt() * * Note: Because we dropped the cpu_base->lock above, diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 2ec7c00228f3..3bcb61b52f6c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -186,10 +186,13 @@ static bool check_tick_dependency(atomic_t *dep) return false; } -static bool can_stop_full_tick(struct tick_sched *ts) +static bool can_stop_full_tick(int cpu, struct tick_sched *ts) { WARN_ON_ONCE(!irqs_disabled()); + if (unlikely(!cpu_online(cpu))) + return false; + if (check_tick_dependency(&tick_dep_mask)) return false; @@ -843,7 +846,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts) if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) return; - if (can_stop_full_tick(ts)) + if (can_stop_full_tick(cpu, ts)) tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); else if (ts->tick_stopped) tick_nohz_restart_sched_tick(ts, ktime_get()); diff --git a/kernel/time/time.c b/kernel/time/time.c index 667b9335f5d6..bd62fb8e8e77 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -780,7 +780,7 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs, { struct timespec64 res; - set_normalized_timespec64(&res, lhs.tv_sec + rhs.tv_sec, + set_normalized_timespec64(&res, (timeu64_t) lhs.tv_sec + rhs.tv_sec, lhs.tv_nsec + rhs.tv_nsec); if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) { diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e07fb093f819..37dec7e3db43 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -403,8 +403,11 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) tkr = tkf->base + (seq & 0x01); now = ktime_to_ns(tkr->base); - now += clocksource_delta(tkr->read(tkr->clock), - tkr->cycle_last, tkr->mask); + now += timekeeping_delta_to_ns(tkr, + clocksource_delta( + tkr->read(tkr->clock), + tkr->cycle_last, + tkr->mask)); } while (read_seqcount_retry(&tkf->seq, seq)); return now; diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index 107310a6f36f..ca9fb800336b 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -75,5 +75,7 @@ void tk_debug_account_sleep_time(struct timespec64 *t) int bin = min(fls(t->tv_sec), NUM_BINS-1); sleep_time_bin[bin]++; + pr_info("Suspended for %lld.%03lu seconds\n", (s64)t->tv_sec, + t->tv_nsec / NSEC_PER_MSEC); } diff --git a/kernel/torture.c b/kernel/torture.c index 75961b3decfe..0d887eb62856 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -43,6 +43,7 @@ #include <linux/stat.h> #include <linux/slab.h> #include <linux/trace_clock.h> +#include <linux/ktime.h> #include <asm/byteorder.h> #include <linux/torture.h> @@ -446,9 +447,8 @@ EXPORT_SYMBOL_GPL(torture_shuffle_cleanup); * Variables for auto-shutdown. This allows "lights out" torture runs * to be fully scripted. */ -static int shutdown_secs; /* desired test duration in seconds. */ static struct task_struct *shutdown_task; -static unsigned long shutdown_time; /* jiffies to system shutdown. */ +static ktime_t shutdown_time; /* time to system shutdown. */ static void (*torture_shutdown_hook)(void); /* @@ -471,20 +471,20 @@ EXPORT_SYMBOL_GPL(torture_shutdown_absorb); */ static int torture_shutdown(void *arg) { - long delta; - unsigned long jiffies_snap; + ktime_t ktime_snap; VERBOSE_TOROUT_STRING("torture_shutdown task started"); - jiffies_snap = jiffies; - while (ULONG_CMP_LT(jiffies_snap, shutdown_time) && + ktime_snap = ktime_get(); + while (ktime_before(ktime_snap, shutdown_time) && !torture_must_stop()) { - delta = shutdown_time - jiffies_snap; if (verbose) pr_alert("%s" TORTURE_FLAG - "torture_shutdown task: %lu jiffies remaining\n", - torture_type, delta); - schedule_timeout_interruptible(delta); - jiffies_snap = jiffies; + "torture_shutdown task: %llu ms remaining\n", + torture_type, + ktime_ms_delta(shutdown_time, ktime_snap)); + set_current_state(TASK_INTERRUPTIBLE); + schedule_hrtimeout(&shutdown_time, HRTIMER_MODE_ABS); + ktime_snap = ktime_get(); } if (torture_must_stop()) { torture_kthread_stopping("torture_shutdown"); @@ -511,10 +511,9 @@ int torture_shutdown_init(int ssecs, void (*cleanup)(void)) { int ret = 0; - shutdown_secs = ssecs; torture_shutdown_hook = cleanup; - if (shutdown_secs > 0) { - shutdown_time = jiffies + shutdown_secs * HZ; + if (ssecs > 0) { + shutdown_time = ktime_add(ktime_get(), ktime_set(ssecs, 0)); ret = torture_create_kthread(torture_shutdown, NULL, shutdown_task); } diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index f4b86e8ca1e7..2a96b063d659 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -24,11 +24,6 @@ config HAVE_FUNCTION_GRAPH_TRACER help See Documentation/trace/ftrace-design.txt -config HAVE_FUNCTION_GRAPH_FP_TEST - bool - help - See Documentation/trace/ftrace-design.txt - config HAVE_DYNAMIC_FTRACE bool help @@ -221,6 +216,41 @@ config SCHED_TRACER This tracer tracks the latency of the highest priority task to be scheduled in, starting from the point it has woken up. +config HWLAT_TRACER + bool "Tracer to detect hardware latencies (like SMIs)" + select GENERIC_TRACER + help + This tracer, when enabled will create one or more kernel threads, + depening on what the cpumask file is set to, which each thread + spinning in a loop looking for interruptions caused by + something other than the kernel. For example, if a + System Management Interrupt (SMI) takes a noticeable amount of + time, this tracer will detect it. This is useful for testing + if a system is reliable for Real Time tasks. + + Some files are created in the tracing directory when this + is enabled: + + hwlat_detector/width - time in usecs for how long to spin for + hwlat_detector/window - time in usecs between the start of each + iteration + + A kernel thread is created that will spin with interrupts disabled + for "width" microseconds in every "widow" cycle. It will not spin + for "window - width" microseconds, where the system can + continue to operate. + + The output will appear in the trace and trace_pipe files. + + When the tracer is not running, it has no affect on the system, + but when it is running, it can cause the system to be + periodically non responsive. Do not run this tracer on a + production system. + + To enable this tracer, echo in "hwlat" into the current_tracer + file. Every time a latency is greater than tracing_thresh, it will + be recorded into the ring buffer. + config ENABLE_DEFAULT_TRACERS bool "Trace process context switches and events" depends on !GENERIC_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index d0a1617b52b4..992ab9d99f35 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -41,6 +41,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o +obj-$(CONFIG_HWLAT_TRACER) += trace_hwlat.o obj-$(CONFIG_NOP_TRACER) += trace_nop.o obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b20438fdb029..5dcb99281259 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1,4 +1,5 @@ /* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -8,6 +9,7 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/bpf.h> +#include <linux/bpf_perf_event.h> #include <linux/filter.h> #include <linux/uaccess.h> #include <linux/ctype.h> @@ -59,11 +61,9 @@ unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx) } EXPORT_SYMBOL_GPL(trace_call_bpf); -static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) { - void *dst = (void *) (long) r1; - int ret, size = (int) r2; - void *unsafe_ptr = (void *) (long) r3; + int ret; ret = probe_kernel_read(dst, unsafe_ptr, size); if (unlikely(ret < 0)) @@ -81,12 +81,9 @@ static const struct bpf_func_proto bpf_probe_read_proto = { .arg3_type = ARG_ANYTHING, }; -static u64 bpf_probe_write_user(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, + u32, size) { - void *unsafe_ptr = (void *) (long) r1; - void *src = (void *) (long) r2; - int size = (int) r3; - /* * Ensure we're in user context which is safe for the helper to * run. This helper has no business in a kthread. @@ -128,9 +125,9 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void) * limited trace_printk() * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed */ -static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) +BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, + u64, arg2, u64, arg3) { - char *fmt = (char *) (long) r1; bool str_seen = false; int mod[3] = {}; int fmt_cnt = 0; @@ -176,16 +173,16 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) switch (fmt_cnt) { case 1: - unsafe_addr = r3; - r3 = (long) buf; + unsafe_addr = arg1; + arg1 = (long) buf; break; case 2: - unsafe_addr = r4; - r4 = (long) buf; + unsafe_addr = arg2; + arg2 = (long) buf; break; case 3: - unsafe_addr = r5; - r5 = (long) buf; + unsafe_addr = arg3; + arg3 = (long) buf; break; } buf[0] = 0; @@ -207,9 +204,9 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) } return __trace_printk(1/* fake ip will not be printed */, fmt, - mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3, - mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4, - mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5); + mod[0] == 2 ? arg1 : mod[0] == 1 ? (long) arg1 : (u32) arg1, + mod[1] == 2 ? arg2 : mod[1] == 1 ? (long) arg2 : (u32) arg2, + mod[2] == 2 ? arg3 : mod[2] == 1 ? (long) arg3 : (u32) arg3); } static const struct bpf_func_proto bpf_trace_printk_proto = { @@ -231,9 +228,8 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) return &bpf_trace_printk_proto; } -static u64 bpf_perf_event_read(u64 r1, u64 flags, u64 r3, u64 r4, u64 r5) +BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) { - struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; struct bpf_array *array = container_of(map, struct bpf_array, map); unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; @@ -310,11 +306,9 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, return 0; } -static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) +BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, + u64, flags, void *, data, u64, size) { - struct pt_regs *regs = (struct pt_regs *)(long) r1; - struct bpf_map *map = (struct bpf_map *)(long) r2; - void *data = (void *)(long) r4; struct perf_raw_record raw = { .frag = { .size = size, @@ -365,7 +359,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, return __bpf_perf_event_output(regs, map, flags, &raw); } -static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_0(bpf_get_current_task) { return (long) current; } @@ -376,6 +370,31 @@ static const struct bpf_func_proto bpf_get_current_task_proto = { .ret_type = RET_INTEGER, }; +BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct cgroup *cgrp; + + if (unlikely(in_interrupt())) + return -EINVAL; + if (unlikely(idx >= array->map.max_entries)) + return -E2BIG; + + cgrp = READ_ONCE(array->ptrs[idx]); + if (unlikely(!cgrp)) + return -EAGAIN; + + return task_under_cgroup_hierarchy(current, cgrp); +} + +static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = { + .func = bpf_current_task_under_cgroup, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -407,6 +426,10 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) return &bpf_perf_event_read_proto; case BPF_FUNC_probe_write_user: return bpf_get_probe_write_proto(); + case BPF_FUNC_current_task_under_cgroup: + return &bpf_current_task_under_cgroup_proto; + case BPF_FUNC_get_prandom_u32: + return &bpf_get_prandom_u32_proto; default: return NULL; } @@ -447,16 +470,17 @@ static struct bpf_prog_type_list kprobe_tl = { .type = BPF_PROG_TYPE_KPROBE, }; -static u64 bpf_perf_event_output_tp(u64 r1, u64 r2, u64 index, u64 r4, u64 size) +BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map, + u64, flags, void *, data, u64, size) { + struct pt_regs *regs = *(struct pt_regs **)tp_buff; + /* * r1 points to perf tracepoint buffer where first 8 bytes are hidden * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it - * from there and call the same bpf_perf_event_output() helper + * from there and call the same bpf_perf_event_output() helper inline. */ - u64 ctx = *(long *)(uintptr_t)r1; - - return bpf_perf_event_output(ctx, r2, index, r4, size); + return ____bpf_perf_event_output(regs, map, flags, data, size); } static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { @@ -470,11 +494,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { .arg5_type = ARG_CONST_STACK_SIZE, }; -static u64 bpf_get_stackid_tp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map, + u64, flags) { - u64 ctx = *(long *)(uintptr_t)r1; + struct pt_regs *regs = *(struct pt_regs **)tp_buff; - return bpf_get_stackid(ctx, r2, r3, r4, r5); + /* + * Same comment as in bpf_perf_event_output_tp(), only that this time + * the other helper's function body cannot be inlined due to being + * external, thus we need to call raw helper function. + */ + return bpf_get_stackid((unsigned long) regs, (unsigned long) map, + flags, 0, 0); } static const struct bpf_func_proto bpf_get_stackid_proto_tp = { @@ -520,10 +551,69 @@ static struct bpf_prog_type_list tracepoint_tl = { .type = BPF_PROG_TYPE_TRACEPOINT, }; +static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, + enum bpf_reg_type *reg_type) +{ + if (off < 0 || off >= sizeof(struct bpf_perf_event_data)) + return false; + if (type != BPF_READ) + return false; + if (off % size != 0) + return false; + if (off == offsetof(struct bpf_perf_event_data, sample_period)) { + if (size != sizeof(u64)) + return false; + } else { + if (size != sizeof(long)) + return false; + } + return true; +} + +static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg, + int src_reg, int ctx_off, + struct bpf_insn *insn_buf, + struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + switch (ctx_off) { + case offsetof(struct bpf_perf_event_data, sample_period): + BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64)); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, + data), dst_reg, src_reg, + offsetof(struct bpf_perf_event_data_kern, data)); + *insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg, + offsetof(struct perf_sample_data, period)); + break; + default: + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, + regs), dst_reg, src_reg, + offsetof(struct bpf_perf_event_data_kern, regs)); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), dst_reg, dst_reg, ctx_off); + break; + } + + return insn - insn_buf; +} + +static const struct bpf_verifier_ops perf_event_prog_ops = { + .get_func_proto = tp_prog_func_proto, + .is_valid_access = pe_prog_is_valid_access, + .convert_ctx_access = pe_prog_convert_ctx_access, +}; + +static struct bpf_prog_type_list perf_event_tl = { + .ops = &perf_event_prog_ops, + .type = BPF_PROG_TYPE_PERF_EVENT, +}; + static int __init register_kprobe_prog_ops(void) { bpf_register_prog_type(&kprobe_tl); bpf_register_prog_type(&tracepoint_tl); + bpf_register_prog_type(&perf_event_tl); return 0; } late_initcall(register_kprobe_prog_ops); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 84752c8e28b5..2050a7652a86 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -872,7 +872,13 @@ function_profile_call(unsigned long ip, unsigned long parent_ip, #ifdef CONFIG_FUNCTION_GRAPH_TRACER static int profile_graph_entry(struct ftrace_graph_ent *trace) { + int index = trace->depth; + function_profile_call(trace->func, 0, NULL, NULL); + + if (index >= 0 && index < FTRACE_RETFUNC_DEPTH) + current->ret_stack[index].subtime = 0; + return 1; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dade4c9559cc..8696ce6bf2f6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1047,7 +1047,7 @@ void disable_trace_on_warning(void) * * Shows real state of the ring buffer if it is enabled or not. */ -static int tracer_tracing_is_on(struct trace_array *tr) +int tracer_tracing_is_on(struct trace_array *tr) { if (tr->trace_buffer.buffer) return ring_buffer_record_is_on(tr->trace_buffer.buffer); @@ -4123,6 +4123,30 @@ static const char readme_msg[] = "\t\t\t traces\n" #endif #endif /* CONFIG_STACK_TRACER */ +#ifdef CONFIG_KPROBE_EVENT + " kprobe_events\t\t- Add/remove/show the kernel dynamic events\n" + "\t\t\t Write into this file to define/undefine new trace events.\n" +#endif +#ifdef CONFIG_UPROBE_EVENT + " uprobe_events\t\t- Add/remove/show the userspace dynamic events\n" + "\t\t\t Write into this file to define/undefine new trace events.\n" +#endif +#if defined(CONFIG_KPROBE_EVENT) || defined(CONFIG_UPROBE_EVENT) + "\t accepts: event-definitions (one definition per line)\n" + "\t Format: p|r[:[<group>/]<event>] <place> [<args>]\n" + "\t -:[<group>/]<event>\n" +#ifdef CONFIG_KPROBE_EVENT + "\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n" +#endif +#ifdef CONFIG_UPROBE_EVENT + "\t place: <path>:<offset>\n" +#endif + "\t args: <name>=fetcharg[:type]\n" + "\t fetcharg: %<register>, @<address>, @<symbol>[+|-<offset>],\n" + "\t $stack<index>, $stack, $retval, $comm\n" + "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string,\n" + "\t b<bit-width>@<bit-offset>/<container-size>\n" +#endif " events/\t\t- Directory containing all trace event subsystems:\n" " enable\t\t- Write 0/1 to enable/disable tracing of all events\n" " events/<system>/\t- Directory containing all trace events for <system>:\n" @@ -4945,7 +4969,7 @@ out: return ret; } -#ifdef CONFIG_TRACER_MAX_TRACE +#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) static ssize_t tracing_max_lat_read(struct file *filp, char __user *ubuf, @@ -5124,19 +5148,20 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, struct trace_iterator *iter = filp->private_data; ssize_t sret; - /* return any leftover data */ - sret = trace_seq_to_user(&iter->seq, ubuf, cnt); - if (sret != -EBUSY) - return sret; - - trace_seq_init(&iter->seq); - /* * Avoid more than one consumer on a single file descriptor * This is just a matter of traces coherency, the ring buffer itself * is protected. */ mutex_lock(&iter->mutex); + + /* return any leftover data */ + sret = trace_seq_to_user(&iter->seq, ubuf, cnt); + if (sret != -EBUSY) + goto out; + + trace_seq_init(&iter->seq); + if (iter->trace->read) { sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); if (sret) @@ -5867,7 +5892,7 @@ static const struct file_operations tracing_thresh_fops = { .llseek = generic_file_llseek, }; -#ifdef CONFIG_TRACER_MAX_TRACE +#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) static const struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, @@ -6163,9 +6188,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, return -EBUSY; #endif - if (splice_grow_spd(pipe, &spd)) - return -ENOMEM; - if (*ppos & (PAGE_SIZE - 1)) return -EINVAL; @@ -6175,6 +6197,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, len &= PAGE_MASK; } + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + again: trace_access_lock(iter->cpu_file); entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file); @@ -6232,19 +6257,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, /* did we read anything? */ if (!spd.nr_pages) { if (ret) - return ret; + goto out; + ret = -EAGAIN; if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) - return -EAGAIN; + goto out; ret = wait_on_pipe(iter, true); if (ret) - return ret; + goto out; goto again; } ret = splice_to_pipe(pipe, &spd); +out: splice_shrink_spd(&spd); return ret; @@ -7195,7 +7222,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) create_trace_options_dir(tr); -#ifdef CONFIG_TRACER_MAX_TRACE +#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) trace_create_file("tracing_max_latency", 0644, d_tracer, &tr->max_latency, &tracing_max_lat_fops); #endif diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f783df416726..fd24b1f9ac43 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -38,6 +38,7 @@ enum trace_type { TRACE_USER_STACK, TRACE_BLK, TRACE_BPUTS, + TRACE_HWLAT, __TRACE_LAST_TYPE, }; @@ -213,6 +214,8 @@ struct trace_array { */ struct trace_buffer max_buffer; bool allocated_snapshot; +#endif +#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) unsigned long max_latency; #endif struct trace_pid_list __rcu *filtered_pids; @@ -326,6 +329,7 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \ + IF_ASSIGN(var, ent, struct hwlat_entry, TRACE_HWLAT); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ TRACE_MMIO_RW); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ @@ -571,6 +575,7 @@ void tracing_reset_current(int cpu); void tracing_reset_all_online_cpus(void); int tracing_open_generic(struct inode *inode, struct file *filp); bool tracing_is_disabled(void); +int tracer_tracing_is_on(struct trace_array *tr); struct dentry *trace_create_file(const char *name, umode_t mode, struct dentry *parent, diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 5c30efcda5e6..d1cc37e78f99 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -322,3 +322,30 @@ FTRACE_ENTRY(branch, trace_branch, FILTER_OTHER ); + +FTRACE_ENTRY(hwlat, hwlat_entry, + + TRACE_HWLAT, + + F_STRUCT( + __field( u64, duration ) + __field( u64, outer_duration ) + __field( u64, nmi_total_ts ) + __field_struct( struct timespec, timestamp ) + __field_desc( long, timestamp, tv_sec ) + __field_desc( long, timestamp, tv_nsec ) + __field( unsigned int, nmi_count ) + __field( unsigned int, seqnum ) + ), + + F_printk("cnt:%u\tts:%010lu.%010lu\tinner:%llu\touter:%llunmi-ts:%llu\tnmi-count:%u\n", + __entry->seqnum, + __entry->tv_sec, + __entry->tv_nsec, + __entry->duration, + __entry->outer_duration, + __entry->nmi_total_ts, + __entry->nmi_count), + + FILTER_OTHER +); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index a975571cde24..6721a1e89f39 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1028,6 +1028,7 @@ static struct event_command trigger_traceon_cmd = { static struct event_command trigger_traceoff_cmd = { .name = "traceoff", .trigger_type = ETT_TRACE_ONOFF, + .flags = EVENT_CMD_FL_POST_TRIGGER, .func = event_trigger_callback, .reg = register_trigger, .unreg = unregister_trigger, diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 7363ccf79512..4e480e870474 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -119,7 +119,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration, /* Add a function return address to the trace stack on thread info.*/ int ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, - unsigned long frame_pointer) + unsigned long frame_pointer, unsigned long *retp) { unsigned long long calltime; int index; @@ -170,8 +170,12 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, current->ret_stack[index].ret = ret; current->ret_stack[index].func = func; current->ret_stack[index].calltime = calltime; - current->ret_stack[index].subtime = 0; +#ifdef HAVE_FUNCTION_GRAPH_FP_TEST current->ret_stack[index].fp = frame_pointer; +#endif +#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR + current->ret_stack[index].retp = retp; +#endif *depth = current->curr_ret_stack; return 0; @@ -204,7 +208,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, return; } -#if defined(CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST) && !defined(CC_USING_FENTRY) +#ifdef HAVE_FUNCTION_GRAPH_FP_TEST /* * The arch may choose to record the frame pointer used * and check it here to make sure that it is what we expect it @@ -279,6 +283,64 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) return ret; } +/** + * ftrace_graph_ret_addr - convert a potentially modified stack return address + * to its original value + * + * This function can be called by stack unwinding code to convert a found stack + * return address ('ret') to its original value, in case the function graph + * tracer has modified it to be 'return_to_handler'. If the address hasn't + * been modified, the unchanged value of 'ret' is returned. + * + * 'idx' is a state variable which should be initialized by the caller to zero + * before the first call. + * + * 'retp' is a pointer to the return address on the stack. It's ignored if + * the arch doesn't have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR defined. + */ +#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR +unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, + unsigned long ret, unsigned long *retp) +{ + int index = task->curr_ret_stack; + int i; + + if (ret != (unsigned long)return_to_handler) + return ret; + + if (index < -1) + index += FTRACE_NOTRACE_DEPTH; + + if (index < 0) + return ret; + + for (i = 0; i <= index; i++) + if (task->ret_stack[i].retp == retp) + return task->ret_stack[i].ret; + + return ret; +} +#else /* !HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */ +unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, + unsigned long ret, unsigned long *retp) +{ + int task_idx; + + if (ret != (unsigned long)return_to_handler) + return ret; + + task_idx = task->curr_ret_stack; + + if (!task->ret_stack || task_idx < *idx) + return ret; + + task_idx -= *idx; + (*idx)++; + + return task->ret_stack[task_idx].ret; +} +#endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */ + int __trace_graph_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, unsigned long flags, @@ -1120,6 +1182,11 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, trace_seq_puts(s, "/* "); switch (iter->ent->type) { + case TRACE_BPUTS: + ret = trace_print_bputs_msg_only(iter); + if (ret != TRACE_TYPE_HANDLED) + return ret; + break; case TRACE_BPRINT: ret = trace_print_bprintk_msg_only(iter); if (ret != TRACE_TYPE_HANDLED) diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c new file mode 100644 index 000000000000..b97286c48735 --- /dev/null +++ b/kernel/trace/trace_hwlat.c @@ -0,0 +1,633 @@ +/* + * trace_hwlatdetect.c - A simple Hardware Latency detector. + * + * Use this tracer to detect large system latencies induced by the behavior of + * certain underlying system hardware or firmware, independent of Linux itself. + * The code was developed originally to detect the presence of SMIs on Intel + * and AMD systems, although there is no dependency upon x86 herein. + * + * The classical example usage of this tracer is in detecting the presence of + * SMIs or System Management Interrupts on Intel and AMD systems. An SMI is a + * somewhat special form of hardware interrupt spawned from earlier CPU debug + * modes in which the (BIOS/EFI/etc.) firmware arranges for the South Bridge + * LPC (or other device) to generate a special interrupt under certain + * circumstances, for example, upon expiration of a special SMI timer device, + * due to certain external thermal readings, on certain I/O address accesses, + * and other situations. An SMI hits a special CPU pin, triggers a special + * SMI mode (complete with special memory map), and the OS is unaware. + * + * Although certain hardware-inducing latencies are necessary (for example, + * a modern system often requires an SMI handler for correct thermal control + * and remote management) they can wreak havoc upon any OS-level performance + * guarantees toward low-latency, especially when the OS is not even made + * aware of the presence of these interrupts. For this reason, we need a + * somewhat brute force mechanism to detect these interrupts. In this case, + * we do it by hogging all of the CPU(s) for configurable timer intervals, + * sampling the built-in CPU timer, looking for discontiguous readings. + * + * WARNING: This implementation necessarily introduces latencies. Therefore, + * you should NEVER use this tracer while running in a production + * environment requiring any kind of low-latency performance + * guarantee(s). + * + * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com> + * Copyright (C) 2013-2016 Steven Rostedt, Red Hat, Inc. <srostedt@redhat.com> + * + * Includes useful feedback from Clark Williams <clark@redhat.com> + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ +#include <linux/kthread.h> +#include <linux/tracefs.h> +#include <linux/uaccess.h> +#include <linux/cpumask.h> +#include <linux/delay.h> +#include "trace.h" + +static struct trace_array *hwlat_trace; + +#define U64STR_SIZE 22 /* 20 digits max */ + +#define BANNER "hwlat_detector: " +#define DEFAULT_SAMPLE_WINDOW 1000000 /* 1s */ +#define DEFAULT_SAMPLE_WIDTH 500000 /* 0.5s */ +#define DEFAULT_LAT_THRESHOLD 10 /* 10us */ + +/* sampling thread*/ +static struct task_struct *hwlat_kthread; + +static struct dentry *hwlat_sample_width; /* sample width us */ +static struct dentry *hwlat_sample_window; /* sample window us */ + +/* Save the previous tracing_thresh value */ +static unsigned long save_tracing_thresh; + +/* NMI timestamp counters */ +static u64 nmi_ts_start; +static u64 nmi_total_ts; +static int nmi_count; +static int nmi_cpu; + +/* Tells NMIs to call back to the hwlat tracer to record timestamps */ +bool trace_hwlat_callback_enabled; + +/* If the user changed threshold, remember it */ +static u64 last_tracing_thresh = DEFAULT_LAT_THRESHOLD * NSEC_PER_USEC; + +/* Individual latency samples are stored here when detected. */ +struct hwlat_sample { + u64 seqnum; /* unique sequence */ + u64 duration; /* delta */ + u64 outer_duration; /* delta (outer loop) */ + u64 nmi_total_ts; /* Total time spent in NMIs */ + struct timespec timestamp; /* wall time */ + int nmi_count; /* # NMIs during this sample */ +}; + +/* keep the global state somewhere. */ +static struct hwlat_data { + + struct mutex lock; /* protect changes */ + + u64 count; /* total since reset */ + + u64 sample_window; /* total sampling window (on+off) */ + u64 sample_width; /* active sampling portion of window */ + +} hwlat_data = { + .sample_window = DEFAULT_SAMPLE_WINDOW, + .sample_width = DEFAULT_SAMPLE_WIDTH, +}; + +static void trace_hwlat_sample(struct hwlat_sample *sample) +{ + struct trace_array *tr = hwlat_trace; + struct trace_event_call *call = &event_hwlat; + struct ring_buffer *buffer = tr->trace_buffer.buffer; + struct ring_buffer_event *event; + struct hwlat_entry *entry; + unsigned long flags; + int pc; + + pc = preempt_count(); + local_save_flags(flags); + + event = trace_buffer_lock_reserve(buffer, TRACE_HWLAT, sizeof(*entry), + flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->seqnum = sample->seqnum; + entry->duration = sample->duration; + entry->outer_duration = sample->outer_duration; + entry->timestamp = sample->timestamp; + entry->nmi_total_ts = sample->nmi_total_ts; + entry->nmi_count = sample->nmi_count; + + if (!call_filter_check_discard(call, entry, buffer, event)) + __buffer_unlock_commit(buffer, event); +} + +/* Macros to encapsulate the time capturing infrastructure */ +#define time_type u64 +#define time_get() trace_clock_local() +#define time_to_us(x) div_u64(x, 1000) +#define time_sub(a, b) ((a) - (b)) +#define init_time(a, b) (a = b) +#define time_u64(a) a + +void trace_hwlat_callback(bool enter) +{ + if (smp_processor_id() != nmi_cpu) + return; + + /* + * Currently trace_clock_local() calls sched_clock() and the + * generic version is not NMI safe. + */ + if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) { + if (enter) + nmi_ts_start = time_get(); + else + nmi_total_ts = time_get() - nmi_ts_start; + } + + if (enter) + nmi_count++; +} + +/** + * get_sample - sample the CPU TSC and look for likely hardware latencies + * + * Used to repeatedly capture the CPU TSC (or similar), looking for potential + * hardware-induced latency. Called with interrupts disabled and with + * hwlat_data.lock held. + */ +static int get_sample(void) +{ + struct trace_array *tr = hwlat_trace; + time_type start, t1, t2, last_t2; + s64 diff, total, last_total = 0; + u64 sample = 0; + u64 thresh = tracing_thresh; + u64 outer_sample = 0; + int ret = -1; + + do_div(thresh, NSEC_PER_USEC); /* modifies interval value */ + + nmi_cpu = smp_processor_id(); + nmi_total_ts = 0; + nmi_count = 0; + /* Make sure NMIs see this first */ + barrier(); + + trace_hwlat_callback_enabled = true; + + init_time(last_t2, 0); + start = time_get(); /* start timestamp */ + + do { + + t1 = time_get(); /* we'll look for a discontinuity */ + t2 = time_get(); + + if (time_u64(last_t2)) { + /* Check the delta from outer loop (t2 to next t1) */ + diff = time_to_us(time_sub(t1, last_t2)); + /* This shouldn't happen */ + if (diff < 0) { + pr_err(BANNER "time running backwards\n"); + goto out; + } + if (diff > outer_sample) + outer_sample = diff; + } + last_t2 = t2; + + total = time_to_us(time_sub(t2, start)); /* sample width */ + + /* Check for possible overflows */ + if (total < last_total) { + pr_err("Time total overflowed\n"); + break; + } + last_total = total; + + /* This checks the inner loop (t1 to t2) */ + diff = time_to_us(time_sub(t2, t1)); /* current diff */ + + /* This shouldn't happen */ + if (diff < 0) { + pr_err(BANNER "time running backwards\n"); + goto out; + } + + if (diff > sample) + sample = diff; /* only want highest value */ + + } while (total <= hwlat_data.sample_width); + + barrier(); /* finish the above in the view for NMIs */ + trace_hwlat_callback_enabled = false; + barrier(); /* Make sure nmi_total_ts is no longer updated */ + + ret = 0; + + /* If we exceed the threshold value, we have found a hardware latency */ + if (sample > thresh || outer_sample > thresh) { + struct hwlat_sample s; + + ret = 1; + + /* We read in microseconds */ + if (nmi_total_ts) + do_div(nmi_total_ts, NSEC_PER_USEC); + + hwlat_data.count++; + s.seqnum = hwlat_data.count; + s.duration = sample; + s.outer_duration = outer_sample; + s.timestamp = CURRENT_TIME; + s.nmi_total_ts = nmi_total_ts; + s.nmi_count = nmi_count; + trace_hwlat_sample(&s); + + /* Keep a running maximum ever recorded hardware latency */ + if (sample > tr->max_latency) + tr->max_latency = sample; + } + +out: + return ret; +} + +static struct cpumask save_cpumask; +static bool disable_migrate; + +static void move_to_next_cpu(void) +{ + static struct cpumask *current_mask; + int next_cpu; + + if (disable_migrate) + return; + + /* Just pick the first CPU on first iteration */ + if (!current_mask) { + current_mask = &save_cpumask; + get_online_cpus(); + cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask); + put_online_cpus(); + next_cpu = cpumask_first(current_mask); + goto set_affinity; + } + + /* + * If for some reason the user modifies the CPU affinity + * of this thread, than stop migrating for the duration + * of the current test. + */ + if (!cpumask_equal(current_mask, ¤t->cpus_allowed)) + goto disable; + + get_online_cpus(); + cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask); + next_cpu = cpumask_next(smp_processor_id(), current_mask); + put_online_cpus(); + + if (next_cpu >= nr_cpu_ids) + next_cpu = cpumask_first(current_mask); + + set_affinity: + if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */ + goto disable; + + cpumask_clear(current_mask); + cpumask_set_cpu(next_cpu, current_mask); + + sched_setaffinity(0, current_mask); + return; + + disable: + disable_migrate = true; +} + +/* + * kthread_fn - The CPU time sampling/hardware latency detection kernel thread + * + * Used to periodically sample the CPU TSC via a call to get_sample. We + * disable interrupts, which does (intentionally) introduce latency since we + * need to ensure nothing else might be running (and thus preempting). + * Obviously this should never be used in production environments. + * + * Currently this runs on which ever CPU it was scheduled on, but most + * real-world hardware latency situations occur across several CPUs, + * but we might later generalize this if we find there are any actualy + * systems with alternate SMI delivery or other hardware latencies. + */ +static int kthread_fn(void *data) +{ + u64 interval; + + while (!kthread_should_stop()) { + + move_to_next_cpu(); + + local_irq_disable(); + get_sample(); + local_irq_enable(); + + mutex_lock(&hwlat_data.lock); + interval = hwlat_data.sample_window - hwlat_data.sample_width; + mutex_unlock(&hwlat_data.lock); + + do_div(interval, USEC_PER_MSEC); /* modifies interval value */ + + /* Always sleep for at least 1ms */ + if (interval < 1) + interval = 1; + + if (msleep_interruptible(interval)) + break; + } + + return 0; +} + +/** + * start_kthread - Kick off the hardware latency sampling/detector kthread + * + * This starts the kernel thread that will sit and sample the CPU timestamp + * counter (TSC or similar) and look for potential hardware latencies. + */ +static int start_kthread(struct trace_array *tr) +{ + struct task_struct *kthread; + + kthread = kthread_create(kthread_fn, NULL, "hwlatd"); + if (IS_ERR(kthread)) { + pr_err(BANNER "could not start sampling thread\n"); + return -ENOMEM; + } + hwlat_kthread = kthread; + wake_up_process(kthread); + + return 0; +} + +/** + * stop_kthread - Inform the hardware latency samping/detector kthread to stop + * + * This kicks the running hardware latency sampling/detector kernel thread and + * tells it to stop sampling now. Use this on unload and at system shutdown. + */ +static void stop_kthread(void) +{ + if (!hwlat_kthread) + return; + kthread_stop(hwlat_kthread); + hwlat_kthread = NULL; +} + +/* + * hwlat_read - Wrapper read function for reading both window and width + * @filp: The active open file structure + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * + * This function provides a generic read implementation for the global state + * "hwlat_data" structure filesystem entries. + */ +static ssize_t hwlat_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[U64STR_SIZE]; + u64 *entry = filp->private_data; + u64 val; + int len; + + if (!entry) + return -EFAULT; + + if (cnt > sizeof(buf)) + cnt = sizeof(buf); + + val = *entry; + + len = snprintf(buf, sizeof(buf), "%llu\n", val); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, len); +} + +/** + * hwlat_width_write - Write function for "width" entry + * @filp: The active open file structure + * @ubuf: The user buffer that contains the value to write + * @cnt: The maximum number of bytes to write to "file" + * @ppos: The current position in @file + * + * This function provides a write implementation for the "width" interface + * to the hardware latency detector. It can be used to configure + * for how many us of the total window us we will actively sample for any + * hardware-induced latency periods. Obviously, it is not possible to + * sample constantly and have the system respond to a sample reader, or, + * worse, without having the system appear to have gone out to lunch. It + * is enforced that width is less that the total window size. + */ +static ssize_t +hwlat_width_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + u64 val; + int err; + + err = kstrtoull_from_user(ubuf, cnt, 10, &val); + if (err) + return err; + + mutex_lock(&hwlat_data.lock); + if (val < hwlat_data.sample_window) + hwlat_data.sample_width = val; + else + err = -EINVAL; + mutex_unlock(&hwlat_data.lock); + + if (err) + return err; + + return cnt; +} + +/** + * hwlat_window_write - Write function for "window" entry + * @filp: The active open file structure + * @ubuf: The user buffer that contains the value to write + * @cnt: The maximum number of bytes to write to "file" + * @ppos: The current position in @file + * + * This function provides a write implementation for the "window" interface + * to the hardware latency detetector. The window is the total time + * in us that will be considered one sample period. Conceptually, windows + * occur back-to-back and contain a sample width period during which + * actual sampling occurs. Can be used to write a new total window size. It + * is enfoced that any value written must be greater than the sample width + * size, or an error results. + */ +static ssize_t +hwlat_window_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + u64 val; + int err; + + err = kstrtoull_from_user(ubuf, cnt, 10, &val); + if (err) + return err; + + mutex_lock(&hwlat_data.lock); + if (hwlat_data.sample_width < val) + hwlat_data.sample_window = val; + else + err = -EINVAL; + mutex_unlock(&hwlat_data.lock); + + if (err) + return err; + + return cnt; +} + +static const struct file_operations width_fops = { + .open = tracing_open_generic, + .read = hwlat_read, + .write = hwlat_width_write, +}; + +static const struct file_operations window_fops = { + .open = tracing_open_generic, + .read = hwlat_read, + .write = hwlat_window_write, +}; + +/** + * init_tracefs - A function to initialize the tracefs interface files + * + * This function creates entries in tracefs for "hwlat_detector". + * It creates the hwlat_detector directory in the tracing directory, + * and within that directory is the count, width and window files to + * change and view those values. + */ +static int init_tracefs(void) +{ + struct dentry *d_tracer; + struct dentry *top_dir; + + d_tracer = tracing_init_dentry(); + if (IS_ERR(d_tracer)) + return -ENOMEM; + + top_dir = tracefs_create_dir("hwlat_detector", d_tracer); + if (!top_dir) + return -ENOMEM; + + hwlat_sample_window = tracefs_create_file("window", 0640, + top_dir, + &hwlat_data.sample_window, + &window_fops); + if (!hwlat_sample_window) + goto err; + + hwlat_sample_width = tracefs_create_file("width", 0644, + top_dir, + &hwlat_data.sample_width, + &width_fops); + if (!hwlat_sample_width) + goto err; + + return 0; + + err: + tracefs_remove_recursive(top_dir); + return -ENOMEM; +} + +static void hwlat_tracer_start(struct trace_array *tr) +{ + int err; + + err = start_kthread(tr); + if (err) + pr_err(BANNER "Cannot start hwlat kthread\n"); +} + +static void hwlat_tracer_stop(struct trace_array *tr) +{ + stop_kthread(); +} + +static bool hwlat_busy; + +static int hwlat_tracer_init(struct trace_array *tr) +{ + /* Only allow one instance to enable this */ + if (hwlat_busy) + return -EBUSY; + + hwlat_trace = tr; + + disable_migrate = false; + hwlat_data.count = 0; + tr->max_latency = 0; + save_tracing_thresh = tracing_thresh; + + /* tracing_thresh is in nsecs, we speak in usecs */ + if (!tracing_thresh) + tracing_thresh = last_tracing_thresh; + + if (tracer_tracing_is_on(tr)) + hwlat_tracer_start(tr); + + hwlat_busy = true; + + return 0; +} + +static void hwlat_tracer_reset(struct trace_array *tr) +{ + stop_kthread(); + + /* the tracing threshold is static between runs */ + last_tracing_thresh = tracing_thresh; + + tracing_thresh = save_tracing_thresh; + hwlat_busy = false; +} + +static struct tracer hwlat_tracer __read_mostly = +{ + .name = "hwlat", + .init = hwlat_tracer_init, + .reset = hwlat_tracer_reset, + .start = hwlat_tracer_start, + .stop = hwlat_tracer_stop, + .allow_instances = true, +}; + +__init static int init_hwlat_tracer(void) +{ + int ret; + + mutex_init(&hwlat_data.lock); + + ret = register_tracer(&hwlat_tracer); + if (ret) + return ret; + + init_tracefs(); + + return 0; +} +late_initcall(init_hwlat_tracer); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 9aedb0b06683..eb6c9f1d3a93 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -253,6 +253,10 @@ static const struct fetch_type kprobes_fetch_type_table[] = { ASSIGN_FETCH_TYPE(s16, u16, 1), ASSIGN_FETCH_TYPE(s32, u32, 1), ASSIGN_FETCH_TYPE(s64, u64, 1), + ASSIGN_FETCH_TYPE_ALIAS(x8, u8, u8, 0), + ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0), + ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0), + ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0), ASSIGN_FETCH_TYPE_END }; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 0bb9cf2d53e6..3fc20422c166 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1098,6 +1098,71 @@ static struct trace_event trace_user_stack_event = { .funcs = &trace_user_stack_funcs, }; +/* TRACE_HWLAT */ +static enum print_line_t +trace_hwlat_print(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + struct hwlat_entry *field; + + trace_assign_type(field, entry); + + trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%ld.%09ld", + field->seqnum, + field->duration, + field->outer_duration, + field->timestamp.tv_sec, + field->timestamp.tv_nsec); + + if (field->nmi_count) { + /* + * The generic sched_clock() is not NMI safe, thus + * we only record the count and not the time. + */ + if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) + trace_seq_printf(s, " nmi-total:%llu", + field->nmi_total_ts); + trace_seq_printf(s, " nmi-count:%u", + field->nmi_count); + } + + trace_seq_putc(s, '\n'); + + return trace_handle_return(s); +} + + +static enum print_line_t +trace_hwlat_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct hwlat_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + trace_seq_printf(s, "%llu %lld %ld %09ld %u\n", + field->duration, + field->outer_duration, + field->timestamp.tv_sec, + field->timestamp.tv_nsec, + field->seqnum); + + return trace_handle_return(s); +} + +static struct trace_event_functions trace_hwlat_funcs = { + .trace = trace_hwlat_print, + .raw = trace_hwlat_raw, +}; + +static struct trace_event trace_hwlat_event = { + .type = TRACE_HWLAT, + .funcs = &trace_hwlat_funcs, +}; + /* TRACE_BPUTS */ static enum print_line_t trace_bputs_print(struct trace_iterator *iter, int flags, @@ -1233,6 +1298,7 @@ static struct trace_event *events[] __initdata = { &trace_bputs_event, &trace_bprint_event, &trace_print_event, + &trace_hwlat_event, NULL }; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 74e80a582c28..8c0553d9afd3 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -36,24 +36,28 @@ const char *reserved_field_names[] = { }; /* Printing in basic type function template */ -#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt) \ -int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \ +#define DEFINE_BASIC_PRINT_TYPE_FUNC(tname, type, fmt) \ +int PRINT_TYPE_FUNC_NAME(tname)(struct trace_seq *s, const char *name, \ void *data, void *ent) \ { \ trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ return !trace_seq_has_overflowed(s); \ } \ -const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \ -NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type)); - -DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x") -DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x") -DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "0x%x") -DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "0x%Lx") -DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d") -DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d") -DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%d") -DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld") +const char PRINT_TYPE_FMT_NAME(tname)[] = fmt; \ +NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(tname)); + +DEFINE_BASIC_PRINT_TYPE_FUNC(u8, u8, "%u") +DEFINE_BASIC_PRINT_TYPE_FUNC(u16, u16, "%u") +DEFINE_BASIC_PRINT_TYPE_FUNC(u32, u32, "%u") +DEFINE_BASIC_PRINT_TYPE_FUNC(u64, u64, "%Lu") +DEFINE_BASIC_PRINT_TYPE_FUNC(s8, s8, "%d") +DEFINE_BASIC_PRINT_TYPE_FUNC(s16, s16, "%d") +DEFINE_BASIC_PRINT_TYPE_FUNC(s32, s32, "%d") +DEFINE_BASIC_PRINT_TYPE_FUNC(s64, s64, "%Ld") +DEFINE_BASIC_PRINT_TYPE_FUNC(x8, u8, "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(x16, u16, "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(x32, u32, "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(x64, u64, "0x%Lx") /* Print type function for string type */ int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name, diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 45400ca5ded1..0c0ae54d44c6 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -149,6 +149,11 @@ DECLARE_BASIC_PRINT_TYPE_FUNC(s8); DECLARE_BASIC_PRINT_TYPE_FUNC(s16); DECLARE_BASIC_PRINT_TYPE_FUNC(s32); DECLARE_BASIC_PRINT_TYPE_FUNC(s64); +DECLARE_BASIC_PRINT_TYPE_FUNC(x8); +DECLARE_BASIC_PRINT_TYPE_FUNC(x16); +DECLARE_BASIC_PRINT_TYPE_FUNC(x32); +DECLARE_BASIC_PRINT_TYPE_FUNC(x64); + DECLARE_BASIC_PRINT_TYPE_FUNC(string); #define FETCH_FUNC_NAME(method, type) fetch_##method##_##type @@ -203,7 +208,7 @@ DEFINE_FETCH_##method(u32) \ DEFINE_FETCH_##method(u64) /* Default (unsigned long) fetch type */ -#define __DEFAULT_FETCH_TYPE(t) u##t +#define __DEFAULT_FETCH_TYPE(t) x##t #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) #define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) #define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) @@ -234,6 +239,10 @@ ASSIGN_FETCH_FUNC(file_offset, ftype), \ #define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) +/* If ptype is an alias of atype, use this macro (show atype in format) */ +#define ASSIGN_FETCH_TYPE_ALIAS(ptype, atype, ftype, sign) \ + __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #atype) + #define ASSIGN_FETCH_TYPE_END {} #define FETCH_TYPE_STRING 0 diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index b2b6efc083a4..5e10395da88e 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -610,8 +610,7 @@ static int perf_sysenter_enable(struct trace_event_call *call) if (!sys_perf_refcount_enter) ret = register_trace_sys_enter(perf_syscall_enter, NULL); if (ret) { - pr_info("event trace: Could not activate" - "syscall entry trace point"); + pr_info("event trace: Could not activate syscall entry trace point"); } else { set_bit(num, enabled_perf_enter_syscalls); sys_perf_refcount_enter++; @@ -682,8 +681,7 @@ static int perf_sysexit_enable(struct trace_event_call *call) if (!sys_perf_refcount_exit) ret = register_trace_sys_exit(perf_syscall_exit, NULL); if (ret) { - pr_info("event trace: Could not activate" - "syscall exit trace point"); + pr_info("event trace: Could not activate syscall exit trace point"); } else { set_bit(num, enabled_perf_exit_syscalls); sys_perf_refcount_exit++; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c53485441c88..0913693caf6e 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -211,6 +211,10 @@ static const struct fetch_type uprobes_fetch_type_table[] = { ASSIGN_FETCH_TYPE(s16, u16, 1), ASSIGN_FETCH_TYPE(s32, u32, 1), ASSIGN_FETCH_TYPE(s64, u64, 1), + ASSIGN_FETCH_TYPE_ALIAS(x8, u8, u8, 0), + ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0), + ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0), + ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0), ASSIGN_FETCH_TYPE_END }; @@ -427,10 +431,6 @@ static int create_trace_uprobe(int argc, char **argv) pr_info("Probe point is not specified.\n"); return -EINVAL; } - if (isdigit(argv[1][0])) { - pr_info("probe point must be have a filename.\n"); - return -EINVAL; - } arg = strchr(argv[1], ':'); if (!arg) { ret = -EINVAL; diff --git a/kernel/ucount.c b/kernel/ucount.c new file mode 100644 index 000000000000..9d20d5dd298a --- /dev/null +++ b/kernel/ucount.c @@ -0,0 +1,235 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#include <linux/stat.h> +#include <linux/sysctl.h> +#include <linux/slab.h> +#include <linux/hash.h> +#include <linux/user_namespace.h> + +#define UCOUNTS_HASHTABLE_BITS 10 +static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)]; +static DEFINE_SPINLOCK(ucounts_lock); + +#define ucounts_hashfn(ns, uid) \ + hash_long((unsigned long)__kuid_val(uid) + (unsigned long)(ns), \ + UCOUNTS_HASHTABLE_BITS) +#define ucounts_hashentry(ns, uid) \ + (ucounts_hashtable + ucounts_hashfn(ns, uid)) + + +#ifdef CONFIG_SYSCTL +static struct ctl_table_set * +set_lookup(struct ctl_table_root *root) +{ + return ¤t_user_ns()->set; +} + +static int set_is_seen(struct ctl_table_set *set) +{ + return ¤t_user_ns()->set == set; +} + +static int set_permissions(struct ctl_table_header *head, + struct ctl_table *table) +{ + struct user_namespace *user_ns = + container_of(head->set, struct user_namespace, set); + int mode; + + /* Allow users with CAP_SYS_RESOURCE unrestrained access */ + if (ns_capable(user_ns, CAP_SYS_RESOURCE)) + mode = (table->mode & S_IRWXU) >> 6; + else + /* Allow all others at most read-only access */ + mode = table->mode & S_IROTH; + return (mode << 6) | (mode << 3) | mode; +} + +static struct ctl_table_root set_root = { + .lookup = set_lookup, + .permissions = set_permissions, +}; + +static int zero = 0; +static int int_max = INT_MAX; +#define UCOUNT_ENTRY(name) \ + { \ + .procname = name, \ + .maxlen = sizeof(int), \ + .mode = 0644, \ + .proc_handler = proc_dointvec_minmax, \ + .extra1 = &zero, \ + .extra2 = &int_max, \ + } +static struct ctl_table user_table[] = { + UCOUNT_ENTRY("max_user_namespaces"), + UCOUNT_ENTRY("max_pid_namespaces"), + UCOUNT_ENTRY("max_uts_namespaces"), + UCOUNT_ENTRY("max_ipc_namespaces"), + UCOUNT_ENTRY("max_net_namespaces"), + UCOUNT_ENTRY("max_mnt_namespaces"), + UCOUNT_ENTRY("max_cgroup_namespaces"), + { } +}; +#endif /* CONFIG_SYSCTL */ + +bool setup_userns_sysctls(struct user_namespace *ns) +{ +#ifdef CONFIG_SYSCTL + struct ctl_table *tbl; + setup_sysctl_set(&ns->set, &set_root, set_is_seen); + tbl = kmemdup(user_table, sizeof(user_table), GFP_KERNEL); + if (tbl) { + int i; + for (i = 0; i < UCOUNT_COUNTS; i++) { + tbl[i].data = &ns->ucount_max[i]; + } + ns->sysctls = __register_sysctl_table(&ns->set, "user", tbl); + } + if (!ns->sysctls) { + kfree(tbl); + retire_sysctl_set(&ns->set); + return false; + } +#endif + return true; +} + +void retire_userns_sysctls(struct user_namespace *ns) +{ +#ifdef CONFIG_SYSCTL + struct ctl_table *tbl; + + tbl = ns->sysctls->ctl_table_arg; + unregister_sysctl_table(ns->sysctls); + retire_sysctl_set(&ns->set); + kfree(tbl); +#endif +} + +static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struct hlist_head *hashent) +{ + struct ucounts *ucounts; + + hlist_for_each_entry(ucounts, hashent, node) { + if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns)) + return ucounts; + } + return NULL; +} + +static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid) +{ + struct hlist_head *hashent = ucounts_hashentry(ns, uid); + struct ucounts *ucounts, *new; + + spin_lock(&ucounts_lock); + ucounts = find_ucounts(ns, uid, hashent); + if (!ucounts) { + spin_unlock(&ucounts_lock); + + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return NULL; + + new->ns = ns; + new->uid = uid; + atomic_set(&new->count, 0); + + spin_lock(&ucounts_lock); + ucounts = find_ucounts(ns, uid, hashent); + if (ucounts) { + kfree(new); + } else { + hlist_add_head(&new->node, hashent); + ucounts = new; + } + } + if (!atomic_add_unless(&ucounts->count, 1, INT_MAX)) + ucounts = NULL; + spin_unlock(&ucounts_lock); + return ucounts; +} + +static void put_ucounts(struct ucounts *ucounts) +{ + if (atomic_dec_and_test(&ucounts->count)) { + spin_lock(&ucounts_lock); + hlist_del_init(&ucounts->node); + spin_unlock(&ucounts_lock); + + kfree(ucounts); + } +} + +static inline bool atomic_inc_below(atomic_t *v, int u) +{ + int c, old; + c = atomic_read(v); + for (;;) { + if (unlikely(c >= u)) + return false; + old = atomic_cmpxchg(v, c, c+1); + if (likely(old == c)) + return true; + c = old; + } +} + +struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, + enum ucount_type type) +{ + struct ucounts *ucounts, *iter, *bad; + struct user_namespace *tns; + ucounts = get_ucounts(ns, uid); + for (iter = ucounts; iter; iter = tns->ucounts) { + int max; + tns = iter->ns; + max = READ_ONCE(tns->ucount_max[type]); + if (!atomic_inc_below(&iter->ucount[type], max)) + goto fail; + } + return ucounts; +fail: + bad = iter; + for (iter = ucounts; iter != bad; iter = iter->ns->ucounts) + atomic_dec(&iter->ucount[type]); + + put_ucounts(ucounts); + return NULL; +} + +void dec_ucount(struct ucounts *ucounts, enum ucount_type type) +{ + struct ucounts *iter; + for (iter = ucounts; iter; iter = iter->ns->ucounts) { + int dec = atomic_dec_if_positive(&iter->ucount[type]); + WARN_ON_ONCE(dec < 0); + } + put_ucounts(ucounts); +} + +static __init int user_namespace_sysctl_init(void) +{ +#ifdef CONFIG_SYSCTL + static struct ctl_table_header *user_header; + static struct ctl_table empty[1]; + /* + * It is necessary to register the user directory in the + * default set so that registrations in the child sets work + * properly. + */ + user_header = register_sysctl("user", empty); + BUG_ON(!user_header); + BUG_ON(!setup_userns_sysctls(&init_user_ns)); +#endif + return 0; +} +subsys_initcall(user_namespace_sysctl_init); + + diff --git a/kernel/uid16.c b/kernel/uid16.c index d58cc4d8f0d1..cc40793464e3 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -117,7 +117,7 @@ static int groups16_to_user(old_gid_t __user *grouplist, kgid_t kgid; for (i = 0; i < group_info->ngroups; i++) { - kgid = GROUP_AT(group_info, i); + kgid = group_info->gid[i]; group = high2lowgid(from_kgid_munged(user_ns, kgid)); if (put_user(group, grouplist+i)) return -EFAULT; @@ -142,7 +142,7 @@ static int groups16_from_user(struct group_info *group_info, if (!gid_valid(kgid)) return -EINVAL; - GROUP_AT(group_info, i) = kgid; + group_info->gid[i] = kgid; } return 0; diff --git a/kernel/up.c b/kernel/up.c index 1760bf3d1463..ee81ac9af4ca 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -6,6 +6,7 @@ #include <linux/kernel.h> #include <linux/export.h> #include <linux/smp.h> +#include <linux/hypervisor.h> int smp_call_function_single(int cpu, void (*func) (void *info), void *info, int wait) @@ -82,3 +83,20 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), preempt_enable(); } EXPORT_SYMBOL(on_each_cpu_cond); + +int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys) +{ + int ret; + + if (cpu != 0) + return -ENXIO; + + if (phys) + hypervisor_pin_vcpu(0); + ret = func(par); + if (phys) + hypervisor_pin_vcpu(-1); + + return ret; +} +EXPORT_SYMBOL_GPL(smp_call_on_cpu); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 68f594212759..86b7854fec8e 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -29,6 +29,17 @@ static DEFINE_MUTEX(userns_state_mutex); static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *map); +static void free_user_ns(struct work_struct *work); + +static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid) +{ + return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES); +} + +static void dec_user_namespaces(struct ucounts *ucounts) +{ + return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES); +} static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) { @@ -62,10 +73,16 @@ int create_user_ns(struct cred *new) struct user_namespace *ns, *parent_ns = new->user_ns; kuid_t owner = new->euid; kgid_t group = new->egid; - int ret; + struct ucounts *ucounts; + int ret, i; + ret = -ENOSPC; if (parent_ns->level > 32) - return -EUSERS; + goto fail; + + ucounts = inc_user_namespaces(parent_ns, owner); + if (!ucounts) + goto fail; /* * Verify that we can not violate the policy of which files @@ -73,26 +90,27 @@ int create_user_ns(struct cred *new) * by verifing that the root directory is at the root of the * mount namespace which allows all files to be accessed. */ + ret = -EPERM; if (current_chrooted()) - return -EPERM; + goto fail_dec; /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who * created a user_namespace. */ + ret = -EPERM; if (!kuid_has_mapping(parent_ns, owner) || !kgid_has_mapping(parent_ns, group)) - return -EPERM; + goto fail_dec; + ret = -ENOMEM; ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); if (!ns) - return -ENOMEM; + goto fail_dec; ret = ns_alloc_inum(&ns->ns); - if (ret) { - kmem_cache_free(user_ns_cachep, ns); - return ret; - } + if (ret) + goto fail_free; ns->ns.ops = &userns_operations; atomic_set(&ns->count, 1); @@ -101,18 +119,37 @@ int create_user_ns(struct cred *new) ns->level = parent_ns->level + 1; ns->owner = owner; ns->group = group; + INIT_WORK(&ns->work, free_user_ns); + for (i = 0; i < UCOUNT_COUNTS; i++) { + ns->ucount_max[i] = INT_MAX; + } + ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ mutex_lock(&userns_state_mutex); ns->flags = parent_ns->flags; mutex_unlock(&userns_state_mutex); - set_cred_user_ns(new, ns); - #ifdef CONFIG_PERSISTENT_KEYRINGS init_rwsem(&ns->persistent_keyring_register_sem); #endif + ret = -ENOMEM; + if (!setup_userns_sysctls(ns)) + goto fail_keyring; + + set_cred_user_ns(new, ns); return 0; +fail_keyring: +#ifdef CONFIG_PERSISTENT_KEYRINGS + key_put(ns->persistent_keyring_register); +#endif + ns_free_inum(&ns->ns); +fail_free: + kmem_cache_free(user_ns_cachep, ns); +fail_dec: + dec_user_namespaces(ucounts); +fail: + return ret; } int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) @@ -135,21 +172,30 @@ int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) return err; } -void free_user_ns(struct user_namespace *ns) +static void free_user_ns(struct work_struct *work) { - struct user_namespace *parent; + struct user_namespace *parent, *ns = + container_of(work, struct user_namespace, work); do { + struct ucounts *ucounts = ns->ucounts; parent = ns->parent; + retire_userns_sysctls(ns); #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif ns_free_inum(&ns->ns); kmem_cache_free(user_ns_cachep, ns); + dec_user_namespaces(ucounts); ns = parent; } while (atomic_dec_and_test(&parent->count)); } -EXPORT_SYMBOL(free_user_ns); + +void __put_user_ns(struct user_namespace *ns) +{ + schedule_work(&ns->work); +} +EXPORT_SYMBOL(__put_user_ns); static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) { @@ -1004,12 +1050,37 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns) return commit_creds(cred); } +struct ns_common *ns_get_owner(struct ns_common *ns) +{ + struct user_namespace *my_user_ns = current_user_ns(); + struct user_namespace *owner, *p; + + /* See if the owner is in the current user namespace */ + owner = p = ns->ops->owner(ns); + for (;;) { + if (!p) + return ERR_PTR(-EPERM); + if (p == my_user_ns) + break; + p = p->parent; + } + + return &get_user_ns(owner)->ns; +} + +static struct user_namespace *userns_owner(struct ns_common *ns) +{ + return to_user_ns(ns)->parent; +} + const struct proc_ns_operations userns_operations = { .name = "user", .type = CLONE_NEWUSER, .get = userns_get, .put = userns_put, .install = userns_install, + .owner = userns_owner, + .get_parent = ns_get_owner, }; static __init int user_namespaces_init(void) diff --git a/kernel/utsname.c b/kernel/utsname.c index 831ea7108232..6976cd47dcf6 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -17,6 +17,16 @@ #include <linux/user_namespace.h> #include <linux/proc_ns.h> +static struct ucounts *inc_uts_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_UTS_NAMESPACES); +} + +static void dec_uts_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_UTS_NAMESPACES); +} + static struct uts_namespace *create_uts_ns(void) { struct uts_namespace *uts_ns; @@ -36,18 +46,24 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, struct uts_namespace *old_ns) { struct uts_namespace *ns; + struct ucounts *ucounts; int err; + err = -ENOSPC; + ucounts = inc_uts_namespaces(user_ns); + if (!ucounts) + goto fail; + + err = -ENOMEM; ns = create_uts_ns(); if (!ns) - return ERR_PTR(-ENOMEM); + goto fail_dec; err = ns_alloc_inum(&ns->ns); - if (err) { - kfree(ns); - return ERR_PTR(err); - } + if (err) + goto fail_free; + ns->ucounts = ucounts; ns->ns.ops = &utsns_operations; down_read(&uts_sem); @@ -55,6 +71,13 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, ns->user_ns = get_user_ns(user_ns); up_read(&uts_sem); return ns; + +fail_free: + kfree(ns); +fail_dec: + dec_uts_namespaces(ucounts); +fail: + return ERR_PTR(err); } /* @@ -85,6 +108,7 @@ void free_uts_ns(struct kref *kref) struct uts_namespace *ns; ns = container_of(kref, struct uts_namespace, kref); + dec_uts_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); kfree(ns); @@ -130,10 +154,16 @@ static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new) return 0; } +static struct user_namespace *utsns_owner(struct ns_common *ns) +{ + return to_uts_ns(ns)->user_ns; +} + const struct proc_ns_operations utsns_operations = { .name = "uts", .type = CLONE_NEWUTS, .get = utsns_get, .put = utsns_put, .install = utsns_install, + .owner = utsns_owner, }; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ef071ca73fc3..bd81f0390277 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2974,6 +2974,31 @@ bool flush_delayed_work(struct delayed_work *dwork) } EXPORT_SYMBOL(flush_delayed_work); +static bool __cancel_work(struct work_struct *work, bool is_dwork) +{ + unsigned long flags; + int ret; + + do { + ret = try_to_grab_pending(work, is_dwork, &flags); + } while (unlikely(ret == -EAGAIN)); + + if (unlikely(ret < 0)) + return false; + + set_work_pool_and_clear_pending(work, get_work_pool_id(work)); + local_irq_restore(flags); + return ret; +} + +/* + * See cancel_delayed_work() + */ +bool cancel_work(struct work_struct *work) +{ + return __cancel_work(work, false); +} + /** * cancel_delayed_work - cancel a delayed work * @dwork: delayed_work to cancel @@ -2992,20 +3017,7 @@ EXPORT_SYMBOL(flush_delayed_work); */ bool cancel_delayed_work(struct delayed_work *dwork) { - unsigned long flags; - int ret; - - do { - ret = try_to_grab_pending(&dwork->work, true, &flags); - } while (unlikely(ret == -EAGAIN)); - - if (unlikely(ret < 0)) - return false; - - set_work_pool_and_clear_pending(&dwork->work, - get_work_pool_id(&dwork->work)); - local_irq_restore(flags); - return ret; + return __cancel_work(&dwork->work, true); } EXPORT_SYMBOL(cancel_delayed_work); |