diff options
Diffstat (limited to 'kernel')
280 files changed, 21937 insertions, 10136 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 471d71935e90..318789c728d3 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -29,7 +29,6 @@ KCOV_INSTRUMENT_softirq.o := n KCSAN_SANITIZE_softirq.o = n # These are called from save_stack_trace() on slub debug path, # and produce insane amounts of uninteresting coverage. -KCOV_INSTRUMENT_module.o := n KCOV_INSTRUMENT_extable.o := n KCOV_INSTRUMENT_stacktrace.o := n # Don't self-instrument. @@ -53,6 +52,7 @@ obj-y += rcu/ obj-y += livepatch/ obj-y += dma/ obj-y += entry/ +obj-$(CONFIG_MODULES) += module/ obj-$(CONFIG_KCMP) += kcmp.o obj-$(CONFIG_FREEZER) += freezer.o @@ -66,9 +66,6 @@ ifneq ($(CONFIG_SMP),y) obj-y += up.o endif obj-$(CONFIG_UID16) += uid16.o -obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_MODULE_DECOMPRESS) += module_decompress.o -obj-$(CONFIG_MODULE_SIG) += module_signing.o obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o @@ -114,7 +111,8 @@ obj-$(CONFIG_CPU_PM) += cpu_pm.o obj-$(CONFIG_BPF) += bpf/ obj-$(CONFIG_KCSAN) += kcsan/ obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o -obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call.o +obj-$(CONFIG_HAVE_STATIC_CALL) += static_call.o +obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call_inline.o obj-$(CONFIG_CFI_CLANG) += cfi.o obj-$(CONFIG_PERF_EVENTS) += events/ diff --git a/kernel/acct.c b/kernel/acct.c index 3df53cf1dcd5..13706356ec54 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -70,11 +70,31 @@ * Turned into sysctl-controllable parameters. AV, 12/11/98 */ -int acct_parm[3] = {4, 2, 30}; +static int acct_parm[3] = {4, 2, 30}; #define RESUME (acct_parm[0]) /* >foo% free space - resume */ #define SUSPEND (acct_parm[1]) /* <foo% free space - suspend */ #define ACCT_TIMEOUT (acct_parm[2]) /* foo second timeout between checks */ +#ifdef CONFIG_SYSCTL +static struct ctl_table kern_acct_table[] = { + { + .procname = "acct", + .data = &acct_parm, + .maxlen = 3*sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static __init int kernel_acct_sysctls_init(void) +{ + register_sysctl_init("kernel", kern_acct_table); + return 0; +} +late_initcall(kernel_acct_sysctls_init); +#endif /* CONFIG_SYSCTL */ + /* * External references and all of the globals. */ diff --git a/kernel/audit.c b/kernel/audit.c index 7690c29d4ee4..9bc0b0301198 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -321,7 +321,6 @@ static inline int audit_rate_check(void) static DEFINE_SPINLOCK(lock); unsigned long flags; unsigned long now; - unsigned long elapsed; int retval = 0; if (!audit_rate_limit) return 1; @@ -330,9 +329,8 @@ static inline int audit_rate_check(void) if (++messages < audit_rate_limit) { retval = 1; } else { - now = jiffies; - elapsed = now - last_check; - if (elapsed > HZ) { + now = jiffies; + if (time_after(now, last_check + HZ)) { last_check = now; messages = 0; retval = 1; @@ -366,7 +364,7 @@ void audit_log_lost(const char *message) if (!print) { spin_lock_irqsave(&lock, flags); now = jiffies; - if (now - last_msg > HZ) { + if (time_after(now, last_msg + HZ)) { print = 1; last_msg = now; } @@ -1100,7 +1098,7 @@ static inline void audit_log_user_recv_msg(struct audit_buffer **ab, audit_log_common_recv_msg(NULL, ab, msg_type); } -int is_audit_feature_set(int i) +static int is_audit_feature_set(int i) { return af.features & AUDIT_FEATURE_TO_MASK(i); } @@ -1390,7 +1388,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) str); } else { audit_log_format(ab, " data="); - if (data_len > 0 && str[data_len - 1] == '\0') + if (str[data_len - 1] == '\0') data_len--; audit_log_n_untrustedstring(ab, str, data_len); } diff --git a/kernel/audit.h b/kernel/audit.h index 58b66543b4d5..c57b008b9914 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -133,7 +133,7 @@ struct audit_context { struct sockaddr_storage *sockaddr; size_t sockaddr_len; /* Save things to print about task_struct */ - pid_t pid, ppid; + pid_t ppid; kuid_t uid, euid, suid, fsuid; kgid_t gid, egid, sgid, fsgid; unsigned long personality; @@ -245,8 +245,6 @@ struct audit_netlink_list { int audit_send_list_thread(void *_dest); -extern int selinux_audit_rule_update(void); - extern struct mutex audit_filter_mutex; extern int audit_del_rule(struct audit_entry *entry); extern void audit_free_rule_rcu(struct rcu_head *head); diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index 02348b48447c..c565fbf66ac8 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -100,8 +100,9 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa audit_update_mark(audit_mark, dentry->d_inode); audit_mark->rule = krule; - ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, true); + ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, 0); if (ret < 0) { + audit_mark->path = NULL; fsnotify_put_mark(&audit_mark->mark); audit_mark = ERR_PTR(ret); } @@ -181,7 +182,8 @@ static const struct fsnotify_ops audit_mark_fsnotify_ops = { static int __init audit_fsnotify_init(void) { - audit_fsnotify_group = fsnotify_alloc_group(&audit_mark_fsnotify_ops); + audit_fsnotify_group = fsnotify_alloc_group(&audit_mark_fsnotify_ops, + FSNOTIFY_GROUP_DUPS); if (IS_ERR(audit_fsnotify_group)) { audit_fsnotify_group = NULL; audit_panic("cannot create audit fsnotify group"); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index e7315d487163..e867c17d3f84 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -351,7 +351,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *mark) struct audit_chunk *new; int size; - mutex_lock(&audit_tree_group->mark_mutex); + fsnotify_group_lock(audit_tree_group); /* * mark_mutex stabilizes chunk attached to the mark so we can check * whether it didn't change while we've dropped hash_lock. @@ -368,7 +368,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *mark) replace_mark_chunk(mark, NULL); spin_unlock(&hash_lock); fsnotify_detach_mark(mark); - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); audit_mark_put_chunk(chunk); fsnotify_free_mark(mark); return; @@ -385,12 +385,12 @@ static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *mark) */ replace_chunk(new, chunk); spin_unlock(&hash_lock); - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); audit_mark_put_chunk(chunk); return; out_mutex: - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); } /* Call with group->mark_mutex held, releases it */ @@ -400,19 +400,19 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) struct audit_chunk *chunk = alloc_chunk(1); if (!chunk) { - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); return -ENOMEM; } mark = alloc_mark(); if (!mark) { - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); kfree(chunk); return -ENOMEM; } if (fsnotify_add_inode_mark_locked(mark, inode, 0)) { - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); fsnotify_put_mark(mark); kfree(chunk); return -ENOSPC; @@ -422,7 +422,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) if (tree->goner) { spin_unlock(&hash_lock); fsnotify_detach_mark(mark); - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); fsnotify_free_mark(mark); fsnotify_put_mark(mark); kfree(chunk); @@ -444,7 +444,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) */ insert_hash(chunk); spin_unlock(&hash_lock); - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); /* * Drop our initial reference. When mark we point to is getting freed, * we get notification through ->freeing_mark callback and cleanup @@ -462,7 +462,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) struct audit_node *p; int n; - mutex_lock(&audit_tree_group->mark_mutex); + fsnotify_group_lock(audit_tree_group); mark = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_tree_group); if (!mark) return create_chunk(inode, tree); @@ -478,7 +478,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) for (n = 0; n < old->count; n++) { if (old->owners[n].owner == tree) { spin_unlock(&hash_lock); - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); fsnotify_put_mark(mark); return 0; } @@ -487,7 +487,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) chunk = alloc_chunk(old->count + 1); if (!chunk) { - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); fsnotify_put_mark(mark); return -ENOMEM; } @@ -495,7 +495,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) spin_lock(&hash_lock); if (tree->goner) { spin_unlock(&hash_lock); - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); fsnotify_put_mark(mark); kfree(chunk); return 0; @@ -515,7 +515,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) */ replace_chunk(chunk, old); spin_unlock(&hash_lock); - mutex_unlock(&audit_tree_group->mark_mutex); + fsnotify_group_unlock(audit_tree_group); fsnotify_put_mark(mark); /* pair to fsnotify_find_mark */ audit_mark_put_chunk(old); @@ -1044,12 +1044,12 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *mark, { struct audit_chunk *chunk; - mutex_lock(&mark->group->mark_mutex); + fsnotify_group_lock(mark->group); spin_lock(&hash_lock); chunk = mark_chunk(mark); replace_mark_chunk(mark, NULL); spin_unlock(&hash_lock); - mutex_unlock(&mark->group->mark_mutex); + fsnotify_group_unlock(mark->group); if (chunk) { evict_chunk(chunk); audit_mark_put_chunk(chunk); @@ -1074,7 +1074,7 @@ static int __init audit_tree_init(void) audit_tree_mark_cachep = KMEM_CACHE(audit_tree_mark, SLAB_PANIC); - audit_tree_group = fsnotify_alloc_group(&audit_tree_ops); + audit_tree_group = fsnotify_alloc_group(&audit_tree_ops, 0); if (IS_ERR(audit_tree_group)) audit_panic("cannot initialize fsnotify group for rectree watches"); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 713b256be944..65075f1e4ac8 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -133,7 +133,7 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev) } /* Initialize a parent watch entry. */ -static struct audit_parent *audit_init_parent(struct path *path) +static struct audit_parent *audit_init_parent(const struct path *path) { struct inode *inode = d_backing_inode(path->dentry); struct audit_parent *parent; @@ -493,7 +493,7 @@ static const struct fsnotify_ops audit_watch_fsnotify_ops = { static int __init audit_watch_init(void) { - audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops); + audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops, 0); if (IS_ERR(audit_watch_group)) { audit_watch_group = NULL; audit_panic("cannot create audit fsnotify group"); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index ea2ee1181921..9f8c05228d6d 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -965,7 +965,7 @@ static void audit_reset_context(struct audit_context *ctx) if (!ctx) return; - /* if ctx is non-null, reset the "ctx->state" regardless */ + /* if ctx is non-null, reset the "ctx->context" regardless */ ctx->context = AUDIT_CTX_UNUSED; if (ctx->dummy) return; @@ -1002,7 +1002,7 @@ static void audit_reset_context(struct audit_context *ctx) kfree(ctx->sockaddr); ctx->sockaddr = NULL; ctx->sockaddr_len = 0; - ctx->pid = ctx->ppid = 0; + ctx->ppid = 0; ctx->uid = ctx->euid = ctx->suid = ctx->fsuid = KUIDT_INIT(0); ctx->gid = ctx->egid = ctx->sgid = ctx->fsgid = KGIDT_INIT(0); ctx->personality = 0; @@ -1014,10 +1014,9 @@ static void audit_reset_context(struct audit_context *ctx) ctx->target_comm[0] = '\0'; unroll_tree_refs(ctx, NULL, 0); WARN_ON(!list_empty(&ctx->killed_trees)); - ctx->type = 0; audit_free_module(ctx); ctx->fds[0] = -1; - audit_proctitle_free(ctx); + ctx->type = 0; /* reset last for audit_free_*() */ } static inline struct audit_context *audit_alloc_context(enum audit_state state) @@ -1073,35 +1072,11 @@ int audit_alloc(struct task_struct *tsk) return 0; } -/** - * audit_alloc_kernel - allocate an audit_context for a kernel task - * @tsk: the kernel task - * - * Similar to the audit_alloc() function, but intended for kernel private - * threads. Returns zero on success, negative values on failure. - */ -int audit_alloc_kernel(struct task_struct *tsk) -{ - /* - * At the moment we are just going to call into audit_alloc() to - * simplify the code, but there two things to keep in mind with this - * approach: - * - * 1. Filtering internal kernel tasks is a bit laughable in almost all - * cases, but there is at least one case where there is a benefit: - * the '-a task,never' case allows the admin to effectively disable - * task auditing at runtime. - * - * 2. The {set,clear}_task_syscall_work() ops likely have zero effect - * on these internal kernel tasks, but they probably don't hurt either. - */ - return audit_alloc(tsk); -} - static inline void audit_free_context(struct audit_context *context) { /* resetting is extra work, but it is likely just noise */ audit_reset_context(context); + audit_proctitle_free(context); free_tree_refs(context); kfree(context->filterkey); kfree(context); @@ -1858,7 +1833,7 @@ void __audit_free(struct task_struct *tsk) /* We are called either by do_exit() or the fork() error handling code; * in the former case tsk == current and in the latter tsk is a - * random task_struct that doesn't doesn't have any meaningful data we + * random task_struct that doesn't have any meaningful data we * need to log via audit_log_exit(). */ if (tsk == current && !context->dummy) { @@ -1959,6 +1934,13 @@ void __audit_uring_exit(int success, long code) { struct audit_context *ctx = audit_context(); + if (ctx->dummy) { + if (ctx->context != AUDIT_CTX_URING) + return; + goto out; + } + + audit_return_fixup(ctx, success, code); if (ctx->context == AUDIT_CTX_SYSCALL) { /* * NOTE: See the note in __audit_uring_entry() about the case @@ -2000,7 +1982,6 @@ void __audit_uring_exit(int success, long code) audit_filter_inodes(current, ctx); if (ctx->current_state != AUDIT_STATE_RECORD) goto out; - audit_return_fixup(ctx, success, code); audit_log_exit(); out: @@ -2084,13 +2065,13 @@ void __audit_syscall_exit(int success, long return_code) if (!list_empty(&context->killed_trees)) audit_kill_trees(context); + audit_return_fixup(context, success, return_code); /* run through both filters to ensure we set the filterkey properly */ audit_filter_syscall(current, context); audit_filter_inodes(current, context); - if (context->current_state < AUDIT_STATE_RECORD) + if (context->current_state != AUDIT_STATE_RECORD) goto out; - audit_return_fixup(context, success, return_code); audit_log_exit(); out: diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index d56ee177d5f8..2dfe1079f772 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -27,6 +27,7 @@ config BPF_SYSCALL bool "Enable bpf() system call" select BPF select IRQ_WORK + select TASKS_RCU if PREEMPTION select TASKS_TRACE_RCU select BINARY_PRINTF select NET_SOCK_MSG if NET diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index c1a9be6a4b9f..341c94f208f4 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -6,14 +6,14 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse endif CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o -obj-$(CONFIG_BPF_SYSCALL) += btf.o +obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o obj-$(CONFIG_BPF_JIT) += dispatcher.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o @@ -24,6 +24,9 @@ endif ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif +ifeq ($(CONFIG_CGROUPS),y) +obj-$(CONFIG_BPF_SYSCALL) += cgroup_iter.o +endif obj-$(CONFIG_CGROUP_BPF) += cgroup.o ifeq ($(CONFIG_INET),y) obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 7f145aefbff8..832b2659e96e 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -11,6 +11,7 @@ #include <linux/perf_event.h> #include <uapi/linux/btf.h> #include <linux/rcupdate_trace.h> +#include <linux/btf_ids.h> #include "map_in_map.h" @@ -69,10 +70,8 @@ int array_map_alloc_check(union bpf_attr *attr) attr->map_flags & BPF_F_PRESERVE_ELEMS) return -EINVAL; - if (attr->value_size > KMALLOC_MAX_SIZE) - /* if value_size is bigger, the user space won't be able to - * access the elements. - */ + /* avoid overflow on round_up(map->value_size) */ + if (attr->value_size > INT_MAX) return -E2BIG; return 0; @@ -155,6 +154,11 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) return &array->map; } +static void *array_map_elem_ptr(struct bpf_array* array, u32 index) +{ + return array->value + (u64)array->elem_size * index; +} + /* Called from syscall or from eBPF program */ static void *array_map_lookup_elem(struct bpf_map *map, void *key) { @@ -164,7 +168,7 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) if (unlikely(index >= array->map.max_entries)) return NULL; - return array->value + array->elem_size * (index & array->index_mask); + return array->value + (u64)array->elem_size * (index & array->index_mask); } static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, @@ -202,7 +206,7 @@ static int array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_insn *insn = insn_buf; - u32 elem_size = round_up(map->value_size, 8); + u32 elem_size = array->elem_size; const int ret = BPF_REG_0; const int map_ptr = BPF_REG_1; const int index = BPF_REG_2; @@ -242,6 +246,20 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) return this_cpu_ptr(array->pptrs[index & array->index_mask]); } +static void *percpu_array_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + + if (cpu >= nr_cpu_ids) + return NULL; + + if (unlikely(index >= array->map.max_entries)) + return NULL; + + return per_cpu_ptr(array->pptrs[index & array->index_mask], cpu); +} + int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) { struct bpf_array *array = container_of(map, struct bpf_array, map); @@ -257,11 +275,12 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) * access 'value_size' of them, so copying rounded areas * will not leak any kernel data */ - size = round_up(map->value_size, 8); + size = array->elem_size; rcu_read_lock(); pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { - bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size); + copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu)); + check_and_init_map_value(map, value + off); off += size; } rcu_read_unlock(); @@ -287,10 +306,12 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key return 0; } -static void check_and_free_timer_in_array(struct bpf_array *arr, void *val) +static void check_and_free_fields(struct bpf_array *arr, void *val) { - if (unlikely(map_value_has_timer(&arr->map))) + if (map_value_has_timer(&arr->map)) bpf_timer_cancel_and_free(val + arr->map.timer_off); + if (map_value_has_kptrs(&arr->map)) + bpf_map_free_kptrs(&arr->map, val); } /* Called from syscall or from eBPF program */ @@ -318,16 +339,17 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, return -EINVAL; if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { - memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]), - value, map->value_size); + val = this_cpu_ptr(array->pptrs[index & array->index_mask]); + copy_map_value(map, val, value); + check_and_free_fields(array, val); } else { val = array->value + - array->elem_size * (index & array->index_mask); + (u64)array->elem_size * (index & array->index_mask); if (map_flags & BPF_F_LOCK) copy_map_value_locked(map, val, value, false); else copy_map_value(map, val, value); - check_and_free_timer_in_array(array, val); + check_and_free_fields(array, val); } return 0; } @@ -359,11 +381,12 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, * returned or zeros which were zero-filled by percpu_alloc, * so no kernel data leaks possible */ - size = round_up(map->value_size, 8); + size = array->elem_size; rcu_read_lock(); pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { - bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size); + copy_map_value_long(map, per_cpu_ptr(pptr, cpu), value + off); + check_and_free_fields(array, per_cpu_ptr(pptr, cpu)); off += size; } rcu_read_unlock(); @@ -386,18 +409,37 @@ static void array_map_free_timers(struct bpf_map *map) struct bpf_array *array = container_of(map, struct bpf_array, map); int i; - if (likely(!map_value_has_timer(map))) + /* We don't reset or free kptr on uref dropping to zero. */ + if (!map_value_has_timer(map)) return; for (i = 0; i < array->map.max_entries; i++) - bpf_timer_cancel_and_free(array->value + array->elem_size * i + - map->timer_off); + bpf_timer_cancel_and_free(array_map_elem_ptr(array, i) + map->timer_off); } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void array_map_free(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); + int i; + + if (map_value_has_kptrs(map)) { + if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { + for (i = 0; i < array->map.max_entries; i++) { + void __percpu *pptr = array->pptrs[i & array->index_mask]; + int cpu; + + for_each_possible_cpu(cpu) { + bpf_map_free_kptrs(map, per_cpu_ptr(pptr, cpu)); + cond_resched(); + } + } + } else { + for (i = 0; i < array->map.max_entries; i++) + bpf_map_free_kptrs(map, array_map_elem_ptr(array, i)); + } + bpf_map_free_kptr_off_tab(map); + } if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) bpf_array_free_percpu(array); @@ -531,7 +573,7 @@ static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos) index = info->index & array->index_mask; if (info->percpu_value_buf) return array->pptrs[index]; - return array->value + array->elem_size * index; + return array_map_elem_ptr(array, index); } static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) @@ -550,7 +592,7 @@ static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) index = info->index & array->index_mask; if (info->percpu_value_buf) return array->pptrs[index]; - return array->value + array->elem_size * index; + return array_map_elem_ptr(array, index); } static int __bpf_array_map_seq_show(struct seq_file *seq, void *v) @@ -558,6 +600,7 @@ static int __bpf_array_map_seq_show(struct seq_file *seq, void *v) struct bpf_iter_seq_array_map_info *info = seq->private; struct bpf_iter__bpf_map_elem ctx = {}; struct bpf_map *map = info->map; + struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_iter_meta meta; struct bpf_prog *prog; int off = 0, cpu = 0; @@ -578,11 +621,11 @@ static int __bpf_array_map_seq_show(struct seq_file *seq, void *v) ctx.value = v; } else { pptr = v; - size = round_up(map->value_size, 8); + size = array->elem_size; for_each_possible_cpu(cpu) { - bpf_long_memcpy(info->percpu_value_buf + off, - per_cpu_ptr(pptr, cpu), - size); + copy_map_value_long(map, info->percpu_value_buf + off, + per_cpu_ptr(pptr, cpu)); + check_and_init_map_value(map, info->percpu_value_buf + off); off += size; } ctx.value = info->percpu_value_buf; @@ -608,11 +651,12 @@ static int bpf_iter_init_array_map(void *priv_data, { struct bpf_iter_seq_array_map_info *seq_info = priv_data; struct bpf_map *map = aux->map; + struct bpf_array *array = container_of(map, struct bpf_array, map); void *value_buf; u32 buf_size; if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { - buf_size = round_up(map->value_size, 8) * num_possible_cpus(); + buf_size = array->elem_size * num_possible_cpus(); value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN); if (!value_buf) return -ENOMEM; @@ -620,6 +664,11 @@ static int bpf_iter_init_array_map(void *priv_data, seq_info->percpu_value_buf = value_buf; } + /* bpf_iter_attach_map() acquires a map uref, and the uref may be + * released before or in the middle of iterating map elements, so + * acquire an extra map uref for iterator. + */ + bpf_map_inc_with_uref(map); seq_info->map = map; return 0; } @@ -628,6 +677,7 @@ static void bpf_iter_fini_array_map(void *priv_data) { struct bpf_iter_seq_array_map_info *seq_info = priv_data; + bpf_map_put_with_uref(seq_info->map); kfree(seq_info->percpu_value_buf); } @@ -665,7 +715,7 @@ static int bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_ if (is_percpu) val = this_cpu_ptr(array->pptrs[i]); else - val = array->value + array->elem_size * i; + val = array_map_elem_ptr(array, i); num_elems++; key = i; ret = callback_fn((u64)(long)map, (u64)(long)&key, @@ -680,7 +730,7 @@ static int bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_ return num_elems; } -static int array_map_btf_id; +BTF_ID_LIST_SINGLE(array_map_btf_ids, struct, bpf_array) const struct bpf_map_ops array_map_ops = { .map_meta_equal = array_map_meta_equal, .map_alloc_check = array_map_alloc_check, @@ -701,12 +751,10 @@ const struct bpf_map_ops array_map_ops = { .map_update_batch = generic_map_update_batch, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_array_elem, - .map_btf_name = "bpf_array", - .map_btf_id = &array_map_btf_id, + .map_btf_id = &array_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; -static int percpu_array_map_btf_id; const struct bpf_map_ops percpu_array_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = array_map_alloc_check, @@ -716,14 +764,14 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_lookup_elem = percpu_array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, + .map_lookup_percpu_elem = percpu_array_map_lookup_percpu_elem, .map_seq_show_elem = percpu_array_map_seq_show_elem, .map_check_btf = array_map_check_btf, .map_lookup_batch = generic_map_lookup_batch, .map_update_batch = generic_map_update_batch, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_array_elem, - .map_btf_name = "bpf_array", - .map_btf_id = &percpu_array_map_btf_id, + .map_btf_id = &array_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; @@ -1102,7 +1150,6 @@ static void prog_array_map_free(struct bpf_map *map) * Thus, prog_array_map cannot be used as an inner_map * and map_meta_equal is not implemented. */ -static int prog_array_map_btf_id; const struct bpf_map_ops prog_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = prog_array_map_alloc, @@ -1118,8 +1165,7 @@ const struct bpf_map_ops prog_array_map_ops = { .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, .map_release_uref = prog_array_map_clear, .map_seq_show_elem = prog_array_map_seq_show_elem, - .map_btf_name = "bpf_array", - .map_btf_id = &prog_array_map_btf_id, + .map_btf_id = &array_map_btf_ids[0], }; static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, @@ -1208,7 +1254,6 @@ static void perf_event_fd_array_map_free(struct bpf_map *map) fd_array_map_free(map); } -static int perf_event_array_map_btf_id; const struct bpf_map_ops perf_event_array_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = fd_array_map_alloc_check, @@ -1221,8 +1266,7 @@ const struct bpf_map_ops perf_event_array_map_ops = { .map_fd_put_ptr = perf_event_fd_array_put_ptr, .map_release = perf_event_fd_array_release, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_array", - .map_btf_id = &perf_event_array_map_btf_id, + .map_btf_id = &array_map_btf_ids[0], }; #ifdef CONFIG_CGROUPS @@ -1245,7 +1289,6 @@ static void cgroup_fd_array_free(struct bpf_map *map) fd_array_map_free(map); } -static int cgroup_array_map_btf_id; const struct bpf_map_ops cgroup_array_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = fd_array_map_alloc_check, @@ -1257,8 +1300,7 @@ const struct bpf_map_ops cgroup_array_map_ops = { .map_fd_get_ptr = cgroup_fd_array_get_ptr, .map_fd_put_ptr = cgroup_fd_array_put_ptr, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_array", - .map_btf_id = &cgroup_array_map_btf_id, + .map_btf_id = &array_map_btf_ids[0], }; #endif @@ -1305,7 +1347,7 @@ static int array_of_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_array *array = container_of(map, struct bpf_array, map); - u32 elem_size = round_up(map->value_size, 8); + u32 elem_size = array->elem_size; struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; const int map_ptr = BPF_REG_1; @@ -1332,7 +1374,6 @@ static int array_of_map_gen_lookup(struct bpf_map *map, return insn - insn_buf; } -static int array_of_maps_map_btf_id; const struct bpf_map_ops array_of_maps_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_of_map_alloc, @@ -1344,7 +1385,8 @@ const struct bpf_map_ops array_of_maps_map_ops = { .map_fd_put_ptr = bpf_map_fd_put_ptr, .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = array_of_map_gen_lookup, + .map_lookup_batch = generic_map_lookup_batch, + .map_update_batch = generic_map_update_batch, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_array", - .map_btf_id = &array_of_maps_map_btf_id, + .map_btf_id = &array_map_btf_ids[0], }; diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c index b141a1346f72..b9ea539a5561 100644 --- a/kernel/bpf/bloom_filter.c +++ b/kernel/bpf/bloom_filter.c @@ -7,6 +7,7 @@ #include <linux/err.h> #include <linux/jhash.h> #include <linux/random.h> +#include <linux/btf_ids.h> #define BLOOM_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_ZERO_SEED | BPF_F_ACCESS_MASK) @@ -192,7 +193,7 @@ static int bloom_map_check_btf(const struct bpf_map *map, return btf_type_is_void(key_type) ? 0 : -EINVAL; } -static int bpf_bloom_map_btf_id; +BTF_ID_LIST_SINGLE(bpf_bloom_map_btf_ids, struct, bpf_bloom_filter) const struct bpf_map_ops bloom_filter_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = bloom_map_alloc, @@ -205,6 +206,5 @@ const struct bpf_map_ops bloom_filter_map_ops = { .map_update_elem = bloom_map_update_elem, .map_delete_elem = bloom_map_delete_elem, .map_check_btf = bloom_map_check_btf, - .map_btf_name = "bpf_bloom_filter", - .map_btf_id = &bpf_bloom_map_btf_id, + .map_btf_id = &bpf_bloom_map_btf_ids[0], }; diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 96be8d518885..5f7683b19199 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -90,7 +90,7 @@ void bpf_inode_storage_free(struct inode *inode) */ bpf_selem_unlink_map(selem); free_inode_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, false); + local_storage, selem, false, false); } raw_spin_unlock_bh(&local_storage->lock); rcu_read_unlock(); @@ -149,7 +149,7 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map) if (!sdata) return -ENOENT; - bpf_selem_unlink(SELEM(sdata)); + bpf_selem_unlink(SELEM(sdata), true); return 0; } @@ -245,7 +245,8 @@ static void inode_storage_map_free(struct bpf_map *map) bpf_local_storage_map_free(smap, NULL); } -static int inode_storage_map_btf_id; +BTF_ID_LIST_SINGLE(inode_storage_map_btf_ids, struct, + bpf_local_storage_map) const struct bpf_map_ops inode_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, @@ -256,8 +257,7 @@ const struct bpf_map_ops inode_storage_map_ops = { .map_update_elem = bpf_fd_inode_storage_update_elem, .map_delete_elem = bpf_fd_inode_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, - .map_btf_name = "bpf_local_storage_map", - .map_btf_id = &inode_storage_map_btf_id, + .map_btf_id = &inode_storage_map_btf_ids[0], .map_owner_storage_ptr = inode_storage_ptr, }; diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 110029ede71e..5dc307bdeaeb 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -68,23 +68,27 @@ static void bpf_iter_done_stop(struct seq_file *seq) iter_priv->done_stop = true; } +static inline bool bpf_iter_target_support_resched(const struct bpf_iter_target_info *tinfo) +{ + return tinfo->reg_info->feature & BPF_ITER_RESCHED; +} + static bool bpf_iter_support_resched(struct seq_file *seq) { struct bpf_iter_priv_data *iter_priv; iter_priv = container_of(seq->private, struct bpf_iter_priv_data, target_private); - return iter_priv->tinfo->reg_info->feature & BPF_ITER_RESCHED; + return bpf_iter_target_support_resched(iter_priv->tinfo); } /* maximum visited objects before bailing out */ #define MAX_ITER_OBJECTS 1000000 /* bpf_seq_read, a customized and simpler version for bpf iterator. - * no_llseek is assumed for this file. * The following are differences from seq_read(): * . fixed buffer size (PAGE_SIZE) - * . assuming no_llseek + * . assuming NULL ->llseek() * . stop() may call bpf program, handling potential overflow there */ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, @@ -198,6 +202,11 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, } stop: offs = seq->count; + if (IS_ERR(p)) { + seq->op->stop(seq, NULL); + err = PTR_ERR(p); + goto done; + } /* bpf program called if !p */ seq->op->stop(seq, p); if (!p) { @@ -330,35 +339,34 @@ static void cache_btf_id(struct bpf_iter_target_info *tinfo, bool bpf_iter_prog_supported(struct bpf_prog *prog) { const char *attach_fname = prog->aux->attach_func_name; + struct bpf_iter_target_info *tinfo = NULL, *iter; u32 prog_btf_id = prog->aux->attach_btf_id; const char *prefix = BPF_ITER_FUNC_PREFIX; - struct bpf_iter_target_info *tinfo; int prefix_len = strlen(prefix); - bool supported = false; if (strncmp(attach_fname, prefix, prefix_len)) return false; mutex_lock(&targets_mutex); - list_for_each_entry(tinfo, &targets, list) { - if (tinfo->btf_id && tinfo->btf_id == prog_btf_id) { - supported = true; + list_for_each_entry(iter, &targets, list) { + if (iter->btf_id && iter->btf_id == prog_btf_id) { + tinfo = iter; break; } - if (!strcmp(attach_fname + prefix_len, tinfo->reg_info->target)) { - cache_btf_id(tinfo, prog); - supported = true; + if (!strcmp(attach_fname + prefix_len, iter->reg_info->target)) { + cache_btf_id(iter, prog); + tinfo = iter; break; } } mutex_unlock(&targets_mutex); - if (supported) { + if (tinfo) { prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size; prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info; } - return supported; + return tinfo != NULL; } const struct bpf_func_proto * @@ -499,12 +507,11 @@ bool bpf_link_is_iter(struct bpf_link *link) int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_prog *prog) { + struct bpf_iter_target_info *tinfo = NULL, *iter; struct bpf_link_primer link_primer; - struct bpf_iter_target_info *tinfo; union bpf_iter_link_info linfo; struct bpf_iter_link *link; u32 prog_btf_id, linfo_len; - bool existed = false; bpfptr_t ulinfo; int err; @@ -530,16 +537,20 @@ int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, prog_btf_id = prog->aux->attach_btf_id; mutex_lock(&targets_mutex); - list_for_each_entry(tinfo, &targets, list) { - if (tinfo->btf_id == prog_btf_id) { - existed = true; + list_for_each_entry(iter, &targets, list) { + if (iter->btf_id == prog_btf_id) { + tinfo = iter; break; } } mutex_unlock(&targets_mutex); - if (!existed) + if (!tinfo) return -ENOENT; + /* Only allow sleepable program for resched-able iterator */ + if (prog->aux->sleepable && !bpf_iter_target_support_resched(tinfo)) + return -EINVAL; + link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN); if (!link) return -ENOMEM; @@ -547,7 +558,7 @@ int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog); link->tinfo = tinfo; - err = bpf_link_prime(&link->link, &link_primer); + err = bpf_link_prime(&link->link, &link_primer); if (err) { kfree(link); return err; @@ -683,19 +694,24 @@ struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop) int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) { + struct bpf_run_ctx run_ctx, *old_run_ctx; int ret; if (prog->aux->sleepable) { rcu_read_lock_trace(); migrate_disable(); might_fault(); + old_run_ctx = bpf_set_run_ctx(&run_ctx); ret = bpf_prog_run(prog, ctx); + bpf_reset_run_ctx(old_run_ctx); migrate_enable(); rcu_read_unlock_trace(); } else { rcu_read_lock(); migrate_disable(); + old_run_ctx = bpf_set_run_ctx(&run_ctx); ret = bpf_prog_run(prog, ctx); + bpf_reset_run_ctx(old_run_ctx); migrate_enable(); rcu_read_unlock(); } @@ -725,9 +741,6 @@ const struct bpf_func_proto bpf_for_each_map_elem_proto = { .arg4_type = ARG_ANYTHING, }; -/* maximum number of loops */ -#define MAX_LOOPS BIT(23) - BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx, u64, flags) { @@ -735,9 +748,13 @@ BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx, u64 ret; u32 i; + /* Note: these safety checks are also verified when bpf_loop + * is inlined, be careful to modify this code in sync. See + * function verifier.c:inline_bpf_loop. + */ if (flags) return -EINVAL; - if (nr_loops > MAX_LOOPS) + if (nr_loops > BPF_MAX_LOOPS) return -E2BIG; for (i = 0; i < nr_loops; i++) { diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 01aa2b51ec4d..802fc15b0d73 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -106,7 +106,7 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu) */ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem, - bool uncharge_mem) + bool uncharge_mem, bool use_trace_rcu) { struct bpf_local_storage_map *smap; bool free_local_storage; @@ -150,11 +150,16 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, SDATA(selem)) RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); - call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu); + if (use_trace_rcu) + call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu); + else + kfree_rcu(selem, rcu); + return free_local_storage; } -static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem) +static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, + bool use_trace_rcu) { struct bpf_local_storage *local_storage; bool free_local_storage = false; @@ -169,12 +174,16 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem) raw_spin_lock_irqsave(&local_storage->lock, flags); if (likely(selem_linked_to_storage(selem))) free_local_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, true); + local_storage, selem, true, use_trace_rcu); raw_spin_unlock_irqrestore(&local_storage->lock, flags); - if (free_local_storage) - call_rcu_tasks_trace(&local_storage->rcu, + if (free_local_storage) { + if (use_trace_rcu) + call_rcu_tasks_trace(&local_storage->rcu, bpf_local_storage_free_rcu); + else + kfree_rcu(local_storage, rcu); + } } void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, @@ -214,14 +223,14 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap, raw_spin_unlock_irqrestore(&b->lock, flags); } -void bpf_selem_unlink(struct bpf_local_storage_elem *selem) +void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu) { /* Always unlink from map before unlinking from local_storage * because selem will be freed after successfully unlinked from * the local_storage. */ bpf_selem_unlink_map(selem); - __bpf_selem_unlink_storage(selem); + __bpf_selem_unlink_storage(selem, use_trace_rcu); } struct bpf_local_storage_data * @@ -466,7 +475,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (old_sdata) { bpf_selem_unlink_map(SELEM(old_sdata)); bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata), - false); + false, true); } unlock: @@ -546,11 +555,11 @@ void bpf_local_storage_map_free(struct bpf_local_storage_map *smap, struct bpf_local_storage_elem, map_node))) { if (busy_counter) { migrate_disable(); - __this_cpu_inc(*busy_counter); + this_cpu_inc(*busy_counter); } - bpf_selem_unlink(selem); + bpf_selem_unlink(selem, false); if (busy_counter) { - __this_cpu_dec(*busy_counter); + this_cpu_dec(*busy_counter); migrate_enable(); } cond_resched_rcu(); @@ -573,7 +582,7 @@ void bpf_local_storage_map_free(struct bpf_local_storage_map *smap, synchronize_rcu(); kvfree(smap->buckets); - kfree(smap); + bpf_map_area_free(smap); } int bpf_local_storage_map_alloc_check(union bpf_attr *attr) @@ -601,7 +610,7 @@ struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr) unsigned int i; u32 nbuckets; - smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); + smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE); if (!smap) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&smap->map, attr); @@ -614,7 +623,7 @@ struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr) smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets, GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); if (!smap->buckets) { - kfree(smap); + bpf_map_area_free(smap); return ERR_PTR(-ENOMEM); } diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index 6b12f06ee18c..4ea227c9c1ad 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -4,6 +4,7 @@ #ifndef __BPF_LRU_LIST_H_ #define __BPF_LRU_LIST_H_ +#include <linux/cache.h> #include <linux/list.h> #include <linux/spinlock_types.h> diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 064eccba641d..d6c9b3705f24 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -16,6 +16,7 @@ #include <linux/bpf_local_storage.h> #include <linux/btf_ids.h> #include <linux/ima.h> +#include <linux/bpf-cgroup.h> /* For every LSM hook that allows attachment of BPF programs, declare a nop * function where a BPF program can be attached. @@ -35,6 +36,65 @@ BTF_SET_START(bpf_lsm_hooks) #undef LSM_HOOK BTF_SET_END(bpf_lsm_hooks) +/* List of LSM hooks that should operate on 'current' cgroup regardless + * of function signature. + */ +BTF_SET_START(bpf_lsm_current_hooks) +/* operate on freshly allocated sk without any cgroup association */ +#ifdef CONFIG_SECURITY_NETWORK +BTF_ID(func, bpf_lsm_sk_alloc_security) +BTF_ID(func, bpf_lsm_sk_free_security) +#endif +BTF_SET_END(bpf_lsm_current_hooks) + +/* List of LSM hooks that trigger while the socket is properly locked. + */ +BTF_SET_START(bpf_lsm_locked_sockopt_hooks) +#ifdef CONFIG_SECURITY_NETWORK +BTF_ID(func, bpf_lsm_socket_sock_rcv_skb) +BTF_ID(func, bpf_lsm_sock_graft) +BTF_ID(func, bpf_lsm_inet_csk_clone) +BTF_ID(func, bpf_lsm_inet_conn_established) +#endif +BTF_SET_END(bpf_lsm_locked_sockopt_hooks) + +/* List of LSM hooks that trigger while the socket is _not_ locked, + * but it's ok to call bpf_{g,s}etsockopt because the socket is still + * in the early init phase. + */ +BTF_SET_START(bpf_lsm_unlocked_sockopt_hooks) +#ifdef CONFIG_SECURITY_NETWORK +BTF_ID(func, bpf_lsm_socket_post_create) +BTF_ID(func, bpf_lsm_socket_socketpair) +#endif +BTF_SET_END(bpf_lsm_unlocked_sockopt_hooks) + +#ifdef CONFIG_CGROUP_BPF +void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, + bpf_func_t *bpf_func) +{ + const struct btf_param *args __maybe_unused; + + if (btf_type_vlen(prog->aux->attach_func_proto) < 1 || + btf_id_set_contains(&bpf_lsm_current_hooks, + prog->aux->attach_btf_id)) { + *bpf_func = __cgroup_bpf_run_lsm_current; + return; + } + +#ifdef CONFIG_NET + args = btf_params(prog->aux->attach_func_proto); + + if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCKET]) + *bpf_func = __cgroup_bpf_run_lsm_socket; + else if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCK]) + *bpf_func = __cgroup_bpf_run_lsm_sock; + else +#endif + *bpf_func = __cgroup_bpf_run_lsm_current; +} +#endif + int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) { @@ -117,9 +177,32 @@ static const struct bpf_func_proto bpf_ima_file_hash_proto = { .allowed = bpf_ima_inode_hash_allowed, }; +BPF_CALL_1(bpf_get_attach_cookie, void *, ctx) +{ + struct bpf_trace_run_ctx *run_ctx; + + run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx); + return run_ctx->bpf_cookie; +} + +static const struct bpf_func_proto bpf_get_attach_cookie_proto = { + .func = bpf_get_attach_cookie, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + static const struct bpf_func_proto * bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + if (prog->expected_attach_type == BPF_LSM_CGROUP) { + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + } + switch (func_id) { case BPF_FUNC_inode_storage_get: return &bpf_inode_storage_get_proto; @@ -141,6 +224,30 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return prog->aux->sleepable ? &bpf_ima_inode_hash_proto : NULL; case BPF_FUNC_ima_file_hash: return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL; + case BPF_FUNC_get_attach_cookie: + return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto : NULL; +#ifdef CONFIG_NET + case BPF_FUNC_setsockopt: + if (prog->expected_attach_type != BPF_LSM_CGROUP) + return NULL; + if (btf_id_set_contains(&bpf_lsm_locked_sockopt_hooks, + prog->aux->attach_btf_id)) + return &bpf_sk_setsockopt_proto; + if (btf_id_set_contains(&bpf_lsm_unlocked_sockopt_hooks, + prog->aux->attach_btf_id)) + return &bpf_unlocked_sk_setsockopt_proto; + return NULL; + case BPF_FUNC_getsockopt: + if (prog->expected_attach_type != BPF_LSM_CGROUP) + return NULL; + if (btf_id_set_contains(&bpf_lsm_locked_sockopt_hooks, + prog->aux->attach_btf_id)) + return &bpf_sk_getsockopt_proto; + if (btf_id_set_contains(&bpf_lsm_unlocked_sockopt_hooks, + prog->aux->attach_btf_id)) + return &bpf_unlocked_sk_getsockopt_proto; + return NULL; +#endif default: return tracing_prog_func_proto(func_id, prog); } @@ -233,6 +340,7 @@ BTF_ID(func, bpf_lsm_task_getsecid_obj) BTF_ID(func, bpf_lsm_task_prctl) BTF_ID(func, bpf_lsm_task_setscheduler) BTF_ID(func, bpf_lsm_task_to_inode) +BTF_ID(func, bpf_lsm_userns_create) BTF_SET_END(sleepable_lsm_hooks) bool bpf_lsm_is_sleepable_hook(u32 btf_id) diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 21069dbe9138..84b2d9dba79a 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -10,6 +10,7 @@ #include <linux/seq_file.h> #include <linux/refcount.h> #include <linux/mutex.h> +#include <linux/btf_ids.h> enum bpf_struct_ops_state { BPF_STRUCT_OPS_STATE_INIT, @@ -32,15 +33,15 @@ struct bpf_struct_ops_map { const struct bpf_struct_ops *st_ops; /* protect map_update */ struct mutex lock; - /* progs has all the bpf_prog that is populated + /* link has all the bpf_links that is populated * to the func ptr of the kernel's struct * (in kvalue.data). */ - struct bpf_prog **progs; + struct bpf_link **links; /* image is a page that has all the trampolines * that stores the func args before calling the bpf_prog. * A PAGE_SIZE "image" is enough to store all trampoline for - * "progs[]". + * "links[]". */ void *image; /* uvalue->data stores the kernel struct @@ -263,7 +264,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, /* No lock is needed. state and refcnt do not need * to be updated together under atomic context. */ - uvalue = (struct bpf_struct_ops_value *)value; + uvalue = value; memcpy(uvalue, st_map->uvalue, map->value_size); uvalue->state = state; refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt)); @@ -282,9 +283,9 @@ static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map) u32 i; for (i = 0; i < btf_type_vlen(t); i++) { - if (st_map->progs[i]) { - bpf_prog_put(st_map->progs[i]); - st_map->progs[i] = NULL; + if (st_map->links[i]) { + bpf_link_put(st_map->links[i]); + st_map->links[i] = NULL; } } } @@ -315,18 +316,37 @@ static int check_zero_holes(const struct btf_type *t, void *data) return 0; } -int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_progs *tprogs, - struct bpf_prog *prog, +static void bpf_struct_ops_link_release(struct bpf_link *link) +{ +} + +static void bpf_struct_ops_link_dealloc(struct bpf_link *link) +{ + struct bpf_tramp_link *tlink = container_of(link, struct bpf_tramp_link, link); + + kfree(tlink); +} + +const struct bpf_link_ops bpf_struct_ops_link_lops = { + .release = bpf_struct_ops_link_release, + .dealloc = bpf_struct_ops_link_dealloc, +}; + +int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, + struct bpf_tramp_link *link, const struct btf_func_model *model, void *image, void *image_end) { u32 flags; - tprogs[BPF_TRAMP_FENTRY].progs[0] = prog; - tprogs[BPF_TRAMP_FENTRY].nr_progs = 1; + tlinks[BPF_TRAMP_FENTRY].links[0] = link; + tlinks[BPF_TRAMP_FENTRY].nr_links = 1; + /* BPF_TRAMP_F_RET_FENTRY_RET is only used by bpf_struct_ops, + * and it must be used alone. + */ flags = model->ret_size > 0 ? BPF_TRAMP_F_RET_FENTRY_RET : 0; return arch_prepare_bpf_trampoline(NULL, image, image_end, - model, flags, tprogs, NULL); + model, flags, tlinks, NULL); } static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, @@ -337,7 +357,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, struct bpf_struct_ops_value *uvalue, *kvalue; const struct btf_member *member; const struct btf_type *t = st_ops->type; - struct bpf_tramp_progs *tprogs = NULL; + struct bpf_tramp_links *tlinks = NULL; void *udata, *kdata; int prog_fd, err = 0; void *image, *image_end; @@ -353,7 +373,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, if (err) return err; - uvalue = (struct bpf_struct_ops_value *)value; + uvalue = value; err = check_zero_holes(t, uvalue->data); if (err) return err; @@ -361,8 +381,8 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, if (uvalue->state || refcount_read(&uvalue->refcnt)) return -EINVAL; - tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL); - if (!tprogs) + tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL); + if (!tlinks) return -ENOMEM; uvalue = (struct bpf_struct_ops_value *)st_map->uvalue; @@ -385,6 +405,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, for_each_member(i, t, member) { const struct btf_type *mtype, *ptype; struct bpf_prog *prog; + struct bpf_tramp_link *link; u32 moff; moff = __btf_member_bit_offset(t, member) / 8; @@ -438,16 +459,26 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, err = PTR_ERR(prog); goto reset_unlock; } - st_map->progs[i] = prog; if (prog->type != BPF_PROG_TYPE_STRUCT_OPS || prog->aux->attach_btf_id != st_ops->type_id || prog->expected_attach_type != i) { + bpf_prog_put(prog); err = -EINVAL; goto reset_unlock; } - err = bpf_struct_ops_prepare_trampoline(tprogs, prog, + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) { + bpf_prog_put(prog); + err = -ENOMEM; + goto reset_unlock; + } + bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, + &bpf_struct_ops_link_lops, prog); + st_map->links[i] = &link->link; + + err = bpf_struct_ops_prepare_trampoline(tlinks, link, &st_ops->func_models[i], image, image_end); if (err < 0) @@ -475,10 +506,9 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, goto unlock; } - /* Error during st_ops->reg(). It is very unlikely since - * the above init_member() should have caught it earlier - * before reg(). The only possibility is if there was a race - * in registering the struct_ops (under the same name) to + /* Error during st_ops->reg(). Can happen if this struct_ops needs to be + * verified as a whole, after all init_member() calls. Can also happen if + * there was a race in registering the struct_ops (under the same name) to * a sub-system through different struct_ops's maps. */ set_memory_nx((long)st_map->image, 1); @@ -490,7 +520,7 @@ reset_unlock: memset(uvalue, 0, map->value_size); memset(kvalue, 0, map->value_size); unlock: - kfree(tprogs); + kfree(tlinks); mutex_unlock(&st_map->lock); return err; } @@ -545,9 +575,9 @@ static void bpf_struct_ops_map_free(struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; - if (st_map->progs) + if (st_map->links) bpf_struct_ops_map_put_progs(st_map); - bpf_map_area_free(st_map->progs); + bpf_map_area_free(st_map->links); bpf_jit_free_exec(st_map->image); bpf_map_area_free(st_map->uvalue); bpf_map_area_free(st_map); @@ -596,11 +626,11 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) map = &st_map->map; st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE); - st_map->progs = - bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_prog *), + st_map->links = + bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_links *), NUMA_NO_NODE); st_map->image = bpf_jit_alloc_exec(PAGE_SIZE); - if (!st_map->uvalue || !st_map->progs || !st_map->image) { + if (!st_map->uvalue || !st_map->links || !st_map->image) { bpf_struct_ops_map_free(map); return ERR_PTR(-ENOMEM); } @@ -612,7 +642,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) return map; } -static int bpf_struct_ops_map_btf_id; +BTF_ID_LIST_SINGLE(bpf_struct_ops_map_btf_ids, struct, bpf_struct_ops_map) const struct bpf_map_ops bpf_struct_ops_map_ops = { .map_alloc_check = bpf_struct_ops_map_alloc_check, .map_alloc = bpf_struct_ops_map_alloc, @@ -622,8 +652,7 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = { .map_delete_elem = bpf_struct_ops_map_delete_elem, .map_update_elem = bpf_struct_ops_map_update_elem, .map_seq_show_elem = bpf_struct_ops_map_seq_show_elem, - .map_btf_name = "bpf_struct_ops_map", - .map_btf_id = &bpf_struct_ops_map_btf_id, + .map_btf_id = &bpf_struct_ops_map_btf_ids[0], }; /* "const void *" because some subsystem is diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 6638a0ecc3d2..6f290623347e 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -26,20 +26,20 @@ static DEFINE_PER_CPU(int, bpf_task_storage_busy); static void bpf_task_storage_lock(void) { migrate_disable(); - __this_cpu_inc(bpf_task_storage_busy); + this_cpu_inc(bpf_task_storage_busy); } static void bpf_task_storage_unlock(void) { - __this_cpu_dec(bpf_task_storage_busy); + this_cpu_dec(bpf_task_storage_busy); migrate_enable(); } static bool bpf_task_storage_trylock(void) { migrate_disable(); - if (unlikely(__this_cpu_inc_return(bpf_task_storage_busy) != 1)) { - __this_cpu_dec(bpf_task_storage_busy); + if (unlikely(this_cpu_inc_return(bpf_task_storage_busy) != 1)) { + this_cpu_dec(bpf_task_storage_busy); migrate_enable(); return false; } @@ -102,7 +102,7 @@ void bpf_task_storage_free(struct task_struct *task) */ bpf_selem_unlink_map(selem); free_task_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, false); + local_storage, selem, false, false); } raw_spin_unlock_irqrestore(&local_storage->lock, flags); bpf_task_storage_unlock(); @@ -192,7 +192,7 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map) if (!sdata) return -ENOENT; - bpf_selem_unlink(SELEM(sdata)); + bpf_selem_unlink(SELEM(sdata), true); return 0; } @@ -307,7 +307,7 @@ static void task_storage_map_free(struct bpf_map *map) bpf_local_storage_map_free(smap, &bpf_task_storage_busy); } -static int task_storage_map_btf_id; +BTF_ID_LIST_SINGLE(task_storage_map_btf_ids, struct, bpf_local_storage_map) const struct bpf_map_ops task_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, @@ -318,8 +318,7 @@ const struct bpf_map_ops task_storage_map_ops = { .map_update_elem = bpf_pid_task_storage_update_elem, .map_delete_elem = bpf_pid_task_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, - .map_btf_name = "bpf_local_storage_map", - .map_btf_id = &task_storage_map_btf_id, + .map_btf_id = &task_storage_map_btf_ids[0], .map_owner_storage_ptr = task_storage_ptr, }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 0918a39279f6..eba603cec2c5 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -202,15 +202,23 @@ enum btf_kfunc_hook { BTF_KFUNC_HOOK_XDP, BTF_KFUNC_HOOK_TC, BTF_KFUNC_HOOK_STRUCT_OPS, + BTF_KFUNC_HOOK_TRACING, + BTF_KFUNC_HOOK_SYSCALL, BTF_KFUNC_HOOK_MAX, }; enum { - BTF_KFUNC_SET_MAX_CNT = 32, + BTF_KFUNC_SET_MAX_CNT = 256, + BTF_DTOR_KFUNC_MAX_CNT = 256, }; struct btf_kfunc_set_tab { - struct btf_id_set *sets[BTF_KFUNC_HOOK_MAX][BTF_KFUNC_TYPE_MAX]; + struct btf_id_set8 *sets[BTF_KFUNC_HOOK_MAX]; +}; + +struct btf_id_dtor_kfunc_tab { + u32 cnt; + struct btf_id_dtor_kfunc dtors[]; }; struct btf { @@ -228,6 +236,7 @@ struct btf { u32 id; struct rcu_head rcu; struct btf_kfunc_set_tab *kfunc_set_tab; + struct btf_id_dtor_kfunc_tab *dtor_kfunc_tab; /* split BTF support */ struct btf *base_btf; @@ -300,6 +309,7 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_FLOAT] = "FLOAT", [BTF_KIND_DECL_TAG] = "DECL_TAG", [BTF_KIND_TYPE_TAG] = "TYPE_TAG", + [BTF_KIND_ENUM64] = "ENUM64", }; const char *btf_type_str(const struct btf_type *t) @@ -657,6 +667,7 @@ static bool btf_type_has_size(const struct btf_type *t) case BTF_KIND_ENUM: case BTF_KIND_DATASEC: case BTF_KIND_FLOAT: + case BTF_KIND_ENUM64: return true; } @@ -702,6 +713,11 @@ static const struct btf_decl_tag *btf_type_decl_tag(const struct btf_type *t) return (const struct btf_decl_tag *)(t + 1); } +static const struct btf_enum64 *btf_type_enum64(const struct btf_type *t) +{ + return (const struct btf_enum64 *)(t + 1); +} + static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) { return kind_ops[BTF_INFO_KIND(t->info)]; @@ -802,6 +818,7 @@ const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) return NULL; return btf->types[type_id]; } +EXPORT_SYMBOL_GPL(btf_type_by_id); /* * Regular int is not a bit field and it must be either @@ -1010,6 +1027,7 @@ static const char *btf_show_name(struct btf_show *show) parens = "{"; break; case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: prefix = "enum"; break; default: @@ -1099,7 +1117,8 @@ __printf(2, 3) static void btf_show(struct btf_show *show, const char *fmt, ...) */ #define btf_show_type_value(show, fmt, value) \ do { \ - if ((value) != 0 || (show->flags & BTF_SHOW_ZERO) || \ + if ((value) != (__typeof__(value))0 || \ + (show->flags & BTF_SHOW_ZERO) || \ show->state.depth == 0) { \ btf_show(show, "%s%s" fmt "%s%s", \ btf_show_indent(show), \ @@ -1378,7 +1397,6 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env, const char *fmt, ...) { struct bpf_verifier_log *log = &env->log; - u8 kind = BTF_INFO_KIND(t->info); struct btf *btf = env->btf; va_list args; @@ -1394,7 +1412,7 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env, __btf_verifier_log(log, "[%u] %s %s%s", env->log_type_id, - btf_kind_str[kind], + btf_type_str(t), __btf_name_by_offset(btf, t->name_off), log_details ? " " : ""); @@ -1598,7 +1616,7 @@ static void btf_free_id(struct btf *btf) static void btf_free_kfunc_set_tab(struct btf *btf) { struct btf_kfunc_set_tab *tab = btf->kfunc_set_tab; - int hook, type; + int hook; if (!tab) return; @@ -1607,17 +1625,26 @@ static void btf_free_kfunc_set_tab(struct btf *btf) */ if (btf_is_module(btf)) goto free_tab; - for (hook = 0; hook < ARRAY_SIZE(tab->sets); hook++) { - for (type = 0; type < ARRAY_SIZE(tab->sets[0]); type++) - kfree(tab->sets[hook][type]); - } + for (hook = 0; hook < ARRAY_SIZE(tab->sets); hook++) + kfree(tab->sets[hook]); free_tab: kfree(tab); btf->kfunc_set_tab = NULL; } +static void btf_free_dtor_kfunc_tab(struct btf *btf) +{ + struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab; + + if (!tab) + return; + kfree(tab); + btf->dtor_kfunc_tab = NULL; +} + static void btf_free(struct btf *btf) { + btf_free_dtor_kfunc_tab(btf); btf_free_kfunc_set_tab(btf); kvfree(btf->types); kvfree(btf->resolved_sizes); @@ -1814,6 +1841,7 @@ __btf_resolve_size(const struct btf *btf, const struct btf_type *type, case BTF_KIND_UNION: case BTF_KIND_ENUM: case BTF_KIND_FLOAT: + case BTF_KIND_ENUM64: size = type->size; goto resolved; @@ -3100,7 +3128,7 @@ static int btf_struct_resolve(struct btf_verifier_env *env, if (v->next_member) { const struct btf_type *last_member_type; const struct btf_member *last_member; - u16 last_member_type_id; + u32 last_member_type_id; last_member = btf_type_member(v->t) + v->next_member - 1; last_member_type_id = last_member->type; @@ -3163,24 +3191,86 @@ static void btf_struct_log(struct btf_verifier_env *env, btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); } +enum btf_field_type { + BTF_FIELD_SPIN_LOCK, + BTF_FIELD_TIMER, + BTF_FIELD_KPTR, +}; + +enum { + BTF_FIELD_IGNORE = 0, + BTF_FIELD_FOUND = 1, +}; + +struct btf_field_info { + u32 type_id; + u32 off; + enum bpf_kptr_type type; +}; + +static int btf_find_struct(const struct btf *btf, const struct btf_type *t, + u32 off, int sz, struct btf_field_info *info) +{ + if (!__btf_type_is_struct(t)) + return BTF_FIELD_IGNORE; + if (t->size != sz) + return BTF_FIELD_IGNORE; + info->off = off; + return BTF_FIELD_FOUND; +} + +static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, + u32 off, int sz, struct btf_field_info *info) +{ + enum bpf_kptr_type type; + u32 res_id; + + /* For PTR, sz is always == 8 */ + if (!btf_type_is_ptr(t)) + return BTF_FIELD_IGNORE; + t = btf_type_by_id(btf, t->type); + + if (!btf_type_is_type_tag(t)) + return BTF_FIELD_IGNORE; + /* Reject extra tags */ + if (btf_type_is_type_tag(btf_type_by_id(btf, t->type))) + return -EINVAL; + if (!strcmp("kptr", __btf_name_by_offset(btf, t->name_off))) + type = BPF_KPTR_UNREF; + else if (!strcmp("kptr_ref", __btf_name_by_offset(btf, t->name_off))) + type = BPF_KPTR_REF; + else + return -EINVAL; + + /* Get the base type */ + t = btf_type_skip_modifiers(btf, t->type, &res_id); + /* Only pointer to struct is allowed */ + if (!__btf_type_is_struct(t)) + return -EINVAL; + + info->type_id = res_id; + info->off = off; + info->type = type; + return BTF_FIELD_FOUND; +} + static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t, - const char *name, int sz, int align) + const char *name, int sz, int align, + enum btf_field_type field_type, + struct btf_field_info *info, int info_cnt) { const struct btf_member *member; - u32 i, off = -ENOENT; + struct btf_field_info tmp; + int ret, idx = 0; + u32 i, off; for_each_member(i, t, member) { const struct btf_type *member_type = btf_type_by_id(btf, member->type); - if (!__btf_type_is_struct(member_type)) - continue; - if (member_type->size != sz) - continue; - if (strcmp(__btf_name_by_offset(btf, member_type->name_off), name)) + + if (name && strcmp(__btf_name_by_offset(btf, member_type->name_off), name)) continue; - if (off != -ENOENT) - /* only one such field is allowed */ - return -E2BIG; + off = __btf_member_bit_offset(t, member); if (off % 8) /* valid C code cannot generate such BTF */ @@ -3188,46 +3278,115 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t off /= 8; if (off % align) return -EINVAL; + + switch (field_type) { + case BTF_FIELD_SPIN_LOCK: + case BTF_FIELD_TIMER: + ret = btf_find_struct(btf, member_type, off, sz, + idx < info_cnt ? &info[idx] : &tmp); + if (ret < 0) + return ret; + break; + case BTF_FIELD_KPTR: + ret = btf_find_kptr(btf, member_type, off, sz, + idx < info_cnt ? &info[idx] : &tmp); + if (ret < 0) + return ret; + break; + default: + return -EFAULT; + } + + if (ret == BTF_FIELD_IGNORE) + continue; + if (idx >= info_cnt) + return -E2BIG; + ++idx; } - return off; + return idx; } static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, - const char *name, int sz, int align) + const char *name, int sz, int align, + enum btf_field_type field_type, + struct btf_field_info *info, int info_cnt) { const struct btf_var_secinfo *vsi; - u32 i, off = -ENOENT; + struct btf_field_info tmp; + int ret, idx = 0; + u32 i, off; for_each_vsi(i, t, vsi) { const struct btf_type *var = btf_type_by_id(btf, vsi->type); const struct btf_type *var_type = btf_type_by_id(btf, var->type); - if (!__btf_type_is_struct(var_type)) - continue; - if (var_type->size != sz) + off = vsi->offset; + + if (name && strcmp(__btf_name_by_offset(btf, var_type->name_off), name)) continue; if (vsi->size != sz) continue; - if (strcmp(__btf_name_by_offset(btf, var_type->name_off), name)) - continue; - if (off != -ENOENT) - /* only one such field is allowed */ - return -E2BIG; - off = vsi->offset; if (off % align) return -EINVAL; + + switch (field_type) { + case BTF_FIELD_SPIN_LOCK: + case BTF_FIELD_TIMER: + ret = btf_find_struct(btf, var_type, off, sz, + idx < info_cnt ? &info[idx] : &tmp); + if (ret < 0) + return ret; + break; + case BTF_FIELD_KPTR: + ret = btf_find_kptr(btf, var_type, off, sz, + idx < info_cnt ? &info[idx] : &tmp); + if (ret < 0) + return ret; + break; + default: + return -EFAULT; + } + + if (ret == BTF_FIELD_IGNORE) + continue; + if (idx >= info_cnt) + return -E2BIG; + ++idx; } - return off; + return idx; } static int btf_find_field(const struct btf *btf, const struct btf_type *t, - const char *name, int sz, int align) + enum btf_field_type field_type, + struct btf_field_info *info, int info_cnt) { + const char *name; + int sz, align; + + switch (field_type) { + case BTF_FIELD_SPIN_LOCK: + name = "bpf_spin_lock"; + sz = sizeof(struct bpf_spin_lock); + align = __alignof__(struct bpf_spin_lock); + break; + case BTF_FIELD_TIMER: + name = "bpf_timer"; + sz = sizeof(struct bpf_timer); + align = __alignof__(struct bpf_timer); + break; + case BTF_FIELD_KPTR: + name = NULL; + sz = sizeof(u64); + align = 8; + break; + default: + return -EFAULT; + } if (__btf_type_is_struct(t)) - return btf_find_struct_field(btf, t, name, sz, align); + return btf_find_struct_field(btf, t, name, sz, align, field_type, info, info_cnt); else if (btf_type_is_datasec(t)) - return btf_find_datasec_var(btf, t, name, sz, align); + return btf_find_datasec_var(btf, t, name, sz, align, field_type, info, info_cnt); return -EINVAL; } @@ -3237,16 +3396,130 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t, */ int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t) { - return btf_find_field(btf, t, "bpf_spin_lock", - sizeof(struct bpf_spin_lock), - __alignof__(struct bpf_spin_lock)); + struct btf_field_info info; + int ret; + + ret = btf_find_field(btf, t, BTF_FIELD_SPIN_LOCK, &info, 1); + if (ret < 0) + return ret; + if (!ret) + return -ENOENT; + return info.off; } int btf_find_timer(const struct btf *btf, const struct btf_type *t) { - return btf_find_field(btf, t, "bpf_timer", - sizeof(struct bpf_timer), - __alignof__(struct bpf_timer)); + struct btf_field_info info; + int ret; + + ret = btf_find_field(btf, t, BTF_FIELD_TIMER, &info, 1); + if (ret < 0) + return ret; + if (!ret) + return -ENOENT; + return info.off; +} + +struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf, + const struct btf_type *t) +{ + struct btf_field_info info_arr[BPF_MAP_VALUE_OFF_MAX]; + struct bpf_map_value_off *tab; + struct btf *kernel_btf = NULL; + struct module *mod = NULL; + int ret, i, nr_off; + + ret = btf_find_field(btf, t, BTF_FIELD_KPTR, info_arr, ARRAY_SIZE(info_arr)); + if (ret < 0) + return ERR_PTR(ret); + if (!ret) + return NULL; + + nr_off = ret; + tab = kzalloc(offsetof(struct bpf_map_value_off, off[nr_off]), GFP_KERNEL | __GFP_NOWARN); + if (!tab) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < nr_off; i++) { + const struct btf_type *t; + s32 id; + + /* Find type in map BTF, and use it to look up the matching type + * in vmlinux or module BTFs, by name and kind. + */ + t = btf_type_by_id(btf, info_arr[i].type_id); + id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info), + &kernel_btf); + if (id < 0) { + ret = id; + goto end; + } + + /* Find and stash the function pointer for the destruction function that + * needs to be eventually invoked from the map free path. + */ + if (info_arr[i].type == BPF_KPTR_REF) { + const struct btf_type *dtor_func; + const char *dtor_func_name; + unsigned long addr; + s32 dtor_btf_id; + + /* This call also serves as a whitelist of allowed objects that + * can be used as a referenced pointer and be stored in a map at + * the same time. + */ + dtor_btf_id = btf_find_dtor_kfunc(kernel_btf, id); + if (dtor_btf_id < 0) { + ret = dtor_btf_id; + goto end_btf; + } + + dtor_func = btf_type_by_id(kernel_btf, dtor_btf_id); + if (!dtor_func) { + ret = -ENOENT; + goto end_btf; + } + + if (btf_is_module(kernel_btf)) { + mod = btf_try_get_module(kernel_btf); + if (!mod) { + ret = -ENXIO; + goto end_btf; + } + } + + /* We already verified dtor_func to be btf_type_is_func + * in register_btf_id_dtor_kfuncs. + */ + dtor_func_name = __btf_name_by_offset(kernel_btf, dtor_func->name_off); + addr = kallsyms_lookup_name(dtor_func_name); + if (!addr) { + ret = -EINVAL; + goto end_mod; + } + tab->off[i].kptr.dtor = (void *)addr; + } + + tab->off[i].offset = info_arr[i].off; + tab->off[i].type = info_arr[i].type; + tab->off[i].kptr.btf_id = id; + tab->off[i].kptr.btf = kernel_btf; + tab->off[i].kptr.module = mod; + } + tab->nr_off = nr_off; + return tab; +end_mod: + module_put(mod); +end_btf: + btf_put(kernel_btf); +end: + while (i--) { + btf_put(tab->off[i].kptr.btf); + if (tab->off[i].kptr.module) + module_put(tab->off[i].kptr.module); + } + kfree(tab); + return ERR_PTR(ret); } static void __btf_struct_show(const struct btf *btf, const struct btf_type *t, @@ -3405,6 +3678,7 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, { const struct btf_enum *enums = btf_type_enum(t); struct btf *btf = env->btf; + const char *fmt_str; u16 i, nr_enums; u32 meta_needed; @@ -3418,11 +3692,6 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, return -EINVAL; } - if (btf_type_kflag(t)) { - btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); - return -EINVAL; - } - if (t->size > 8 || !is_power_of_2(t->size)) { btf_verifier_log_type(env, t, "Unexpected size"); return -EINVAL; @@ -3453,7 +3722,8 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, if (env->log.level == BPF_LOG_KERNEL) continue; - btf_verifier_log(env, "\t%s val=%d\n", + fmt_str = btf_type_kflag(t) ? "\t%s val=%d\n" : "\t%s val=%u\n"; + btf_verifier_log(env, fmt_str, __btf_name_by_offset(btf, enums[i].name_off), enums[i].val); } @@ -3494,7 +3764,10 @@ static void btf_enum_show(const struct btf *btf, const struct btf_type *t, return; } - btf_show_type_value(show, "%d", v); + if (btf_type_kflag(t)) + btf_show_type_value(show, "%d", v); + else + btf_show_type_value(show, "%u", v); btf_show_end_type(show); } @@ -3507,6 +3780,109 @@ static struct btf_kind_operations enum_ops = { .show = btf_enum_show, }; +static s32 btf_enum64_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + const struct btf_enum64 *enums = btf_type_enum64(t); + struct btf *btf = env->btf; + const char *fmt_str; + u16 i, nr_enums; + u32 meta_needed; + + nr_enums = btf_type_vlen(t); + meta_needed = nr_enums * sizeof(*enums); + + if (meta_left < meta_needed) { + btf_verifier_log_basic(env, t, + "meta_left:%u meta_needed:%u", + meta_left, meta_needed); + return -EINVAL; + } + + if (t->size > 8 || !is_power_of_2(t->size)) { + btf_verifier_log_type(env, t, "Unexpected size"); + return -EINVAL; + } + + /* enum type either no name or a valid one */ + if (t->name_off && + !btf_name_valid_identifier(env->btf, t->name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + for (i = 0; i < nr_enums; i++) { + if (!btf_name_offset_valid(btf, enums[i].name_off)) { + btf_verifier_log(env, "\tInvalid name_offset:%u", + enums[i].name_off); + return -EINVAL; + } + + /* enum member must have a valid name */ + if (!enums[i].name_off || + !btf_name_valid_identifier(btf, enums[i].name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + + if (env->log.level == BPF_LOG_KERNEL) + continue; + + fmt_str = btf_type_kflag(t) ? "\t%s val=%lld\n" : "\t%s val=%llu\n"; + btf_verifier_log(env, fmt_str, + __btf_name_by_offset(btf, enums[i].name_off), + btf_enum64_value(enums + i)); + } + + return meta_needed; +} + +static void btf_enum64_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct btf_show *show) +{ + const struct btf_enum64 *enums = btf_type_enum64(t); + u32 i, nr_enums = btf_type_vlen(t); + void *safe_data; + s64 v; + + safe_data = btf_show_start_type(show, t, type_id, data); + if (!safe_data) + return; + + v = *(u64 *)safe_data; + + for (i = 0; i < nr_enums; i++) { + if (v != btf_enum64_value(enums + i)) + continue; + + btf_show_type_value(show, "%s", + __btf_name_by_offset(btf, + enums[i].name_off)); + + btf_show_end_type(show); + return; + } + + if (btf_type_kflag(t)) + btf_show_type_value(show, "%lld", v); + else + btf_show_type_value(show, "%llu", v); + btf_show_end_type(show); +} + +static struct btf_kind_operations enum64_ops = { + .check_meta = btf_enum64_check_meta, + .resolve = btf_df_resolve, + .check_member = btf_enum_check_member, + .check_kflag_member = btf_enum_check_kflag_member, + .log_details = btf_enum_log, + .show = btf_enum64_show, +}; + static s32 btf_func_proto_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) @@ -4173,6 +4549,7 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = { [BTF_KIND_FLOAT] = &float_ops, [BTF_KIND_DECL_TAG] = &decl_tag_ops, [BTF_KIND_TYPE_TAG] = &modifier_ops, + [BTF_KIND_ENUM64] = &enum64_ops, }; static s32 btf_check_meta(struct btf_verifier_env *env, @@ -4477,7 +4854,6 @@ static int btf_parse_hdr(struct btf_verifier_env *env) u32 hdr_len, hdr_copy, btf_data_size; const struct btf_header *hdr; struct btf *btf; - int err; btf = env->btf; btf_data_size = btf->data_size; @@ -4534,10 +4910,53 @@ static int btf_parse_hdr(struct btf_verifier_env *env) return -EINVAL; } - err = btf_check_sec_info(env, btf_data_size); - if (err) - return err; + return btf_check_sec_info(env, btf_data_size); +} + +static int btf_check_type_tags(struct btf_verifier_env *env, + struct btf *btf, int start_id) +{ + int i, n, good_id = start_id - 1; + bool in_tags; + + n = btf_nr_types(btf); + for (i = start_id; i < n; i++) { + const struct btf_type *t; + int chain_limit = 32; + u32 cur_id = i; + + t = btf_type_by_id(btf, i); + if (!t) + return -EINVAL; + if (!btf_type_is_modifier(t)) + continue; + + cond_resched(); + in_tags = btf_type_is_type_tag(t); + while (btf_type_is_modifier(t)) { + if (!chain_limit--) { + btf_verifier_log(env, "Max chain length or cycle detected"); + return -ELOOP; + } + if (btf_type_is_type_tag(t)) { + if (!in_tags) { + btf_verifier_log(env, "Type tags don't precede modifiers"); + return -EINVAL; + } + } else if (in_tags) { + in_tags = false; + } + if (cur_id <= good_id) + break; + /* Move to next type */ + cur_id = t->type; + t = btf_type_by_id(btf, cur_id); + if (!t) + return -EINVAL; + } + good_id = i; + } return 0; } @@ -4608,6 +5027,10 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size, if (err) goto errout; + err = btf_check_type_tags(env, btf, 1); + if (err) + goto errout; + if (log->level && bpf_verifier_log_full(log)) { err = -ENOSPC; goto errout; @@ -4716,41 +5139,6 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, return ctx_type; } -static const struct bpf_map_ops * const btf_vmlinux_map_ops[] = { -#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) -#define BPF_LINK_TYPE(_id, _name) -#define BPF_MAP_TYPE(_id, _ops) \ - [_id] = &_ops, -#include <linux/bpf_types.h> -#undef BPF_PROG_TYPE -#undef BPF_LINK_TYPE -#undef BPF_MAP_TYPE -}; - -static int btf_vmlinux_map_ids_init(const struct btf *btf, - struct bpf_verifier_log *log) -{ - const struct bpf_map_ops *ops; - int i, btf_id; - - for (i = 0; i < ARRAY_SIZE(btf_vmlinux_map_ops); ++i) { - ops = btf_vmlinux_map_ops[i]; - if (!ops || (!ops->map_btf_name && !ops->map_btf_id)) - continue; - if (!ops->map_btf_name || !ops->map_btf_id) { - bpf_log(log, "map type %d is misconfigured\n", i); - return -EINVAL; - } - btf_id = btf_find_by_name_kind(btf, ops->map_btf_name, - BTF_KIND_STRUCT); - if (btf_id < 0) - return btf_id; - *ops->map_btf_id = btf_id; - } - - return 0; -} - static int btf_translate_to_vmlinux(struct bpf_verifier_log *log, struct btf *btf, const struct btf_type *t, @@ -4809,14 +5197,13 @@ struct btf *btf_parse_vmlinux(void) if (err) goto errout; + err = btf_check_type_tags(env, btf, 1); + if (err) + goto errout; + /* btf_parse_vmlinux() runs under bpf_verifier_lock */ bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]); - /* find bpf map structs for map_ptr access checking */ - err = btf_vmlinux_map_ids_init(btf, log); - if (err < 0) - goto errout; - bpf_struct_ops_init(btf, log); refcount_set(&btf->refcnt, 1); @@ -4894,6 +5281,10 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, u if (err) goto errout; + err = btf_check_type_tags(env, btf, btf_nr_types(base_btf)); + if (err) + goto errout; + btf_verifier_env_free(env); refcount_set(&btf->refcnt, 1); return btf; @@ -4932,6 +5323,34 @@ static bool is_int_ptr(struct btf *btf, const struct btf_type *t) return btf_type_is_int(t); } +static u32 get_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto, + int off) +{ + const struct btf_param *args; + const struct btf_type *t; + u32 offset = 0, nr_args; + int i; + + if (!func_proto) + return off / 8; + + nr_args = btf_type_vlen(func_proto); + args = (const struct btf_param *)(func_proto + 1); + for (i = 0; i < nr_args; i++) { + t = btf_type_skip_modifiers(btf, args[i].type, NULL); + offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8); + if (off < offset) + return i; + } + + t = btf_type_skip_modifiers(btf, func_proto->type, NULL); + offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8); + if (off < offset) + return nr_args; + + return nr_args + 1; +} + bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) @@ -4951,7 +5370,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, tname, off); return false; } - arg = off / 8; + arg = get_ctx_arg_idx(btf, t, off); args = (const struct btf_param *)(t + 1); /* if (t == NULL) Fall back to default BPF prog with * MAX_BPF_FUNC_REG_ARGS u64 arguments. @@ -4971,6 +5390,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, if (arg == nr_args) { switch (prog->expected_attach_type) { + case BPF_LSM_CGROUP: case BPF_LSM_MAC: case BPF_TRACE_FEXIT: /* When LSM programs are attached to void LSM hooks @@ -5001,7 +5421,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, if (!btf_type_is_small_int(t)) { bpf_log(log, "ret type %s not allowed for fmod_ret\n", - btf_kind_str[BTF_INFO_KIND(t->info)]); + btf_type_str(t)); return false; } break; @@ -5020,7 +5440,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* skip modifiers */ while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (btf_type_is_small_int(t) || btf_type_is_enum(t)) + if (btf_type_is_small_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t)) /* accessing a scalar */ return true; if (!btf_type_is_ptr(t)) { @@ -5028,7 +5448,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, "func '%s' arg%d '%s' has type %s. Only pointer access is allowed\n", tname, arg, __btf_name_by_offset(btf, t->name_off), - btf_kind_str[BTF_INFO_KIND(t->info)]); + btf_type_str(t)); return false; } @@ -5112,11 +5532,11 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, if (!btf_type_is_struct(t)) { bpf_log(log, "func '%s' arg%d type %s is not a struct\n", - tname, arg, btf_kind_str[BTF_INFO_KIND(t->info)]); + tname, arg, btf_type_str(t)); return false; } bpf_log(log, "func '%s' arg%d has btf_id %d type %s '%s'\n", - tname, arg, info->btf_id, btf_kind_str[BTF_INFO_KIND(t->info)], + tname, arg, info->btf_id, btf_type_str(t), __btf_name_by_offset(btf, t->name_off)); return true; } @@ -5429,7 +5849,8 @@ static bool btf_types_are_same(const struct btf *btf1, u32 id1, bool btf_struct_ids_match(struct bpf_verifier_log *log, const struct btf *btf, u32 id, int off, - const struct btf *need_btf, u32 need_type_id) + const struct btf *need_btf, u32 need_type_id, + bool strict) { const struct btf_type *type; enum bpf_type_flag flag; @@ -5438,7 +5859,12 @@ bool btf_struct_ids_match(struct bpf_verifier_log *log, /* Are we already done? */ if (off == 0 && btf_types_are_same(btf, id, need_btf, need_type_id)) return true; - + /* In case of strict type match, we do not walk struct, the top level + * type match must succeed. When strict is true, off should have already + * been 0. + */ + if (strict) + return false; again: type = btf_type_by_id(btf, id); if (!type) @@ -5461,26 +5887,25 @@ again: } static int __get_type_size(struct btf *btf, u32 btf_id, - const struct btf_type **bad_type) + const struct btf_type **ret_type) { const struct btf_type *t; + *ret_type = btf_type_by_id(btf, 0); if (!btf_id) /* void */ return 0; t = btf_type_by_id(btf, btf_id); while (t && btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (!t) { - *bad_type = btf_type_by_id(btf, 0); + if (!t) return -EINVAL; - } + *ret_type = t; if (btf_type_is_ptr(t)) /* kernel size of pointer. Not BPF's size of pointer*/ return sizeof(void *); - if (btf_type_is_int(t) || btf_type_is_enum(t)) + if (btf_type_is_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t)) return t->size; - *bad_type = t; return -EINVAL; } @@ -5499,8 +5924,10 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, /* BTF function prototype doesn't match the verifier types. * Fall back to MAX_BPF_FUNC_REG_ARGS u64 args. */ - for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) + for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { m->arg_size[i] = 8; + m->arg_flags[i] = 0; + } m->ret_size = 8; m->nr_args = MAX_BPF_FUNC_REG_ARGS; return 0; @@ -5514,10 +5941,10 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, return -EINVAL; } ret = __get_type_size(btf, func->type, &t); - if (ret < 0) { + if (ret < 0 || __btf_type_is_struct(t)) { bpf_log(log, "The function %s return type %s is unsupported.\n", - tname, btf_kind_str[BTF_INFO_KIND(t->info)]); + tname, btf_type_str(t)); return -EINVAL; } m->ret_size = ret; @@ -5530,10 +5957,12 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, return -EINVAL; } ret = __get_type_size(btf, args[i].type, &t); - if (ret < 0) { + + /* No support of struct argument size greater than 16 bytes */ + if (ret < 0 || ret > 16) { bpf_log(log, "The function %s arg%d type %s is unsupported.\n", - tname, i, btf_kind_str[BTF_INFO_KIND(t->info)]); + tname, i, btf_type_str(t)); return -EINVAL; } if (ret == 0) { @@ -5543,6 +5972,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, return -EINVAL; } m->arg_size[i] = ret; + m->arg_flags[i] = __btf_type_is_struct(t) ? BTF_FMODEL_STRUCT_ARG : 0; } m->nr_args = nargs; return 0; @@ -5626,7 +6056,7 @@ static int btf_check_func_type_match(struct bpf_verifier_log *log, * to context only. And only global functions can be replaced. * Hence type check only those types. */ - if (btf_type_is_int(t1) || btf_type_is_enum(t1)) + if (btf_type_is_int(t1) || btf_is_any_enum(t1)) continue; if (!btf_type_is_ptr(t1)) { bpf_log(log, @@ -5764,11 +6194,41 @@ static bool is_kfunc_arg_mem_size(const struct btf *btf, return true; } +static bool btf_is_kfunc_arg_mem_size(const struct btf *btf, + const struct btf_param *arg, + const struct bpf_reg_state *reg, + const char *name) +{ + int len, target_len = strlen(name); + const struct btf_type *t; + const char *param_name; + + t = btf_type_skip_modifiers(btf, arg->type, NULL); + if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE) + return false; + + param_name = btf_name_by_offset(btf, arg->name_off); + if (str_is_empty(param_name)) + return false; + len = strlen(param_name); + if (len != target_len) + return false; + if (strcmp(param_name, name)) + return false; + + return true; +} + static int btf_check_func_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs, - bool ptr_to_mem_ok) + bool ptr_to_mem_ok, + struct bpf_kfunc_arg_meta *kfunc_meta, + bool processing_call) { + enum bpf_prog_type prog_type = resolve_prog_type(env->prog); + bool rel = false, kptr_get = false, trusted_args = false; + bool sleepable = false; struct bpf_verifier_log *log = &env->log; u32 i, nargs, ref_id, ref_obj_id = 0; bool is_kfunc = btf_is_kernel(btf); @@ -5776,7 +6236,6 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, const struct btf_type *t, *ref_t; const struct btf_param *args; int ref_regno = 0, ret; - bool rel = false; t = btf_type_by_id(btf, func_id); if (!t || !btf_type_is_func(t)) { @@ -5802,19 +6261,57 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } - /* Only kfunc can be release func */ - if (is_kfunc) - rel = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog), - BTF_KFUNC_TYPE_RELEASE, func_id); + if (is_kfunc && kfunc_meta) { + /* Only kfunc can be release func */ + rel = kfunc_meta->flags & KF_RELEASE; + kptr_get = kfunc_meta->flags & KF_KPTR_GET; + trusted_args = kfunc_meta->flags & KF_TRUSTED_ARGS; + sleepable = kfunc_meta->flags & KF_SLEEPABLE; + } + /* check that BTF function arguments match actual types that the * verifier sees. */ for (i = 0; i < nargs; i++) { + enum bpf_arg_type arg_type = ARG_DONTCARE; u32 regno = i + 1; struct bpf_reg_state *reg = ®s[regno]; + bool obj_ptr = false; t = btf_type_skip_modifiers(btf, args[i].type, NULL); if (btf_type_is_scalar(t)) { + if (is_kfunc && kfunc_meta) { + bool is_buf_size = false; + + /* check for any const scalar parameter of name "rdonly_buf_size" + * or "rdwr_buf_size" + */ + if (btf_is_kfunc_arg_mem_size(btf, &args[i], reg, + "rdonly_buf_size")) { + kfunc_meta->r0_rdonly = true; + is_buf_size = true; + } else if (btf_is_kfunc_arg_mem_size(btf, &args[i], reg, + "rdwr_buf_size")) + is_buf_size = true; + + if (is_buf_size) { + if (kfunc_meta->r0_size) { + bpf_log(log, "2 or more rdonly/rdwr_buf_size parameters for kfunc"); + return -EINVAL; + } + + if (!tnum_is_const(reg->var_off)) { + bpf_log(log, "R%d is not a const\n", regno); + return -EINVAL; + } + + kfunc_meta->r0_size = reg->var_off.value; + ret = mark_chain_precision(env, regno); + if (ret) + return ret; + } + } + if (reg->type == SCALAR_VALUE) continue; bpf_log(log, "R%d is not a scalar\n", regno); @@ -5827,15 +6324,88 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } + /* These register types have special constraints wrt ref_obj_id + * and offset checks. The rest of trusted args don't. + */ + obj_ptr = reg->type == PTR_TO_CTX || reg->type == PTR_TO_BTF_ID || + reg2btf_ids[base_type(reg->type)]; + + /* Check if argument must be a referenced pointer, args + i has + * been verified to be a pointer (after skipping modifiers). + * PTR_TO_CTX is ok without having non-zero ref_obj_id. + */ + if (is_kfunc && trusted_args && (obj_ptr && reg->type != PTR_TO_CTX) && !reg->ref_obj_id) { + bpf_log(log, "R%d must be referenced\n", regno); + return -EINVAL; + } + ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); ref_tname = btf_name_by_offset(btf, ref_t->name_off); - ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE, rel); + /* Trusted args have the same offset checks as release arguments */ + if ((trusted_args && obj_ptr) || (rel && reg->ref_obj_id)) + arg_type |= OBJ_RELEASE; + ret = check_func_arg_reg_off(env, reg, regno, arg_type); if (ret < 0) return ret; - if (btf_get_prog_ctx_type(log, btf, t, - env->prog->type, i)) { + if (is_kfunc && reg->ref_obj_id) { + /* Ensure only one argument is referenced PTR_TO_BTF_ID */ + if (ref_obj_id) { + bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", + regno, reg->ref_obj_id, ref_obj_id); + return -EFAULT; + } + ref_regno = regno; + ref_obj_id = reg->ref_obj_id; + } + + /* kptr_get is only true for kfunc */ + if (i == 0 && kptr_get) { + struct bpf_map_value_off_desc *off_desc; + + if (reg->type != PTR_TO_MAP_VALUE) { + bpf_log(log, "arg#0 expected pointer to map value\n"); + return -EINVAL; + } + + /* check_func_arg_reg_off allows var_off for + * PTR_TO_MAP_VALUE, but we need fixed offset to find + * off_desc. + */ + if (!tnum_is_const(reg->var_off)) { + bpf_log(log, "arg#0 must have constant offset\n"); + return -EINVAL; + } + + off_desc = bpf_map_kptr_off_contains(reg->map_ptr, reg->off + reg->var_off.value); + if (!off_desc || off_desc->type != BPF_KPTR_REF) { + bpf_log(log, "arg#0 no referenced kptr at map value offset=%llu\n", + reg->off + reg->var_off.value); + return -EINVAL; + } + + if (!btf_type_is_ptr(ref_t)) { + bpf_log(log, "arg#0 BTF type must be a double pointer\n"); + return -EINVAL; + } + + ref_t = btf_type_skip_modifiers(btf, ref_t->type, &ref_id); + ref_tname = btf_name_by_offset(btf, ref_t->name_off); + + if (!btf_type_is_struct(ref_t)) { + bpf_log(log, "kernel function %s args#%d pointer type %s %s is not supported\n", + func_name, i, btf_type_str(ref_t), ref_tname); + return -EINVAL; + } + if (!btf_struct_ids_match(log, btf, ref_id, 0, off_desc->kptr.btf, + off_desc->kptr.btf_id, true)) { + bpf_log(log, "kernel function %s args#%d expected pointer to %s %s\n", + func_name, i, btf_type_str(ref_t), ref_tname); + return -EINVAL; + } + /* rest of the arguments can be anything, like normal kfunc */ + } else if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { /* If function expects ctx type in BTF check that caller * is passing PTR_TO_CTX. */ @@ -5862,20 +6432,6 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, if (reg->type == PTR_TO_BTF_ID) { reg_btf = reg->btf; reg_ref_id = reg->btf_id; - /* Ensure only one argument is referenced - * PTR_TO_BTF_ID, check_func_arg_reg_off relies - * on only one referenced register being allowed - * for kfuncs. - */ - if (reg->ref_obj_id) { - if (ref_obj_id) { - bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", - regno, reg->ref_obj_id, ref_obj_id); - return -EFAULT; - } - ref_regno = regno; - ref_obj_id = reg->ref_obj_id; - } } else { reg_btf = btf_vmlinux; reg_ref_id = *reg2btf_ids[base_type(reg->type)]; @@ -5886,7 +6442,8 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off); if (!btf_struct_ids_match(log, reg_btf, reg_ref_id, - reg->off, btf, ref_id)) { + reg->off, btf, ref_id, + trusted_args || (rel && reg->ref_obj_id))) { bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n", func_name, i, btf_type_str(ref_t), ref_tname, @@ -5894,21 +6451,26 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, reg_ref_tname); return -EINVAL; } - } else if (ptr_to_mem_ok) { + } else if (ptr_to_mem_ok && processing_call) { const struct btf_type *resolve_ret; u32 type_size; if (is_kfunc) { bool arg_mem_size = i + 1 < nargs && is_kfunc_arg_mem_size(btf, &args[i + 1], ®s[regno + 1]); + bool arg_dynptr = btf_type_is_struct(ref_t) && + !strcmp(ref_tname, + stringify_struct(bpf_dynptr_kern)); /* Permit pointer to mem, but only when argument * type is pointer to scalar, or struct composed * (recursively) of scalars. * When arg_mem_size is true, the pointer can be * void *. + * Also permit initialized local dynamic pointers. */ if (!btf_type_is_scalar(ref_t) && !__btf_type_is_scalar_struct(log, btf, ref_t, 0) && + !arg_dynptr && (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) { bpf_log(log, "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n", @@ -5916,6 +6478,34 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } + if (arg_dynptr) { + if (reg->type != PTR_TO_STACK) { + bpf_log(log, "arg#%d pointer type %s %s not to stack\n", + i, btf_type_str(ref_t), + ref_tname); + return -EINVAL; + } + + if (!is_dynptr_reg_valid_init(env, reg)) { + bpf_log(log, + "arg#%d pointer type %s %s must be valid and initialized\n", + i, btf_type_str(ref_t), + ref_tname); + return -EINVAL; + } + + if (!is_dynptr_type_expected(env, reg, + ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL)) { + bpf_log(log, + "arg#%d pointer type %s %s points to unsupported dynamic pointer type\n", + i, btf_type_str(ref_t), + ref_tname); + return -EINVAL; + } + + continue; + } + /* Check for mem, len pair */ if (arg_mem_size) { if (check_kfunc_mem_size_reg(env, ®s[regno + 1], regno + 1)) { @@ -5958,11 +6548,21 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, func_name); return -EINVAL; } + + if (sleepable && !env->prog->aux->sleepable) { + bpf_log(log, "kernel function %s is sleepable but the program is not\n", + func_name); + return -EINVAL; + } + + if (kfunc_meta && ref_obj_id) + kfunc_meta->ref_obj_id = ref_obj_id; + /* returns argument register number > 0 in case of reference release kfunc */ return rel ? ref_regno : 0; } -/* Compare BTF of a function with given bpf_reg_state. +/* Compare BTF of a function declaration with given bpf_reg_state. * Returns: * EFAULT - there is a verifier bug. Abort verification. * EINVAL - there is a type mismatch or BTF is not available. @@ -5989,7 +6589,50 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, return -EINVAL; is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; - err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global); + err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, NULL, false); + + /* Compiler optimizations can remove arguments from static functions + * or mismatched type can be passed into a global function. + * In such cases mark the function as unreliable from BTF point of view. + */ + if (err) + prog->aux->func_info_aux[subprog].unreliable = true; + return err; +} + +/* Compare BTF of a function call with given bpf_reg_state. + * Returns: + * EFAULT - there is a verifier bug. Abort verification. + * EINVAL - there is a type mismatch or BTF is not available. + * 0 - BTF matches with what bpf_reg_state expects. + * Only PTR_TO_CTX and SCALAR_VALUE states are recognized. + * + * NOTE: the code is duplicated from btf_check_subprog_arg_match() + * because btf_check_func_arg_match() is still doing both. Once that + * function is split in 2, we can call from here btf_check_subprog_arg_match() + * first, and then treat the calling part in a new code path. + */ +int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, + struct bpf_reg_state *regs) +{ + struct bpf_prog *prog = env->prog; + struct btf *btf = prog->aux->btf; + bool is_global; + u32 btf_id; + int err; + + if (!prog->aux->func_info) + return -EINVAL; + + btf_id = prog->aux->func_info[subprog].type_id; + if (!btf_id) + return -EFAULT; + + if (prog->aux->func_info_aux[subprog].unreliable) + return -EINVAL; + + is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; + err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, NULL, true); /* Compiler optimizations can remove arguments from static functions * or mismatched type can be passed into a global function. @@ -6002,9 +6645,10 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, - struct bpf_reg_state *regs) + struct bpf_reg_state *regs, + struct bpf_kfunc_arg_meta *meta) { - return btf_check_func_arg_match(env, btf, func_id, regs, true); + return btf_check_func_arg_match(env, btf, func_id, regs, true, meta, true); } /* Convert BTF of a function into bpf_reg_state if possible @@ -6076,7 +6720,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, t = btf_type_by_id(btf, t->type); while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (!btf_type_is_int(t) && !btf_type_is_enum(t)) { + if (!btf_type_is_int(t) && !btf_is_any_enum(t)) { bpf_log(log, "Global function %s() doesn't return scalar. Only those are supported.\n", tname); @@ -6091,7 +6735,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, t = btf_type_by_id(btf, args[i].type); while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (btf_type_is_int(t) || btf_type_is_enum(t)) { + if (btf_type_is_int(t) || btf_is_any_enum(t)) { reg->type = SCALAR_VALUE; continue; } @@ -6118,7 +6762,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, continue; } bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n", - i, btf_kind_str[BTF_INFO_KIND(t->info)], tname); + i, btf_type_str(t), tname); return -EINVAL; } return 0; @@ -6181,7 +6825,7 @@ static void btf_snprintf_show(struct btf_show *show, const char *fmt, if (len < 0) { ssnprintf->len_left = 0; ssnprintf->len = len; - } else if (len > ssnprintf->len_left) { + } else if (len >= ssnprintf->len_left) { /* no space, drive on to get length we would have written */ ssnprintf->len_left = 0; ssnprintf->len += len; @@ -6401,6 +7045,11 @@ bool btf_id_set_contains(const struct btf_id_set *set, u32 id) return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL; } +static void *btf_id_set8_contains(const struct btf_id_set8 *set, u32 id) +{ + return bsearch(&id, set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func); +} + enum { BTF_MODULE_F_LIVE = (1 << 0), }; @@ -6649,16 +7298,16 @@ BTF_TRACING_TYPE_xxx /* Kernel Function (kfunc) BTF ID set registration API */ -static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, - enum btf_kfunc_type type, - struct btf_id_set *add_set, bool vmlinux_set) +static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, + struct btf_id_set8 *add_set) { + bool vmlinux_set = !btf_is_module(btf); struct btf_kfunc_set_tab *tab; - struct btf_id_set *set; + struct btf_id_set8 *set; u32 set_cnt; int ret; - if (hook >= BTF_KFUNC_HOOK_MAX || type >= BTF_KFUNC_TYPE_MAX) { + if (hook >= BTF_KFUNC_HOOK_MAX) { ret = -EINVAL; goto end; } @@ -6674,7 +7323,7 @@ static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, btf->kfunc_set_tab = tab; } - set = tab->sets[hook][type]; + set = tab->sets[hook]; /* Warn when register_btf_kfunc_id_set is called twice for the same hook * for module sets. */ @@ -6688,7 +7337,7 @@ static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, * pointer and return. */ if (!vmlinux_set) { - tab->sets[hook][type] = add_set; + tab->sets[hook] = add_set; return 0; } @@ -6697,7 +7346,7 @@ static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, * and concatenate all individual sets being registered. While each set * is individually sorted, they may become unsorted when concatenated, * hence re-sorting the final set again is required to make binary - * searching the set using btf_id_set_contains function work. + * searching the set using btf_id_set8_contains function work. */ set_cnt = set ? set->cnt : 0; @@ -6712,8 +7361,8 @@ static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, } /* Grow set */ - set = krealloc(tab->sets[hook][type], - offsetof(struct btf_id_set, ids[set_cnt + add_set->cnt]), + set = krealloc(tab->sets[hook], + offsetof(struct btf_id_set8, pairs[set_cnt + add_set->cnt]), GFP_KERNEL | __GFP_NOWARN); if (!set) { ret = -ENOMEM; @@ -6721,15 +7370,15 @@ static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, } /* For newly allocated set, initialize set->cnt to 0 */ - if (!tab->sets[hook][type]) + if (!tab->sets[hook]) set->cnt = 0; - tab->sets[hook][type] = set; + tab->sets[hook] = set; /* Concatenate the two sets */ - memcpy(set->ids + set->cnt, add_set->ids, add_set->cnt * sizeof(set->ids[0])); + memcpy(set->pairs + set->cnt, add_set->pairs, add_set->cnt * sizeof(set->pairs[0])); set->cnt += add_set->cnt; - sort(set->ids, set->cnt, sizeof(set->ids[0]), btf_id_cmp_func, NULL); + sort(set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func, NULL); return 0; end: @@ -6737,38 +7386,25 @@ end: return ret; } -static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, - const struct btf_kfunc_id_set *kset) -{ - bool vmlinux_set = !btf_is_module(btf); - int type, ret = 0; - - for (type = 0; type < ARRAY_SIZE(kset->sets); type++) { - if (!kset->sets[type]) - continue; - - ret = __btf_populate_kfunc_set(btf, hook, type, kset->sets[type], vmlinux_set); - if (ret) - break; - } - return ret; -} - -static bool __btf_kfunc_id_set_contains(const struct btf *btf, +static u32 *__btf_kfunc_id_set_contains(const struct btf *btf, enum btf_kfunc_hook hook, - enum btf_kfunc_type type, u32 kfunc_btf_id) { - struct btf_id_set *set; + struct btf_id_set8 *set; + u32 *id; - if (hook >= BTF_KFUNC_HOOK_MAX || type >= BTF_KFUNC_TYPE_MAX) - return false; + if (hook >= BTF_KFUNC_HOOK_MAX) + return NULL; if (!btf->kfunc_set_tab) - return false; - set = btf->kfunc_set_tab->sets[hook][type]; + return NULL; + set = btf->kfunc_set_tab->sets[hook]; if (!set) - return false; - return btf_id_set_contains(set, kfunc_btf_id); + return NULL; + id = btf_id_set8_contains(set, kfunc_btf_id); + if (!id) + return NULL; + /* The flags for BTF ID are located next to it */ + return id + 1; } static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) @@ -6780,6 +7416,11 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) return BTF_KFUNC_HOOK_TC; case BPF_PROG_TYPE_STRUCT_OPS: return BTF_KFUNC_HOOK_STRUCT_OPS; + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_LSM: + return BTF_KFUNC_HOOK_TRACING; + case BPF_PROG_TYPE_SYSCALL: + return BTF_KFUNC_HOOK_SYSCALL; default: return BTF_KFUNC_HOOK_MAX; } @@ -6792,14 +7433,14 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) * keeping the reference for the duration of the call provides the necessary * protection for looking up a well-formed btf->kfunc_set_tab. */ -bool btf_kfunc_id_set_contains(const struct btf *btf, +u32 *btf_kfunc_id_set_contains(const struct btf *btf, enum bpf_prog_type prog_type, - enum btf_kfunc_type type, u32 kfunc_btf_id) + u32 kfunc_btf_id) { enum btf_kfunc_hook hook; hook = bpf_prog_type_to_kfunc_hook(prog_type); - return __btf_kfunc_id_set_contains(btf, hook, type, kfunc_btf_id); + return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id); } /* This function must be invoked only from initcalls/module init functions */ @@ -6826,103 +7467,155 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, return PTR_ERR(btf); hook = bpf_prog_type_to_kfunc_hook(prog_type); - ret = btf_populate_kfunc_set(btf, hook, kset); + ret = btf_populate_kfunc_set(btf, hook, kset->set); btf_put(btf); return ret; } EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set); -#define MAX_TYPES_ARE_COMPAT_DEPTH 2 +s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id) +{ + struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab; + struct btf_id_dtor_kfunc *dtor; + + if (!tab) + return -ENOENT; + /* Even though the size of tab->dtors[0] is > sizeof(u32), we only need + * to compare the first u32 with btf_id, so we can reuse btf_id_cmp_func. + */ + BUILD_BUG_ON(offsetof(struct btf_id_dtor_kfunc, btf_id) != 0); + dtor = bsearch(&btf_id, tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func); + if (!dtor) + return -ENOENT; + return dtor->kfunc_btf_id; +} -static -int __bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, - const struct btf *targ_btf, __u32 targ_id, - int level) +static int btf_check_dtor_kfuncs(struct btf *btf, const struct btf_id_dtor_kfunc *dtors, u32 cnt) { - const struct btf_type *local_type, *targ_type; - int depth = 32; /* max recursion depth */ + const struct btf_type *dtor_func, *dtor_func_proto, *t; + const struct btf_param *args; + s32 dtor_btf_id; + u32 nr_args, i; - /* caller made sure that names match (ignoring flavor suffix) */ - local_type = btf_type_by_id(local_btf, local_id); - targ_type = btf_type_by_id(targ_btf, targ_id); - if (btf_kind(local_type) != btf_kind(targ_type)) - return 0; + for (i = 0; i < cnt; i++) { + dtor_btf_id = dtors[i].kfunc_btf_id; -recur: - depth--; - if (depth < 0) - return -EINVAL; + dtor_func = btf_type_by_id(btf, dtor_btf_id); + if (!dtor_func || !btf_type_is_func(dtor_func)) + return -EINVAL; - local_type = btf_type_skip_modifiers(local_btf, local_id, &local_id); - targ_type = btf_type_skip_modifiers(targ_btf, targ_id, &targ_id); - if (!local_type || !targ_type) - return -EINVAL; + dtor_func_proto = btf_type_by_id(btf, dtor_func->type); + if (!dtor_func_proto || !btf_type_is_func_proto(dtor_func_proto)) + return -EINVAL; - if (btf_kind(local_type) != btf_kind(targ_type)) - return 0; + /* Make sure the prototype of the destructor kfunc is 'void func(type *)' */ + t = btf_type_by_id(btf, dtor_func_proto->type); + if (!t || !btf_type_is_void(t)) + return -EINVAL; - switch (btf_kind(local_type)) { - case BTF_KIND_UNKN: - case BTF_KIND_STRUCT: - case BTF_KIND_UNION: - case BTF_KIND_ENUM: - case BTF_KIND_FWD: - return 1; - case BTF_KIND_INT: - /* just reject deprecated bitfield-like integers; all other - * integers are by default compatible between each other + nr_args = btf_type_vlen(dtor_func_proto); + if (nr_args != 1) + return -EINVAL; + args = btf_params(dtor_func_proto); + t = btf_type_by_id(btf, args[0].type); + /* Allow any pointer type, as width on targets Linux supports + * will be same for all pointer types (i.e. sizeof(void *)) */ - return btf_int_offset(local_type) == 0 && btf_int_offset(targ_type) == 0; - case BTF_KIND_PTR: - local_id = local_type->type; - targ_id = targ_type->type; - goto recur; - case BTF_KIND_ARRAY: - local_id = btf_array(local_type)->type; - targ_id = btf_array(targ_type)->type; - goto recur; - case BTF_KIND_FUNC_PROTO: { - struct btf_param *local_p = btf_params(local_type); - struct btf_param *targ_p = btf_params(targ_type); - __u16 local_vlen = btf_vlen(local_type); - __u16 targ_vlen = btf_vlen(targ_type); - int i, err; - - if (local_vlen != targ_vlen) - return 0; - - for (i = 0; i < local_vlen; i++, local_p++, targ_p++) { - if (level <= 0) - return -EINVAL; + if (!t || !btf_type_is_ptr(t)) + return -EINVAL; + } + return 0; +} - btf_type_skip_modifiers(local_btf, local_p->type, &local_id); - btf_type_skip_modifiers(targ_btf, targ_p->type, &targ_id); - err = __bpf_core_types_are_compat(local_btf, local_id, - targ_btf, targ_id, - level - 1); - if (err <= 0) - return err; +/* This function must be invoked only from initcalls/module init functions */ +int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt, + struct module *owner) +{ + struct btf_id_dtor_kfunc_tab *tab; + struct btf *btf; + u32 tab_cnt; + int ret; + + btf = btf_get_module_btf(owner); + if (!btf) { + if (!owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { + pr_err("missing vmlinux BTF, cannot register dtor kfuncs\n"); + return -ENOENT; + } + if (owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) { + pr_err("missing module BTF, cannot register dtor kfuncs\n"); + return -ENOENT; } + return 0; + } + if (IS_ERR(btf)) + return PTR_ERR(btf); - /* tail recurse for return type check */ - btf_type_skip_modifiers(local_btf, local_type->type, &local_id); - btf_type_skip_modifiers(targ_btf, targ_type->type, &targ_id); - goto recur; + if (add_cnt >= BTF_DTOR_KFUNC_MAX_CNT) { + pr_err("cannot register more than %d kfunc destructors\n", BTF_DTOR_KFUNC_MAX_CNT); + ret = -E2BIG; + goto end; } - default: - return 0; + + /* Ensure that the prototype of dtor kfuncs being registered is sane */ + ret = btf_check_dtor_kfuncs(btf, dtors, add_cnt); + if (ret < 0) + goto end; + + tab = btf->dtor_kfunc_tab; + /* Only one call allowed for modules */ + if (WARN_ON_ONCE(tab && btf_is_module(btf))) { + ret = -EINVAL; + goto end; + } + + tab_cnt = tab ? tab->cnt : 0; + if (tab_cnt > U32_MAX - add_cnt) { + ret = -EOVERFLOW; + goto end; } + if (tab_cnt + add_cnt >= BTF_DTOR_KFUNC_MAX_CNT) { + pr_err("cannot register more than %d kfunc destructors\n", BTF_DTOR_KFUNC_MAX_CNT); + ret = -E2BIG; + goto end; + } + + tab = krealloc(btf->dtor_kfunc_tab, + offsetof(struct btf_id_dtor_kfunc_tab, dtors[tab_cnt + add_cnt]), + GFP_KERNEL | __GFP_NOWARN); + if (!tab) { + ret = -ENOMEM; + goto end; + } + + if (!btf->dtor_kfunc_tab) + tab->cnt = 0; + btf->dtor_kfunc_tab = tab; + + memcpy(tab->dtors + tab->cnt, dtors, add_cnt * sizeof(tab->dtors[0])); + tab->cnt += add_cnt; + + sort(tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func, NULL); + + return 0; +end: + btf_free_dtor_kfunc_tab(btf); + btf_put(btf); + return ret; } +EXPORT_SYMBOL_GPL(register_btf_id_dtor_kfuncs); + +#define MAX_TYPES_ARE_COMPAT_DEPTH 2 /* Check local and target types for compatibility. This check is used for * type-based CO-RE relocations and follow slightly different rules than * field-based relocations. This function assumes that root types were already * checked for name match. Beyond that initial root-level name check, names * are completely ignored. Compatibility rules are as follows: - * - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs are considered compatible, but + * - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs/ENUM64s are considered compatible, but * kind should match for local and target types (i.e., STRUCT is not * compatible with UNION); - * - for ENUMs, the size is ignored; + * - for ENUMs/ENUM64s, the size is ignored; * - for INT, size and signedness are ignored; * - for ARRAY, dimensionality is ignored, element types are checked for * compatibility recursively; @@ -6936,11 +7629,19 @@ recur: int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, __u32 targ_id) { - return __bpf_core_types_are_compat(local_btf, local_id, - targ_btf, targ_id, + return __bpf_core_types_are_compat(local_btf, local_id, targ_btf, targ_id, MAX_TYPES_ARE_COMPAT_DEPTH); } +#define MAX_TYPES_MATCH_DEPTH 2 + +int bpf_core_types_match(const struct btf *local_btf, u32 local_id, + const struct btf *targ_btf, u32 targ_id) +{ + return __bpf_core_types_match(local_btf, local_id, targ_btf, targ_id, false, + MAX_TYPES_MATCH_DEPTH); +} + static bool bpf_core_is_flavor_sep(const char *s) { /* check X___Y name pattern, where X and Y are not underscores */ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 128028efda64..bf2fdb33fb31 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -14,6 +14,8 @@ #include <linux/string.h> #include <linux/bpf.h> #include <linux/bpf-cgroup.h> +#include <linux/bpf_lsm.h> +#include <linux/bpf_verifier.h> #include <net/sock.h> #include <net/bpf_sk_storage.h> @@ -22,6 +24,171 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE); EXPORT_SYMBOL(cgroup_bpf_enabled_key); +/* __always_inline is necessary to prevent indirect call through run_prog + * function pointer. + */ +static __always_inline int +bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp, + enum cgroup_bpf_attach_type atype, + const void *ctx, bpf_prog_run_fn run_prog, + int retval, u32 *ret_flags) +{ + const struct bpf_prog_array_item *item; + const struct bpf_prog *prog; + const struct bpf_prog_array *array; + struct bpf_run_ctx *old_run_ctx; + struct bpf_cg_run_ctx run_ctx; + u32 func_ret; + + run_ctx.retval = retval; + migrate_disable(); + rcu_read_lock(); + array = rcu_dereference(cgrp->effective[atype]); + item = &array->items[0]; + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + while ((prog = READ_ONCE(item->prog))) { + run_ctx.prog_item = item; + func_ret = run_prog(prog, ctx); + if (ret_flags) { + *(ret_flags) |= (func_ret >> 1); + func_ret &= 1; + } + if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval)) + run_ctx.retval = -EPERM; + item++; + } + bpf_reset_run_ctx(old_run_ctx); + rcu_read_unlock(); + migrate_enable(); + return run_ctx.retval; +} + +unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx, + const struct bpf_insn *insn) +{ + const struct bpf_prog *shim_prog; + struct sock *sk; + struct cgroup *cgrp; + int ret = 0; + u64 *args; + + args = (u64 *)ctx; + sk = (void *)(unsigned long)args[0]; + /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/ + shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi)); + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + if (likely(cgrp)) + ret = bpf_prog_run_array_cg(&cgrp->bpf, + shim_prog->aux->cgroup_atype, + ctx, bpf_prog_run, 0, NULL); + return ret; +} + +unsigned int __cgroup_bpf_run_lsm_socket(const void *ctx, + const struct bpf_insn *insn) +{ + const struct bpf_prog *shim_prog; + struct socket *sock; + struct cgroup *cgrp; + int ret = 0; + u64 *args; + + args = (u64 *)ctx; + sock = (void *)(unsigned long)args[0]; + /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/ + shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi)); + + cgrp = sock_cgroup_ptr(&sock->sk->sk_cgrp_data); + if (likely(cgrp)) + ret = bpf_prog_run_array_cg(&cgrp->bpf, + shim_prog->aux->cgroup_atype, + ctx, bpf_prog_run, 0, NULL); + return ret; +} + +unsigned int __cgroup_bpf_run_lsm_current(const void *ctx, + const struct bpf_insn *insn) +{ + const struct bpf_prog *shim_prog; + struct cgroup *cgrp; + int ret = 0; + + /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/ + shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi)); + + /* We rely on trampoline's __bpf_prog_enter_lsm_cgroup to grab RCU read lock. */ + cgrp = task_dfl_cgroup(current); + if (likely(cgrp)) + ret = bpf_prog_run_array_cg(&cgrp->bpf, + shim_prog->aux->cgroup_atype, + ctx, bpf_prog_run, 0, NULL); + return ret; +} + +#ifdef CONFIG_BPF_LSM +struct cgroup_lsm_atype { + u32 attach_btf_id; + int refcnt; +}; + +static struct cgroup_lsm_atype cgroup_lsm_atype[CGROUP_LSM_NUM]; + +static enum cgroup_bpf_attach_type +bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id) +{ + int i; + + lockdep_assert_held(&cgroup_mutex); + + if (attach_type != BPF_LSM_CGROUP) + return to_cgroup_bpf_attach_type(attach_type); + + for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++) + if (cgroup_lsm_atype[i].attach_btf_id == attach_btf_id) + return CGROUP_LSM_START + i; + + for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++) + if (cgroup_lsm_atype[i].attach_btf_id == 0) + return CGROUP_LSM_START + i; + + return -E2BIG; + +} + +void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype) +{ + int i = cgroup_atype - CGROUP_LSM_START; + + lockdep_assert_held(&cgroup_mutex); + + WARN_ON_ONCE(cgroup_lsm_atype[i].attach_btf_id && + cgroup_lsm_atype[i].attach_btf_id != attach_btf_id); + + cgroup_lsm_atype[i].attach_btf_id = attach_btf_id; + cgroup_lsm_atype[i].refcnt++; +} + +void bpf_cgroup_atype_put(int cgroup_atype) +{ + int i = cgroup_atype - CGROUP_LSM_START; + + mutex_lock(&cgroup_mutex); + if (--cgroup_lsm_atype[i].refcnt <= 0) + cgroup_lsm_atype[i].attach_btf_id = 0; + WARN_ON_ONCE(cgroup_lsm_atype[i].refcnt < 0); + mutex_unlock(&cgroup_mutex); +} +#else +static enum cgroup_bpf_attach_type +bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id) +{ + if (attach_type != BPF_LSM_CGROUP) + return to_cgroup_bpf_attach_type(attach_type); + return -EOPNOTSUPP; +} +#endif /* CONFIG_BPF_LSM */ + void cgroup_bpf_offline(struct cgroup *cgrp) { cgroup_get(cgrp); @@ -118,15 +285,22 @@ static void cgroup_bpf_release(struct work_struct *work) mutex_lock(&cgroup_mutex); for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) { - struct list_head *progs = &cgrp->bpf.progs[atype]; - struct bpf_prog_list *pl, *pltmp; + struct hlist_head *progs = &cgrp->bpf.progs[atype]; + struct bpf_prog_list *pl; + struct hlist_node *pltmp; - list_for_each_entry_safe(pl, pltmp, progs, node) { - list_del(&pl->node); - if (pl->prog) + hlist_for_each_entry_safe(pl, pltmp, progs, node) { + hlist_del(&pl->node); + if (pl->prog) { + if (pl->prog->expected_attach_type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(pl->prog); bpf_prog_put(pl->prog); - if (pl->link) + } + if (pl->link) { + if (pl->link->link.prog->expected_attach_type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(pl->link->link.prog); bpf_cgroup_link_auto_detach(pl->link); + } kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key[atype]); } @@ -178,12 +352,12 @@ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl) /* count number of elements in the list. * it's slow but the list cannot be long */ -static u32 prog_list_length(struct list_head *head) +static u32 prog_list_length(struct hlist_head *head) { struct bpf_prog_list *pl; u32 cnt = 0; - list_for_each_entry(pl, head, node) { + hlist_for_each_entry(pl, head, node) { if (!prog_list_prog(pl)) continue; cnt++; @@ -252,7 +426,7 @@ static int compute_effective_progs(struct cgroup *cgrp, if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) continue; - list_for_each_entry(pl, &p->bpf.progs[atype], node) { + hlist_for_each_entry(pl, &p->bpf.progs[atype], node) { if (!prog_list_prog(pl)) continue; @@ -303,7 +477,7 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) cgroup_bpf_get(p); for (i = 0; i < NR; i++) - INIT_LIST_HEAD(&cgrp->bpf.progs[i]); + INIT_HLIST_HEAD(&cgrp->bpf.progs[i]); INIT_LIST_HEAD(&cgrp->bpf.storages); @@ -379,7 +553,7 @@ cleanup: #define BPF_CGROUP_MAX_PROGS 64 -static struct bpf_prog_list *find_attach_entry(struct list_head *progs, +static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs, struct bpf_prog *prog, struct bpf_cgroup_link *link, struct bpf_prog *replace_prog, @@ -389,12 +563,12 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs, /* single-attach case */ if (!allow_multi) { - if (list_empty(progs)) + if (hlist_empty(progs)) return NULL; - return list_first_entry(progs, typeof(*pl), node); + return hlist_entry(progs->first, typeof(*pl), node); } - list_for_each_entry(pl, progs, node) { + hlist_for_each_entry(pl, progs, node) { if (prog && pl->prog == prog && prog != replace_prog) /* disallow attaching the same prog twice */ return ERR_PTR(-EINVAL); @@ -405,7 +579,7 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs, /* direct prog multi-attach w/ replacement case */ if (replace_prog) { - list_for_each_entry(pl, progs, node) { + hlist_for_each_entry(pl, progs, node) { if (pl->prog == replace_prog) /* a match found */ return pl; @@ -439,9 +613,10 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *old_prog = NULL; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; + struct bpf_prog *new_prog = prog ? : link->link.prog; enum cgroup_bpf_attach_type atype; struct bpf_prog_list *pl; - struct list_head *progs; + struct hlist_head *progs; int err; if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) || @@ -455,7 +630,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, /* replace_prog implies BPF_F_REPLACE, and vice versa */ return -EINVAL; - atype = to_cgroup_bpf_attach_type(type); + atype = bpf_cgroup_atype_find(type, new_prog->aux->attach_btf_id); if (atype < 0) return -EINVAL; @@ -464,7 +639,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, if (!hierarchy_allows_attach(cgrp, atype)) return -EPERM; - if (!list_empty(progs) && cgrp->bpf.flags[atype] != saved_flags) + if (!hlist_empty(progs) && cgrp->bpf.flags[atype] != saved_flags) /* Disallow attaching non-overridable on top * of existing overridable in this cgroup. * Disallow attaching multi-prog if overridable or none @@ -486,12 +661,22 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, if (pl) { old_prog = pl->prog; } else { + struct hlist_node *last = NULL; + pl = kmalloc(sizeof(*pl), GFP_KERNEL); if (!pl) { bpf_cgroup_storages_free(new_storage); return -ENOMEM; } - list_add_tail(&pl->node, progs); + if (hlist_empty(progs)) + hlist_add_head(&pl->node, progs); + else + hlist_for_each(last, progs) { + if (last->next) + continue; + hlist_add_behind(&pl->node, last); + break; + } } pl->prog = prog; @@ -499,17 +684,30 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, bpf_cgroup_storages_assign(pl->storage, storage); cgrp->bpf.flags[atype] = saved_flags; + if (type == BPF_LSM_CGROUP) { + err = bpf_trampoline_link_cgroup_shim(new_prog, atype); + if (err) + goto cleanup; + } + err = update_effective_progs(cgrp, atype); if (err) - goto cleanup; + goto cleanup_trampoline; - if (old_prog) + if (old_prog) { + if (type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(old_prog); bpf_prog_put(old_prog); - else + } else { static_branch_inc(&cgroup_bpf_enabled_key[atype]); + } bpf_cgroup_storages_link(new_storage, cgrp, type); return 0; +cleanup_trampoline: + if (type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(new_prog); + cleanup: if (old_prog) { pl->prog = old_prog; @@ -517,7 +715,7 @@ cleanup: } bpf_cgroup_storages_free(new_storage); if (!old_prog) { - list_del(&pl->node); + hlist_del(&pl->node); kfree(pl); } return err; @@ -548,7 +746,7 @@ static void replace_effective_prog(struct cgroup *cgrp, struct cgroup_subsys_state *css; struct bpf_prog_array *progs; struct bpf_prog_list *pl; - struct list_head *head; + struct hlist_head *head; struct cgroup *cg; int pos; @@ -564,7 +762,7 @@ static void replace_effective_prog(struct cgroup *cgrp, continue; head = &cg->bpf.progs[atype]; - list_for_each_entry(pl, head, node) { + hlist_for_each_entry(pl, head, node) { if (!prog_list_prog(pl)) continue; if (pl->link == link) @@ -598,10 +796,10 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp, enum cgroup_bpf_attach_type atype; struct bpf_prog *old_prog; struct bpf_prog_list *pl; - struct list_head *progs; + struct hlist_head *progs; bool found = false; - atype = to_cgroup_bpf_attach_type(link->type); + atype = bpf_cgroup_atype_find(link->type, new_prog->aux->attach_btf_id); if (atype < 0) return -EINVAL; @@ -610,7 +808,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp, if (link->link.prog->type != new_prog->type) return -EINVAL; - list_for_each_entry(pl, progs, node) { + hlist_for_each_entry(pl, progs, node) { if (pl->link == link) { found = true; break; @@ -649,7 +847,7 @@ out_unlock: return ret; } -static struct bpf_prog_list *find_detach_entry(struct list_head *progs, +static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs, struct bpf_prog *prog, struct bpf_cgroup_link *link, bool allow_multi) @@ -657,14 +855,14 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs, struct bpf_prog_list *pl; if (!allow_multi) { - if (list_empty(progs)) + if (hlist_empty(progs)) /* report error when trying to detach and nothing is attached */ return ERR_PTR(-ENOENT); /* to maintain backward compatibility NONE and OVERRIDE cgroups * allow detaching with invalid FD (prog==NULL) in legacy mode */ - return list_first_entry(progs, typeof(*pl), node); + return hlist_entry(progs->first, typeof(*pl), node); } if (!prog && !link) @@ -674,7 +872,7 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs, return ERR_PTR(-EINVAL); /* find the prog or link and detach it */ - list_for_each_entry(pl, progs, node) { + hlist_for_each_entry(pl, progs, node) { if (pl->prog == prog && pl->link == link) return pl; } @@ -682,6 +880,62 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs, } /** + * purge_effective_progs() - After compute_effective_progs fails to alloc new + * cgrp->bpf.inactive table we can recover by + * recomputing the array in place. + * + * @cgrp: The cgroup which descendants to travers + * @prog: A program to detach or NULL + * @link: A link to detach or NULL + * @atype: Type of detach operation + */ +static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog, + struct bpf_cgroup_link *link, + enum cgroup_bpf_attach_type atype) +{ + struct cgroup_subsys_state *css; + struct bpf_prog_array *progs; + struct bpf_prog_list *pl; + struct hlist_head *head; + struct cgroup *cg; + int pos; + + /* recompute effective prog array in place */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + if (percpu_ref_is_zero(&desc->bpf.refcnt)) + continue; + + /* find position of link or prog in effective progs array */ + for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) { + if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) + continue; + + head = &cg->bpf.progs[atype]; + hlist_for_each_entry(pl, head, node) { + if (!prog_list_prog(pl)) + continue; + if (pl->prog == prog && pl->link == link) + goto found; + pos++; + } + } + + /* no link or prog match, skip the cgroup of this layer */ + continue; +found: + progs = rcu_dereference_protected( + desc->bpf.effective[atype], + lockdep_is_held(&cgroup_mutex)); + + /* Remove the program from the array */ + WARN_ONCE(bpf_prog_array_delete_safe_at(progs, pos), + "Failed to purge a prog from array at index %d", pos); + } +} + +/** * __cgroup_bpf_detach() - Detach the program or link from a cgroup, and * propagate the change to descendants * @cgrp: The cgroup which descendants to traverse @@ -698,11 +952,16 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, enum cgroup_bpf_attach_type atype; struct bpf_prog *old_prog; struct bpf_prog_list *pl; - struct list_head *progs; + struct hlist_head *progs; + u32 attach_btf_id = 0; u32 flags; - int err; - atype = to_cgroup_bpf_attach_type(type); + if (prog) + attach_btf_id = prog->aux->attach_btf_id; + if (link) + attach_btf_id = link->link.prog->aux->attach_btf_id; + + atype = bpf_cgroup_atype_find(type, attach_btf_id); if (atype < 0) return -EINVAL; @@ -722,26 +981,27 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, pl->prog = NULL; pl->link = NULL; - err = update_effective_progs(cgrp, atype); - if (err) - goto cleanup; + if (update_effective_progs(cgrp, atype)) { + /* if update effective array failed replace the prog with a dummy prog*/ + pl->prog = old_prog; + pl->link = link; + purge_effective_progs(cgrp, old_prog, link, atype); + } /* now can actually delete it from this cgroup list */ - list_del(&pl->node); + hlist_del(&pl->node); + kfree(pl); - if (list_empty(progs)) + if (hlist_empty(progs)) /* last program was detached, reset flags to zero */ cgrp->bpf.flags[atype] = 0; - if (old_prog) + if (old_prog) { + if (type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(old_prog); bpf_prog_put(old_prog); + } static_branch_dec(&cgroup_bpf_enabled_key[atype]); return 0; - -cleanup: - /* restore back prog or link */ - pl->prog = old_prog; - pl->link = link; - return err; } static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, @@ -759,57 +1019,98 @@ static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, union bpf_attr __user *uattr) { + __u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags); + bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE; __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); enum bpf_attach_type type = attr->query.attach_type; + enum cgroup_bpf_attach_type from_atype, to_atype; enum cgroup_bpf_attach_type atype; struct bpf_prog_array *effective; - struct list_head *progs; - struct bpf_prog *prog; int cnt, ret = 0, i; + int total_cnt = 0; u32 flags; - atype = to_cgroup_bpf_attach_type(type); - if (atype < 0) + if (effective_query && prog_attach_flags) return -EINVAL; - progs = &cgrp->bpf.progs[atype]; - flags = cgrp->bpf.flags[atype]; + if (type == BPF_LSM_CGROUP) { + if (!effective_query && attr->query.prog_cnt && + prog_ids && !prog_attach_flags) + return -EINVAL; - effective = rcu_dereference_protected(cgrp->bpf.effective[atype], - lockdep_is_held(&cgroup_mutex)); + from_atype = CGROUP_LSM_START; + to_atype = CGROUP_LSM_END; + flags = 0; + } else { + from_atype = to_cgroup_bpf_attach_type(type); + if (from_atype < 0) + return -EINVAL; + to_atype = from_atype; + flags = cgrp->bpf.flags[from_atype]; + } - if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) - cnt = bpf_prog_array_length(effective); - else - cnt = prog_list_length(progs); + for (atype = from_atype; atype <= to_atype; atype++) { + if (effective_query) { + effective = rcu_dereference_protected(cgrp->bpf.effective[atype], + lockdep_is_held(&cgroup_mutex)); + total_cnt += bpf_prog_array_length(effective); + } else { + total_cnt += prog_list_length(&cgrp->bpf.progs[atype]); + } + } + /* always output uattr->query.attach_flags as 0 during effective query */ + flags = effective_query ? 0 : flags; if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) return -EFAULT; - if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt))) + if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt))) return -EFAULT; - if (attr->query.prog_cnt == 0 || !prog_ids || !cnt) + if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt) /* return early if user requested only program count + flags */ return 0; - if (attr->query.prog_cnt < cnt) { - cnt = attr->query.prog_cnt; + + if (attr->query.prog_cnt < total_cnt) { + total_cnt = attr->query.prog_cnt; ret = -ENOSPC; } - if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { - return bpf_prog_array_copy_to_user(effective, prog_ids, cnt); - } else { - struct bpf_prog_list *pl; - u32 id; - - i = 0; - list_for_each_entry(pl, progs, node) { - prog = prog_list_prog(pl); - id = prog->aux->id; - if (copy_to_user(prog_ids + i, &id, sizeof(id))) - return -EFAULT; - if (++i == cnt) - break; + for (atype = from_atype; atype <= to_atype && total_cnt; atype++) { + if (effective_query) { + effective = rcu_dereference_protected(cgrp->bpf.effective[atype], + lockdep_is_held(&cgroup_mutex)); + cnt = min_t(int, bpf_prog_array_length(effective), total_cnt); + ret = bpf_prog_array_copy_to_user(effective, prog_ids, cnt); + } else { + struct hlist_head *progs; + struct bpf_prog_list *pl; + struct bpf_prog *prog; + u32 id; + + progs = &cgrp->bpf.progs[atype]; + cnt = min_t(int, prog_list_length(progs), total_cnt); + i = 0; + hlist_for_each_entry(pl, progs, node) { + prog = prog_list_prog(pl); + id = prog->aux->id; + if (copy_to_user(prog_ids + i, &id, sizeof(id))) + return -EFAULT; + if (++i == cnt) + break; + } + + if (prog_attach_flags) { + flags = cgrp->bpf.flags[atype]; + + for (i = 0; i < cnt; i++) + if (copy_to_user(prog_attach_flags + i, + &flags, sizeof(flags))) + return -EFAULT; + prog_attach_flags += cnt; + } } + + prog_ids += cnt; + total_cnt -= cnt; } return ret; } @@ -898,6 +1199,8 @@ static void bpf_cgroup_link_release(struct bpf_link *link) WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link, cg_link->type)); + if (cg_link->type == BPF_LSM_CGROUP) + bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog); cg = cg_link->cgroup; cg_link->cgroup = NULL; @@ -1075,11 +1378,38 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, bpf_compute_and_save_data_end(skb, &saved_data_end); if (atype == CGROUP_INET_EGRESS) { - ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY( - cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb); + u32 flags = 0; + bool cn; + + ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, skb, + __bpf_prog_run_save_cb, 0, &flags); + + /* Return values of CGROUP EGRESS BPF programs are: + * 0: drop packet + * 1: keep packet + * 2: drop packet and cn + * 3: keep packet and cn + * + * The returned value is then converted to one of the NET_XMIT + * or an error code that is then interpreted as drop packet + * (and no cn): + * 0: NET_XMIT_SUCCESS skb should be transmitted + * 1: NET_XMIT_DROP skb should be dropped and cn + * 2: NET_XMIT_CN skb should be transmitted and cn + * 3: -err skb should be dropped + */ + + cn = flags & BPF_RET_SET_CN; + if (ret && !IS_ERR_VALUE((long)ret)) + ret = -EFAULT; + if (!ret) + ret = (cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); + else + ret = (cn ? NET_XMIT_DROP : ret); } else { - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb, - __bpf_prog_run_save_cb, 0); + ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, + skb, __bpf_prog_run_save_cb, 0, + NULL); if (ret && !IS_ERR_VALUE((long)ret)) ret = -EFAULT; } @@ -1109,8 +1439,8 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk, - bpf_prog_run, 0); + return bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0, + NULL); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); @@ -1155,8 +1485,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, } cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - return BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx, - bpf_prog_run, 0, flags); + return bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, + 0, flags); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); @@ -1182,8 +1512,8 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops, - bpf_prog_run, 0); + return bpf_prog_run_array_cg(&cgrp->bpf, atype, sock_ops, bpf_prog_run, + 0, NULL); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); @@ -1200,13 +1530,44 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, rcu_read_lock(); cgrp = task_dfl_cgroup(current); - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, - bpf_prog_run, 0); + ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0, + NULL); rcu_read_unlock(); return ret; } +BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) +{ + /* flags argument is not used now, + * but provides an ability to extend the API. + * verifier checks that its value is correct. + */ + enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); + struct bpf_cgroup_storage *storage; + struct bpf_cg_run_ctx *ctx; + void *ptr; + + /* get current cgroup storage from BPF run context */ + ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); + storage = ctx->prog_item->cgroup_storage[stype]; + + if (stype == BPF_CGROUP_STORAGE_SHARED) + ptr = &READ_ONCE(storage->buf)->data[0]; + else + ptr = this_cpu_ptr(storage->percpu_buf); + + return (unsigned long)ptr; +} + +const struct bpf_func_proto bpf_get_local_storage_proto = { + .func = bpf_get_local_storage, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_0(bpf_get_retval) { struct bpf_cg_run_ctx *ctx = @@ -1215,7 +1576,7 @@ BPF_CALL_0(bpf_get_retval) return ctx->retval; } -static const struct bpf_func_proto bpf_get_retval_proto = { +const struct bpf_func_proto bpf_get_retval_proto = { .func = bpf_get_retval, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1230,7 +1591,7 @@ BPF_CALL_1(bpf_set_retval, int, retval) return 0; } -static const struct bpf_func_proto bpf_set_retval_proto = { +const struct bpf_func_proto bpf_set_retval_proto = { .func = bpf_set_retval, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1238,32 +1599,26 @@ static const struct bpf_func_proto bpf_set_retval_proto = { }; static const struct bpf_func_proto * -cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + func_proto = cgroup_current_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; - case BPF_FUNC_get_current_cgroup_id: - return &bpf_get_current_cgroup_id_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; - case BPF_FUNC_get_retval: - return &bpf_get_retval_proto; - case BPF_FUNC_set_retval: - return &bpf_set_retval_proto; default: return bpf_base_func_proto(func_id); } } -static const struct bpf_func_proto * -cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -{ - return cgroup_base_func_proto(func_id, prog); -} - static bool cgroup_dev_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, @@ -1366,8 +1721,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, rcu_read_lock(); cgrp = task_dfl_cgroup(current); - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, - bpf_prog_run, 0); + ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0, + NULL); rcu_read_unlock(); kfree(ctx.cur_val); @@ -1459,8 +1814,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, } lock_sock(sk); - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_SETSOCKOPT], - &ctx, bpf_prog_run, 0); + ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_SETSOCKOPT, + &ctx, bpf_prog_run, 0, NULL); release_sock(sk); if (ret) @@ -1559,8 +1914,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, } lock_sock(sk); - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT], - &ctx, bpf_prog_run, retval); + ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT, + &ctx, bpf_prog_run, retval, NULL); release_sock(sk); if (ret < 0) @@ -1608,8 +1963,8 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, * be called if that data shouldn't be "exported". */ - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT], - &ctx, bpf_prog_run, retval); + ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT, + &ctx, bpf_prog_run, retval, NULL); if (ret < 0) return ret; @@ -1776,11 +2131,17 @@ static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = { static const struct bpf_func_proto * sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + func_proto = cgroup_current_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { - case BPF_FUNC_strtol: - return &bpf_strtol_proto; - case BPF_FUNC_strtoul: - return &bpf_strtoul_proto; case BPF_FUNC_sysctl_get_name: return &bpf_sysctl_get_name_proto; case BPF_FUNC_sysctl_get_current_value: @@ -1791,8 +2152,10 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sysctl_set_new_value_proto; case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; default: - return cgroup_base_func_proto(func_id, prog); + return bpf_base_func_proto(func_id); } } @@ -1913,6 +2276,16 @@ static const struct bpf_func_proto bpf_get_netns_cookie_sockopt_proto = { static const struct bpf_func_proto * cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + func_proto = cgroup_current_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { #ifdef CONFIG_NET case BPF_FUNC_get_netns_cookie: @@ -1934,8 +2307,10 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; #endif + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; default: - return cgroup_base_func_proto(func_id, prog); + return bpf_base_func_proto(func_id); } } @@ -2100,3 +2475,69 @@ const struct bpf_verifier_ops cg_sockopt_verifier_ops = { const struct bpf_prog_ops cg_sockopt_prog_ops = { }; + +/* Common helpers for cgroup hooks. */ +const struct bpf_func_proto * +cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_get_local_storage: + return &bpf_get_local_storage_proto; + case BPF_FUNC_get_retval: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + case BPF_CGROUP_SOCK_OPS: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: + return NULL; + default: + return &bpf_get_retval_proto; + } + case BPF_FUNC_set_retval: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + case BPF_CGROUP_SOCK_OPS: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: + return NULL; + default: + return &bpf_set_retval_proto; + } + default: + return NULL; + } +} + +/* Common helpers for cgroup hooks with valid process context. */ +const struct bpf_func_proto * +cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_get_current_uid_gid: + return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_get_current_pid_tgid: + return &bpf_get_current_pid_tgid_proto; + case BPF_FUNC_get_current_comm: + return &bpf_get_current_comm_proto; + case BPF_FUNC_get_current_cgroup_id: + return &bpf_get_current_cgroup_id_proto; + case BPF_FUNC_get_current_ancestor_cgroup_id: + return &bpf_get_current_ancestor_cgroup_id_proto; +#ifdef CONFIG_CGROUP_NET_CLASSID + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_curr_proto; +#endif + default: + return NULL; + } +} diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c new file mode 100644 index 000000000000..0d200a993489 --- /dev/null +++ b/kernel/bpf/cgroup_iter.c @@ -0,0 +1,282 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2022 Google */ +#include <linux/bpf.h> +#include <linux/btf_ids.h> +#include <linux/cgroup.h> +#include <linux/kernel.h> +#include <linux/seq_file.h> + +#include "../cgroup/cgroup-internal.h" /* cgroup_mutex and cgroup_is_dead */ + +/* cgroup_iter provides four modes of traversal to the cgroup hierarchy. + * + * 1. Walk the descendants of a cgroup in pre-order. + * 2. Walk the descendants of a cgroup in post-order. + * 3. Walk the ancestors of a cgroup. + * 4. Show the given cgroup only. + * + * For walking descendants, cgroup_iter can walk in either pre-order or + * post-order. For walking ancestors, the iter walks up from a cgroup to + * the root. + * + * The iter program can terminate the walk early by returning 1. Walk + * continues if prog returns 0. + * + * The prog can check (seq->num == 0) to determine whether this is + * the first element. The prog may also be passed a NULL cgroup, + * which means the walk has completed and the prog has a chance to + * do post-processing, such as outputting an epilogue. + * + * Note: the iter_prog is called with cgroup_mutex held. + * + * Currently only one session is supported, which means, depending on the + * volume of data bpf program intends to send to user space, the number + * of cgroups that can be walked is limited. For example, given the current + * buffer size is 8 * PAGE_SIZE, if the program sends 64B data for each + * cgroup, assuming PAGE_SIZE is 4kb, the total number of cgroups that can + * be walked is 512. This is a limitation of cgroup_iter. If the output data + * is larger than the kernel buffer size, after all data in the kernel buffer + * is consumed by user space, the subsequent read() syscall will signal + * EOPNOTSUPP. In order to work around, the user may have to update their + * program to reduce the volume of data sent to output. For example, skip + * some uninteresting cgroups. + */ + +struct bpf_iter__cgroup { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct cgroup *, cgroup); +}; + +struct cgroup_iter_priv { + struct cgroup_subsys_state *start_css; + bool visited_all; + bool terminate; + int order; +}; + +static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct cgroup_iter_priv *p = seq->private; + + mutex_lock(&cgroup_mutex); + + /* cgroup_iter doesn't support read across multiple sessions. */ + if (*pos > 0) { + if (p->visited_all) + return NULL; + + /* Haven't visited all, but because cgroup_mutex has dropped, + * return -EOPNOTSUPP to indicate incomplete iteration. + */ + return ERR_PTR(-EOPNOTSUPP); + } + + ++*pos; + p->terminate = false; + p->visited_all = false; + if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE) + return css_next_descendant_pre(NULL, p->start_css); + else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST) + return css_next_descendant_post(NULL, p->start_css); + else /* BPF_CGROUP_ITER_SELF_ONLY and BPF_CGROUP_ITER_ANCESTORS_UP */ + return p->start_css; +} + +static int __cgroup_iter_seq_show(struct seq_file *seq, + struct cgroup_subsys_state *css, int in_stop); + +static void cgroup_iter_seq_stop(struct seq_file *seq, void *v) +{ + struct cgroup_iter_priv *p = seq->private; + + mutex_unlock(&cgroup_mutex); + + /* pass NULL to the prog for post-processing */ + if (!v) { + __cgroup_iter_seq_show(seq, NULL, true); + p->visited_all = true; + } +} + +static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v; + struct cgroup_iter_priv *p = seq->private; + + ++*pos; + if (p->terminate) + return NULL; + + if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE) + return css_next_descendant_pre(curr, p->start_css); + else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST) + return css_next_descendant_post(curr, p->start_css); + else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP) + return curr->parent; + else /* BPF_CGROUP_ITER_SELF_ONLY */ + return NULL; +} + +static int __cgroup_iter_seq_show(struct seq_file *seq, + struct cgroup_subsys_state *css, int in_stop) +{ + struct cgroup_iter_priv *p = seq->private; + struct bpf_iter__cgroup ctx; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int ret = 0; + + /* cgroup is dead, skip this element */ + if (css && cgroup_is_dead(css->cgroup)) + return 0; + + ctx.meta = &meta; + ctx.cgroup = css ? css->cgroup : NULL; + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (prog) + ret = bpf_iter_run_prog(prog, &ctx); + + /* if prog returns > 0, terminate after this element. */ + if (ret != 0) + p->terminate = true; + + return 0; +} + +static int cgroup_iter_seq_show(struct seq_file *seq, void *v) +{ + return __cgroup_iter_seq_show(seq, (struct cgroup_subsys_state *)v, + false); +} + +static const struct seq_operations cgroup_iter_seq_ops = { + .start = cgroup_iter_seq_start, + .next = cgroup_iter_seq_next, + .stop = cgroup_iter_seq_stop, + .show = cgroup_iter_seq_show, +}; + +BTF_ID_LIST_SINGLE(bpf_cgroup_btf_id, struct, cgroup) + +static int cgroup_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux) +{ + struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv; + struct cgroup *cgrp = aux->cgroup.start; + + p->start_css = &cgrp->self; + p->terminate = false; + p->visited_all = false; + p->order = aux->cgroup.order; + return 0; +} + +static const struct bpf_iter_seq_info cgroup_iter_seq_info = { + .seq_ops = &cgroup_iter_seq_ops, + .init_seq_private = cgroup_iter_seq_init, + .seq_priv_size = sizeof(struct cgroup_iter_priv), +}; + +static int bpf_iter_attach_cgroup(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux) +{ + int fd = linfo->cgroup.cgroup_fd; + u64 id = linfo->cgroup.cgroup_id; + int order = linfo->cgroup.order; + struct cgroup *cgrp; + + if (order != BPF_CGROUP_ITER_DESCENDANTS_PRE && + order != BPF_CGROUP_ITER_DESCENDANTS_POST && + order != BPF_CGROUP_ITER_ANCESTORS_UP && + order != BPF_CGROUP_ITER_SELF_ONLY) + return -EINVAL; + + if (fd && id) + return -EINVAL; + + if (fd) + cgrp = cgroup_get_from_fd(fd); + else if (id) + cgrp = cgroup_get_from_id(id); + else /* walk the entire hierarchy by default. */ + cgrp = cgroup_get_from_path("/"); + + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + aux->cgroup.start = cgrp; + aux->cgroup.order = order; + return 0; +} + +static void bpf_iter_detach_cgroup(struct bpf_iter_aux_info *aux) +{ + cgroup_put(aux->cgroup.start); +} + +static void bpf_iter_cgroup_show_fdinfo(const struct bpf_iter_aux_info *aux, + struct seq_file *seq) +{ + char *buf; + + buf = kzalloc(PATH_MAX, GFP_KERNEL); + if (!buf) { + seq_puts(seq, "cgroup_path:\t<unknown>\n"); + goto show_order; + } + + /* If cgroup_path_ns() fails, buf will be an empty string, cgroup_path + * will print nothing. + * + * Path is in the calling process's cgroup namespace. + */ + cgroup_path_ns(aux->cgroup.start, buf, PATH_MAX, + current->nsproxy->cgroup_ns); + seq_printf(seq, "cgroup_path:\t%s\n", buf); + kfree(buf); + +show_order: + if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_PRE) + seq_puts(seq, "order: descendants_pre\n"); + else if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_POST) + seq_puts(seq, "order: descendants_post\n"); + else if (aux->cgroup.order == BPF_CGROUP_ITER_ANCESTORS_UP) + seq_puts(seq, "order: ancestors_up\n"); + else /* BPF_CGROUP_ITER_SELF_ONLY */ + seq_puts(seq, "order: self_only\n"); +} + +static int bpf_iter_cgroup_fill_link_info(const struct bpf_iter_aux_info *aux, + struct bpf_link_info *info) +{ + info->iter.cgroup.order = aux->cgroup.order; + info->iter.cgroup.cgroup_id = cgroup_id(aux->cgroup.start); + return 0; +} + +DEFINE_BPF_ITER_FUNC(cgroup, struct bpf_iter_meta *meta, + struct cgroup *cgroup) + +static struct bpf_iter_reg bpf_cgroup_reg_info = { + .target = "cgroup", + .feature = BPF_ITER_RESCHED, + .attach_target = bpf_iter_attach_cgroup, + .detach_target = bpf_iter_detach_cgroup, + .show_fdinfo = bpf_iter_cgroup_show_fdinfo, + .fill_link_info = bpf_iter_cgroup_fill_link_info, + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__cgroup, cgroup), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &cgroup_iter_seq_info, +}; + +static int __init bpf_cgroup_iter_init(void) +{ + bpf_cgroup_reg_info.ctx_arg_info[0].btf_id = bpf_cgroup_btf_id[0]; + return bpf_iter_reg_target(&bpf_cgroup_reg_info); +} + +late_initcall(bpf_cgroup_iter_init); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 13e9dbeeedf3..711fd293b6de 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -68,11 +68,13 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns { u8 *ptr = NULL; - if (k >= SKF_NET_OFF) + if (k >= SKF_NET_OFF) { ptr = skb_network_header(skb) + k - SKF_NET_OFF; - else if (k >= SKF_LL_OFF) + } else if (k >= SKF_LL_OFF) { + if (unlikely(!skb_mac_header_was_set(skb))) + return NULL; ptr = skb_mac_header(skb) + k - SKF_LL_OFF; - + } if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb)) return ptr; @@ -107,6 +109,9 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag fp->aux->prog = fp; fp->jit_requested = ebpf_jit_enabled(); fp->blinding_requested = bpf_jit_blinding_enabled(fp); +#ifdef CONFIG_CGROUP_BPF + aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID; +#endif INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode); mutex_init(&fp->aux->used_maps_mutex); @@ -176,7 +181,7 @@ void bpf_prog_jit_attempt_done(struct bpf_prog *prog) * here is relative to the prog itself instead of the main prog. * This array has one entry for each xlated bpf insn. * - * jited_off is the byte off to the last byte of the jited insn. + * jited_off is the byte off to the end of the jited insn. * * Hence, with * insn_start: @@ -647,12 +652,6 @@ static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp) return fp->jited && !bpf_prog_was_classic(fp); } -static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) -{ - return list_empty(&fp->aux->ksym.lnode) || - fp->aux->ksym.lnode.prev == LIST_POISON2; -} - void bpf_prog_kallsyms_add(struct bpf_prog *fp) { if (!bpf_prog_kallsyms_candidate(fp) || @@ -826,17 +825,13 @@ struct bpf_prog_pack { unsigned long bitmap[]; }; -#define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE) - -static size_t bpf_prog_pack_size = -1; -static size_t bpf_prog_pack_mask = -1; - -static int bpf_prog_chunk_count(void) +void bpf_jit_fill_hole_with_zero(void *area, unsigned int size) { - WARN_ON_ONCE(bpf_prog_pack_size == -1); - return bpf_prog_pack_size / BPF_PROG_CHUNK_SIZE; + memset(area, 0, size); } +#define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE) + static DEFINE_MUTEX(pack_mutex); static LIST_HEAD(pack_list); @@ -844,58 +839,37 @@ static LIST_HEAD(pack_list); * CONFIG_MMU=n. Use PAGE_SIZE in these cases. */ #ifdef PMD_SIZE -#define BPF_HPAGE_SIZE PMD_SIZE -#define BPF_HPAGE_MASK PMD_MASK +#define BPF_PROG_PACK_SIZE (PMD_SIZE * num_possible_nodes()) #else -#define BPF_HPAGE_SIZE PAGE_SIZE -#define BPF_HPAGE_MASK PAGE_MASK +#define BPF_PROG_PACK_SIZE PAGE_SIZE #endif -static size_t select_bpf_prog_pack_size(void) -{ - size_t size; - void *ptr; - - size = BPF_HPAGE_SIZE * num_online_nodes(); - ptr = module_alloc(size); +#define BPF_PROG_CHUNK_COUNT (BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE) - /* Test whether we can get huge pages. If not just use PAGE_SIZE - * packs. - */ - if (!ptr || !is_vm_area_hugepages(ptr)) { - size = PAGE_SIZE; - bpf_prog_pack_mask = PAGE_MASK; - } else { - bpf_prog_pack_mask = BPF_HPAGE_MASK; - } - - vfree(ptr); - return size; -} - -static struct bpf_prog_pack *alloc_new_pack(void) +static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns) { struct bpf_prog_pack *pack; - pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(bpf_prog_chunk_count())), + pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)), GFP_KERNEL); if (!pack) return NULL; - pack->ptr = module_alloc(bpf_prog_pack_size); + pack->ptr = module_alloc(BPF_PROG_PACK_SIZE); if (!pack->ptr) { kfree(pack); return NULL; } - bitmap_zero(pack->bitmap, bpf_prog_pack_size / BPF_PROG_CHUNK_SIZE); + bpf_fill_ill_insns(pack->ptr, BPF_PROG_PACK_SIZE); + bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE); list_add_tail(&pack->list, &pack_list); set_vm_flush_reset_perms(pack->ptr); - set_memory_ro((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE); - set_memory_x((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE); + set_memory_ro((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE); + set_memory_x((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE); return pack; } -static void *bpf_prog_pack_alloc(u32 size) +void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns) { unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size); struct bpf_prog_pack *pack; @@ -903,13 +877,11 @@ static void *bpf_prog_pack_alloc(u32 size) void *ptr = NULL; mutex_lock(&pack_mutex); - if (bpf_prog_pack_size == -1) - bpf_prog_pack_size = select_bpf_prog_pack_size(); - - if (size > bpf_prog_pack_size) { + if (size > BPF_PROG_PACK_SIZE) { size = round_up(size, PAGE_SIZE); ptr = module_alloc(size); if (ptr) { + bpf_fill_ill_insns(ptr, size); set_vm_flush_reset_perms(ptr); set_memory_ro((unsigned long)ptr, size / PAGE_SIZE); set_memory_x((unsigned long)ptr, size / PAGE_SIZE); @@ -917,13 +889,13 @@ static void *bpf_prog_pack_alloc(u32 size) goto out; } list_for_each_entry(pack, &pack_list, list) { - pos = bitmap_find_next_zero_area(pack->bitmap, bpf_prog_chunk_count(), 0, + pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0, nbits, 0); - if (pos < bpf_prog_chunk_count()) + if (pos < BPF_PROG_CHUNK_COUNT) goto found_free_area; } - pack = alloc_new_pack(); + pack = alloc_new_pack(bpf_fill_ill_insns); if (!pack) goto out; @@ -938,23 +910,20 @@ out: return ptr; } -static void bpf_prog_pack_free(struct bpf_binary_header *hdr) +void bpf_prog_pack_free(struct bpf_binary_header *hdr) { struct bpf_prog_pack *pack = NULL, *tmp; unsigned int nbits; unsigned long pos; - void *pack_ptr; mutex_lock(&pack_mutex); - if (hdr->size > bpf_prog_pack_size) { + if (hdr->size > BPF_PROG_PACK_SIZE) { module_memfree(hdr); goto out; } - pack_ptr = (void *)((unsigned long)hdr & bpf_prog_pack_mask); - list_for_each_entry(tmp, &pack_list, list) { - if (tmp->ptr == pack_ptr) { + if ((void *)hdr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > (void *)hdr) { pack = tmp; break; } @@ -964,11 +933,14 @@ static void bpf_prog_pack_free(struct bpf_binary_header *hdr) goto out; nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size); - pos = ((unsigned long)hdr - (unsigned long)pack_ptr) >> BPF_PROG_CHUNK_SHIFT; + pos = ((unsigned long)hdr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT; + + WARN_ONCE(bpf_arch_text_invalidate(hdr, hdr->size), + "bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n"); bitmap_clear(pack->bitmap, pos, nbits); - if (bitmap_find_next_zero_area(pack->bitmap, bpf_prog_chunk_count(), 0, - bpf_prog_chunk_count(), 0) == 0) { + if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0, + BPF_PROG_CHUNK_COUNT, 0) == 0) { list_del(&pack->list); module_memfree(pack->ptr); kfree(pack); @@ -1004,7 +976,7 @@ pure_initcall(bpf_jit_charge_init); int bpf_jit_charge_modmem(u32 size) { - if (atomic_long_add_return(size, &bpf_jit_current) > bpf_jit_limit) { + if (atomic_long_add_return(size, &bpf_jit_current) > READ_ONCE(bpf_jit_limit)) { if (!bpf_capable()) { atomic_long_sub(size, &bpf_jit_current); return -EPERM; @@ -1102,7 +1074,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr, if (bpf_jit_charge_modmem(size)) return NULL; - ro_header = bpf_prog_pack_alloc(size); + ro_header = bpf_prog_pack_alloc(size, bpf_fill_ill_insns); if (!ro_header) { bpf_jit_uncharge_modmem(size); return NULL; @@ -1145,7 +1117,6 @@ int bpf_jit_binary_pack_finalize(struct bpf_prog *prog, bpf_prog_pack_free(ro_header); return PTR_ERR(ptr); } - prog->aux->use_bpf_prog_pack = true; return 0; } @@ -1169,17 +1140,23 @@ void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header, bpf_jit_uncharge_modmem(size); } +struct bpf_binary_header * +bpf_jit_binary_pack_hdr(const struct bpf_prog *fp) +{ + unsigned long real_start = (unsigned long)fp->bpf_func; + unsigned long addr; + + addr = real_start & BPF_PROG_CHUNK_MASK; + return (void *)addr; +} + static inline struct bpf_binary_header * bpf_jit_binary_hdr(const struct bpf_prog *fp) { unsigned long real_start = (unsigned long)fp->bpf_func; unsigned long addr; - if (fp->aux->use_bpf_prog_pack) - addr = real_start & BPF_PROG_CHUNK_MASK; - else - addr = real_start & PAGE_MASK; - + addr = real_start & PAGE_MASK; return (void *)addr; } @@ -1192,11 +1169,7 @@ void __weak bpf_jit_free(struct bpf_prog *fp) if (fp->jited) { struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); - if (fp->aux->use_bpf_prog_pack) - bpf_jit_binary_pack_free(hdr, NULL /* rw_buffer */); - else - bpf_jit_binary_free(hdr); - + bpf_jit_binary_free(hdr); WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); } @@ -1434,6 +1407,16 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) insn = clone->insnsi; for (i = 0; i < insn_cnt; i++, insn++) { + if (bpf_pseudo_func(insn)) { + /* ld_imm64 with an address of bpf subprog is not + * a user controlled constant. Don't randomize it, + * since it will conflict with jit_subprogs() logic. + */ + insn++; + i++; + continue; + } + /* We temporarily need to hold the original ld64 insn * so that we can still access the first part in the * second blinding run. @@ -1938,6 +1921,11 @@ out: CONT; \ LDX_MEM_##SIZEOP: \ DST = *(SIZE *)(unsigned long) (SRC + insn->off); \ + CONT; \ + LDX_PROBE_MEM_##SIZEOP: \ + bpf_probe_read_kernel(&DST, sizeof(SIZE), \ + (const void *)(long) (SRC + insn->off)); \ + DST = *((SIZE *)&DST); \ CONT; LDST(B, u8) @@ -1945,15 +1933,6 @@ out: LDST(W, u32) LDST(DW, u64) #undef LDST -#define LDX_PROBE(SIZEOP, SIZE) \ - LDX_PROBE_MEM_##SIZEOP: \ - bpf_probe_read_kernel(&DST, SIZE, (const void *)(long) (SRC + insn->off)); \ - CONT; - LDX_PROBE(B, 1) - LDX_PROBE(H, 2) - LDX_PROBE(W, 4) - LDX_PROBE(DW, 8) -#undef LDX_PROBE #define ATOMIC_ALU_OP(BOP, KOP) \ case BOP: \ @@ -2268,6 +2247,21 @@ void bpf_prog_array_free(struct bpf_prog_array *progs) kfree_rcu(progs, rcu); } +static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu) +{ + struct bpf_prog_array *progs; + + progs = container_of(rcu, struct bpf_prog_array, rcu); + kfree_rcu(progs, rcu); +} + +void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs) +{ + if (!progs || progs == &bpf_empty_prog_array.hdr) + return; + call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb); +} + int bpf_prog_array_length(struct bpf_prog_array *array) { struct bpf_prog_array_item *item; @@ -2544,6 +2538,10 @@ static void bpf_prog_free_deferred(struct work_struct *work) #ifdef CONFIG_BPF_SYSCALL bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab); #endif +#ifdef CONFIG_CGROUP_BPF + if (aux->cgroup_atype != CGROUP_BPF_ATTACH_TYPE_INVALID) + bpf_cgroup_atype_put(aux->cgroup_atype); +#endif bpf_free_used_maps(aux); bpf_free_used_btfs(aux); if (bpf_prog_is_dev_bound(aux)) @@ -2619,6 +2617,7 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak; const struct bpf_func_proto bpf_map_push_elem_proto __weak; const struct bpf_func_proto bpf_map_pop_elem_proto __weak; const struct bpf_func_proto bpf_map_peek_elem_proto __weak; +const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto __weak; const struct bpf_func_proto bpf_spin_lock_proto __weak; const struct bpf_func_proto bpf_spin_unlock_proto __weak; const struct bpf_func_proto bpf_jiffies64_proto __weak; @@ -2629,6 +2628,7 @@ const struct bpf_func_proto bpf_get_numa_node_id_proto __weak; const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak; const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak; +const struct bpf_func_proto bpf_ktime_get_tai_ns_proto __weak; const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; @@ -2639,6 +2639,8 @@ const struct bpf_func_proto bpf_get_local_storage_proto __weak; const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_snprintf_btf_proto __weak; const struct bpf_func_proto bpf_seq_printf_btf_proto __weak; +const struct bpf_func_proto bpf_set_retval_proto __weak; +const struct bpf_func_proto bpf_get_retval_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { @@ -2702,6 +2704,12 @@ bool __weak bpf_jit_needs_zext(void) return false; } +/* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */ +bool __weak bpf_jit_supports_subprog_tailcalls(void) +{ + return false; +} + bool __weak bpf_jit_supports_kfunc_call(void) { return false; @@ -2727,6 +2735,11 @@ void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len) return ERR_PTR(-ENOTSUPP); } +int __weak bpf_arch_text_invalidate(void *dst, size_t len) +{ + return -ENOTSUPP; +} + DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key); EXPORT_SYMBOL(bpf_stats_enabled_key); diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 650e5d21f90d..b5ba34ddd4b6 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -27,6 +27,7 @@ #include <linux/kthread.h> #include <linux/capability.h> #include <trace/events/xdp.h> +#include <linux/btf_ids.h> #include <linux/netdevice.h> /* netif_receive_skb_list */ #include <linux/etherdevice.h> /* eth_type_trans */ @@ -96,7 +97,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) attr->map_flags & ~BPF_F_NUMA_NODE) return ERR_PTR(-EINVAL); - cmap = kzalloc(sizeof(*cmap), GFP_USER | __GFP_ACCOUNT); + cmap = bpf_map_area_alloc(sizeof(*cmap), NUMA_NO_NODE); if (!cmap) return ERR_PTR(-ENOMEM); @@ -117,7 +118,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) return &cmap->map; free_cmap: - kfree(cmap); + bpf_map_area_free(cmap); return ERR_PTR(err); } @@ -622,7 +623,7 @@ static void cpu_map_free(struct bpf_map *map) __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ } bpf_map_area_free(cmap->cpu_map); - kfree(cmap); + bpf_map_area_free(cmap); } /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or @@ -673,7 +674,7 @@ static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) __cpu_map_lookup_elem); } -static int cpu_map_btf_id; +BTF_ID_LIST_SINGLE(cpu_map_btf_ids, struct, bpf_cpu_map) const struct bpf_map_ops cpu_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = cpu_map_alloc, @@ -683,8 +684,7 @@ const struct bpf_map_ops cpu_map_ops = { .map_lookup_elem = cpu_map_lookup_elem, .map_get_next_key = cpu_map_get_next_key, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_cpu_map", - .map_btf_id = &cpu_map_btf_id, + .map_btf_id = &cpu_map_btf_ids[0], .map_redirect = cpu_map_redirect, }; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 038f6d7a83e4..f9a87dcc5535 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -48,6 +48,7 @@ #include <net/xdp.h> #include <linux/filter.h> #include <trace/events/xdp.h> +#include <linux/btf_ids.h> #define DEV_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) @@ -162,13 +163,13 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); - dtab = kzalloc(sizeof(*dtab), GFP_USER | __GFP_ACCOUNT); + dtab = bpf_map_area_alloc(sizeof(*dtab), NUMA_NO_NODE); if (!dtab) return ERR_PTR(-ENOMEM); err = dev_map_init_map(dtab, attr); if (err) { - kfree(dtab); + bpf_map_area_free(dtab); return ERR_PTR(err); } @@ -239,7 +240,7 @@ static void dev_map_free(struct bpf_map *map) bpf_map_area_free(dtab->netdev_map); } - kfree(dtab); + bpf_map_area_free(dtab); } static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) @@ -476,7 +477,7 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, if (!dev->netdev_ops->ndo_xdp_xmit) return -EOPNOTSUPP; - err = xdp_ok_fwd_dev(dev, xdpf->len); + err = xdp_ok_fwd_dev(dev, xdp_get_frame_len(xdpf)); if (unlikely(err)) return err; @@ -535,7 +536,7 @@ static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf) !obj->dev->netdev_ops->ndo_xdp_xmit) return false; - if (xdp_ok_fwd_dev(obj->dev, xdpf->len)) + if (xdp_ok_fwd_dev(obj->dev, xdp_get_frame_len(xdpf))) return false; return true; @@ -844,7 +845,7 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, struct bpf_dtab_netdev *dev; dev = bpf_map_kmalloc_node(&dtab->map, sizeof(*dev), - GFP_ATOMIC | __GFP_NOWARN, + GFP_NOWAIT | __GFP_NOWARN, dtab->map.numa_node); if (!dev) return ERR_PTR(-ENOMEM); @@ -1005,7 +1006,7 @@ static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) __dev_map_hash_lookup_elem); } -static int dev_map_btf_id; +BTF_ID_LIST_SINGLE(dev_map_btf_ids, struct, bpf_dtab) const struct bpf_map_ops dev_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = dev_map_alloc, @@ -1015,12 +1016,10 @@ const struct bpf_map_ops dev_map_ops = { .map_update_elem = dev_map_update_elem, .map_delete_elem = dev_map_delete_elem, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_dtab", - .map_btf_id = &dev_map_btf_id, + .map_btf_id = &dev_map_btf_ids[0], .map_redirect = dev_map_redirect, }; -static int dev_map_hash_map_btf_id; const struct bpf_map_ops dev_map_hash_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = dev_map_alloc, @@ -1030,8 +1029,7 @@ const struct bpf_map_ops dev_map_hash_ops = { .map_update_elem = dev_map_hash_update_elem, .map_delete_elem = dev_map_hash_delete_elem, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_dtab", - .map_btf_id = &dev_map_hash_map_btf_id, + .map_btf_id = &dev_map_btf_ids[0], .map_redirect = dev_hash_map_redirect, }; diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c index 2444bd15cc2d..fa64b80b8bca 100644 --- a/kernel/bpf/dispatcher.c +++ b/kernel/bpf/dispatcher.c @@ -85,12 +85,12 @@ static bool bpf_dispatcher_remove_prog(struct bpf_dispatcher *d, return false; } -int __weak arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs) +int __weak arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs) { return -ENOTSUPP; } -static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image) +static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image, void *buf) { s64 ips[BPF_DISPATCHER_MAX] = {}, *ipsp = &ips[0]; int i; @@ -99,12 +99,12 @@ static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image) if (d->progs[i].prog) *ipsp++ = (s64)(uintptr_t)d->progs[i].prog->bpf_func; } - return arch_prepare_bpf_dispatcher(image, &ips[0], d->num_progs); + return arch_prepare_bpf_dispatcher(image, buf, &ips[0], d->num_progs); } static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs) { - void *old, *new; + void *old, *new, *tmp; u32 noff; int err; @@ -117,8 +117,14 @@ static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs) } new = d->num_progs ? d->image + noff : NULL; + tmp = d->num_progs ? d->rw_image + noff : NULL; if (new) { - if (bpf_dispatcher_prepare(d, new)) + /* Prepare the dispatcher in d->rw_image. Then use + * bpf_arch_text_copy to update d->image, which is RO+X. + */ + if (bpf_dispatcher_prepare(d, new, tmp)) + return; + if (IS_ERR(bpf_arch_text_copy(new, tmp, PAGE_SIZE / 2))) return; } @@ -140,9 +146,18 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, mutex_lock(&d->mutex); if (!d->image) { - d->image = bpf_jit_alloc_exec_page(); + d->image = bpf_prog_pack_alloc(PAGE_SIZE, bpf_jit_fill_hole_with_zero); if (!d->image) goto out; + d->rw_image = bpf_jit_alloc_exec(PAGE_SIZE); + if (!d->rw_image) { + u32 size = PAGE_SIZE; + + bpf_arch_text_copy(d->image, &size, sizeof(size)); + bpf_prog_pack_free((struct bpf_binary_header *)d->image); + d->image = NULL; + goto out; + } bpf_image_ksym_add(d->image, &d->ksym); } diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 65877967f414..ed3f8a53603b 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -10,9 +10,11 @@ #include <linux/random.h> #include <uapi/linux/btf.h> #include <linux/rcupdate_trace.h> +#include <linux/btf_ids.h> #include "percpu_freelist.h" #include "bpf_lru_list.h" #include "map_in_map.h" +#include <linux/bpf_mem_alloc.h> #define HTAB_CREATE_FLAG_MASK \ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ @@ -60,30 +62,22 @@ * * As regular device interrupt handlers and soft interrupts are forced into * thread context, the existing code which does - * spin_lock*(); alloc(GPF_ATOMIC); spin_unlock*(); + * spin_lock*(); alloc(GFP_ATOMIC); spin_unlock*(); * just works. * * In theory the BPF locks could be converted to regular spinlocks as well, * but the bucket locks and percpu_freelist locks can be taken from * arbitrary contexts (perf, kprobes, tracepoints) which are required to be - * atomic contexts even on RT. These mechanisms require preallocated maps, - * so there is no need to invoke memory allocations within the lock held - * sections. - * - * BPF maps which need dynamic allocation are only used from (forced) - * thread context on RT and can therefore use regular spinlocks which in - * turn allows to invoke memory allocations from the lock held section. - * - * On a non RT kernel this distinction is neither possible nor required. - * spinlock maps to raw_spinlock and the extra code is optimized out by the - * compiler. + * atomic contexts even on RT. Before the introduction of bpf_mem_alloc, + * it is only safe to use raw spinlock for preallocated hash map on a RT kernel, + * because there is no memory allocation within the lock held sections. However + * after hash map was fully converted to use bpf_mem_alloc, there will be + * non-synchronous memory allocation for non-preallocated hash map, so it is + * safe to always use raw spinlock for bucket lock. */ struct bucket { struct hlist_nulls_head head; - union { - raw_spinlock_t raw_lock; - spinlock_t lock; - }; + raw_spinlock_t raw_lock; }; #define HASHTAB_MAP_LOCK_COUNT 8 @@ -91,6 +85,8 @@ struct bucket { struct bpf_htab { struct bpf_map map; + struct bpf_mem_alloc ma; + struct bpf_mem_alloc pcpu_ma; struct bucket *buckets; void *elems; union { @@ -98,7 +94,12 @@ struct bpf_htab { struct bpf_lru lru; }; struct htab_elem *__percpu *extra_elems; - atomic_t count; /* number of elements in this hashtable */ + /* number of elements in non-preallocated hashtable are kept + * in either pcount or count + */ + struct percpu_counter pcount; + atomic_t count; + bool use_percpu_counter; u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ u32 hashrnd; @@ -113,14 +114,14 @@ struct htab_elem { struct { void *padding; union { - struct bpf_htab *htab; struct pcpu_freelist_node fnode; struct htab_elem *batch_flink; }; }; }; union { - struct rcu_head rcu; + /* pointer to per-cpu pointer */ + void *ptr_to_pptr; struct bpf_lru_node lru_node; }; u32 hash; @@ -132,26 +133,15 @@ static inline bool htab_is_prealloc(const struct bpf_htab *htab) return !(htab->map.map_flags & BPF_F_NO_PREALLOC); } -static inline bool htab_use_raw_lock(const struct bpf_htab *htab) -{ - return (!IS_ENABLED(CONFIG_PREEMPT_RT) || htab_is_prealloc(htab)); -} - static void htab_init_buckets(struct bpf_htab *htab) { - unsigned i; + unsigned int i; for (i = 0; i < htab->n_buckets; i++) { INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); - if (htab_use_raw_lock(htab)) { - raw_spin_lock_init(&htab->buckets[i].raw_lock); - lockdep_set_class(&htab->buckets[i].raw_lock, - &htab->lockdep_key); - } else { - spin_lock_init(&htab->buckets[i].lock); - lockdep_set_class(&htab->buckets[i].lock, + raw_spin_lock_init(&htab->buckets[i].raw_lock); + lockdep_set_class(&htab->buckets[i].raw_lock, &htab->lockdep_key); - } cond_resched(); } } @@ -164,17 +154,14 @@ static inline int htab_lock_bucket(const struct bpf_htab *htab, hash = hash & HASHTAB_MAP_LOCK_MASK; - migrate_disable(); + preempt_disable(); if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) { __this_cpu_dec(*(htab->map_locked[hash])); - migrate_enable(); + preempt_enable(); return -EBUSY; } - if (htab_use_raw_lock(htab)) - raw_spin_lock_irqsave(&b->raw_lock, flags); - else - spin_lock_irqsave(&b->lock, flags); + raw_spin_lock_irqsave(&b->raw_lock, flags); *pflags = flags; return 0; @@ -185,12 +172,9 @@ static inline void htab_unlock_bucket(const struct bpf_htab *htab, unsigned long flags) { hash = hash & HASHTAB_MAP_LOCK_MASK; - if (htab_use_raw_lock(htab)) - raw_spin_unlock_irqrestore(&b->raw_lock, flags); - else - spin_unlock_irqrestore(&b->lock, flags); + raw_spin_unlock_irqrestore(&b->raw_lock, flags); __this_cpu_dec(*(htab->map_locked[hash])); - migrate_enable(); + preempt_enable(); } static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); @@ -238,7 +222,7 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab) u32 num_entries = htab->map.max_entries; int i; - if (likely(!map_value_has_timer(&htab->map))) + if (!map_value_has_timer(&htab->map)) return; if (htab_has_extra_elems(htab)) num_entries += num_possible_cpus(); @@ -254,6 +238,25 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab) } } +static void htab_free_prealloced_kptrs(struct bpf_htab *htab) +{ + u32 num_entries = htab->map.max_entries; + int i; + + if (!map_value_has_kptrs(&htab->map)) + return; + if (htab_has_extra_elems(htab)) + num_entries += num_possible_cpus(); + + for (i = 0; i < num_entries; i++) { + struct htab_elem *elem; + + elem = get_htab_elem(htab, i); + bpf_map_free_kptrs(&htab->map, elem->key + round_up(htab->map.key_size, 8)); + cond_resched(); + } +} + static void htab_free_elems(struct bpf_htab *htab) { int i; @@ -291,12 +294,8 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, struct htab_elem *l; if (node) { - u32 key_size = htab->map.key_size; - l = container_of(node, struct htab_elem, lru_node); - memcpy(l->key, key, key_size); - check_and_init_map_value(&htab->map, - l->key + round_up(key_size, 8)); + memcpy(l->key, key, htab->map.key_size); return l; } @@ -412,8 +411,6 @@ static int htab_map_alloc_check(union bpf_attr *attr) bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED); int numa_node = bpf_map_attr_numa_node(attr); - BUILD_BUG_ON(offsetof(struct htab_elem, htab) != - offsetof(struct htab_elem, hash_node.pprev)); BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != offsetof(struct htab_elem, hash_node.pprev)); @@ -475,7 +472,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) struct bpf_htab *htab; int err, i; - htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_ACCOUNT); + htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE); if (!htab) return ERR_PTR(-ENOMEM); @@ -534,6 +531,29 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) htab_init_buckets(htab); +/* compute_batch_value() computes batch value as num_online_cpus() * 2 + * and __percpu_counter_compare() needs + * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus() + * for percpu_counter to be faster than atomic_t. In practice the average bpf + * hash map size is 10k, which means that a system with 64 cpus will fill + * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore + * define our own batch count as 32 then 10k hash map can be filled up to 80%: + * 10k - 8k > 32 _batch_ * 64 _cpus_ + * and __percpu_counter_compare() will still be fast. At that point hash map + * collisions will dominate its performance anyway. Assume that hash map filled + * to 50+% isn't going to be O(1) and use the following formula to choose + * between percpu_counter and atomic_t. + */ +#define PERCPU_COUNTER_BATCH 32 + if (attr->max_entries / 2 > num_online_cpus() * PERCPU_COUNTER_BATCH) + htab->use_percpu_counter = true; + + if (htab->use_percpu_counter) { + err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL); + if (err) + goto free_map_locked; + } + if (prealloc) { err = prealloc_init(htab); if (err) @@ -547,6 +567,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (err) goto free_prealloc; } + } else { + err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false); + if (err) + goto free_map_locked; + if (percpu) { + err = bpf_mem_alloc_init(&htab->pcpu_ma, + round_up(htab->map.value_size, 8), true); + if (err) + goto free_map_locked; + } } return &htab->map; @@ -554,12 +584,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) free_prealloc: prealloc_destroy(htab); free_map_locked: + if (htab->use_percpu_counter) + percpu_counter_destroy(&htab->pcount); for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) free_percpu(htab->map_locked[i]); bpf_map_area_free(htab->buckets); + bpf_mem_alloc_destroy(&htab->pcpu_ma); + bpf_mem_alloc_destroy(&htab->ma); free_htab: lockdep_unregister_key(&htab->lockdep_key); - kfree(htab); + bpf_map_area_free(htab); return ERR_PTR(err); } @@ -725,12 +759,15 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map, return insn - insn_buf; } -static void check_and_free_timer(struct bpf_htab *htab, struct htab_elem *elem) +static void check_and_free_fields(struct bpf_htab *htab, + struct htab_elem *elem) { - if (unlikely(map_value_has_timer(&htab->map))) - bpf_timer_cancel_and_free(elem->key + - round_up(htab->map.key_size, 8) + - htab->map.timer_off); + void *map_value = elem->key + round_up(htab->map.key_size, 8); + + if (map_value_has_timer(&htab->map)) + bpf_timer_cancel_and_free(map_value + htab->map.timer_off); + if (map_value_has_kptrs(&htab->map)) + bpf_map_free_kptrs(&htab->map, map_value); } /* It is called from the bpf_lru_list when the LRU needs to delete @@ -738,7 +775,7 @@ static void check_and_free_timer(struct bpf_htab *htab, struct htab_elem *elem) */ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) { - struct bpf_htab *htab = (struct bpf_htab *)arg; + struct bpf_htab *htab = arg; struct htab_elem *l = NULL, *tgt_l; struct hlist_nulls_head *head; struct hlist_nulls_node *n; @@ -757,7 +794,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) if (l == tgt_l) { hlist_nulls_del_rcu(&l->hash_node); - check_and_free_timer(htab, l); + check_and_free_fields(htab, l); break; } @@ -828,17 +865,9 @@ find_first_elem: static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) { if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) - free_percpu(htab_elem_get_ptr(l, htab->map.key_size)); - check_and_free_timer(htab, l); - kfree(l); -} - -static void htab_elem_free_rcu(struct rcu_head *head) -{ - struct htab_elem *l = container_of(head, struct htab_elem, rcu); - struct bpf_htab *htab = l->htab; - - htab_elem_free(htab, l); + bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr); + check_and_free_fields(htab, l); + bpf_mem_cache_free(&htab->ma, l); } static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) @@ -852,17 +881,41 @@ static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) } } +static bool is_map_full(struct bpf_htab *htab) +{ + if (htab->use_percpu_counter) + return __percpu_counter_compare(&htab->pcount, htab->map.max_entries, + PERCPU_COUNTER_BATCH) >= 0; + return atomic_read(&htab->count) >= htab->map.max_entries; +} + +static void inc_elem_count(struct bpf_htab *htab) +{ + if (htab->use_percpu_counter) + percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH); + else + atomic_inc(&htab->count); +} + +static void dec_elem_count(struct bpf_htab *htab) +{ + if (htab->use_percpu_counter) + percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH); + else + atomic_dec(&htab->count); +} + + static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) { htab_put_fd_value(htab, l); if (htab_is_prealloc(htab)) { - check_and_free_timer(htab, l); + check_and_free_fields(htab, l); __pcpu_freelist_push(&htab->freelist, &l->fnode); } else { - atomic_dec(&htab->count); - l->htab = htab; - call_rcu(&l->rcu, htab_elem_free_rcu); + dec_elem_count(htab); + htab_elem_free(htab, l); } } @@ -887,13 +940,12 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, void *value, bool onallcpus) { - /* When using prealloc and not setting the initial value on all cpus, - * zero-fill element values for other cpus (just as what happens when - * not using prealloc). Otherwise, bpf program has no way to ensure + /* When not setting the initial value on all cpus, zero-fill element + * values for other cpus. Otherwise, bpf program has no way to ensure * known initial values for cpus other than current one * (onallcpus=false always when coming from bpf prog). */ - if (htab_is_prealloc(htab) && !onallcpus) { + if (!onallcpus) { u32 size = round_up(htab->map.value_size, 8); int current_cpu = raw_smp_processor_id(); int cpu; @@ -944,19 +996,16 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, l_new = container_of(l, struct htab_elem, fnode); } } else { - if (atomic_inc_return(&htab->count) > htab->map.max_entries) - if (!old_elem) { + if (is_map_full(htab)) + if (!old_elem) /* when map is full and update() is replacing * old element, it's ok to allocate, since * old element will be freed immediately. * Otherwise return an error */ - l_new = ERR_PTR(-E2BIG); - goto dec_count; - } - l_new = bpf_map_kmalloc_node(&htab->map, htab->elem_size, - GFP_ATOMIC | __GFP_NOWARN, - htab->map.numa_node); + return ERR_PTR(-E2BIG); + inc_elem_count(htab); + l_new = bpf_mem_cache_alloc(&htab->ma); if (!l_new) { l_new = ERR_PTR(-ENOMEM); goto dec_count; @@ -967,18 +1016,18 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, memcpy(l_new->key, key, key_size); if (percpu) { - size = round_up(size, 8); if (prealloc) { pptr = htab_elem_get_ptr(l_new, key_size); } else { /* alloc_percpu zero-fills */ - pptr = bpf_map_alloc_percpu(&htab->map, size, 8, - GFP_ATOMIC | __GFP_NOWARN); + pptr = bpf_mem_cache_alloc(&htab->pcpu_ma); if (!pptr) { - kfree(l_new); + bpf_mem_cache_free(&htab->ma, l_new); l_new = ERR_PTR(-ENOMEM); goto dec_count; } + l_new->ptr_to_pptr = pptr; + pptr = *(void **)pptr; } pcpu_init_value(htab, pptr, value, onallcpus); @@ -997,7 +1046,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, l_new->hash = hash; return l_new; dec_count: - atomic_dec(&htab->count); + dec_elem_count(htab); return l_new; } @@ -1104,7 +1153,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, if (!htab_is_prealloc(htab)) free_htab_elem(htab, l_old); else - check_and_free_timer(htab, l_old); + check_and_free_fields(htab, l_old); } ret = 0; err: @@ -1114,7 +1163,7 @@ err: static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem) { - check_and_free_timer(htab, elem); + check_and_free_fields(htab, elem); bpf_lru_push_free(&htab->lru, &elem->lru_node); } @@ -1397,6 +1446,10 @@ static void delete_all_elements(struct bpf_htab *htab) { int i; + /* It's called from a worker thread, so disable migration here, + * since bpf_mem_cache_free() relies on that. + */ + migrate_disable(); for (i = 0; i < htab->n_buckets; i++) { struct hlist_nulls_head *head = select_bucket(htab, i); struct hlist_nulls_node *n; @@ -1407,6 +1460,7 @@ static void delete_all_elements(struct bpf_htab *htab) htab_elem_free(htab, l); } } + migrate_enable(); } static void htab_free_malloced_timers(struct bpf_htab *htab) @@ -1419,8 +1473,14 @@ static void htab_free_malloced_timers(struct bpf_htab *htab) struct hlist_nulls_node *n; struct htab_elem *l; - hlist_nulls_for_each_entry(l, n, head, hash_node) - check_and_free_timer(htab, l); + hlist_nulls_for_each_entry(l, n, head, hash_node) { + /* We don't reset or free kptr on uref dropping to zero, + * hence just free timer. + */ + bpf_timer_cancel_and_free(l->key + + round_up(htab->map.key_size, 8) + + htab->map.timer_off); + } cond_resched_rcu(); } rcu_read_unlock(); @@ -1430,7 +1490,8 @@ static void htab_map_free_timers(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - if (likely(!map_value_has_timer(&htab->map))) + /* We don't reset or free kptr on uref dropping to zero. */ + if (!map_value_has_timer(&htab->map)) return; if (!htab_is_prealloc(htab)) htab_free_malloced_timers(htab); @@ -1449,21 +1510,28 @@ static void htab_map_free(struct bpf_map *map) * There is no need to synchronize_rcu() here to protect map elements. */ - /* some of free_htab_elem() callbacks for elements of this map may - * not have executed. Wait for them. + /* htab no longer uses call_rcu() directly. bpf_mem_alloc does it + * underneath and is reponsible for waiting for callbacks to finish + * during bpf_mem_alloc_destroy(). */ - rcu_barrier(); - if (!htab_is_prealloc(htab)) + if (!htab_is_prealloc(htab)) { delete_all_elements(htab); - else + } else { + htab_free_prealloced_kptrs(htab); prealloc_destroy(htab); + } + bpf_map_free_kptr_off_tab(map); free_percpu(htab->extra_elems); bpf_map_area_free(htab->buckets); + bpf_mem_alloc_destroy(&htab->pcpu_ma); + bpf_mem_alloc_destroy(&htab->ma); + if (htab->use_percpu_counter) + percpu_counter_destroy(&htab->pcount); for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) free_percpu(htab->map_locked[i]); lockdep_unregister_key(&htab->lockdep_key); - kfree(htab); + bpf_map_area_free(htab); } static void htab_map_seq_show_elem(struct bpf_map *map, void *key, @@ -1594,7 +1662,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, void __user *uvalues = u64_to_user_ptr(attr->batch.values); void __user *ukeys = u64_to_user_ptr(attr->batch.keys); void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); - u32 batch, max_count, size, bucket_size; + u32 batch, max_count, size, bucket_size, map_id; struct htab_elem *node_to_free = NULL; u64 elem_map_flags, map_flags; struct hlist_nulls_head *head; @@ -1662,8 +1730,11 @@ again_nocopy: /* do not grab the lock unless need it (bucket_cnt > 0). */ if (locked) { ret = htab_lock_bucket(htab, b, batch, &flags); - if (ret) - goto next_batch; + if (ret) { + rcu_read_unlock(); + bpf_enable_instrumentation(); + goto after_loop; + } } bucket_cnt = 0; @@ -1719,6 +1790,14 @@ again_nocopy: } } else { value = l->key + roundup_key_size; + if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { + struct bpf_map **inner_map = value; + + /* Actual value is the id of the inner map */ + map_id = map->ops->map_fd_sys_lookup_elem(*inner_map); + value = &map_id; + } + if (elem_map_flags & BPF_F_LOCK) copy_map_value_locked(map, dst_val, value, true); @@ -2023,6 +2102,7 @@ static int bpf_iter_init_hash_map(void *priv_data, seq_info->percpu_value_buf = value_buf; } + bpf_map_inc_with_uref(map); seq_info->map = map; seq_info->htab = container_of(map, struct bpf_htab, map); return 0; @@ -2032,6 +2112,7 @@ static void bpf_iter_fini_hash_map(void *priv_data) { struct bpf_iter_seq_hash_map_info *seq_info = priv_data; + bpf_map_put_with_uref(seq_info->map); kfree(seq_info->percpu_value_buf); } @@ -2105,7 +2186,7 @@ out: return num_elems; } -static int htab_map_btf_id; +BTF_ID_LIST_SINGLE(htab_map_btf_ids, struct, bpf_htab) const struct bpf_map_ops htab_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, @@ -2122,12 +2203,10 @@ const struct bpf_map_ops htab_map_ops = { .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab), - .map_btf_name = "bpf_htab", - .map_btf_id = &htab_map_btf_id, + .map_btf_id = &htab_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; -static int htab_lru_map_btf_id; const struct bpf_map_ops htab_lru_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, @@ -2145,8 +2224,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab_lru), - .map_btf_name = "bpf_htab", - .map_btf_id = &htab_lru_map_btf_id, + .map_btf_id = &htab_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; @@ -2161,6 +2239,20 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) +{ + struct htab_elem *l; + + if (cpu >= nr_cpu_ids) + return NULL; + + l = __htab_map_lookup_elem(map, key); + if (l) + return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu); + else + return NULL; +} + static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) { struct htab_elem *l = __htab_map_lookup_elem(map, key); @@ -2173,6 +2265,22 @@ static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) +{ + struct htab_elem *l; + + if (cpu >= nr_cpu_ids) + return NULL; + + l = __htab_map_lookup_elem(map, key); + if (l) { + bpf_lru_node_set_ref(&l->lru_node); + return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu); + } + + return NULL; +} + int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) { struct htab_elem *l; @@ -2252,7 +2360,6 @@ static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } -static int htab_percpu_map_btf_id; const struct bpf_map_ops htab_percpu_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, @@ -2263,16 +2370,15 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem, .map_update_elem = htab_percpu_map_update_elem, .map_delete_elem = htab_map_delete_elem, + .map_lookup_percpu_elem = htab_percpu_map_lookup_percpu_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab_percpu), - .map_btf_name = "bpf_htab", - .map_btf_id = &htab_percpu_map_btf_id, + .map_btf_id = &htab_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; -static int htab_lru_percpu_map_btf_id; const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, @@ -2283,12 +2389,12 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem, .map_update_elem = htab_lru_percpu_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, + .map_lookup_percpu_elem = htab_lru_percpu_map_lookup_percpu_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab_lru_percpu), - .map_btf_name = "bpf_htab", - .map_btf_id = &htab_lru_percpu_map_btf_id, + .map_btf_id = &htab_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; @@ -2412,7 +2518,6 @@ static void htab_of_map_free(struct bpf_map *map) fd_htab_map_free(map); } -static int htab_of_maps_map_btf_id; const struct bpf_map_ops htab_of_maps_map_ops = { .map_alloc_check = fd_htab_map_alloc_check, .map_alloc = htab_of_map_alloc, @@ -2425,6 +2530,6 @@ const struct bpf_map_ops htab_of_maps_map_ops = { .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = htab_of_map_gen_lookup, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_htab", - .map_btf_id = &htab_of_maps_map_btf_id, + BATCH_OPS(htab), + .map_btf_id = &htab_map_btf_ids[0], }; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 315053ef6a75..a6b04faed282 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -15,6 +15,7 @@ #include <linux/ctype.h> #include <linux/jiffies.h> #include <linux/pid_namespace.h> +#include <linux/poison.h> #include <linux/proc_ns.h> #include <linux/security.h> #include <linux/btf_ids.h> @@ -103,7 +104,7 @@ const struct bpf_func_proto bpf_map_pop_elem_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, + .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT, }; BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value) @@ -116,7 +117,23 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, + .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT, +}; + +BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu) +{ + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu); +} + +const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto = { + .func = bpf_map_lookup_percpu_elem, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_KEY, + .arg3_type = ARG_ANYTHING, }; const struct bpf_func_proto bpf_get_prandom_u32_proto = { @@ -182,6 +199,18 @@ const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto = { .ret_type = RET_INTEGER, }; +BPF_CALL_0(bpf_ktime_get_tai_ns) +{ + /* NMI safe access to clock tai */ + return ktime_get_tai_fast_ns(); +} + +const struct bpf_func_proto bpf_ktime_get_tai_ns_proto = { + .func = bpf_ktime_get_tai_ns, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + BPF_CALL_0(bpf_get_current_pid_tgid) { struct task_struct *task = current; @@ -399,40 +428,7 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, }; - -#ifdef CONFIG_CGROUP_BPF - -BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) -{ - /* flags argument is not used now, - * but provides an ability to extend the API. - * verifier checks that its value is correct. - */ - enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); - struct bpf_cgroup_storage *storage; - struct bpf_cg_run_ctx *ctx; - void *ptr; - - /* get current cgroup storage from BPF run context */ - ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); - storage = ctx->prog_item->cgroup_storage[stype]; - - if (stype == BPF_CGROUP_STORAGE_SHARED) - ptr = &READ_ONCE(storage->buf)->data[0]; - else - ptr = this_cpu_ptr(storage->percpu_buf); - - return (unsigned long)ptr; -} - -const struct bpf_func_proto bpf_get_local_storage_proto = { - .func = bpf_get_local_storage, - .gpl_only = false, - .ret_type = RET_PTR_TO_MAP_VALUE, - .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_ANYTHING, -}; -#endif +#endif /* CONFIG_CGROUPS */ #define BPF_STRTOX_BASE_MASK 0x1F @@ -561,14 +557,13 @@ const struct bpf_func_proto bpf_strtoul_proto = { .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_LONG, }; -#endif BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2) { return strncmp(s1, s2, s1_sz); } -const struct bpf_func_proto bpf_strncmp_proto = { +static const struct bpf_func_proto bpf_strncmp_proto = { .func = bpf_strncmp, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1374,6 +1369,196 @@ out: kfree(t); } +BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr) +{ + unsigned long *kptr = map_value; + + return xchg(kptr, (unsigned long)ptr); +} + +/* Unlike other PTR_TO_BTF_ID helpers the btf_id in bpf_kptr_xchg() + * helper is determined dynamically by the verifier. Use BPF_PTR_POISON to + * denote type that verifier will determine. + */ +static const struct bpf_func_proto bpf_kptr_xchg_proto = { + .func = bpf_kptr_xchg, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .ret_btf_id = BPF_PTR_POISON, + .arg1_type = ARG_PTR_TO_KPTR, + .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL | OBJ_RELEASE, + .arg2_btf_id = BPF_PTR_POISON, +}; + +/* Since the upper 8 bits of dynptr->size is reserved, the + * maximum supported size is 2^24 - 1. + */ +#define DYNPTR_MAX_SIZE ((1UL << 24) - 1) +#define DYNPTR_TYPE_SHIFT 28 +#define DYNPTR_SIZE_MASK 0xFFFFFF +#define DYNPTR_RDONLY_BIT BIT(31) + +static bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr) +{ + return ptr->size & DYNPTR_RDONLY_BIT; +} + +static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_type type) +{ + ptr->size |= type << DYNPTR_TYPE_SHIFT; +} + +u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr) +{ + return ptr->size & DYNPTR_SIZE_MASK; +} + +int bpf_dynptr_check_size(u32 size) +{ + return size > DYNPTR_MAX_SIZE ? -E2BIG : 0; +} + +void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, + enum bpf_dynptr_type type, u32 offset, u32 size) +{ + ptr->data = data; + ptr->offset = offset; + ptr->size = size; + bpf_dynptr_set_type(ptr, type); +} + +void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) +{ + memset(ptr, 0, sizeof(*ptr)); +} + +static int bpf_dynptr_check_off_len(struct bpf_dynptr_kern *ptr, u32 offset, u32 len) +{ + u32 size = bpf_dynptr_get_size(ptr); + + if (len > size || offset > size - len) + return -E2BIG; + + return 0; +} + +BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr) +{ + int err; + + BTF_TYPE_EMIT(struct bpf_dynptr); + + err = bpf_dynptr_check_size(size); + if (err) + goto error; + + /* flags is currently unsupported */ + if (flags) { + err = -EINVAL; + goto error; + } + + bpf_dynptr_init(ptr, data, BPF_DYNPTR_TYPE_LOCAL, 0, size); + + return 0; + +error: + bpf_dynptr_set_null(ptr); + return err; +} + +static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { + .func = bpf_dynptr_from_mem, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT, +}; + +BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src, + u32, offset, u64, flags) +{ + int err; + + if (!src->data || flags) + return -EINVAL; + + err = bpf_dynptr_check_off_len(src, offset, len); + if (err) + return err; + + memcpy(dst, src->data + src->offset + offset, len); + + return 0; +} + +static const struct bpf_func_proto bpf_dynptr_read_proto = { + .func = bpf_dynptr_read, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_PTR_TO_DYNPTR, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src, + u32, len, u64, flags) +{ + int err; + + if (!dst->data || flags || bpf_dynptr_is_rdonly(dst)) + return -EINVAL; + + err = bpf_dynptr_check_off_len(dst, offset, len); + if (err) + return err; + + memcpy(dst->data + dst->offset + offset, src, len); + + return 0; +} + +static const struct bpf_func_proto bpf_dynptr_write_proto = { + .func = bpf_dynptr_write, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_DYNPTR, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg4_type = ARG_CONST_SIZE_OR_ZERO, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len) +{ + int err; + + if (!ptr->data) + return 0; + + err = bpf_dynptr_check_off_len(ptr, offset, len); + if (err) + return 0; + + if (bpf_dynptr_is_rdonly(ptr)) + return 0; + + return (unsigned long)(ptr->data + ptr->offset + offset); +} + +static const struct bpf_func_proto bpf_dynptr_data_proto = { + .func = bpf_dynptr_data, + .gpl_only = false, + .ret_type = RET_PTR_TO_DYNPTR_MEM_OR_NULL, + .arg1_type = ARG_PTR_TO_DYNPTR, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, +}; + const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_get_current_task_btf_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; @@ -1398,6 +1583,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_map_pop_elem_proto; case BPF_FUNC_map_peek_elem: return &bpf_map_peek_elem_proto; + case BPF_FUNC_map_lookup_percpu_elem: + return &bpf_map_lookup_percpu_elem_proto; case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; case BPF_FUNC_get_smp_processor_id: @@ -1410,6 +1597,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: return &bpf_ktime_get_boot_ns_proto; + case BPF_FUNC_ktime_get_tai_ns: + return &bpf_ktime_get_tai_ns_proto; case BPF_FUNC_ringbuf_output: return &bpf_ringbuf_output_proto; case BPF_FUNC_ringbuf_reserve: @@ -1420,12 +1609,12 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ringbuf_discard_proto; case BPF_FUNC_ringbuf_query: return &bpf_ringbuf_query_proto; - case BPF_FUNC_for_each_map_elem: - return &bpf_for_each_map_elem_proto; - case BPF_FUNC_loop: - return &bpf_loop_proto; case BPF_FUNC_strncmp: return &bpf_strncmp_proto; + case BPF_FUNC_strtol: + return &bpf_strtol_proto; + case BPF_FUNC_strtoul: + return &bpf_strtoul_proto; default: break; } @@ -1452,6 +1641,28 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_timer_start_proto; case BPF_FUNC_timer_cancel: return &bpf_timer_cancel_proto; + case BPF_FUNC_kptr_xchg: + return &bpf_kptr_xchg_proto; + case BPF_FUNC_for_each_map_elem: + return &bpf_for_each_map_elem_proto; + case BPF_FUNC_loop: + return &bpf_loop_proto; + case BPF_FUNC_user_ringbuf_drain: + return &bpf_user_ringbuf_drain_proto; + case BPF_FUNC_ringbuf_reserve_dynptr: + return &bpf_ringbuf_reserve_dynptr_proto; + case BPF_FUNC_ringbuf_submit_dynptr: + return &bpf_ringbuf_submit_dynptr_proto; + case BPF_FUNC_ringbuf_discard_dynptr: + return &bpf_ringbuf_discard_dynptr_proto; + case BPF_FUNC_dynptr_from_mem: + return &bpf_dynptr_from_mem_proto; + case BPF_FUNC_dynptr_read: + return &bpf_dynptr_read_proto; + case BPF_FUNC_dynptr_write: + return &bpf_dynptr_write_proto; + case BPF_FUNC_dynptr_data: + return &bpf_dynptr_data_proto; default: break; } @@ -1488,3 +1699,21 @@ bpf_base_func_proto(enum bpf_func_id func_id) return NULL; } } + +BTF_SET8_START(tracing_btf_ids) +#ifdef CONFIG_KEXEC_CORE +BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) +#endif +BTF_SET8_END(tracing_btf_ids) + +static const struct btf_kfunc_id_set tracing_kfunc_set = { + .owner = THIS_MODULE, + .set = &tracing_btf_ids, +}; + +static int __init kfunc_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &tracing_kfunc_set); +} + +late_initcall(kfunc_init); diff --git a/kernel/bpf/link_iter.c b/kernel/bpf/link_iter.c new file mode 100644 index 000000000000..fec8005a121c --- /dev/null +++ b/kernel/bpf/link_iter.c @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2022 Red Hat, Inc. */ +#include <linux/bpf.h> +#include <linux/fs.h> +#include <linux/filter.h> +#include <linux/kernel.h> +#include <linux/btf_ids.h> + +struct bpf_iter_seq_link_info { + u32 link_id; +}; + +static void *bpf_link_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_link_info *info = seq->private; + struct bpf_link *link; + + link = bpf_link_get_curr_or_next(&info->link_id); + if (!link) + return NULL; + + if (*pos == 0) + ++*pos; + return link; +} + +static void *bpf_link_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_link_info *info = seq->private; + + ++*pos; + ++info->link_id; + bpf_link_put((struct bpf_link *)v); + return bpf_link_get_curr_or_next(&info->link_id); +} + +struct bpf_iter__bpf_link { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct bpf_link *, link); +}; + +DEFINE_BPF_ITER_FUNC(bpf_link, struct bpf_iter_meta *meta, struct bpf_link *link) + +static int __bpf_link_seq_show(struct seq_file *seq, void *v, bool in_stop) +{ + struct bpf_iter__bpf_link ctx; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int ret = 0; + + ctx.meta = &meta; + ctx.link = v; + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (prog) + ret = bpf_iter_run_prog(prog, &ctx); + + return ret; +} + +static int bpf_link_seq_show(struct seq_file *seq, void *v) +{ + return __bpf_link_seq_show(seq, v, false); +} + +static void bpf_link_seq_stop(struct seq_file *seq, void *v) +{ + if (!v) + (void)__bpf_link_seq_show(seq, v, true); + else + bpf_link_put((struct bpf_link *)v); +} + +static const struct seq_operations bpf_link_seq_ops = { + .start = bpf_link_seq_start, + .next = bpf_link_seq_next, + .stop = bpf_link_seq_stop, + .show = bpf_link_seq_show, +}; + +BTF_ID_LIST(btf_bpf_link_id) +BTF_ID(struct, bpf_link) + +static const struct bpf_iter_seq_info bpf_link_seq_info = { + .seq_ops = &bpf_link_seq_ops, + .init_seq_private = NULL, + .fini_seq_private = NULL, + .seq_priv_size = sizeof(struct bpf_iter_seq_link_info), +}; + +static struct bpf_iter_reg bpf_link_reg_info = { + .target = "bpf_link", + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__bpf_link, link), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &bpf_link_seq_info, +}; + +static int __init bpf_link_iter_init(void) +{ + bpf_link_reg_info.ctx_arg_info[0].btf_id = *btf_bpf_link_id; + return bpf_iter_reg_target(&bpf_link_reg_info); +} + +late_initcall(bpf_link_iter_init); diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 497916060ac7..098cf336fae6 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -9,6 +9,7 @@ #include <linux/rbtree.h> #include <linux/slab.h> #include <uapi/linux/btf.h> +#include <linux/btf_ids.h> #ifdef CONFIG_CGROUP_BPF @@ -164,7 +165,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key, } new = bpf_map_kmalloc_node(map, struct_size(new, data, map->value_size), - __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN, + __GFP_ZERO | GFP_NOWAIT | __GFP_NOWARN, map->numa_node); if (!new) return -ENOMEM; @@ -312,8 +313,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) /* max_entries is not used and enforced to be 0 */ return ERR_PTR(-EINVAL); - map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map), - __GFP_ZERO | GFP_USER | __GFP_ACCOUNT, numa_node); + map = bpf_map_area_alloc(sizeof(struct bpf_cgroup_storage_map), numa_node); if (!map) return ERR_PTR(-ENOMEM); @@ -345,7 +345,7 @@ static void cgroup_storage_map_free(struct bpf_map *_map) WARN_ON(!RB_EMPTY_ROOT(&map->root)); WARN_ON(!list_empty(&map->list)); - kfree(map); + bpf_map_area_free(map); } static int cgroup_storage_delete_elem(struct bpf_map *map, void *key) @@ -446,7 +446,8 @@ static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } -static int cgroup_storage_map_btf_id; +BTF_ID_LIST_SINGLE(cgroup_storage_map_btf_ids, struct, + bpf_cgroup_storage_map) const struct bpf_map_ops cgroup_storage_map_ops = { .map_alloc = cgroup_storage_map_alloc, .map_free = cgroup_storage_map_free, @@ -456,8 +457,7 @@ const struct bpf_map_ops cgroup_storage_map_ops = { .map_delete_elem = cgroup_storage_delete_elem, .map_check_btf = cgroup_storage_check_btf, .map_seq_show_elem = cgroup_storage_seq_show_elem, - .map_btf_name = "bpf_cgroup_storage_map", - .map_btf_id = &cgroup_storage_map_btf_id, + .map_btf_id = &cgroup_storage_map_btf_ids[0], }; int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 5763cc7ac4f1..d833496e9e42 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -14,6 +14,7 @@ #include <linux/vmalloc.h> #include <net/ipv6.h> #include <uapi/linux/btf.h> +#include <linux/btf_ids.h> /* Intermediate node */ #define LPM_TREE_NODE_FLAG_IM BIT(0) @@ -284,7 +285,7 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie, if (value) size += trie->map.value_size; - node = bpf_map_kmalloc_node(&trie->map, size, GFP_ATOMIC | __GFP_NOWARN, + node = bpf_map_kmalloc_node(&trie->map, size, GFP_NOWAIT | __GFP_NOWARN, trie->map.numa_node); if (!node) return NULL; @@ -557,7 +558,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) attr->value_size > LPM_VAL_SIZE_MAX) return ERR_PTR(-EINVAL); - trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); + trie = bpf_map_area_alloc(sizeof(*trie), NUMA_NO_NODE); if (!trie) return ERR_PTR(-ENOMEM); @@ -608,7 +609,7 @@ static void trie_free(struct bpf_map *map) } out: - kfree(trie); + bpf_map_area_free(trie); } static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) @@ -719,7 +720,7 @@ static int trie_check_btf(const struct bpf_map *map, -EINVAL : 0; } -static int trie_map_btf_id; +BTF_ID_LIST_SINGLE(trie_map_btf_ids, struct, lpm_trie) const struct bpf_map_ops trie_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = trie_alloc, @@ -732,6 +733,5 @@ const struct bpf_map_ops trie_map_ops = { .map_update_batch = generic_map_update_batch, .map_delete_batch = generic_map_delete_batch, .map_check_btf = trie_check_btf, - .map_btf_name = "lpm_trie", - .map_btf_id = &trie_map_btf_id, + .map_btf_id = &trie_map_btf_ids[0], }; diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 5cd8f5277279..135205d0d560 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -52,6 +52,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) inner_map_meta->max_entries = inner_map->max_entries; inner_map_meta->spin_lock_off = inner_map->spin_lock_off; inner_map_meta->timer_off = inner_map->timer_off; + inner_map_meta->kptr_off_tab = bpf_map_copy_kptr_off_tab(inner_map); if (inner_map->btf) { btf_get(inner_map->btf); inner_map_meta->btf = inner_map->btf; @@ -71,6 +72,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) void bpf_map_meta_free(struct bpf_map *map_meta) { + bpf_map_free_kptr_off_tab(map_meta); btf_put(map_meta->btf); kfree(map_meta); } @@ -83,7 +85,8 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0, meta0->key_size == meta1->key_size && meta0->value_size == meta1->value_size && meta0->timer_off == meta1->timer_off && - meta0->map_flags == meta1->map_flags; + meta0->map_flags == meta1->map_flags && + bpf_map_equal_kptr_off_tab(meta0, meta1); } void *bpf_map_fd_get_ptr(struct bpf_map *map, diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c new file mode 100644 index 000000000000..5f83be1d2018 --- /dev/null +++ b/kernel/bpf/memalloc.c @@ -0,0 +1,635 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ +#include <linux/mm.h> +#include <linux/llist.h> +#include <linux/bpf.h> +#include <linux/irq_work.h> +#include <linux/bpf_mem_alloc.h> +#include <linux/memcontrol.h> +#include <asm/local.h> + +/* Any context (including NMI) BPF specific memory allocator. + * + * Tracing BPF programs can attach to kprobe and fentry. Hence they + * run in unknown context where calling plain kmalloc() might not be safe. + * + * Front-end kmalloc() with per-cpu per-bucket cache of free elements. + * Refill this cache asynchronously from irq_work. + * + * CPU_0 buckets + * 16 32 64 96 128 196 256 512 1024 2048 4096 + * ... + * CPU_N buckets + * 16 32 64 96 128 196 256 512 1024 2048 4096 + * + * The buckets are prefilled at the start. + * BPF programs always run with migration disabled. + * It's safe to allocate from cache of the current cpu with irqs disabled. + * Free-ing is always done into bucket of the current cpu as well. + * irq_work trims extra free elements from buckets with kfree + * and refills them with kmalloc, so global kmalloc logic takes care + * of freeing objects allocated by one cpu and freed on another. + * + * Every allocated objected is padded with extra 8 bytes that contains + * struct llist_node. + */ +#define LLIST_NODE_SZ sizeof(struct llist_node) + +/* similar to kmalloc, but sizeof == 8 bucket is gone */ +static u8 size_index[24] __ro_after_init = { + 3, /* 8 */ + 3, /* 16 */ + 4, /* 24 */ + 4, /* 32 */ + 5, /* 40 */ + 5, /* 48 */ + 5, /* 56 */ + 5, /* 64 */ + 1, /* 72 */ + 1, /* 80 */ + 1, /* 88 */ + 1, /* 96 */ + 6, /* 104 */ + 6, /* 112 */ + 6, /* 120 */ + 6, /* 128 */ + 2, /* 136 */ + 2, /* 144 */ + 2, /* 152 */ + 2, /* 160 */ + 2, /* 168 */ + 2, /* 176 */ + 2, /* 184 */ + 2 /* 192 */ +}; + +static int bpf_mem_cache_idx(size_t size) +{ + if (!size || size > 4096) + return -1; + + if (size <= 192) + return size_index[(size - 1) / 8] - 1; + + return fls(size - 1) - 1; +} + +#define NUM_CACHES 11 + +struct bpf_mem_cache { + /* per-cpu list of free objects of size 'unit_size'. + * All accesses are done with interrupts disabled and 'active' counter + * protection with __llist_add() and __llist_del_first(). + */ + struct llist_head free_llist; + local_t active; + + /* Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill + * are sequenced by per-cpu 'active' counter. But unit_free() cannot + * fail. When 'active' is busy the unit_free() will add an object to + * free_llist_extra. + */ + struct llist_head free_llist_extra; + + struct irq_work refill_work; + struct obj_cgroup *objcg; + int unit_size; + /* count of objects in free_llist */ + int free_cnt; + int low_watermark, high_watermark, batch; + int percpu_size; + + struct rcu_head rcu; + struct llist_head free_by_rcu; + struct llist_head waiting_for_gp; + atomic_t call_rcu_in_progress; +}; + +struct bpf_mem_caches { + struct bpf_mem_cache cache[NUM_CACHES]; +}; + +static struct llist_node notrace *__llist_del_first(struct llist_head *head) +{ + struct llist_node *entry, *next; + + entry = head->first; + if (!entry) + return NULL; + next = entry->next; + head->first = next; + return entry; +} + +static void *__alloc(struct bpf_mem_cache *c, int node) +{ + /* Allocate, but don't deplete atomic reserves that typical + * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc + * will allocate from the current numa node which is what we + * want here. + */ + gfp_t flags = GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT; + + if (c->percpu_size) { + void **obj = kmalloc_node(c->percpu_size, flags, node); + void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags); + + if (!obj || !pptr) { + free_percpu(pptr); + kfree(obj); + return NULL; + } + obj[1] = pptr; + return obj; + } + + return kmalloc_node(c->unit_size, flags, node); +} + +static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c) +{ +#ifdef CONFIG_MEMCG_KMEM + if (c->objcg) + return get_mem_cgroup_from_objcg(c->objcg); +#endif + +#ifdef CONFIG_MEMCG + return root_mem_cgroup; +#else + return NULL; +#endif +} + +/* Mostly runs from irq_work except __init phase. */ +static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node) +{ + struct mem_cgroup *memcg = NULL, *old_memcg; + unsigned long flags; + void *obj; + int i; + + memcg = get_memcg(c); + old_memcg = set_active_memcg(memcg); + for (i = 0; i < cnt; i++) { + obj = __alloc(c, node); + if (!obj) + break; + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + /* In RT irq_work runs in per-cpu kthread, so disable + * interrupts to avoid preemption and interrupts and + * reduce the chance of bpf prog executing on this cpu + * when active counter is busy. + */ + local_irq_save(flags); + /* alloc_bulk runs from irq_work which will not preempt a bpf + * program that does unit_alloc/unit_free since IRQs are + * disabled there. There is no race to increment 'active' + * counter. It protects free_llist from corruption in case NMI + * bpf prog preempted this loop. + */ + WARN_ON_ONCE(local_inc_return(&c->active) != 1); + __llist_add(obj, &c->free_llist); + c->free_cnt++; + local_dec(&c->active); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_restore(flags); + } + set_active_memcg(old_memcg); + mem_cgroup_put(memcg); +} + +static void free_one(struct bpf_mem_cache *c, void *obj) +{ + if (c->percpu_size) { + free_percpu(((void **)obj)[1]); + kfree(obj); + return; + } + + kfree(obj); +} + +static void __free_rcu(struct rcu_head *head) +{ + struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); + struct llist_node *llnode = llist_del_all(&c->waiting_for_gp); + struct llist_node *pos, *t; + + llist_for_each_safe(pos, t, llnode) + free_one(c, pos); + atomic_set(&c->call_rcu_in_progress, 0); +} + +static void __free_rcu_tasks_trace(struct rcu_head *head) +{ + struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); + + call_rcu(&c->rcu, __free_rcu); +} + +static void enque_to_free(struct bpf_mem_cache *c, void *obj) +{ + struct llist_node *llnode = obj; + + /* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work. + * Nothing races to add to free_by_rcu list. + */ + __llist_add(llnode, &c->free_by_rcu); +} + +static void do_call_rcu(struct bpf_mem_cache *c) +{ + struct llist_node *llnode, *t; + + if (atomic_xchg(&c->call_rcu_in_progress, 1)) + return; + + WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); + llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu)) + /* There is no concurrent __llist_add(waiting_for_gp) access. + * It doesn't race with llist_del_all either. + * But there could be two concurrent llist_del_all(waiting_for_gp): + * from __free_rcu() and from drain_mem_cache(). + */ + __llist_add(llnode, &c->waiting_for_gp); + /* Use call_rcu_tasks_trace() to wait for sleepable progs to finish. + * Then use call_rcu() to wait for normal progs to finish + * and finally do free_one() on each element. + */ + call_rcu_tasks_trace(&c->rcu, __free_rcu_tasks_trace); +} + +static void free_bulk(struct bpf_mem_cache *c) +{ + struct llist_node *llnode, *t; + unsigned long flags; + int cnt; + + do { + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_save(flags); + WARN_ON_ONCE(local_inc_return(&c->active) != 1); + llnode = __llist_del_first(&c->free_llist); + if (llnode) + cnt = --c->free_cnt; + else + cnt = 0; + local_dec(&c->active); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_restore(flags); + if (llnode) + enque_to_free(c, llnode); + } while (cnt > (c->high_watermark + c->low_watermark) / 2); + + /* and drain free_llist_extra */ + llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra)) + enque_to_free(c, llnode); + do_call_rcu(c); +} + +static void bpf_mem_refill(struct irq_work *work) +{ + struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work); + int cnt; + + /* Racy access to free_cnt. It doesn't need to be 100% accurate */ + cnt = c->free_cnt; + if (cnt < c->low_watermark) + /* irq_work runs on this cpu and kmalloc will allocate + * from the current numa node which is what we want here. + */ + alloc_bulk(c, c->batch, NUMA_NO_NODE); + else if (cnt > c->high_watermark) + free_bulk(c); +} + +static void notrace irq_work_raise(struct bpf_mem_cache *c) +{ + irq_work_queue(&c->refill_work); +} + +/* For typical bpf map case that uses bpf_mem_cache_alloc and single bucket + * the freelist cache will be elem_size * 64 (or less) on each cpu. + * + * For bpf programs that don't have statically known allocation sizes and + * assuming (low_mark + high_mark) / 2 as an average number of elements per + * bucket and all buckets are used the total amount of memory in freelists + * on each cpu will be: + * 64*16 + 64*32 + 64*64 + 64*96 + 64*128 + 64*196 + 64*256 + 32*512 + 16*1024 + 8*2048 + 4*4096 + * == ~ 116 Kbyte using below heuristic. + * Initialized, but unused bpf allocator (not bpf map specific one) will + * consume ~ 11 Kbyte per cpu. + * Typical case will be between 11K and 116K closer to 11K. + * bpf progs can and should share bpf_mem_cache when possible. + */ + +static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) +{ + init_irq_work(&c->refill_work, bpf_mem_refill); + if (c->unit_size <= 256) { + c->low_watermark = 32; + c->high_watermark = 96; + } else { + /* When page_size == 4k, order-0 cache will have low_mark == 2 + * and high_mark == 6 with batch alloc of 3 individual pages at + * a time. + * 8k allocs and above low == 1, high == 3, batch == 1. + */ + c->low_watermark = max(32 * 256 / c->unit_size, 1); + c->high_watermark = max(96 * 256 / c->unit_size, 3); + } + c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1); + + /* To avoid consuming memory assume that 1st run of bpf + * prog won't be doing more than 4 map_update_elem from + * irq disabled region + */ + alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu)); +} + +/* When size != 0 bpf_mem_cache for each cpu. + * This is typical bpf hash map use case when all elements have equal size. + * + * When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on + * kmalloc/kfree. Max allocation size is 4096 in this case. + * This is bpf_dynptr and bpf_kptr use case. + */ +int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) +{ + static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; + struct bpf_mem_caches *cc, __percpu *pcc; + struct bpf_mem_cache *c, __percpu *pc; + struct obj_cgroup *objcg = NULL; + int cpu, i, unit_size, percpu_size = 0; + + if (size) { + pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL); + if (!pc) + return -ENOMEM; + + if (percpu) + /* room for llist_node and per-cpu pointer */ + percpu_size = LLIST_NODE_SZ + sizeof(void *); + else + size += LLIST_NODE_SZ; /* room for llist_node */ + unit_size = size; + +#ifdef CONFIG_MEMCG_KMEM + objcg = get_obj_cgroup_from_current(); +#endif + for_each_possible_cpu(cpu) { + c = per_cpu_ptr(pc, cpu); + c->unit_size = unit_size; + c->objcg = objcg; + c->percpu_size = percpu_size; + prefill_mem_cache(c, cpu); + } + ma->cache = pc; + return 0; + } + + /* size == 0 && percpu is an invalid combination */ + if (WARN_ON_ONCE(percpu)) + return -EINVAL; + + pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL); + if (!pcc) + return -ENOMEM; +#ifdef CONFIG_MEMCG_KMEM + objcg = get_obj_cgroup_from_current(); +#endif + for_each_possible_cpu(cpu) { + cc = per_cpu_ptr(pcc, cpu); + for (i = 0; i < NUM_CACHES; i++) { + c = &cc->cache[i]; + c->unit_size = sizes[i]; + c->objcg = objcg; + prefill_mem_cache(c, cpu); + } + } + ma->caches = pcc; + return 0; +} + +static void drain_mem_cache(struct bpf_mem_cache *c) +{ + struct llist_node *llnode, *t; + + /* No progs are using this bpf_mem_cache, but htab_map_free() called + * bpf_mem_cache_free() for all remaining elements and they can be in + * free_by_rcu or in waiting_for_gp lists, so drain those lists now. + */ + llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu)) + free_one(c, llnode); + llist_for_each_safe(llnode, t, llist_del_all(&c->waiting_for_gp)) + free_one(c, llnode); + llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist)) + free_one(c, llnode); + llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra)) + free_one(c, llnode); +} + +static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) +{ + free_percpu(ma->cache); + free_percpu(ma->caches); + ma->cache = NULL; + ma->caches = NULL; +} + +static void free_mem_alloc(struct bpf_mem_alloc *ma) +{ + /* waiting_for_gp lists was drained, but __free_rcu might + * still execute. Wait for it now before we freeing percpu caches. + */ + rcu_barrier_tasks_trace(); + rcu_barrier(); + free_mem_alloc_no_barrier(ma); +} + +static void free_mem_alloc_deferred(struct work_struct *work) +{ + struct bpf_mem_alloc *ma = container_of(work, struct bpf_mem_alloc, work); + + free_mem_alloc(ma); + kfree(ma); +} + +static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress) +{ + struct bpf_mem_alloc *copy; + + if (!rcu_in_progress) { + /* Fast path. No callbacks are pending, hence no need to do + * rcu_barrier-s. + */ + free_mem_alloc_no_barrier(ma); + return; + } + + copy = kmalloc(sizeof(*ma), GFP_KERNEL); + if (!copy) { + /* Slow path with inline barrier-s */ + free_mem_alloc(ma); + return; + } + + /* Defer barriers into worker to let the rest of map memory to be freed */ + copy->cache = ma->cache; + ma->cache = NULL; + copy->caches = ma->caches; + ma->caches = NULL; + INIT_WORK(©->work, free_mem_alloc_deferred); + queue_work(system_unbound_wq, ©->work); +} + +void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) +{ + struct bpf_mem_caches *cc; + struct bpf_mem_cache *c; + int cpu, i, rcu_in_progress; + + if (ma->cache) { + rcu_in_progress = 0; + for_each_possible_cpu(cpu) { + c = per_cpu_ptr(ma->cache, cpu); + drain_mem_cache(c); + rcu_in_progress += atomic_read(&c->call_rcu_in_progress); + } + /* objcg is the same across cpus */ + if (c->objcg) + obj_cgroup_put(c->objcg); + destroy_mem_alloc(ma, rcu_in_progress); + } + if (ma->caches) { + rcu_in_progress = 0; + for_each_possible_cpu(cpu) { + cc = per_cpu_ptr(ma->caches, cpu); + for (i = 0; i < NUM_CACHES; i++) { + c = &cc->cache[i]; + drain_mem_cache(c); + rcu_in_progress += atomic_read(&c->call_rcu_in_progress); + } + } + if (c->objcg) + obj_cgroup_put(c->objcg); + destroy_mem_alloc(ma, rcu_in_progress); + } +} + +/* notrace is necessary here and in other functions to make sure + * bpf programs cannot attach to them and cause llist corruptions. + */ +static void notrace *unit_alloc(struct bpf_mem_cache *c) +{ + struct llist_node *llnode = NULL; + unsigned long flags; + int cnt = 0; + + /* Disable irqs to prevent the following race for majority of prog types: + * prog_A + * bpf_mem_alloc + * preemption or irq -> prog_B + * bpf_mem_alloc + * + * but prog_B could be a perf_event NMI prog. + * Use per-cpu 'active' counter to order free_list access between + * unit_alloc/unit_free/bpf_mem_refill. + */ + local_irq_save(flags); + if (local_inc_return(&c->active) == 1) { + llnode = __llist_del_first(&c->free_llist); + if (llnode) + cnt = --c->free_cnt; + } + local_dec(&c->active); + local_irq_restore(flags); + + WARN_ON(cnt < 0); + + if (cnt < c->low_watermark) + irq_work_raise(c); + return llnode; +} + +/* Though 'ptr' object could have been allocated on a different cpu + * add it to the free_llist of the current cpu. + * Let kfree() logic deal with it when it's later called from irq_work. + */ +static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) +{ + struct llist_node *llnode = ptr - LLIST_NODE_SZ; + unsigned long flags; + int cnt = 0; + + BUILD_BUG_ON(LLIST_NODE_SZ > 8); + + local_irq_save(flags); + if (local_inc_return(&c->active) == 1) { + __llist_add(llnode, &c->free_llist); + cnt = ++c->free_cnt; + } else { + /* unit_free() cannot fail. Therefore add an object to atomic + * llist. free_bulk() will drain it. Though free_llist_extra is + * a per-cpu list we have to use atomic llist_add here, since + * it also can be interrupted by bpf nmi prog that does another + * unit_free() into the same free_llist_extra. + */ + llist_add(llnode, &c->free_llist_extra); + } + local_dec(&c->active); + local_irq_restore(flags); + + if (cnt > c->high_watermark) + /* free few objects from current cpu into global kmalloc pool */ + irq_work_raise(c); +} + +/* Called from BPF program or from sys_bpf syscall. + * In both cases migration is disabled. + */ +void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size) +{ + int idx; + void *ret; + + if (!size) + return ZERO_SIZE_PTR; + + idx = bpf_mem_cache_idx(size + LLIST_NODE_SZ); + if (idx < 0) + return NULL; + + ret = unit_alloc(this_cpu_ptr(ma->caches)->cache + idx); + return !ret ? NULL : ret + LLIST_NODE_SZ; +} + +void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) +{ + int idx; + + if (!ptr) + return; + + idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ)); + if (idx < 0) + return; + + unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr); +} + +void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma) +{ + void *ret; + + ret = unit_alloc(this_cpu_ptr(ma->cache)); + return !ret ? NULL : ret + LLIST_NODE_SZ; +} + +void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr) +{ + if (!ptr) + return; + + unit_free(this_cpu_ptr(ma->cache), ptr); +} diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index bd09290e3648..13e4efc971e6 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -372,7 +372,7 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) attr->map_type != BPF_MAP_TYPE_HASH) return ERR_PTR(-EINVAL); - offmap = kzalloc(sizeof(*offmap), GFP_USER); + offmap = bpf_map_area_alloc(sizeof(*offmap), NUMA_NO_NODE); if (!offmap) return ERR_PTR(-ENOMEM); @@ -404,7 +404,7 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) err_unlock: up_write(&bpf_devs_lock); rtnl_unlock(); - kfree(offmap); + bpf_map_area_free(offmap); return ERR_PTR(err); } @@ -428,7 +428,7 @@ void bpf_map_offload_map_free(struct bpf_map *map) up_write(&bpf_devs_lock); rtnl_unlock(); - kfree(offmap); + bpf_map_area_free(offmap); } int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value) diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c index 3d897de89061..b6e7f5c5b9ab 100644 --- a/kernel/bpf/percpu_freelist.c +++ b/kernel/bpf/percpu_freelist.c @@ -31,7 +31,7 @@ static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head, struct pcpu_freelist_node *node) { node->next = head->first; - head->first = node; + WRITE_ONCE(head->first, node); } static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head, @@ -58,23 +58,21 @@ static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s, { int cpu, orig_cpu; - orig_cpu = cpu = raw_smp_processor_id(); + orig_cpu = raw_smp_processor_id(); while (1) { - struct pcpu_freelist_head *head; + for_each_cpu_wrap(cpu, cpu_possible_mask, orig_cpu) { + struct pcpu_freelist_head *head; - head = per_cpu_ptr(s->freelist, cpu); - if (raw_spin_trylock(&head->lock)) { - pcpu_freelist_push_node(head, node); - raw_spin_unlock(&head->lock); - return; + head = per_cpu_ptr(s->freelist, cpu); + if (raw_spin_trylock(&head->lock)) { + pcpu_freelist_push_node(head, node); + raw_spin_unlock(&head->lock); + return; + } } - cpu = cpumask_next(cpu, cpu_possible_mask); - if (cpu >= nr_cpu_ids) - cpu = 0; /* cannot lock any per cpu lock, try extralist */ - if (cpu == orig_cpu && - pcpu_freelist_try_push_extra(s, node)) + if (pcpu_freelist_try_push_extra(s, node)) return; } } @@ -125,31 +123,29 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s) { struct pcpu_freelist_head *head; struct pcpu_freelist_node *node; - int orig_cpu, cpu; + int cpu; - orig_cpu = cpu = raw_smp_processor_id(); - while (1) { + for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { head = per_cpu_ptr(s->freelist, cpu); + if (!READ_ONCE(head->first)) + continue; raw_spin_lock(&head->lock); node = head->first; if (node) { - head->first = node->next; + WRITE_ONCE(head->first, node->next); raw_spin_unlock(&head->lock); return node; } raw_spin_unlock(&head->lock); - cpu = cpumask_next(cpu, cpu_possible_mask); - if (cpu >= nr_cpu_ids) - cpu = 0; - if (cpu == orig_cpu) - break; } /* per cpu lists are all empty, try extralist */ + if (!READ_ONCE(s->extralist.first)) + return NULL; raw_spin_lock(&s->extralist.lock); node = s->extralist.first; if (node) - s->extralist.first = node->next; + WRITE_ONCE(s->extralist.first, node->next); raw_spin_unlock(&s->extralist.lock); return node; } @@ -159,33 +155,29 @@ ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s) { struct pcpu_freelist_head *head; struct pcpu_freelist_node *node; - int orig_cpu, cpu; + int cpu; - orig_cpu = cpu = raw_smp_processor_id(); - while (1) { + for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { head = per_cpu_ptr(s->freelist, cpu); + if (!READ_ONCE(head->first)) + continue; if (raw_spin_trylock(&head->lock)) { node = head->first; if (node) { - head->first = node->next; + WRITE_ONCE(head->first, node->next); raw_spin_unlock(&head->lock); return node; } raw_spin_unlock(&head->lock); } - cpu = cpumask_next(cpu, cpu_possible_mask); - if (cpu >= nr_cpu_ids) - cpu = 0; - if (cpu == orig_cpu) - break; } /* cannot pop from per cpu lists, try extralist */ - if (!raw_spin_trylock(&s->extralist.lock)) + if (!READ_ONCE(s->extralist.first) || !raw_spin_trylock(&s->extralist.lock)) return NULL; node = s->extralist.first; if (node) - s->extralist.first = node->next; + WRITE_ONCE(s->extralist.first, node->next); raw_spin_unlock(&s->extralist.lock); return node; } diff --git a/kernel/bpf/preload/iterators/Makefile b/kernel/bpf/preload/iterators/Makefile index bfe24f8c5a20..6762b1260f2f 100644 --- a/kernel/bpf/preload/iterators/Makefile +++ b/kernel/bpf/preload/iterators/Makefile @@ -9,7 +9,7 @@ LLVM_STRIP ?= llvm-strip TOOLS_PATH := $(abspath ../../../../tools) BPFTOOL_SRC := $(TOOLS_PATH)/bpf/bpftool BPFTOOL_OUTPUT := $(abs_out)/bpftool -DEFAULT_BPFTOOL := $(OUTPUT)/sbin/bpftool +DEFAULT_BPFTOOL := $(BPFTOOL_OUTPUT)/bootstrap/bpftool BPFTOOL ?= $(DEFAULT_BPFTOOL) LIBBPF_SRC := $(TOOLS_PATH)/lib/bpf @@ -61,9 +61,5 @@ $(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(LIBBPF_OU OUTPUT=$(abspath $(dir $@))/ prefix= \ DESTDIR=$(LIBBPF_DESTDIR) $(abspath $@) install_headers -$(DEFAULT_BPFTOOL): $(BPFOBJ) | $(BPFTOOL_OUTPUT) - $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOL_SRC) \ - OUTPUT=$(BPFTOOL_OUTPUT)/ \ - LIBBPF_OUTPUT=$(LIBBPF_OUTPUT)/ \ - LIBBPF_DESTDIR=$(LIBBPF_DESTDIR)/ \ - prefix= DESTDIR=$(abs_out)/ install-bin +$(DEFAULT_BPFTOOL): | $(BPFTOOL_OUTPUT) + $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOL_SRC) OUTPUT=$(BPFTOOL_OUTPUT)/ bootstrap diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index f9c734aaa990..8a5e060de63b 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -8,6 +8,7 @@ #include <linux/list.h> #include <linux/slab.h> #include <linux/capability.h> +#include <linux/btf_ids.h> #include "percpu_freelist.h" #define QUEUE_STACK_CREATE_FLAG_MASK \ @@ -77,8 +78,6 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) if (!qs) return ERR_PTR(-ENOMEM); - memset(qs, 0, sizeof(*qs)); - bpf_map_init_from_attr(&qs->map, attr); qs->size = size; @@ -247,7 +246,7 @@ static int queue_stack_map_get_next_key(struct bpf_map *map, void *key, return -EINVAL; } -static int queue_map_btf_id; +BTF_ID_LIST_SINGLE(queue_map_btf_ids, struct, bpf_queue_stack) const struct bpf_map_ops queue_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = queue_stack_map_alloc_check, @@ -260,11 +259,9 @@ const struct bpf_map_ops queue_map_ops = { .map_pop_elem = queue_map_pop_elem, .map_peek_elem = queue_map_peek_elem, .map_get_next_key = queue_stack_map_get_next_key, - .map_btf_name = "bpf_queue_stack", - .map_btf_id = &queue_map_btf_id, + .map_btf_id = &queue_map_btf_ids[0], }; -static int stack_map_btf_id; const struct bpf_map_ops stack_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = queue_stack_map_alloc_check, @@ -277,6 +274,5 @@ const struct bpf_map_ops stack_map_ops = { .map_pop_elem = stack_map_pop_elem, .map_peek_elem = stack_map_peek_elem, .map_get_next_key = queue_stack_map_get_next_key, - .map_btf_name = "bpf_queue_stack", - .map_btf_id = &stack_map_btf_id, + .map_btf_id = &queue_map_btf_ids[0], }; diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 8251243022a2..82c61612f382 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -6,6 +6,7 @@ #include <linux/err.h> #include <linux/sock_diag.h> #include <net/sock_reuseport.h> +#include <linux/btf_ids.h> struct reuseport_array { struct bpf_map map; @@ -20,14 +21,11 @@ static struct reuseport_array *reuseport_array(struct bpf_map *map) /* The caller must hold the reuseport_lock */ void bpf_sk_reuseport_detach(struct sock *sk) { - uintptr_t sk_user_data; + struct sock __rcu **socks; write_lock_bh(&sk->sk_callback_lock); - sk_user_data = (uintptr_t)sk->sk_user_data; - if (sk_user_data & SK_USER_DATA_BPF) { - struct sock __rcu **socks; - - socks = (void *)(sk_user_data & SK_USER_DATA_PTRMASK); + socks = __locked_read_sk_user_data_with_flags(sk, SK_USER_DATA_BPF); + if (socks) { WRITE_ONCE(sk->sk_user_data, NULL); /* * Do not move this NULL assignment outside of @@ -337,7 +335,7 @@ static int reuseport_array_get_next_key(struct bpf_map *map, void *key, return 0; } -static int reuseport_array_map_btf_id; +BTF_ID_LIST_SINGLE(reuseport_array_map_btf_ids, struct, reuseport_array) const struct bpf_map_ops reuseport_array_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = reuseport_array_alloc_check, @@ -346,6 +344,5 @@ const struct bpf_map_ops reuseport_array_ops = { .map_lookup_elem = reuseport_array_lookup_elem, .map_get_next_key = reuseport_array_get_next_key, .map_delete_elem = reuseport_array_delete_elem, - .map_btf_name = "reuseport_array", - .map_btf_id = &reuseport_array_map_btf_id, + .map_btf_id = &reuseport_array_map_btf_ids[0], }; diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 710ba9de12ce..9e832acf4692 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -10,6 +10,7 @@ #include <linux/poll.h> #include <linux/kmemleak.h> #include <uapi/linux/btf.h> +#include <linux/btf_ids.h> #define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE) @@ -37,10 +38,43 @@ struct bpf_ringbuf { struct page **pages; int nr_pages; spinlock_t spinlock ____cacheline_aligned_in_smp; - /* Consumer and producer counters are put into separate pages to allow - * mapping consumer page as r/w, but restrict producer page to r/o. - * This protects producer position from being modified by user-space - * application and ruining in-kernel position tracking. + /* For user-space producer ring buffers, an atomic_t busy bit is used + * to synchronize access to the ring buffers in the kernel, rather than + * the spinlock that is used for kernel-producer ring buffers. This is + * done because the ring buffer must hold a lock across a BPF program's + * callback: + * + * __bpf_user_ringbuf_peek() // lock acquired + * -> program callback_fn() + * -> __bpf_user_ringbuf_sample_release() // lock released + * + * It is unsafe and incorrect to hold an IRQ spinlock across what could + * be a long execution window, so we instead simply disallow concurrent + * access to the ring buffer by kernel consumers, and return -EBUSY from + * __bpf_user_ringbuf_peek() if the busy bit is held by another task. + */ + atomic_t busy ____cacheline_aligned_in_smp; + /* Consumer and producer counters are put into separate pages to + * allow each position to be mapped with different permissions. + * This prevents a user-space application from modifying the + * position and ruining in-kernel tracking. The permissions of the + * pages depend on who is producing samples: user-space or the + * kernel. + * + * Kernel-producer + * --------------- + * The producer position and data pages are mapped as r/o in + * userspace. For this approach, bits in the header of samples are + * used to signal to user-space, and to other producers, whether a + * sample is currently being written. + * + * User-space producer + * ------------------- + * Only the page containing the consumer position is mapped r/o in + * user-space. User-space producers also use bits of the header to + * communicate to the kernel, but the kernel must carefully check and + * validate each sample to ensure that they're correctly formatted, and + * fully contained within the ring buffer. */ unsigned long consumer_pos __aligned(PAGE_SIZE); unsigned long producer_pos __aligned(PAGE_SIZE); @@ -115,7 +149,7 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) err_free_pages: for (i = 0; i < nr_pages; i++) __free_page(pages[i]); - kvfree(pages); + bpf_map_area_free(pages); return NULL; } @@ -135,6 +169,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) return NULL; spin_lock_init(&rb->spinlock); + atomic_set(&rb->busy, 0); init_waitqueue_head(&rb->waitq); init_irq_work(&rb->work, bpf_ringbuf_notify); @@ -163,7 +198,7 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) return ERR_PTR(-E2BIG); #endif - rb_map = kzalloc(sizeof(*rb_map), GFP_USER | __GFP_ACCOUNT); + rb_map = bpf_map_area_alloc(sizeof(*rb_map), NUMA_NO_NODE); if (!rb_map) return ERR_PTR(-ENOMEM); @@ -171,7 +206,7 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node); if (!rb_map->rb) { - kfree(rb_map); + bpf_map_area_free(rb_map); return ERR_PTR(-ENOMEM); } @@ -189,7 +224,7 @@ static void bpf_ringbuf_free(struct bpf_ringbuf *rb) vunmap(rb); for (i = 0; i < nr_pages; i++) __free_page(pages[i]); - kvfree(pages); + bpf_map_area_free(pages); } static void ringbuf_map_free(struct bpf_map *map) @@ -198,7 +233,7 @@ static void ringbuf_map_free(struct bpf_map *map) rb_map = container_of(map, struct bpf_ringbuf_map, map); bpf_ringbuf_free(rb_map->rb); - kfree(rb_map); + bpf_map_area_free(rb_map); } static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key) @@ -223,7 +258,7 @@ static int ringbuf_map_get_next_key(struct bpf_map *map, void *key, return -ENOTSUPP; } -static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) +static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma) { struct bpf_ringbuf_map *rb_map; @@ -241,6 +276,26 @@ static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) vma->vm_pgoff + RINGBUF_PGOFF); } +static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma) +{ + struct bpf_ringbuf_map *rb_map; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + + if (vma->vm_flags & VM_WRITE) { + if (vma->vm_pgoff == 0) + /* Disallow writable mappings to the consumer pointer, + * and allow writable mappings to both the producer + * position, and the ring buffer data itself. + */ + return -EPERM; + } else { + vma->vm_flags &= ~VM_MAYWRITE; + } + /* remap_vmalloc_range() checks size and offset constraints */ + return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF); +} + static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb) { unsigned long cons_pos, prod_pos; @@ -250,8 +305,13 @@ static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb) return prod_pos - cons_pos; } -static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp, - struct poll_table_struct *pts) +static u32 ringbuf_total_data_sz(const struct bpf_ringbuf *rb) +{ + return rb->mask + 1; +} + +static __poll_t ringbuf_map_poll_kern(struct bpf_map *map, struct file *filp, + struct poll_table_struct *pts) { struct bpf_ringbuf_map *rb_map; @@ -263,19 +323,45 @@ static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp, return 0; } -static int ringbuf_map_btf_id; +static __poll_t ringbuf_map_poll_user(struct bpf_map *map, struct file *filp, + struct poll_table_struct *pts) +{ + struct bpf_ringbuf_map *rb_map; + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + poll_wait(filp, &rb_map->rb->waitq, pts); + + if (ringbuf_avail_data_sz(rb_map->rb) < ringbuf_total_data_sz(rb_map->rb)) + return EPOLLOUT | EPOLLWRNORM; + return 0; +} + +BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map) const struct bpf_map_ops ringbuf_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = ringbuf_map_alloc, .map_free = ringbuf_map_free, - .map_mmap = ringbuf_map_mmap, - .map_poll = ringbuf_map_poll, + .map_mmap = ringbuf_map_mmap_kern, + .map_poll = ringbuf_map_poll_kern, .map_lookup_elem = ringbuf_map_lookup_elem, .map_update_elem = ringbuf_map_update_elem, .map_delete_elem = ringbuf_map_delete_elem, .map_get_next_key = ringbuf_map_get_next_key, - .map_btf_name = "bpf_ringbuf_map", - .map_btf_id = &ringbuf_map_btf_id, + .map_btf_id = &ringbuf_map_btf_ids[0], +}; + +BTF_ID_LIST_SINGLE(user_ringbuf_map_btf_ids, struct, bpf_ringbuf_map) +const struct bpf_map_ops user_ringbuf_map_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc = ringbuf_map_alloc, + .map_free = ringbuf_map_free, + .map_mmap = ringbuf_map_mmap_user, + .map_poll = ringbuf_map_poll_user, + .map_lookup_elem = ringbuf_map_lookup_elem, + .map_update_elem = ringbuf_map_update_elem, + .map_delete_elem = ringbuf_map_delete_elem, + .map_get_next_key = ringbuf_map_get_next_key, + .map_btf_id = &user_ringbuf_map_btf_ids[0], }; /* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself, @@ -312,7 +398,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) return NULL; len = round_up(size + BPF_RINGBUF_HDR_SZ, 8); - if (len > rb->mask + 1) + if (len > ringbuf_total_data_sz(rb)) return NULL; cons_pos = smp_load_acquire(&rb->consumer_pos); @@ -404,7 +490,7 @@ BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags) const struct bpf_func_proto bpf_ringbuf_submit_proto = { .func = bpf_ringbuf_submit, .ret_type = RET_VOID, - .arg1_type = ARG_PTR_TO_ALLOC_MEM, + .arg1_type = ARG_PTR_TO_ALLOC_MEM | OBJ_RELEASE, .arg2_type = ARG_ANYTHING, }; @@ -417,7 +503,7 @@ BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags) const struct bpf_func_proto bpf_ringbuf_discard_proto = { .func = bpf_ringbuf_discard, .ret_type = RET_VOID, - .arg1_type = ARG_PTR_TO_ALLOC_MEM, + .arg1_type = ARG_PTR_TO_ALLOC_MEM | OBJ_RELEASE, .arg2_type = ARG_ANYTHING, }; @@ -459,7 +545,7 @@ BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags) case BPF_RB_AVAIL_DATA: return ringbuf_avail_data_sz(rb); case BPF_RB_RING_SIZE: - return rb->mask + 1; + return ringbuf_total_data_sz(rb); case BPF_RB_CONS_POS: return smp_load_acquire(&rb->consumer_pos); case BPF_RB_PROD_POS: @@ -475,3 +561,216 @@ const struct bpf_func_proto bpf_ringbuf_query_proto = { .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_ANYTHING, }; + +BPF_CALL_4(bpf_ringbuf_reserve_dynptr, struct bpf_map *, map, u32, size, u64, flags, + struct bpf_dynptr_kern *, ptr) +{ + struct bpf_ringbuf_map *rb_map; + void *sample; + int err; + + if (unlikely(flags)) { + bpf_dynptr_set_null(ptr); + return -EINVAL; + } + + err = bpf_dynptr_check_size(size); + if (err) { + bpf_dynptr_set_null(ptr); + return err; + } + + rb_map = container_of(map, struct bpf_ringbuf_map, map); + + sample = __bpf_ringbuf_reserve(rb_map->rb, size); + if (!sample) { + bpf_dynptr_set_null(ptr); + return -EINVAL; + } + + bpf_dynptr_init(ptr, sample, BPF_DYNPTR_TYPE_RINGBUF, 0, size); + + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto = { + .func = bpf_ringbuf_reserve_dynptr, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT, +}; + +BPF_CALL_2(bpf_ringbuf_submit_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags) +{ + if (!ptr->data) + return 0; + + bpf_ringbuf_commit(ptr->data, flags, false /* discard */); + + bpf_dynptr_set_null(ptr); + + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_submit_dynptr_proto = { + .func = bpf_ringbuf_submit_dynptr, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_ringbuf_discard_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags) +{ + if (!ptr->data) + return 0; + + bpf_ringbuf_commit(ptr->data, flags, true /* discard */); + + bpf_dynptr_set_null(ptr); + + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto = { + .func = bpf_ringbuf_discard_dynptr, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE, + .arg2_type = ARG_ANYTHING, +}; + +static int __bpf_user_ringbuf_peek(struct bpf_ringbuf *rb, void **sample, u32 *size) +{ + int err; + u32 hdr_len, sample_len, total_len, flags, *hdr; + u64 cons_pos, prod_pos; + + /* Synchronizes with smp_store_release() in user-space producer. */ + prod_pos = smp_load_acquire(&rb->producer_pos); + if (prod_pos % 8) + return -EINVAL; + + /* Synchronizes with smp_store_release() in __bpf_user_ringbuf_sample_release() */ + cons_pos = smp_load_acquire(&rb->consumer_pos); + if (cons_pos >= prod_pos) + return -ENODATA; + + hdr = (u32 *)((uintptr_t)rb->data + (uintptr_t)(cons_pos & rb->mask)); + /* Synchronizes with smp_store_release() in user-space producer. */ + hdr_len = smp_load_acquire(hdr); + flags = hdr_len & (BPF_RINGBUF_BUSY_BIT | BPF_RINGBUF_DISCARD_BIT); + sample_len = hdr_len & ~flags; + total_len = round_up(sample_len + BPF_RINGBUF_HDR_SZ, 8); + + /* The sample must fit within the region advertised by the producer position. */ + if (total_len > prod_pos - cons_pos) + return -EINVAL; + + /* The sample must fit within the data region of the ring buffer. */ + if (total_len > ringbuf_total_data_sz(rb)) + return -E2BIG; + + /* The sample must fit into a struct bpf_dynptr. */ + err = bpf_dynptr_check_size(sample_len); + if (err) + return -E2BIG; + + if (flags & BPF_RINGBUF_DISCARD_BIT) { + /* If the discard bit is set, the sample should be skipped. + * + * Update the consumer pos, and return -EAGAIN so the caller + * knows to skip this sample and try to read the next one. + */ + smp_store_release(&rb->consumer_pos, cons_pos + total_len); + return -EAGAIN; + } + + if (flags & BPF_RINGBUF_BUSY_BIT) + return -ENODATA; + + *sample = (void *)((uintptr_t)rb->data + + (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask)); + *size = sample_len; + return 0; +} + +static void __bpf_user_ringbuf_sample_release(struct bpf_ringbuf *rb, size_t size, u64 flags) +{ + u64 consumer_pos; + u32 rounded_size = round_up(size + BPF_RINGBUF_HDR_SZ, 8); + + /* Using smp_load_acquire() is unnecessary here, as the busy-bit + * prevents another task from writing to consumer_pos after it was read + * by this task with smp_load_acquire() in __bpf_user_ringbuf_peek(). + */ + consumer_pos = rb->consumer_pos; + /* Synchronizes with smp_load_acquire() in user-space producer. */ + smp_store_release(&rb->consumer_pos, consumer_pos + rounded_size); +} + +BPF_CALL_4(bpf_user_ringbuf_drain, struct bpf_map *, map, + void *, callback_fn, void *, callback_ctx, u64, flags) +{ + struct bpf_ringbuf *rb; + long samples, discarded_samples = 0, ret = 0; + bpf_callback_t callback = (bpf_callback_t)callback_fn; + u64 wakeup_flags = BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP; + int busy = 0; + + if (unlikely(flags & ~wakeup_flags)) + return -EINVAL; + + rb = container_of(map, struct bpf_ringbuf_map, map)->rb; + + /* If another consumer is already consuming a sample, wait for them to finish. */ + if (!atomic_try_cmpxchg(&rb->busy, &busy, 1)) + return -EBUSY; + + for (samples = 0; samples < BPF_MAX_USER_RINGBUF_SAMPLES && ret == 0; samples++) { + int err; + u32 size; + void *sample; + struct bpf_dynptr_kern dynptr; + + err = __bpf_user_ringbuf_peek(rb, &sample, &size); + if (err) { + if (err == -ENODATA) { + break; + } else if (err == -EAGAIN) { + discarded_samples++; + continue; + } else { + ret = err; + goto schedule_work_return; + } + } + + bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size); + ret = callback((uintptr_t)&dynptr, (uintptr_t)callback_ctx, 0, 0, 0); + __bpf_user_ringbuf_sample_release(rb, size, flags); + } + ret = samples - discarded_samples; + +schedule_work_return: + /* Prevent the clearing of the busy-bit from being reordered before the + * storing of any rb consumer or producer positions. + */ + smp_mb__before_atomic(); + atomic_set(&rb->busy, 0); + + if (flags & BPF_RB_FORCE_WAKEUP) + irq_work_queue(&rb->work); + else if (!(flags & BPF_RB_NO_WAKEUP) && samples > 0) + irq_work_queue(&rb->work); + return ret; +} + +const struct bpf_func_proto bpf_user_ringbuf_drain_proto = { + .func = bpf_user_ringbuf_drain, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_FUNC, + .arg3_type = ARG_PTR_TO_STACK_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 34725bfa1e97..1adbe67cdb95 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -100,13 +100,11 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) return ERR_PTR(-E2BIG); cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); - cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); if (!smap) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&smap->map, attr); - smap->map.value_size = value_size; smap->n_buckets = n_buckets; err = get_callchain_buffers(sysctl_perf_event_max_stack); @@ -656,7 +654,7 @@ static void stack_map_free(struct bpf_map *map) put_callchain_buffers(); } -static int stack_trace_map_btf_id; +BTF_ID_LIST_SINGLE(stack_trace_map_btf_ids, struct, bpf_stack_map) const struct bpf_map_ops stack_trace_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = stack_map_alloc, @@ -666,6 +664,5 @@ const struct bpf_map_ops stack_trace_map_ops = { .map_update_elem = stack_map_update_elem, .map_delete_elem = stack_map_delete_elem, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_stack_map", - .map_btf_id = &stack_trace_map_btf_id, + .map_btf_id = &stack_trace_map_btf_ids[0], }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cdaa1152436a..7b373a5e861f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -6,6 +6,7 @@ #include <linux/bpf_trace.h> #include <linux/bpf_lirc.h> #include <linux/bpf_verifier.h> +#include <linux/bsearch.h> #include <linux/btf.h> #include <linux/syscalls.h> #include <linux/slab.h> @@ -29,6 +30,7 @@ #include <linux/pgtable.h> #include <linux/bpf_lsm.h> #include <linux/poll.h> +#include <linux/sort.h> #include <linux/bpf-netns.h> #include <linux/rcupdate_trace.h> #include <linux/memcontrol.h> @@ -417,35 +419,53 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) #ifdef CONFIG_MEMCG_KMEM static void bpf_map_save_memcg(struct bpf_map *map) { - map->memcg = get_mem_cgroup_from_mm(current->mm); + /* Currently if a map is created by a process belonging to the root + * memory cgroup, get_obj_cgroup_from_current() will return NULL. + * So we have to check map->objcg for being NULL each time it's + * being used. + */ + map->objcg = get_obj_cgroup_from_current(); } static void bpf_map_release_memcg(struct bpf_map *map) { - mem_cgroup_put(map->memcg); + if (map->objcg) + obj_cgroup_put(map->objcg); +} + +static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map) +{ + if (map->objcg) + return get_mem_cgroup_from_objcg(map->objcg); + + return root_mem_cgroup; } void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, int node) { - struct mem_cgroup *old_memcg; + struct mem_cgroup *memcg, *old_memcg; void *ptr; - old_memcg = set_active_memcg(map->memcg); + memcg = bpf_map_get_memcg(map); + old_memcg = set_active_memcg(memcg); ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node); set_active_memcg(old_memcg); + mem_cgroup_put(memcg); return ptr; } void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) { - struct mem_cgroup *old_memcg; + struct mem_cgroup *memcg, *old_memcg; void *ptr; - old_memcg = set_active_memcg(map->memcg); + memcg = bpf_map_get_memcg(map); + old_memcg = set_active_memcg(memcg); ptr = kzalloc(size, flags | __GFP_ACCOUNT); set_active_memcg(old_memcg); + mem_cgroup_put(memcg); return ptr; } @@ -453,12 +473,14 @@ void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align, gfp_t flags) { - struct mem_cgroup *old_memcg; + struct mem_cgroup *memcg, *old_memcg; void __percpu *ptr; - old_memcg = set_active_memcg(map->memcg); + memcg = bpf_map_get_memcg(map); + old_memcg = set_active_memcg(memcg); ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT); set_active_memcg(old_memcg); + mem_cgroup_put(memcg); return ptr; } @@ -473,14 +495,128 @@ static void bpf_map_release_memcg(struct bpf_map *map) } #endif +static int bpf_map_kptr_off_cmp(const void *a, const void *b) +{ + const struct bpf_map_value_off_desc *off_desc1 = a, *off_desc2 = b; + + if (off_desc1->offset < off_desc2->offset) + return -1; + else if (off_desc1->offset > off_desc2->offset) + return 1; + return 0; +} + +struct bpf_map_value_off_desc *bpf_map_kptr_off_contains(struct bpf_map *map, u32 offset) +{ + /* Since members are iterated in btf_find_field in increasing order, + * offsets appended to kptr_off_tab are in increasing order, so we can + * do bsearch to find exact match. + */ + struct bpf_map_value_off *tab; + + if (!map_value_has_kptrs(map)) + return NULL; + tab = map->kptr_off_tab; + return bsearch(&offset, tab->off, tab->nr_off, sizeof(tab->off[0]), bpf_map_kptr_off_cmp); +} + +void bpf_map_free_kptr_off_tab(struct bpf_map *map) +{ + struct bpf_map_value_off *tab = map->kptr_off_tab; + int i; + + if (!map_value_has_kptrs(map)) + return; + for (i = 0; i < tab->nr_off; i++) { + if (tab->off[i].kptr.module) + module_put(tab->off[i].kptr.module); + btf_put(tab->off[i].kptr.btf); + } + kfree(tab); + map->kptr_off_tab = NULL; +} + +struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map) +{ + struct bpf_map_value_off *tab = map->kptr_off_tab, *new_tab; + int size, i; + + if (!map_value_has_kptrs(map)) + return ERR_PTR(-ENOENT); + size = offsetof(struct bpf_map_value_off, off[tab->nr_off]); + new_tab = kmemdup(tab, size, GFP_KERNEL | __GFP_NOWARN); + if (!new_tab) + return ERR_PTR(-ENOMEM); + /* Do a deep copy of the kptr_off_tab */ + for (i = 0; i < tab->nr_off; i++) { + btf_get(tab->off[i].kptr.btf); + if (tab->off[i].kptr.module && !try_module_get(tab->off[i].kptr.module)) { + while (i--) { + if (tab->off[i].kptr.module) + module_put(tab->off[i].kptr.module); + btf_put(tab->off[i].kptr.btf); + } + kfree(new_tab); + return ERR_PTR(-ENXIO); + } + } + return new_tab; +} + +bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b) +{ + struct bpf_map_value_off *tab_a = map_a->kptr_off_tab, *tab_b = map_b->kptr_off_tab; + bool a_has_kptr = map_value_has_kptrs(map_a), b_has_kptr = map_value_has_kptrs(map_b); + int size; + + if (!a_has_kptr && !b_has_kptr) + return true; + if (a_has_kptr != b_has_kptr) + return false; + if (tab_a->nr_off != tab_b->nr_off) + return false; + size = offsetof(struct bpf_map_value_off, off[tab_a->nr_off]); + return !memcmp(tab_a, tab_b, size); +} + +/* Caller must ensure map_value_has_kptrs is true. Note that this function can + * be called on a map value while the map_value is visible to BPF programs, as + * it ensures the correct synchronization, and we already enforce the same using + * the bpf_kptr_xchg helper on the BPF program side for referenced kptrs. + */ +void bpf_map_free_kptrs(struct bpf_map *map, void *map_value) +{ + struct bpf_map_value_off *tab = map->kptr_off_tab; + unsigned long *btf_id_ptr; + int i; + + for (i = 0; i < tab->nr_off; i++) { + struct bpf_map_value_off_desc *off_desc = &tab->off[i]; + unsigned long old_ptr; + + btf_id_ptr = map_value + off_desc->offset; + if (off_desc->type == BPF_KPTR_UNREF) { + u64 *p = (u64 *)btf_id_ptr; + + WRITE_ONCE(*p, 0); + continue; + } + old_ptr = xchg(btf_id_ptr, 0); + off_desc->kptr.dtor((void *)old_ptr); + } +} + /* called from workqueue */ static void bpf_map_free_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_map, work); security_bpf_map_free(map); + kfree(map->off_arr); bpf_map_release_memcg(map); - /* implementation dependent freeing */ + /* implementation dependent freeing, map_free callback also does + * bpf_map_free_kptr_off_tab, if needed. + */ map->ops->map_free(map); } @@ -502,7 +638,10 @@ static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock) bpf_map_free_id(map, do_idr_lock); btf_put(map->btf); INIT_WORK(&map->work, bpf_map_free_deferred); - schedule_work(&map->work); + /* Avoid spawning kworkers, since they all might contend + * for the same mutex like slab_mutex. + */ + queue_work(system_unbound_wq, &map->work); } } @@ -640,7 +779,7 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) int err; if (!map->ops->map_mmap || map_value_has_spin_lock(map) || - map_value_has_timer(map)) + map_value_has_timer(map) || map_value_has_kptrs(map)) return -ENOTSUPP; if (!(vma->vm_flags & VM_SHARED)) @@ -767,6 +906,84 @@ int map_check_no_btf(const struct bpf_map *map, return -ENOTSUPP; } +static int map_off_arr_cmp(const void *_a, const void *_b, const void *priv) +{ + const u32 a = *(const u32 *)_a; + const u32 b = *(const u32 *)_b; + + if (a < b) + return -1; + else if (a > b) + return 1; + return 0; +} + +static void map_off_arr_swap(void *_a, void *_b, int size, const void *priv) +{ + struct bpf_map *map = (struct bpf_map *)priv; + u32 *off_base = map->off_arr->field_off; + u32 *a = _a, *b = _b; + u8 *sz_a, *sz_b; + + sz_a = map->off_arr->field_sz + (a - off_base); + sz_b = map->off_arr->field_sz + (b - off_base); + + swap(*a, *b); + swap(*sz_a, *sz_b); +} + +static int bpf_map_alloc_off_arr(struct bpf_map *map) +{ + bool has_spin_lock = map_value_has_spin_lock(map); + bool has_timer = map_value_has_timer(map); + bool has_kptrs = map_value_has_kptrs(map); + struct bpf_map_off_arr *off_arr; + u32 i; + + if (!has_spin_lock && !has_timer && !has_kptrs) { + map->off_arr = NULL; + return 0; + } + + off_arr = kmalloc(sizeof(*map->off_arr), GFP_KERNEL | __GFP_NOWARN); + if (!off_arr) + return -ENOMEM; + map->off_arr = off_arr; + + off_arr->cnt = 0; + if (has_spin_lock) { + i = off_arr->cnt; + + off_arr->field_off[i] = map->spin_lock_off; + off_arr->field_sz[i] = sizeof(struct bpf_spin_lock); + off_arr->cnt++; + } + if (has_timer) { + i = off_arr->cnt; + + off_arr->field_off[i] = map->timer_off; + off_arr->field_sz[i] = sizeof(struct bpf_timer); + off_arr->cnt++; + } + if (has_kptrs) { + struct bpf_map_value_off *tab = map->kptr_off_tab; + u32 *off = &off_arr->field_off[off_arr->cnt]; + u8 *sz = &off_arr->field_sz[off_arr->cnt]; + + for (i = 0; i < tab->nr_off; i++) { + *off++ = tab->off[i].offset; + *sz++ = sizeof(u64); + } + off_arr->cnt += tab->nr_off; + } + + if (off_arr->cnt == 1) + return 0; + sort_r(off_arr->field_off, off_arr->cnt, sizeof(off_arr->field_off[0]), + map_off_arr_cmp, map_off_arr_swap, map); + return 0; +} + static int map_check_btf(struct bpf_map *map, const struct btf *btf, u32 btf_key_id, u32 btf_value_id) { @@ -820,10 +1037,35 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, return -EOPNOTSUPP; } - if (map->ops->map_check_btf) + map->kptr_off_tab = btf_parse_kptrs(btf, value_type); + if (map_value_has_kptrs(map)) { + if (!bpf_capable()) { + ret = -EPERM; + goto free_map_tab; + } + if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) { + ret = -EACCES; + goto free_map_tab; + } + if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_LRU_HASH && + map->map_type != BPF_MAP_TYPE_ARRAY && + map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY) { + ret = -EOPNOTSUPP; + goto free_map_tab; + } + } + + if (map->ops->map_check_btf) { ret = map->ops->map_check_btf(map, btf, key_type, value_type); + if (ret < 0) + goto free_map_tab; + } return ret; +free_map_tab: + bpf_map_free_kptr_off_tab(map); + return ret; } #define BPF_MAP_CREATE_LAST_FIELD map_extra @@ -912,10 +1154,14 @@ static int map_create(union bpf_attr *attr) attr->btf_vmlinux_value_type_id; } - err = security_bpf_map_alloc(map); + err = bpf_map_alloc_off_arr(map); if (err) goto free_map; + err = security_bpf_map_alloc(map); + if (err) + goto free_map_off_arr; + err = bpf_map_alloc_id(map); if (err) goto free_map_sec; @@ -938,6 +1184,8 @@ static int map_create(union bpf_attr *attr) free_map_sec: security_bpf_map_free(map); +free_map_off_arr: + kfree(map->off_arr); free_map: btf_put(map->btf); map->ops->map_free(map); @@ -1169,19 +1417,14 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) } value_size = bpf_map_value_size(map); - - err = -ENOMEM; - value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); - if (!value) + value = kvmemdup_bpfptr(uvalue, value_size); + if (IS_ERR(value)) { + err = PTR_ERR(value); goto free_key; - - err = -EFAULT; - if (copy_from_bpfptr(value, uvalue, value_size) != 0) - goto free_value; + } err = bpf_map_update_value(map, f, key, value, attr->flags); -free_value: kvfree(value); free_key: kvfree(key); @@ -1193,9 +1436,9 @@ err_put: #define BPF_MAP_DELETE_ELEM_LAST_FIELD key -static int map_delete_elem(union bpf_attr *attr) +static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr) { - void __user *ukey = u64_to_user_ptr(attr->key); + bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel); int ufd = attr->map_fd; struct bpf_map *map; struct fd f; @@ -1215,7 +1458,7 @@ static int map_delete_elem(union bpf_attr *attr) goto err_put; } - key = __bpf_copy_key(ukey, map->key_size); + key = ___bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; @@ -1639,7 +1882,7 @@ static int map_freeze(const union bpf_attr *attr) return PTR_ERR(map); if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || - map_value_has_timer(map)) { + map_value_has_timer(map) || map_value_has_kptrs(map)) { fdput(f); return -ENOTSUPP; } @@ -1850,6 +2093,17 @@ struct bpf_prog_kstats { u64 misses; }; +void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog) +{ + struct bpf_prog_stats *stats; + unsigned int flags; + + stats = this_cpu_ptr(prog->stats); + flags = u64_stats_update_begin_irqsave(&stats->syncp); + u64_stats_inc(&stats->misses); + u64_stats_update_end_irqrestore(&stats->syncp, flags); +} + static void bpf_prog_get_stats(const struct bpf_prog *prog, struct bpf_prog_kstats *stats) { @@ -2640,19 +2894,12 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd) } EXPORT_SYMBOL(bpf_link_get_from_fd); -struct bpf_tracing_link { - struct bpf_link link; - enum bpf_attach_type attach_type; - struct bpf_trampoline *trampoline; - struct bpf_prog *tgt_prog; -}; - static void bpf_tracing_link_release(struct bpf_link *link) { struct bpf_tracing_link *tr_link = - container_of(link, struct bpf_tracing_link, link); + container_of(link, struct bpf_tracing_link, link.link); - WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog, + WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link, tr_link->trampoline)); bpf_trampoline_put(tr_link->trampoline); @@ -2665,7 +2912,7 @@ static void bpf_tracing_link_release(struct bpf_link *link) static void bpf_tracing_link_dealloc(struct bpf_link *link) { struct bpf_tracing_link *tr_link = - container_of(link, struct bpf_tracing_link, link); + container_of(link, struct bpf_tracing_link, link.link); kfree(tr_link); } @@ -2674,7 +2921,7 @@ static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { struct bpf_tracing_link *tr_link = - container_of(link, struct bpf_tracing_link, link); + container_of(link, struct bpf_tracing_link, link.link); seq_printf(seq, "attach_type:\t%d\n", @@ -2685,7 +2932,7 @@ static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, struct bpf_link_info *info) { struct bpf_tracing_link *tr_link = - container_of(link, struct bpf_tracing_link, link); + container_of(link, struct bpf_tracing_link, link.link); info->tracing.attach_type = tr_link->attach_type; bpf_trampoline_unpack_key(tr_link->trampoline->key, @@ -2704,7 +2951,8 @@ static const struct bpf_link_ops bpf_tracing_link_lops = { static int bpf_tracing_prog_attach(struct bpf_prog *prog, int tgt_prog_fd, - u32 btf_id) + u32 btf_id, + u64 bpf_cookie) { struct bpf_link_primer link_primer; struct bpf_prog *tgt_prog = NULL; @@ -2766,9 +3014,10 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, err = -ENOMEM; goto out_put_prog; } - bpf_link_init(&link->link, BPF_LINK_TYPE_TRACING, + bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING, &bpf_tracing_link_lops, prog); link->attach_type = prog->expected_attach_type; + link->link.cookie = bpf_cookie; mutex_lock(&prog->aux->dst_mutex); @@ -2836,11 +3085,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, tgt_prog = prog->aux->dst_prog; } - err = bpf_link_prime(&link->link, &link_primer); + err = bpf_link_prime(&link->link.link, &link_primer); if (err) goto out_unlock; - err = bpf_trampoline_link_prog(prog, tr); + err = bpf_trampoline_link_prog(&link->link, tr); if (err) { bpf_link_cleanup(&link_primer); link = NULL; @@ -3030,66 +3279,45 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro } #endif /* CONFIG_PERF_EVENTS */ -#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd - -static int bpf_raw_tracepoint_open(const union bpf_attr *attr) +static int bpf_raw_tp_link_attach(struct bpf_prog *prog, + const char __user *user_tp_name) { struct bpf_link_primer link_primer; struct bpf_raw_tp_link *link; struct bpf_raw_event_map *btp; - struct bpf_prog *prog; const char *tp_name; char buf[128]; int err; - if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN)) - return -EINVAL; - - prog = bpf_prog_get(attr->raw_tracepoint.prog_fd); - if (IS_ERR(prog)) - return PTR_ERR(prog); - switch (prog->type) { case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_EXT: case BPF_PROG_TYPE_LSM: - if (attr->raw_tracepoint.name) { + if (user_tp_name) /* The attach point for this category of programs * should be specified via btf_id during program load. */ - err = -EINVAL; - goto out_put_prog; - } + return -EINVAL; if (prog->type == BPF_PROG_TYPE_TRACING && prog->expected_attach_type == BPF_TRACE_RAW_TP) { tp_name = prog->aux->attach_func_name; break; } - err = bpf_tracing_prog_attach(prog, 0, 0); - if (err >= 0) - return err; - goto out_put_prog; + return bpf_tracing_prog_attach(prog, 0, 0, 0); case BPF_PROG_TYPE_RAW_TRACEPOINT: case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: - if (strncpy_from_user(buf, - u64_to_user_ptr(attr->raw_tracepoint.name), - sizeof(buf) - 1) < 0) { - err = -EFAULT; - goto out_put_prog; - } + if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0) + return -EFAULT; buf[sizeof(buf) - 1] = 0; tp_name = buf; break; default: - err = -EINVAL; - goto out_put_prog; + return -EINVAL; } btp = bpf_get_raw_tracepoint(tp_name); - if (!btp) { - err = -ENOENT; - goto out_put_prog; - } + if (!btp) + return -ENOENT; link = kzalloc(sizeof(*link), GFP_USER); if (!link) { @@ -3116,11 +3344,29 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) out_put_btp: bpf_put_raw_tracepoint(btp); -out_put_prog: - bpf_prog_put(prog); return err; } +#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd + +static int bpf_raw_tracepoint_open(const union bpf_attr *attr) +{ + struct bpf_prog *prog; + int fd; + + if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN)) + return -EINVAL; + + prog = bpf_prog_get(attr->raw_tracepoint.prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + fd = bpf_raw_tp_link_attach(prog, u64_to_user_ptr(attr->raw_tracepoint.name)); + if (fd < 0) + bpf_prog_put(prog); + return fd; +} + static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, enum bpf_attach_type attach_type) { @@ -3189,11 +3435,19 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_CGROUP_SETSOCKOPT: return BPF_PROG_TYPE_CGROUP_SOCKOPT; case BPF_TRACE_ITER: + case BPF_TRACE_RAW_TP: + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + case BPF_MODIFY_RETURN: return BPF_PROG_TYPE_TRACING; + case BPF_LSM_MAC: + return BPF_PROG_TYPE_LSM; case BPF_SK_LOOKUP: return BPF_PROG_TYPE_SK_LOOKUP; case BPF_XDP: return BPF_PROG_TYPE_XDP; + case BPF_LSM_CGROUP: + return BPF_PROG_TYPE_LSM; default: return BPF_PROG_TYPE_UNSPEC; } @@ -3247,6 +3501,11 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_PROG_TYPE_CGROUP_SOCKOPT: case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_LSM: + if (ptype == BPF_PROG_TYPE_LSM && + prog->expected_attach_type != BPF_LSM_CGROUP) + return -EINVAL; + ret = cgroup_bpf_prog_attach(attr, ptype, prog); break; default: @@ -3284,13 +3543,14 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_PROG_TYPE_CGROUP_SOCKOPT: case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_LSM: return cgroup_bpf_prog_detach(attr, ptype); default: return -EINVAL; } } -#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt +#define BPF_PROG_QUERY_LAST_FIELD query.prog_attach_flags static int bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -3326,6 +3586,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_SYSCTL: case BPF_CGROUP_GETSOCKOPT: case BPF_CGROUP_SETSOCKOPT: + case BPF_LSM_CGROUP: return cgroup_bpf_prog_query(attr, uattr); case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); @@ -3635,6 +3896,7 @@ static int bpf_prog_get_info_by_fd(struct file *file, union bpf_attr __user *uattr) { struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info); + struct btf *attach_btf = bpf_prog_get_target_btf(prog); struct bpf_prog_info info; u32 info_len = attr->info.info_len; struct bpf_prog_kstats stats; @@ -3836,6 +4098,9 @@ static int bpf_prog_get_info_by_fd(struct file *file, if (prog->aux->btf) info.btf_id = btf_obj_id(prog->aux->btf); + info.attach_btf_id = prog->aux->attach_btf_id; + if (attach_btf) + info.attach_btf_obj_id = btf_obj_id(attach_btf); ulen = info.nr_func_info; info.nr_func_info = prog->aux->func_info_cnt; @@ -3868,14 +4133,15 @@ static int bpf_prog_get_info_by_fd(struct file *file, info.nr_jited_line_info = 0; if (info.nr_jited_line_info && ulen) { if (bpf_dump_raw_ok(file->f_cred)) { + unsigned long line_addr; __u64 __user *user_linfo; u32 i; user_linfo = u64_to_user_ptr(info.jited_line_info); ulen = min_t(u32, info.nr_jited_line_info, ulen); for (i = 0; i < ulen; i++) { - if (put_user((__u64)(long)prog->aux->jited_linfo[i], - &user_linfo[i])) + line_addr = (unsigned long)prog->aux->jited_linfo[i]; + if (put_user((__u64)line_addr, &user_linfo[i])) return -EFAULT; } } else { @@ -4139,7 +4405,9 @@ static int bpf_task_fd_query(const union bpf_attr *attr, if (attr->task_fd_query.flags != 0) return -EINVAL; + rcu_read_lock(); task = get_pid_task(find_vpid(pid), PIDTYPE_PID); + rcu_read_unlock(); if (!task) return -ENOENT; @@ -4246,21 +4514,6 @@ err_put: return err; } -static int tracing_bpf_link_attach(const union bpf_attr *attr, bpfptr_t uattr, - struct bpf_prog *prog) -{ - if (attr->link_create.attach_type != prog->expected_attach_type) - return -EINVAL; - - if (prog->expected_attach_type == BPF_TRACE_ITER) - return bpf_iter_link_attach(attr, uattr, prog); - else if (prog->type == BPF_PROG_TYPE_EXT) - return bpf_tracing_prog_attach(prog, - attr->link_create.target_fd, - attr->link_create.target_btf_id); - return -EINVAL; -} - #define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.cookies static int link_create(union bpf_attr *attr, bpfptr_t uattr) { @@ -4282,15 +4535,13 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) switch (prog->type) { case BPF_PROG_TYPE_EXT: - ret = tracing_bpf_link_attach(attr, uattr, prog); - goto out; + break; case BPF_PROG_TYPE_PERF_EVENT: case BPF_PROG_TYPE_TRACEPOINT: if (attr->link_create.attach_type != BPF_PERF_EVENT) { ret = -EINVAL; goto out; } - ptype = prog->type; break; case BPF_PROG_TYPE_KPROBE: if (attr->link_create.attach_type != BPF_PERF_EVENT && @@ -4298,7 +4549,6 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = -EINVAL; goto out; } - ptype = prog->type; break; default: ptype = attach_type_to_prog_type(attr->link_create.attach_type); @@ -4309,7 +4559,7 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) break; } - switch (ptype) { + switch (prog->type) { case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: @@ -4319,8 +4569,29 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_PROG_TYPE_CGROUP_SOCKOPT: ret = cgroup_bpf_link_attach(attr, prog); break; + case BPF_PROG_TYPE_EXT: + ret = bpf_tracing_prog_attach(prog, + attr->link_create.target_fd, + attr->link_create.target_btf_id, + attr->link_create.tracing.cookie); + break; + case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_TRACING: - ret = tracing_bpf_link_attach(attr, uattr, prog); + if (attr->link_create.attach_type != prog->expected_attach_type) { + ret = -EINVAL; + goto out; + } + if (prog->expected_attach_type == BPF_TRACE_RAW_TP) + ret = bpf_raw_tp_link_attach(prog, NULL); + else if (prog->expected_attach_type == BPF_TRACE_ITER) + ret = bpf_iter_link_attach(attr, uattr, prog); + else if (prog->expected_attach_type == BPF_LSM_CGROUP) + ret = cgroup_bpf_link_attach(attr, prog); + else + ret = bpf_tracing_prog_attach(prog, + attr->link_create.target_fd, + attr->link_create.target_btf_id, + attr->link_create.tracing.cookie); break; case BPF_PROG_TYPE_FLOW_DISSECTOR: case BPF_PROG_TYPE_SK_LOOKUP: @@ -4454,6 +4725,25 @@ struct bpf_link *bpf_link_by_id(u32 id) return link; } +struct bpf_link *bpf_link_get_curr_or_next(u32 *id) +{ + struct bpf_link *link; + + spin_lock_bh(&link_idr_lock); +again: + link = idr_get_next(&link_idr, id); + if (link) { + link = bpf_link_inc_not_zero(link); + if (IS_ERR(link)) { + (*id)++; + goto again; + } + } + spin_unlock_bh(&link_idr_lock); + + return link; +} + #define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id static int bpf_link_get_fd_by_id(const union bpf_attr *attr) @@ -4621,9 +4911,21 @@ out_prog_put: static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) { union bpf_attr attr; + bool capable; int err; - if (sysctl_unprivileged_bpf_disabled && !bpf_capable()) + capable = bpf_capable() || !sysctl_unprivileged_bpf_disabled; + + /* Intent here is for unprivileged_bpf_disabled to block key object + * creation commands for unprivileged users; other actions depend + * of fd availability and access to bpffs, so are dependent on + * object creation success. Capabilities are later verified for + * operations such as load and map create, so even with unprivileged + * BPF disabled, capability checks are still carried out for these + * and other operations. + */ + if (!capable && + (cmd == BPF_MAP_CREATE || cmd == BPF_PROG_LOAD)) return -EPERM; err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); @@ -4651,7 +4953,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) err = map_update_elem(&attr, uattr); break; case BPF_MAP_DELETE_ELEM: - err = map_delete_elem(&attr); + err = map_delete_elem(&attr, uattr); break; case BPF_MAP_GET_NEXT_KEY: err = map_get_next_key(&attr); @@ -4781,17 +5083,37 @@ static bool syscall_prog_is_valid_access(int off, int size, BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size) { - struct bpf_prog * __maybe_unused prog; - switch (cmd) { case BPF_MAP_CREATE: + case BPF_MAP_DELETE_ELEM: case BPF_MAP_UPDATE_ELEM: case BPF_MAP_FREEZE: + case BPF_MAP_GET_FD_BY_ID: case BPF_PROG_LOAD: case BPF_BTF_LOAD: case BPF_LINK_CREATE: case BPF_RAW_TRACEPOINT_OPEN: break; + default: + return -EINVAL; + } + return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size); +} + + +/* To shut up -Wmissing-prototypes. + * This function is used by the kernel light skeleton + * to load bpf programs when modules are loaded or during kernel boot. + * See tools/lib/bpf/skel_internal.h + */ +int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size); + +int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size) +{ + struct bpf_prog * __maybe_unused prog; + struct bpf_tramp_run_ctx __maybe_unused run_ctx; + + switch (cmd) { #ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */ case BPF_PROG_TEST_RUN: if (attr->test.data_in || attr->test.data_out || @@ -4809,22 +5131,23 @@ BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size) return -EINVAL; } - if (!__bpf_prog_enter_sleepable(prog)) { + run_ctx.bpf_cookie = 0; + run_ctx.saved_run_ctx = NULL; + if (!__bpf_prog_enter_sleepable(prog, &run_ctx)) { /* recursion detected */ bpf_prog_put(prog); return -EBUSY; } attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in); - __bpf_prog_exit_sleepable(prog, 0 /* bpf_prog_run does runtime stats */); + __bpf_prog_exit_sleepable(prog, 0 /* bpf_prog_run does runtime stats */, &run_ctx); bpf_prog_put(prog); return 0; #endif default: - return -EINVAL; + return ____bpf_sys_bpf(cmd, attr, size); } - return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size); } -EXPORT_SYMBOL(bpf_sys_bpf); +EXPORT_SYMBOL(kern_sys_bpf); static const struct bpf_func_proto bpf_sys_bpf_proto = { .func = bpf_sys_bpf, @@ -4873,7 +5196,7 @@ BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flag return *res ? 0 : -ENOENT; } -const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = { +static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = { .func = bpf_kallsyms_lookup_name, .gpl_only = false, .ret_type = RET_INTEGER, @@ -4888,7 +5211,7 @@ syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_sys_bpf: - return &bpf_sys_bpf_proto; + return !perfmon_capable() ? NULL : &bpf_sys_bpf_proto; case BPF_FUNC_btf_find_by_name_kind: return &bpf_btf_find_by_name_kind_proto; case BPF_FUNC_sys_close: @@ -4908,3 +5231,90 @@ const struct bpf_verifier_ops bpf_syscall_verifier_ops = { const struct bpf_prog_ops bpf_syscall_prog_ops = { .test_run = bpf_prog_test_run_syscall, }; + +#ifdef CONFIG_SYSCTL +static int bpf_stats_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct static_key *key = (struct static_key *)table->data; + static int saved_val; + int val, ret; + struct ctl_table tmp = { + .data = &val, + .maxlen = sizeof(val), + .mode = table->mode, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&bpf_stats_enabled_mutex); + val = saved_val; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret && val != saved_val) { + if (val) + static_key_slow_inc(key); + else + static_key_slow_dec(key); + saved_val = val; + } + mutex_unlock(&bpf_stats_enabled_mutex); + return ret; +} + +void __weak unpriv_ebpf_notify(int new_state) +{ +} + +static int bpf_unpriv_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret, unpriv_enable = *(int *)table->data; + bool locked_state = unpriv_enable == 1; + struct ctl_table tmp = *table; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + tmp.data = &unpriv_enable; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret) { + if (locked_state && unpriv_enable != 1) + return -EPERM; + *(int *)table->data = unpriv_enable; + } + + unpriv_ebpf_notify(unpriv_enable); + + return ret; +} + +static struct ctl_table bpf_syscall_table[] = { + { + .procname = "unprivileged_bpf_disabled", + .data = &sysctl_unprivileged_bpf_disabled, + .maxlen = sizeof(sysctl_unprivileged_bpf_disabled), + .mode = 0644, + .proc_handler = bpf_unpriv_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { + .procname = "bpf_stats_enabled", + .data = &bpf_stats_enabled_key.key, + .maxlen = sizeof(bpf_stats_enabled_key), + .mode = 0644, + .proc_handler = bpf_stats_handler, + }, + { } +}; + +static int __init bpf_syscall_sysctl_init(void) +{ + register_sysctl_init("kernel", bpf_syscall_table); + return 0; +} +late_initcall(bpf_syscall_sysctl_init); +#endif /* CONFIG_SYSCTL */ diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index d94696198ef8..67e03e1833ba 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -10,8 +10,17 @@ #include <linux/btf_ids.h> #include "mmap_unlock_work.h" +static const char * const iter_task_type_names[] = { + "ALL", + "TID", + "PID", +}; + struct bpf_iter_seq_task_common { struct pid_namespace *ns; + enum bpf_iter_task_type type; + u32 pid; + u32 pid_visiting; }; struct bpf_iter_seq_task_info { @@ -22,18 +31,115 @@ struct bpf_iter_seq_task_info { u32 tid; }; -static struct task_struct *task_seq_get_next(struct pid_namespace *ns, +static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_common *common, + u32 *tid, + bool skip_if_dup_files) +{ + struct task_struct *task, *next_task; + struct pid *pid; + u32 saved_tid; + + if (!*tid) { + /* The first time, the iterator calls this function. */ + pid = find_pid_ns(common->pid, common->ns); + if (!pid) + return NULL; + + task = get_pid_task(pid, PIDTYPE_TGID); + if (!task) + return NULL; + + *tid = common->pid; + common->pid_visiting = common->pid; + + return task; + } + + /* If the control returns to user space and comes back to the + * kernel again, *tid and common->pid_visiting should be the + * same for task_seq_start() to pick up the correct task. + */ + if (*tid == common->pid_visiting) { + pid = find_pid_ns(common->pid_visiting, common->ns); + task = get_pid_task(pid, PIDTYPE_PID); + + return task; + } + + pid = find_pid_ns(common->pid_visiting, common->ns); + if (!pid) + return NULL; + + task = get_pid_task(pid, PIDTYPE_PID); + if (!task) + return NULL; + +retry: + if (!pid_alive(task)) { + put_task_struct(task); + return NULL; + } + + next_task = next_thread(task); + put_task_struct(task); + if (!next_task) + return NULL; + + saved_tid = *tid; + *tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns); + if (!*tid || *tid == common->pid) { + /* Run out of tasks of a process. The tasks of a + * thread_group are linked as circular linked list. + */ + *tid = saved_tid; + return NULL; + } + + get_task_struct(next_task); + common->pid_visiting = *tid; + + if (skip_if_dup_files && task->files == task->group_leader->files) { + task = next_task; + goto retry; + } + + return next_task; +} + +static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common, u32 *tid, bool skip_if_dup_files) { struct task_struct *task = NULL; struct pid *pid; + if (common->type == BPF_TASK_ITER_TID) { + if (*tid && *tid != common->pid) + return NULL; + rcu_read_lock(); + pid = find_pid_ns(common->pid, common->ns); + if (pid) { + task = get_pid_task(pid, PIDTYPE_TGID); + *tid = common->pid; + } + rcu_read_unlock(); + + return task; + } + + if (common->type == BPF_TASK_ITER_TGID) { + rcu_read_lock(); + task = task_group_seq_get_next(common, tid, skip_if_dup_files); + rcu_read_unlock(); + + return task; + } + rcu_read_lock(); retry: - pid = find_ge_pid(*tid, ns); + pid = find_ge_pid(*tid, common->ns); if (pid) { - *tid = pid_nr_ns(pid, ns); + *tid = pid_nr_ns(pid, common->ns); task = get_pid_task(pid, PIDTYPE_PID); if (!task) { ++*tid; @@ -56,7 +162,7 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos) struct bpf_iter_seq_task_info *info = seq->private; struct task_struct *task; - task = task_seq_get_next(info->common.ns, &info->tid, false); + task = task_seq_get_next(&info->common, &info->tid, false); if (!task) return NULL; @@ -73,7 +179,7 @@ static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos) ++*pos; ++info->tid; put_task_struct((struct task_struct *)v); - task = task_seq_get_next(info->common.ns, &info->tid, false); + task = task_seq_get_next(&info->common, &info->tid, false); if (!task) return NULL; @@ -99,7 +205,6 @@ static int __task_seq_show(struct seq_file *seq, struct task_struct *task, if (!prog) return 0; - meta.seq = seq; ctx.meta = &meta; ctx.task = task; return bpf_iter_run_prog(prog, &ctx); @@ -118,6 +223,41 @@ static void task_seq_stop(struct seq_file *seq, void *v) put_task_struct((struct task_struct *)v); } +static int bpf_iter_attach_task(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux) +{ + unsigned int flags; + struct pid *pid; + pid_t tgid; + + if ((!!linfo->task.tid + !!linfo->task.pid + !!linfo->task.pid_fd) > 1) + return -EINVAL; + + aux->task.type = BPF_TASK_ITER_ALL; + if (linfo->task.tid != 0) { + aux->task.type = BPF_TASK_ITER_TID; + aux->task.pid = linfo->task.tid; + } + if (linfo->task.pid != 0) { + aux->task.type = BPF_TASK_ITER_TGID; + aux->task.pid = linfo->task.pid; + } + if (linfo->task.pid_fd != 0) { + aux->task.type = BPF_TASK_ITER_TGID; + + pid = pidfd_get_pid(linfo->task.pid_fd, &flags); + if (IS_ERR(pid)) + return PTR_ERR(pid); + + tgid = pid_nr_ns(pid, task_active_pid_ns(current)); + aux->task.pid = tgid; + put_pid(pid); + } + + return 0; +} + static const struct seq_operations task_seq_ops = { .start = task_seq_start, .next = task_seq_next, @@ -138,8 +278,7 @@ struct bpf_iter_seq_task_file_info { static struct file * task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) { - struct pid_namespace *ns = info->common.ns; - u32 curr_tid = info->tid; + u32 saved_tid = info->tid; struct task_struct *curr_task; unsigned int curr_fd = info->fd; @@ -152,21 +291,18 @@ again: curr_task = info->task; curr_fd = info->fd; } else { - curr_task = task_seq_get_next(ns, &curr_tid, true); + curr_task = task_seq_get_next(&info->common, &info->tid, true); if (!curr_task) { info->task = NULL; - info->tid = curr_tid; return NULL; } - /* set info->task and info->tid */ + /* set info->task */ info->task = curr_task; - if (curr_tid == info->tid) { + if (saved_tid == info->tid) curr_fd = info->fd; - } else { - info->tid = curr_tid; + else curr_fd = 0; - } } rcu_read_lock(); @@ -187,9 +323,15 @@ again: /* the current task is done, go to the next task */ rcu_read_unlock(); put_task_struct(curr_task); + + if (info->common.type == BPF_TASK_ITER_TID) { + info->task = NULL; + return NULL; + } + info->task = NULL; info->fd = 0; - curr_tid = ++(info->tid); + saved_tid = ++(info->tid); goto again; } @@ -270,6 +412,9 @@ static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux) struct bpf_iter_seq_task_common *common = priv_data; common->ns = get_pid_ns(task_active_pid_ns(current)); + common->type = aux->task.type; + common->pid = aux->task.pid; + return 0; } @@ -308,11 +453,10 @@ enum bpf_task_vma_iter_find_op { static struct vm_area_struct * task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) { - struct pid_namespace *ns = info->common.ns; enum bpf_task_vma_iter_find_op op; struct vm_area_struct *curr_vma; struct task_struct *curr_task; - u32 curr_tid = info->tid; + u32 saved_tid = info->tid; /* If this function returns a non-NULL vma, it holds a reference to * the task_struct, and holds read lock on vma->mm->mmap_lock. @@ -372,14 +516,13 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) } } else { again: - curr_task = task_seq_get_next(ns, &curr_tid, true); + curr_task = task_seq_get_next(&info->common, &info->tid, true); if (!curr_task) { - info->tid = curr_tid + 1; + info->tid++; goto finish; } - if (curr_tid != info->tid) { - info->tid = curr_tid; + if (saved_tid != info->tid) { /* new task, process the first vma */ op = task_vma_iter_first_vma; } else { @@ -431,9 +574,12 @@ again: return curr_vma; next_task: + if (info->common.type == BPF_TASK_ITER_TID) + goto finish; + put_task_struct(curr_task); info->task = NULL; - curr_tid++; + info->tid++; goto again; finish: @@ -532,8 +678,33 @@ static const struct bpf_iter_seq_info task_seq_info = { .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), }; +static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info) +{ + switch (aux->task.type) { + case BPF_TASK_ITER_TID: + info->iter.task.tid = aux->task.pid; + break; + case BPF_TASK_ITER_TGID: + info->iter.task.pid = aux->task.pid; + break; + default: + break; + } + return 0; +} + +static void bpf_iter_task_show_fdinfo(const struct bpf_iter_aux_info *aux, struct seq_file *seq) +{ + seq_printf(seq, "task_type:\t%s\n", iter_task_type_names[aux->task.type]); + if (aux->task.type == BPF_TASK_ITER_TID) + seq_printf(seq, "tid:\t%u\n", aux->task.pid); + else if (aux->task.type == BPF_TASK_ITER_TGID) + seq_printf(seq, "pid:\t%u\n", aux->task.pid); +} + static struct bpf_iter_reg task_reg_info = { .target = "task", + .attach_target = bpf_iter_attach_task, .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 1, .ctx_arg_info = { @@ -541,6 +712,8 @@ static struct bpf_iter_reg task_reg_info = { PTR_TO_BTF_ID_OR_NULL }, }, .seq_info = &task_seq_info, + .fill_link_info = bpf_iter_fill_link_info, + .show_fdinfo = bpf_iter_task_show_fdinfo, }; static const struct bpf_iter_seq_info task_file_seq_info = { @@ -552,6 +725,7 @@ static const struct bpf_iter_seq_info task_file_seq_info = { static struct bpf_iter_reg task_file_reg_info = { .target = "task_file", + .attach_target = bpf_iter_attach_task, .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 2, .ctx_arg_info = { @@ -561,6 +735,8 @@ static struct bpf_iter_reg task_file_reg_info = { PTR_TO_BTF_ID_OR_NULL }, }, .seq_info = &task_file_seq_info, + .fill_link_info = bpf_iter_fill_link_info, + .show_fdinfo = bpf_iter_task_show_fdinfo, }; static const struct bpf_iter_seq_info task_vma_seq_info = { @@ -572,6 +748,7 @@ static const struct bpf_iter_seq_info task_vma_seq_info = { static struct bpf_iter_reg task_vma_reg_info = { .target = "task_vma", + .attach_target = bpf_iter_attach_task, .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 2, .ctx_arg_info = { @@ -581,6 +758,8 @@ static struct bpf_iter_reg task_vma_reg_info = { PTR_TO_BTF_ID_OR_NULL }, }, .seq_info = &task_vma_seq_info, + .fill_link_info = bpf_iter_fill_link_info, + .show_fdinfo = bpf_iter_task_show_fdinfo, }; BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start, diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index ada97751ae1b..bf0906e1e2b9 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -11,6 +11,9 @@ #include <linux/rcupdate_wait.h> #include <linux/module.h> #include <linux/static_call.h> +#include <linux/bpf_verifier.h> +#include <linux/bpf_lsm.h> +#include <linux/delay.h> /* dummy _ops. The verifier will operate on target program's ops. */ const struct bpf_verifier_ops bpf_extension_verifier_ops = { @@ -27,28 +30,90 @@ static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE]; /* serializes access to trampoline_table */ static DEFINE_MUTEX(trampoline_mutex); -bool bpf_prog_has_trampoline(const struct bpf_prog *prog) +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex); + +static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd cmd) { - enum bpf_attach_type eatype = prog->expected_attach_type; + struct bpf_trampoline *tr = ops->private; + int ret = 0; - return eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || - eatype == BPF_MODIFY_RETURN; -} + if (cmd == FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF) { + /* This is called inside register_ftrace_direct_multi(), so + * tr->mutex is already locked. + */ + lockdep_assert_held_once(&tr->mutex); -void *bpf_jit_alloc_exec_page(void) -{ - void *image; + /* Instead of updating the trampoline here, we propagate + * -EAGAIN to register_ftrace_direct_multi(). Then we can + * retry register_ftrace_direct_multi() after updating the + * trampoline. + */ + if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) && + !(tr->flags & BPF_TRAMP_F_ORIG_STACK)) { + if (WARN_ON_ONCE(tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY)) + return -EBUSY; - image = bpf_jit_alloc_exec(PAGE_SIZE); - if (!image) - return NULL; + tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY; + return -EAGAIN; + } - set_vm_flush_reset_perms(image); - /* Keep image as writeable. The alternative is to keep flipping ro/rw - * every time new program is attached or detached. + return 0; + } + + /* The normal locking order is + * tr->mutex => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c) + * + * The following two commands are called from + * + * prepare_direct_functions_for_ipmodify + * cleanup_direct_functions_after_ipmodify + * + * In both cases, direct_mutex is already locked. Use + * mutex_trylock(&tr->mutex) to avoid deadlock in race condition + * (something else is making changes to this same trampoline). */ - set_memory_x((long)image, 1); - return image; + if (!mutex_trylock(&tr->mutex)) { + /* sleep 1 ms to make sure whatever holding tr->mutex makes + * some progress. + */ + msleep(1); + return -EAGAIN; + } + + switch (cmd) { + case FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER: + tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY; + + if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) && + !(tr->flags & BPF_TRAMP_F_ORIG_STACK)) + ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */); + break; + case FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER: + tr->flags &= ~BPF_TRAMP_F_SHARE_IPMODIFY; + + if (tr->flags & BPF_TRAMP_F_ORIG_STACK) + ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */); + break; + default: + ret = -EINVAL; + break; + } + + mutex_unlock(&tr->mutex); + return ret; +} +#endif + +bool bpf_prog_has_trampoline(const struct bpf_prog *prog) +{ + enum bpf_attach_type eatype = prog->expected_attach_type; + enum bpf_prog_type ptype = prog->type; + + return (ptype == BPF_PROG_TYPE_TRACING && + (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || + eatype == BPF_MODIFY_RETURN)) || + (ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC); } void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym) @@ -84,6 +149,16 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key) tr = kzalloc(sizeof(*tr), GFP_KERNEL); if (!tr) goto out; +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + tr->fops = kzalloc(sizeof(struct ftrace_ops), GFP_KERNEL); + if (!tr->fops) { + kfree(tr); + tr = NULL; + goto out; + } + tr->fops->private = tr; + tr->fops->ops_func = bpf_tramp_ftrace_ops_func; +#endif tr->key = key; INIT_HLIST_NODE(&tr->hlist); @@ -123,7 +198,7 @@ static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr) int ret; if (tr->func.ftrace_managed) - ret = unregister_ftrace_direct((long)ip, (long)old_addr); + ret = unregister_ftrace_direct_multi(tr->fops, (long)old_addr); else ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL); @@ -132,15 +207,20 @@ static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr) return ret; } -static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr) +static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr, + bool lock_direct_mutex) { void *ip = tr->func.addr; int ret; - if (tr->func.ftrace_managed) - ret = modify_ftrace_direct((long)ip, (long)old_addr, (long)new_addr); - else + if (tr->func.ftrace_managed) { + if (lock_direct_mutex) + ret = modify_ftrace_direct_multi(tr->fops, (long)new_addr); + else + ret = modify_ftrace_direct_multi_nolock(tr->fops, (long)new_addr); + } else { ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr); + } return ret; } @@ -152,46 +232,51 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) int ret; faddr = ftrace_location((unsigned long)ip); - if (faddr) + if (faddr) { + if (!tr->fops) + return -ENOTSUPP; tr->func.ftrace_managed = true; + } if (bpf_trampoline_module_get(tr)) return -ENOENT; - if (tr->func.ftrace_managed) - ret = register_ftrace_direct((long)ip, (long)new_addr); - else + if (tr->func.ftrace_managed) { + ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1); + ret = register_ftrace_direct_multi(tr->fops, (long)new_addr); + } else { ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr); + } if (ret) bpf_trampoline_module_put(tr); return ret; } -static struct bpf_tramp_progs * +static struct bpf_tramp_links * bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_arg) { - const struct bpf_prog_aux *aux; - struct bpf_tramp_progs *tprogs; - struct bpf_prog **progs; + struct bpf_tramp_link *link; + struct bpf_tramp_links *tlinks; + struct bpf_tramp_link **links; int kind; *total = 0; - tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL); - if (!tprogs) + tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL); + if (!tlinks) return ERR_PTR(-ENOMEM); for (kind = 0; kind < BPF_TRAMP_MAX; kind++) { - tprogs[kind].nr_progs = tr->progs_cnt[kind]; + tlinks[kind].nr_links = tr->progs_cnt[kind]; *total += tr->progs_cnt[kind]; - progs = tprogs[kind].progs; + links = tlinks[kind].links; - hlist_for_each_entry(aux, &tr->progs_hlist[kind], tramp_hlist) { - *ip_arg |= aux->prog->call_get_func_ip; - *progs++ = aux->prog; + hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) { + *ip_arg |= link->link.prog->call_get_func_ip; + *links++ = link; } } - return tprogs; + return tlinks; } static void __bpf_tramp_image_put_deferred(struct work_struct *work) @@ -303,9 +388,10 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx) goto out_free_im; err = -ENOMEM; - im->image = image = bpf_jit_alloc_exec_page(); + im->image = image = bpf_jit_alloc_exec(PAGE_SIZE); if (!image) goto out_uncharge; + set_vm_flush_reset_perms(image); err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL); if (err) @@ -327,17 +413,17 @@ out: return ERR_PTR(err); } -static int bpf_trampoline_update(struct bpf_trampoline *tr) +static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex) { struct bpf_tramp_image *im; - struct bpf_tramp_progs *tprogs; - u32 flags = BPF_TRAMP_F_RESTORE_REGS; + struct bpf_tramp_links *tlinks; + u32 orig_flags = tr->flags; bool ip_arg = false; int err, total; - tprogs = bpf_trampoline_get_progs(tr, &total, &ip_arg); - if (IS_ERR(tprogs)) - return PTR_ERR(tprogs); + tlinks = bpf_trampoline_get_progs(tr, &total, &ip_arg); + if (IS_ERR(tlinks)) + return PTR_ERR(tlinks); if (total == 0) { err = unregister_fentry(tr, tr->cur_image->image); @@ -353,35 +439,71 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) goto out; } - if (tprogs[BPF_TRAMP_FEXIT].nr_progs || - tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs) - flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; + /* clear all bits except SHARE_IPMODIFY */ + tr->flags &= BPF_TRAMP_F_SHARE_IPMODIFY; + + if (tlinks[BPF_TRAMP_FEXIT].nr_links || + tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) { + /* NOTE: BPF_TRAMP_F_RESTORE_REGS and BPF_TRAMP_F_SKIP_FRAME + * should not be set together. + */ + tr->flags |= BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; + } else { + tr->flags |= BPF_TRAMP_F_RESTORE_REGS; + } if (ip_arg) - flags |= BPF_TRAMP_F_IP_ARG; + tr->flags |= BPF_TRAMP_F_IP_ARG; + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +again: + if ((tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY) && + (tr->flags & BPF_TRAMP_F_CALL_ORIG)) + tr->flags |= BPF_TRAMP_F_ORIG_STACK; +#endif err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE, - &tr->func.model, flags, tprogs, + &tr->func.model, tr->flags, tlinks, tr->func.addr); if (err < 0) goto out; + set_memory_ro((long)im->image, 1); + set_memory_x((long)im->image, 1); + WARN_ON(tr->cur_image && tr->selector == 0); WARN_ON(!tr->cur_image && tr->selector); if (tr->cur_image) /* progs already running at this address */ - err = modify_fentry(tr, tr->cur_image->image, im->image); + err = modify_fentry(tr, tr->cur_image->image, im->image, lock_direct_mutex); else /* first time registering */ err = register_fentry(tr, im->image); + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + if (err == -EAGAIN) { + /* -EAGAIN from bpf_tramp_ftrace_ops_func. Now + * BPF_TRAMP_F_SHARE_IPMODIFY is set, we can generate the + * trampoline again, and retry register. + */ + /* reset fops->func and fops->trampoline for re-register */ + tr->fops->func = NULL; + tr->fops->trampoline = 0; + goto again; + } +#endif if (err) goto out; + if (tr->cur_image) bpf_tramp_image_put(tr->cur_image); tr->cur_image = im; tr->selector++; out: - kfree(tprogs); + /* If any error happens, restore previous flags */ + if (err) + tr->flags = orig_flags; + kfree(tlinks); return err; } @@ -407,77 +529,264 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) } } -int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) +static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) { enum bpf_tramp_prog_type kind; + struct bpf_tramp_link *link_exiting; int err = 0; - int cnt; + int cnt = 0, i; - kind = bpf_attach_type_to_tramp(prog); - mutex_lock(&tr->mutex); - if (tr->extension_prog) { + kind = bpf_attach_type_to_tramp(link->link.prog); + if (tr->extension_prog) /* cannot attach fentry/fexit if extension prog is attached. * cannot overwrite extension prog either. */ - err = -EBUSY; - goto out; - } - cnt = tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT]; + return -EBUSY; + + for (i = 0; i < BPF_TRAMP_MAX; i++) + cnt += tr->progs_cnt[i]; + if (kind == BPF_TRAMP_REPLACE) { /* Cannot attach extension if fentry/fexit are in use. */ - if (cnt) { - err = -EBUSY; - goto out; - } - tr->extension_prog = prog; - err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL, - prog->bpf_func); - goto out; - } - if (cnt >= BPF_MAX_TRAMP_PROGS) { - err = -E2BIG; - goto out; + if (cnt) + return -EBUSY; + tr->extension_prog = link->link.prog; + return bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL, + link->link.prog->bpf_func); } - if (!hlist_unhashed(&prog->aux->tramp_hlist)) { + if (cnt >= BPF_MAX_TRAMP_LINKS) + return -E2BIG; + if (!hlist_unhashed(&link->tramp_hlist)) /* prog already linked */ - err = -EBUSY; - goto out; + return -EBUSY; + hlist_for_each_entry(link_exiting, &tr->progs_hlist[kind], tramp_hlist) { + if (link_exiting->link.prog != link->link.prog) + continue; + /* prog already linked */ + return -EBUSY; } - hlist_add_head(&prog->aux->tramp_hlist, &tr->progs_hlist[kind]); + + hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]); tr->progs_cnt[kind]++; - err = bpf_trampoline_update(tr); + err = bpf_trampoline_update(tr, true /* lock_direct_mutex */); if (err) { - hlist_del_init(&prog->aux->tramp_hlist); + hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; } -out: + return err; +} + +int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) +{ + int err; + + mutex_lock(&tr->mutex); + err = __bpf_trampoline_link_prog(link, tr); mutex_unlock(&tr->mutex); return err; } -/* bpf_trampoline_unlink_prog() should never fail. */ -int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) +static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) { enum bpf_tramp_prog_type kind; int err; - kind = bpf_attach_type_to_tramp(prog); - mutex_lock(&tr->mutex); + kind = bpf_attach_type_to_tramp(link->link.prog); if (kind == BPF_TRAMP_REPLACE) { WARN_ON_ONCE(!tr->extension_prog); err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, tr->extension_prog->bpf_func, NULL); tr->extension_prog = NULL; - goto out; + return err; } - hlist_del_init(&prog->aux->tramp_hlist); + hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; - err = bpf_trampoline_update(tr); -out: + return bpf_trampoline_update(tr, true /* lock_direct_mutex */); +} + +/* bpf_trampoline_unlink_prog() should never fail. */ +int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) +{ + int err; + + mutex_lock(&tr->mutex); + err = __bpf_trampoline_unlink_prog(link, tr); mutex_unlock(&tr->mutex); return err; } +#if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM) +static void bpf_shim_tramp_link_release(struct bpf_link *link) +{ + struct bpf_shim_tramp_link *shim_link = + container_of(link, struct bpf_shim_tramp_link, link.link); + + /* paired with 'shim_link->trampoline = tr' in bpf_trampoline_link_cgroup_shim */ + if (!shim_link->trampoline) + return; + + WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline)); + bpf_trampoline_put(shim_link->trampoline); +} + +static void bpf_shim_tramp_link_dealloc(struct bpf_link *link) +{ + struct bpf_shim_tramp_link *shim_link = + container_of(link, struct bpf_shim_tramp_link, link.link); + + kfree(shim_link); +} + +static const struct bpf_link_ops bpf_shim_tramp_link_lops = { + .release = bpf_shim_tramp_link_release, + .dealloc = bpf_shim_tramp_link_dealloc, +}; + +static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog, + bpf_func_t bpf_func, + int cgroup_atype) +{ + struct bpf_shim_tramp_link *shim_link = NULL; + struct bpf_prog *p; + + shim_link = kzalloc(sizeof(*shim_link), GFP_USER); + if (!shim_link) + return NULL; + + p = bpf_prog_alloc(1, 0); + if (!p) { + kfree(shim_link); + return NULL; + } + + p->jited = false; + p->bpf_func = bpf_func; + + p->aux->cgroup_atype = cgroup_atype; + p->aux->attach_func_proto = prog->aux->attach_func_proto; + p->aux->attach_btf_id = prog->aux->attach_btf_id; + p->aux->attach_btf = prog->aux->attach_btf; + btf_get(p->aux->attach_btf); + p->type = BPF_PROG_TYPE_LSM; + p->expected_attach_type = BPF_LSM_MAC; + bpf_prog_inc(p); + bpf_link_init(&shim_link->link.link, BPF_LINK_TYPE_UNSPEC, + &bpf_shim_tramp_link_lops, p); + bpf_cgroup_atype_get(p->aux->attach_btf_id, cgroup_atype); + + return shim_link; +} + +static struct bpf_shim_tramp_link *cgroup_shim_find(struct bpf_trampoline *tr, + bpf_func_t bpf_func) +{ + struct bpf_tramp_link *link; + int kind; + + for (kind = 0; kind < BPF_TRAMP_MAX; kind++) { + hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) { + struct bpf_prog *p = link->link.prog; + + if (p->bpf_func == bpf_func) + return container_of(link, struct bpf_shim_tramp_link, link); + } + } + + return NULL; +} + +int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, + int cgroup_atype) +{ + struct bpf_shim_tramp_link *shim_link = NULL; + struct bpf_attach_target_info tgt_info = {}; + struct bpf_trampoline *tr; + bpf_func_t bpf_func; + u64 key; + int err; + + err = bpf_check_attach_target(NULL, prog, NULL, + prog->aux->attach_btf_id, + &tgt_info); + if (err) + return err; + + key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, + prog->aux->attach_btf_id); + + bpf_lsm_find_cgroup_shim(prog, &bpf_func); + tr = bpf_trampoline_get(key, &tgt_info); + if (!tr) + return -ENOMEM; + + mutex_lock(&tr->mutex); + + shim_link = cgroup_shim_find(tr, bpf_func); + if (shim_link) { + /* Reusing existing shim attached by the other program. */ + bpf_link_inc(&shim_link->link.link); + + mutex_unlock(&tr->mutex); + bpf_trampoline_put(tr); /* bpf_trampoline_get above */ + return 0; + } + + /* Allocate and install new shim. */ + + shim_link = cgroup_shim_alloc(prog, bpf_func, cgroup_atype); + if (!shim_link) { + err = -ENOMEM; + goto err; + } + + err = __bpf_trampoline_link_prog(&shim_link->link, tr); + if (err) + goto err; + + shim_link->trampoline = tr; + /* note, we're still holding tr refcnt from above */ + + mutex_unlock(&tr->mutex); + + return 0; +err: + mutex_unlock(&tr->mutex); + + if (shim_link) + bpf_link_put(&shim_link->link.link); + + /* have to release tr while _not_ holding its mutex */ + bpf_trampoline_put(tr); /* bpf_trampoline_get above */ + + return err; +} + +void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog) +{ + struct bpf_shim_tramp_link *shim_link = NULL; + struct bpf_trampoline *tr; + bpf_func_t bpf_func; + u64 key; + + key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, + prog->aux->attach_btf_id); + + bpf_lsm_find_cgroup_shim(prog, &bpf_func); + tr = bpf_trampoline_lookup(key); + if (WARN_ON_ONCE(!tr)) + return; + + mutex_lock(&tr->mutex); + shim_link = cgroup_shim_find(tr, bpf_func); + mutex_unlock(&tr->mutex); + + if (shim_link) + bpf_link_put(&shim_link->link.link); + + bpf_trampoline_put(tr); /* bpf_trampoline_lookup above */ +} +#endif + struct bpf_trampoline *bpf_trampoline_get(u64 key, struct bpf_attach_target_info *tgt_info) { @@ -500,16 +809,19 @@ out: void bpf_trampoline_put(struct bpf_trampoline *tr) { + int i; + if (!tr) return; mutex_lock(&trampoline_mutex); if (!refcount_dec_and_test(&tr->refcnt)) goto out; WARN_ON_ONCE(mutex_is_locked(&tr->mutex)); - if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FENTRY]))) - goto out; - if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT]))) - goto out; + + for (i = 0; i < BPF_TRAMP_MAX; i++) + if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[i]))) + goto out; + /* This code will be executed even when the last bpf_tramp_image * is alive. All progs are detached from the trampoline and the * trampoline image is patched with jmp into epilogue to skip @@ -517,6 +829,10 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) * multiple rcu callbacks. */ hlist_del(&tr->hlist); + if (tr->fops) { + ftrace_free_filter(tr->fops); + kfree(tr->fops); + } kfree(tr); out: mutex_unlock(&trampoline_mutex); @@ -535,17 +851,6 @@ static __always_inline u64 notrace bpf_prog_start_time(void) return start; } -static void notrace inc_misses_counter(struct bpf_prog *prog) -{ - struct bpf_prog_stats *stats; - unsigned int flags; - - stats = this_cpu_ptr(prog->stats); - flags = u64_stats_update_begin_irqsave(&stats->syncp); - u64_stats_inc(&stats->misses); - u64_stats_update_end_irqrestore(&stats->syncp, flags); -} - /* The logic is similar to bpf_prog_run(), but with an explicit * rcu_read_lock() and migrate_disable() which are required * for the trampoline. The macro is split into @@ -559,13 +864,16 @@ static void notrace inc_misses_counter(struct bpf_prog *prog) * [2..MAX_U64] - execute bpf prog and record execution time. * This is start time. */ -u64 notrace __bpf_prog_enter(struct bpf_prog *prog) +u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) __acquires(RCU) { rcu_read_lock(); migrate_disable(); - if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) { - inc_misses_counter(prog); + + run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); + + if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { + bpf_prog_inc_misses_counter(prog); return 0; } return bpf_prog_start_time(); @@ -593,35 +901,92 @@ static void notrace update_prog_stats(struct bpf_prog *prog, } } -void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) +void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx) __releases(RCU) { + bpf_reset_run_ctx(run_ctx->saved_run_ctx); + update_prog_stats(prog, start); - __this_cpu_dec(*(prog->active)); + this_cpu_dec(*(prog->active)); + migrate_enable(); + rcu_read_unlock(); +} + +u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog, + struct bpf_tramp_run_ctx *run_ctx) + __acquires(RCU) +{ + /* Runtime stats are exported via actual BPF_LSM_CGROUP + * programs, not the shims. + */ + rcu_read_lock(); + migrate_disable(); + + run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); + + return NO_START_TIME; +} + +void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx) + __releases(RCU) +{ + bpf_reset_run_ctx(run_ctx->saved_run_ctx); + migrate_enable(); rcu_read_unlock(); } -u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog) +u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) { rcu_read_lock_trace(); migrate_disable(); might_fault(); - if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) { - inc_misses_counter(prog); + + if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { + bpf_prog_inc_misses_counter(prog); return 0; } + + run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); + return bpf_prog_start_time(); } -void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start) +void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx) { + bpf_reset_run_ctx(run_ctx->saved_run_ctx); + update_prog_stats(prog, start); - __this_cpu_dec(*(prog->active)); + this_cpu_dec(*(prog->active)); migrate_enable(); rcu_read_unlock_trace(); } +u64 notrace __bpf_prog_enter_struct_ops(struct bpf_prog *prog, + struct bpf_tramp_run_ctx *run_ctx) + __acquires(RCU) +{ + rcu_read_lock(); + migrate_disable(); + + run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); + + return bpf_prog_start_time(); +} + +void notrace __bpf_prog_exit_struct_ops(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx) + __releases(RCU) +{ + bpf_reset_run_ctx(run_ctx->saved_run_ctx); + + update_prog_stats(prog, start); + migrate_enable(); + rcu_read_unlock(); +} + void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr) { percpu_ref_get(&tr->pcref); @@ -635,7 +1000,7 @@ void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr) int __weak arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end, const struct btf_func_model *m, u32 flags, - struct bpf_tramp_progs *tprogs, + struct bpf_tramp_links *tlinks, void *orig_call) { return -ENOTSUPP; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d175b70067b3..6f6d2d511c06 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -23,6 +23,7 @@ #include <linux/error-injection.h> #include <linux/bpf_lsm.h> #include <linux/btf_ids.h> +#include <linux/poison.h> #include "disasm.h" @@ -187,6 +188,9 @@ struct bpf_verifier_stack_elem { POISON_POINTER_DELTA)) #define BPF_MAP_PTR(X) ((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV)) +static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx); +static int release_reference(struct bpf_verifier_env *env, int ref_obj_id); + static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux) { return BPF_MAP_PTR(aux->map_ptr_state) == BPF_MAP_PTR_POISON; @@ -245,6 +249,7 @@ struct bpf_call_arg_meta { struct bpf_map *map_ptr; bool raw_mode; bool pkt_access; + u8 release_regno; int regno; int access_size; int mem_size; @@ -257,6 +262,8 @@ struct bpf_call_arg_meta { struct btf *ret_btf; u32 ret_btf_id; u32 subprogno; + struct bpf_map_value_off_desc *kptr_off_desc; + u8 uninit_dynptr_regno; }; struct btf *btf_vmlinux; @@ -364,6 +371,7 @@ __printf(2, 3) void bpf_log(struct bpf_verifier_log *log, bpf_verifier_vlog(log, fmt, args); va_end(args); } +EXPORT_SYMBOL_GPL(bpf_log); static const char *ltrim(const char *s) { @@ -421,6 +429,7 @@ static void verbose_invalid_scalar(struct bpf_verifier_env *env, static bool type_is_pkt_pointer(enum bpf_reg_type type) { + type = base_type(type); return type == PTR_TO_PACKET || type == PTR_TO_PACKET_META; } @@ -450,10 +459,9 @@ static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) { - return base_type(type) == PTR_TO_SOCKET || - base_type(type) == PTR_TO_TCP_SOCK || - base_type(type) == PTR_TO_MEM || - base_type(type) == PTR_TO_BTF_ID; + type = base_type(type); + return type == PTR_TO_SOCKET || type == PTR_TO_TCP_SOCK || + type == PTR_TO_MEM || type == PTR_TO_BTF_ID; } static bool type_is_rdonly_mem(u32 type) @@ -461,36 +469,11 @@ static bool type_is_rdonly_mem(u32 type) return type & MEM_RDONLY; } -static bool arg_type_may_be_refcounted(enum bpf_arg_type type) -{ - return type == ARG_PTR_TO_SOCK_COMMON; -} - static bool type_may_be_null(u32 type) { return type & PTR_MAYBE_NULL; } -/* Determine whether the function releases some resources allocated by another - * function call. The first reference type argument will be assumed to be - * released by release_reference(). - */ -static bool is_release_function(enum bpf_func_id func_id) -{ - return func_id == BPF_FUNC_sk_release || - func_id == BPF_FUNC_ringbuf_submit || - func_id == BPF_FUNC_ringbuf_discard; -} - -static bool may_be_acquire_function(enum bpf_func_id func_id) -{ - return func_id == BPF_FUNC_sk_lookup_tcp || - func_id == BPF_FUNC_sk_lookup_udp || - func_id == BPF_FUNC_skc_lookup_tcp || - func_id == BPF_FUNC_map_lookup_elem || - func_id == BPF_FUNC_ringbuf_reserve; -} - static bool is_acquire_function(enum bpf_func_id func_id, const struct bpf_map *map) { @@ -499,7 +482,8 @@ static bool is_acquire_function(enum bpf_func_id func_id, if (func_id == BPF_FUNC_sk_lookup_tcp || func_id == BPF_FUNC_sk_lookup_udp || func_id == BPF_FUNC_skc_lookup_tcp || - func_id == BPF_FUNC_ringbuf_reserve) + func_id == BPF_FUNC_ringbuf_reserve || + func_id == BPF_FUNC_kptr_xchg) return true; if (func_id == BPF_FUNC_map_lookup_elem && @@ -517,10 +501,31 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id) func_id == BPF_FUNC_skc_to_tcp_sock || func_id == BPF_FUNC_skc_to_tcp6_sock || func_id == BPF_FUNC_skc_to_udp6_sock || + func_id == BPF_FUNC_skc_to_mptcp_sock || func_id == BPF_FUNC_skc_to_tcp_timewait_sock || func_id == BPF_FUNC_skc_to_tcp_request_sock; } +static bool is_dynptr_ref_function(enum bpf_func_id func_id) +{ + return func_id == BPF_FUNC_dynptr_data; +} + +static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id, + const struct bpf_map *map) +{ + int ref_obj_uses = 0; + + if (is_ptr_cast_function(func_id)) + ref_obj_uses++; + if (is_acquire_function(func_id, map)) + ref_obj_uses++; + if (is_dynptr_ref_function(func_id)) + ref_obj_uses++; + + return ref_obj_uses > 1; +} + static bool is_cmpxchg_insn(const struct bpf_insn *insn) { return BPF_CLASS(insn->code) == BPF_STX && @@ -558,6 +563,7 @@ static const char *reg_type_str(struct bpf_verifier_env *env, [PTR_TO_BUF] = "buf", [PTR_TO_FUNC] = "func", [PTR_TO_MAP_KEY] = "map_key", + [PTR_TO_DYNPTR] = "dynptr_ptr", }; if (type & PTR_MAYBE_NULL) { @@ -575,6 +581,8 @@ static const char *reg_type_str(struct bpf_verifier_env *env, strncpy(prefix, "user_", 32); if (type & MEM_PERCPU) strncpy(prefix, "percpu_", 32); + if (type & PTR_UNTRUSTED) + strncpy(prefix, "untrusted_", 32); snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s", prefix, str[base_type(type)], postfix); @@ -586,6 +594,7 @@ static char slot_type_char[] = { [STACK_SPILL] = 'r', [STACK_MISC] = 'm', [STACK_ZERO] = '0', + [STACK_DYNPTR] = 'd', }; static void print_liveness(struct bpf_verifier_env *env, @@ -601,6 +610,25 @@ static void print_liveness(struct bpf_verifier_env *env, verbose(env, "D"); } +static int get_spi(s32 off) +{ + return (-off - 1) / BPF_REG_SIZE; +} + +static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_slots) +{ + int allocated_slots = state->allocated_stack / BPF_REG_SIZE; + + /* We need to check that slots between [spi - nr_slots + 1, spi] are + * within [0, allocated_stack). + * + * Please note that the spi grows downwards. For example, a dynptr + * takes the size of two stack slots; the first slot will be at + * spi and the second slot will be at spi - 1. + */ + return spi - nr_slots + 1 >= 0 && spi < allocated_slots; +} + static struct bpf_func_state *func(struct bpf_verifier_env *env, const struct bpf_reg_state *reg) { @@ -652,6 +680,145 @@ static void mark_verifier_state_scratched(struct bpf_verifier_env *env) env->scratched_stack_slots = ~0ULL; } +static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type) +{ + switch (arg_type & DYNPTR_TYPE_FLAG_MASK) { + case DYNPTR_TYPE_LOCAL: + return BPF_DYNPTR_TYPE_LOCAL; + case DYNPTR_TYPE_RINGBUF: + return BPF_DYNPTR_TYPE_RINGBUF; + default: + return BPF_DYNPTR_TYPE_INVALID; + } +} + +static bool dynptr_type_refcounted(enum bpf_dynptr_type type) +{ + return type == BPF_DYNPTR_TYPE_RINGBUF; +} + +static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + enum bpf_arg_type arg_type, int insn_idx) +{ + struct bpf_func_state *state = func(env, reg); + enum bpf_dynptr_type type; + int spi, i, id; + + spi = get_spi(reg->off); + + if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS)) + return -EINVAL; + + for (i = 0; i < BPF_REG_SIZE; i++) { + state->stack[spi].slot_type[i] = STACK_DYNPTR; + state->stack[spi - 1].slot_type[i] = STACK_DYNPTR; + } + + type = arg_to_dynptr_type(arg_type); + if (type == BPF_DYNPTR_TYPE_INVALID) + return -EINVAL; + + state->stack[spi].spilled_ptr.dynptr.first_slot = true; + state->stack[spi].spilled_ptr.dynptr.type = type; + state->stack[spi - 1].spilled_ptr.dynptr.type = type; + + if (dynptr_type_refcounted(type)) { + /* The id is used to track proper releasing */ + id = acquire_reference_state(env, insn_idx); + if (id < 0) + return id; + + state->stack[spi].spilled_ptr.id = id; + state->stack[spi - 1].spilled_ptr.id = id; + } + + return 0; +} + +static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ + struct bpf_func_state *state = func(env, reg); + int spi, i; + + spi = get_spi(reg->off); + + if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS)) + return -EINVAL; + + for (i = 0; i < BPF_REG_SIZE; i++) { + state->stack[spi].slot_type[i] = STACK_INVALID; + state->stack[spi - 1].slot_type[i] = STACK_INVALID; + } + + /* Invalidate any slices associated with this dynptr */ + if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) { + release_reference(env, state->stack[spi].spilled_ptr.id); + state->stack[spi].spilled_ptr.id = 0; + state->stack[spi - 1].spilled_ptr.id = 0; + } + + state->stack[spi].spilled_ptr.dynptr.first_slot = false; + state->stack[spi].spilled_ptr.dynptr.type = 0; + state->stack[spi - 1].spilled_ptr.dynptr.type = 0; + + return 0; +} + +static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ + struct bpf_func_state *state = func(env, reg); + int spi = get_spi(reg->off); + int i; + + if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS)) + return true; + + for (i = 0; i < BPF_REG_SIZE; i++) { + if (state->stack[spi].slot_type[i] == STACK_DYNPTR || + state->stack[spi - 1].slot_type[i] == STACK_DYNPTR) + return false; + } + + return true; +} + +bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, + struct bpf_reg_state *reg) +{ + struct bpf_func_state *state = func(env, reg); + int spi = get_spi(reg->off); + int i; + + if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) || + !state->stack[spi].spilled_ptr.dynptr.first_slot) + return false; + + for (i = 0; i < BPF_REG_SIZE; i++) { + if (state->stack[spi].slot_type[i] != STACK_DYNPTR || + state->stack[spi - 1].slot_type[i] != STACK_DYNPTR) + return false; + } + + return true; +} + +bool is_dynptr_type_expected(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + enum bpf_arg_type arg_type) +{ + struct bpf_func_state *state = func(env, reg); + enum bpf_dynptr_type dynptr_type; + int spi = get_spi(reg->off); + + /* ARG_PTR_TO_DYNPTR takes any type of dynptr */ + if (arg_type == ARG_PTR_TO_DYNPTR) + return true; + + dynptr_type = arg_to_dynptr_type(arg_type); + + return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type; +} + /* The reg state of a pointer or a bounded scalar was saved when * it was spilled to the stack. */ @@ -941,6 +1108,7 @@ static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx) id = ++env->id_gen; state->refs[new_ofs].id = id; state->refs[new_ofs].insn_idx = insn_idx; + state->refs[new_ofs].callback_ref = state->in_callback_fn ? state->frameno : 0; return id; } @@ -953,6 +1121,9 @@ static int release_reference_state(struct bpf_func_state *state, int ptr_id) last_idx = state->acquired_refs - 1; for (i = 0; i < state->acquired_refs; i++) { if (state->refs[i].id == ptr_id) { + /* Cannot release caller references in callbacks */ + if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno) + return -EINVAL; if (last_idx && i != last_idx) memcpy(&state->refs[i], &state->refs[last_idx], sizeof(*state->refs)); @@ -1417,6 +1588,21 @@ static void __reg_bound_offset(struct bpf_reg_state *reg) reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off); } +static void reg_bounds_sync(struct bpf_reg_state *reg) +{ + /* We might have learned new bounds from the var_off. */ + __update_reg_bounds(reg); + /* We might have learned something about the sign bit. */ + __reg_deduce_bounds(reg); + /* We might have learned some bits from the bounds. */ + __reg_bound_offset(reg); + /* Intersecting with the old var_off might have improved our bounds + * slightly, e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), + * then new var_off is (0; 0x7f...fc) which improves our umax. + */ + __update_reg_bounds(reg); +} + static bool __reg32_bound_s64(s32 a) { return a >= 0 && a <= S32_MAX; @@ -1458,16 +1644,8 @@ static void __reg_combine_32_into_64(struct bpf_reg_state *reg) * so they do not impact tnum bounds calculation. */ __mark_reg64_unbounded(reg); - __update_reg_bounds(reg); } - - /* Intersecting with the old var_off might have improved our bounds - * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), - * then new var_off is (0; 0x7f...fc) which improves our umax. - */ - __reg_deduce_bounds(reg); - __reg_bound_offset(reg); - __update_reg_bounds(reg); + reg_bounds_sync(reg); } static bool __reg64_bound_s32(s64 a) @@ -1483,7 +1661,6 @@ static bool __reg64_bound_u32(u64 a) static void __reg_combine_64_into_32(struct bpf_reg_state *reg) { __mark_reg32_unbounded(reg); - if (__reg64_bound_s32(reg->smin_value) && __reg64_bound_s32(reg->smax_value)) { reg->s32_min_value = (s32)reg->smin_value; reg->s32_max_value = (s32)reg->smax_value; @@ -1492,14 +1669,7 @@ static void __reg_combine_64_into_32(struct bpf_reg_state *reg) reg->u32_min_value = (u32)reg->umin_value; reg->u32_max_value = (u32)reg->umax_value; } - - /* Intersecting with the old var_off might have improved our bounds - * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), - * then new var_off is (0; 0x7f...fc) which improves our umax. - */ - __reg_deduce_bounds(reg); - __reg_bound_offset(reg); - __update_reg_bounds(reg); + reg_bounds_sync(reg); } /* Mark a register as having a completely unknown (scalar) value. */ @@ -1595,6 +1765,7 @@ static void init_func_state(struct bpf_verifier_env *env, state->callsite = callsite; state->frameno = frameno; state->subprogno = subprogno; + state->callback_ret_range = tnum_range(0, 0); init_reg_state(env, state); mark_verifier_state_scratched(env); } @@ -1821,8 +1992,7 @@ void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab) kfree(tab); } -static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, - u32 func_id, s16 offset) +static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, s16 offset) { if (offset) { if (offset < 0) { @@ -1897,7 +2067,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) prog_aux->kfunc_btf_tab = btf_tab; } - desc_btf = find_kfunc_desc_btf(env, func_id, offset); + desc_btf = find_kfunc_desc_btf(env, offset); if (IS_ERR(desc_btf)) { verbose(env, "failed to find BTF for kernel function\n"); return PTR_ERR(desc_btf); @@ -2366,7 +2536,7 @@ static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn) if (insn->src_reg != BPF_PSEUDO_KFUNC_CALL) return NULL; - desc_btf = find_kfunc_desc_btf(data, insn->imm, insn->off); + desc_btf = find_kfunc_desc_btf(data, insn->off); if (IS_ERR(desc_btf)) return "<error>"; @@ -2755,7 +2925,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, return 0; } -static int mark_chain_precision(struct bpf_verifier_env *env, int regno) +int mark_chain_precision(struct bpf_verifier_env *env, int regno) { return __mark_chain_precision(env, regno, -1); } @@ -3211,7 +3381,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, return 0; } -enum stack_access_src { +enum bpf_access_src { ACCESS_DIRECT = 1, /* the access is performed by an instruction */ ACCESS_HELPER = 2, /* the access is performed by a helper */ }; @@ -3219,7 +3389,7 @@ enum stack_access_src { static int check_stack_range_initialized(struct bpf_verifier_env *env, int regno, int off, int access_size, bool zero_size_allowed, - enum stack_access_src type, + enum bpf_access_src type, struct bpf_call_arg_meta *meta); static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno) @@ -3469,9 +3639,175 @@ static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno, return 0; } +static int __check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno, + bool fixed_off_ok) +{ + /* Access to this pointer-typed register or passing it to a helper + * is only allowed in its original, unmodified form. + */ + + if (reg->off < 0) { + verbose(env, "negative offset %s ptr R%d off=%d disallowed\n", + reg_type_str(env, reg->type), regno, reg->off); + return -EACCES; + } + + if (!fixed_off_ok && reg->off) { + verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n", + reg_type_str(env, reg->type), regno, reg->off); + return -EACCES; + } + + if (!tnum_is_const(reg->var_off) || reg->var_off.value) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "variable %s access var_off=%s disallowed\n", + reg_type_str(env, reg->type), tn_buf); + return -EACCES; + } + + return 0; +} + +int check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno) +{ + return __check_ptr_off_reg(env, reg, regno, false); +} + +static int map_kptr_match_type(struct bpf_verifier_env *env, + struct bpf_map_value_off_desc *off_desc, + struct bpf_reg_state *reg, u32 regno) +{ + const char *targ_name = kernel_type_name(off_desc->kptr.btf, off_desc->kptr.btf_id); + int perm_flags = PTR_MAYBE_NULL; + const char *reg_name = ""; + + /* Only unreferenced case accepts untrusted pointers */ + if (off_desc->type == BPF_KPTR_UNREF) + perm_flags |= PTR_UNTRUSTED; + + if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags)) + goto bad_type; + + if (!btf_is_kernel(reg->btf)) { + verbose(env, "R%d must point to kernel BTF\n", regno); + return -EINVAL; + } + /* We need to verify reg->type and reg->btf, before accessing reg->btf */ + reg_name = kernel_type_name(reg->btf, reg->btf_id); + + /* For ref_ptr case, release function check should ensure we get one + * referenced PTR_TO_BTF_ID, and that its fixed offset is 0. For the + * normal store of unreferenced kptr, we must ensure var_off is zero. + * Since ref_ptr cannot be accessed directly by BPF insns, checks for + * reg->off and reg->ref_obj_id are not needed here. + */ + if (__check_ptr_off_reg(env, reg, regno, true)) + return -EACCES; + + /* A full type match is needed, as BTF can be vmlinux or module BTF, and + * we also need to take into account the reg->off. + * + * We want to support cases like: + * + * struct foo { + * struct bar br; + * struct baz bz; + * }; + * + * struct foo *v; + * v = func(); // PTR_TO_BTF_ID + * val->foo = v; // reg->off is zero, btf and btf_id match type + * val->bar = &v->br; // reg->off is still zero, but we need to retry with + * // first member type of struct after comparison fails + * val->baz = &v->bz; // reg->off is non-zero, so struct needs to be walked + * // to match type + * + * In the kptr_ref case, check_func_arg_reg_off already ensures reg->off + * is zero. We must also ensure that btf_struct_ids_match does not walk + * the struct to match type against first member of struct, i.e. reject + * second case from above. Hence, when type is BPF_KPTR_REF, we set + * strict mode to true for type match. + */ + if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, + off_desc->kptr.btf, off_desc->kptr.btf_id, + off_desc->type == BPF_KPTR_REF)) + goto bad_type; + return 0; +bad_type: + verbose(env, "invalid kptr access, R%d type=%s%s ", regno, + reg_type_str(env, reg->type), reg_name); + verbose(env, "expected=%s%s", reg_type_str(env, PTR_TO_BTF_ID), targ_name); + if (off_desc->type == BPF_KPTR_UNREF) + verbose(env, " or %s%s\n", reg_type_str(env, PTR_TO_BTF_ID | PTR_UNTRUSTED), + targ_name); + else + verbose(env, "\n"); + return -EINVAL; +} + +static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, + int value_regno, int insn_idx, + struct bpf_map_value_off_desc *off_desc) +{ + struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; + int class = BPF_CLASS(insn->code); + struct bpf_reg_state *val_reg; + + /* Things we already checked for in check_map_access and caller: + * - Reject cases where variable offset may touch kptr + * - size of access (must be BPF_DW) + * - tnum_is_const(reg->var_off) + * - off_desc->offset == off + reg->var_off.value + */ + /* Only BPF_[LDX,STX,ST] | BPF_MEM | BPF_DW is supported */ + if (BPF_MODE(insn->code) != BPF_MEM) { + verbose(env, "kptr in map can only be accessed using BPF_MEM instruction mode\n"); + return -EACCES; + } + + /* We only allow loading referenced kptr, since it will be marked as + * untrusted, similar to unreferenced kptr. + */ + if (class != BPF_LDX && off_desc->type == BPF_KPTR_REF) { + verbose(env, "store to referenced kptr disallowed\n"); + return -EACCES; + } + + if (class == BPF_LDX) { + val_reg = reg_state(env, value_regno); + /* We can simply mark the value_regno receiving the pointer + * value from map as PTR_TO_BTF_ID, with the correct type. + */ + mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, off_desc->kptr.btf, + off_desc->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED); + /* For mark_ptr_or_null_reg */ + val_reg->id = ++env->id_gen; + } else if (class == BPF_STX) { + val_reg = reg_state(env, value_regno); + if (!register_is_null(val_reg) && + map_kptr_match_type(env, off_desc, val_reg, value_regno)) + return -EACCES; + } else if (class == BPF_ST) { + if (insn->imm) { + verbose(env, "BPF_ST imm must be 0 when storing to kptr at off=%u\n", + off_desc->offset); + return -EACCES; + } + } else { + verbose(env, "kptr in map can only be accessed using BPF_LDX/BPF_STX/BPF_ST\n"); + return -EACCES; + } + return 0; +} + /* check read/write into a map element with possible variable offset */ static int check_map_access(struct bpf_verifier_env *env, u32 regno, - int off, int size, bool zero_size_allowed) + int off, int size, bool zero_size_allowed, + enum bpf_access_src src) { struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; @@ -3507,6 +3843,36 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, return -EACCES; } } + if (map_value_has_kptrs(map)) { + struct bpf_map_value_off *tab = map->kptr_off_tab; + int i; + + for (i = 0; i < tab->nr_off; i++) { + u32 p = tab->off[i].offset; + + if (reg->smin_value + off < p + sizeof(u64) && + p < reg->umax_value + off + size) { + if (src != ACCESS_DIRECT) { + verbose(env, "kptr cannot be accessed indirectly by helper\n"); + return -EACCES; + } + if (!tnum_is_const(reg->var_off)) { + verbose(env, "kptr access cannot have variable offset\n"); + return -EACCES; + } + if (p != off + reg->var_off.value) { + verbose(env, "kptr access misaligned expected=%u off=%llu\n", + p, off + reg->var_off.value); + return -EACCES; + } + if (size != bpf_size_to_bytes(BPF_DW)) { + verbose(env, "kptr access size must be BPF_DW\n"); + return -EACCES; + } + break; + } + } + } return err; } @@ -3980,44 +4346,6 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env, } #endif -static int __check_ptr_off_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno, - bool fixed_off_ok) -{ - /* Access to this pointer-typed register or passing it to a helper - * is only allowed in its original, unmodified form. - */ - - if (reg->off < 0) { - verbose(env, "negative offset %s ptr R%d off=%d disallowed\n", - reg_type_str(env, reg->type), regno, reg->off); - return -EACCES; - } - - if (!fixed_off_ok && reg->off) { - verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n", - reg_type_str(env, reg->type), regno, reg->off); - return -EACCES; - } - - if (!tnum_is_const(reg->var_off) || reg->var_off.value) { - char tn_buf[48]; - - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "variable %s access var_off=%s disallowed\n", - reg_type_str(env, reg->type), tn_buf); - return -EACCES; - } - - return 0; -} - -int check_ptr_off_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno) -{ - return __check_ptr_off_reg(env, reg, regno, false); -} - static int __check_buffer_access(struct bpf_verifier_env *env, const char *buf_info, const struct bpf_reg_state *reg, @@ -4224,6 +4552,12 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (ret < 0) return ret; + /* If this is an untrusted pointer, all pointers formed by walking it + * also inherit the untrusted flag. + */ + if (type_flag(reg->type) & PTR_UNTRUSTED) + flag |= PTR_UNTRUSTED; + if (atype == BPF_READ && value_regno >= 0) mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag); @@ -4316,7 +4650,7 @@ static int check_stack_slot_within_bounds(int off, static int check_stack_access_within_bounds( struct bpf_verifier_env *env, int regno, int off, int access_size, - enum stack_access_src src, enum bpf_access_type type) + enum bpf_access_src src, enum bpf_access_type type) { struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = regs + regno; @@ -4412,6 +4746,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_MAP_VALUE) { + struct bpf_map_value_off_desc *kptr_off_desc = NULL; + if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { verbose(env, "R%d leaks addr into map\n", value_regno); @@ -4420,8 +4756,16 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_map_access_type(env, regno, off, size, t); if (err) return err; - err = check_map_access(env, regno, off, size, false); - if (!err && t == BPF_READ && value_regno >= 0) { + err = check_map_access(env, regno, off, size, false, ACCESS_DIRECT); + if (err) + return err; + if (tnum_is_const(reg->var_off)) + kptr_off_desc = bpf_map_kptr_off_contains(reg->map_ptr, + off + reg->var_off.value); + if (kptr_off_desc) { + err = check_map_kptr_access(env, regno, value_regno, insn_idx, + kptr_off_desc); + } else if (t == BPF_READ && value_regno >= 0) { struct bpf_map *map = reg->map_ptr; /* if map is read-only, track its contents as scalars */ @@ -4724,7 +5068,7 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i static int check_stack_range_initialized( struct bpf_verifier_env *env, int regno, int off, int access_size, bool zero_size_allowed, - enum stack_access_src type, struct bpf_call_arg_meta *meta) + enum bpf_access_src type, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_func_state *state = func(env, reg); @@ -4861,6 +5205,11 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return check_packet_access(env, regno, reg->off, access_size, zero_size_allowed); case PTR_TO_MAP_KEY: + if (meta && meta->raw_mode) { + verbose(env, "R%d cannot write into %s\n", regno, + reg_type_str(env, reg->type)); + return -EACCES; + } return check_mem_region_access(env, regno, reg->off, access_size, reg->map_ptr->key_size, false); case PTR_TO_MAP_VALUE: @@ -4869,15 +5218,25 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, BPF_READ)) return -EACCES; return check_map_access(env, regno, reg->off, access_size, - zero_size_allowed); + zero_size_allowed, ACCESS_HELPER); case PTR_TO_MEM: + if (type_is_rdonly_mem(reg->type)) { + if (meta && meta->raw_mode) { + verbose(env, "R%d cannot write into %s\n", regno, + reg_type_str(env, reg->type)); + return -EACCES; + } + } return check_mem_region_access(env, regno, reg->off, access_size, reg->mem_size, zero_size_allowed); case PTR_TO_BUF: if (type_is_rdonly_mem(reg->type)) { - if (meta && meta->raw_mode) + if (meta && meta->raw_mode) { + verbose(env, "R%d cannot write into %s\n", regno, + reg_type_str(env, reg->type)); return -EACCES; + } max_access = &env->prog->aux->max_rdonly_access; } else { @@ -4891,6 +5250,25 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, env, regno, reg->off, access_size, zero_size_allowed, ACCESS_HELPER, meta); + case PTR_TO_CTX: + /* in case the function doesn't know how to access the context, + * (because we are in a program of type SYSCALL for example), we + * can not statically check its size. + * Dynamically check it now. + */ + if (!env->ops->convert_ctx_access) { + enum bpf_access_type atype = meta && meta->raw_mode ? BPF_WRITE : BPF_READ; + int offset = access_size - 1; + + /* Allow zero-byte read from PTR_TO_CTX */ + if (access_size == 0) + return zero_size_allowed ? 0 : -EACCES; + + return check_mem_access(env, env->insn_idx, regno, offset, BPF_B, + atype, -1, false); + } + + fallthrough; default: /* scalar_value or invalid ptr */ /* Allow zero-byte read from NULL, regardless of pointer type */ if (zero_size_allowed && access_size == 0 && @@ -4919,8 +5297,7 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, * out. Only upper bounds can be learned because retval is an * int type and negative retvals are allowed. */ - if (meta) - meta->msize_max_value = reg->umax_value; + meta->msize_max_value = reg->umax_value; /* The register is SCALAR_VALUE; the access check * happens using its boundaries. @@ -4963,24 +5340,33 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, u32 mem_size) { + bool may_be_null = type_may_be_null(reg->type); + struct bpf_reg_state saved_reg; + struct bpf_call_arg_meta meta; + int err; + if (register_is_null(reg)) return 0; - if (type_may_be_null(reg->type)) { - /* Assuming that the register contains a value check if the memory - * access is safe. Temporarily save and restore the register's state as - * the conversion shouldn't be visible to a caller. - */ - const struct bpf_reg_state saved_reg = *reg; - int rv; - + memset(&meta, 0, sizeof(meta)); + /* Assuming that the register contains a value check if the memory + * access is safe. Temporarily save and restore the register's state as + * the conversion shouldn't be visible to a caller. + */ + if (may_be_null) { + saved_reg = *reg; mark_ptr_not_null_reg(reg); - rv = check_helper_mem_access(env, regno, mem_size, true, NULL); - *reg = saved_reg; - return rv; } - return check_helper_mem_access(env, regno, mem_size, true, NULL); + err = check_helper_mem_access(env, regno, mem_size, true, &meta); + /* Check access for BPF_WRITE */ + meta.raw_mode = true; + err = err ?: check_helper_mem_access(env, regno, mem_size, true, &meta); + + if (may_be_null) + *reg = saved_reg; + + return err; } int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, @@ -4989,16 +5375,22 @@ int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1]; bool may_be_null = type_may_be_null(mem_reg->type); struct bpf_reg_state saved_reg; + struct bpf_call_arg_meta meta; int err; WARN_ON_ONCE(regno < BPF_REG_2 || regno > BPF_REG_5); + memset(&meta, 0, sizeof(meta)); + if (may_be_null) { saved_reg = *mem_reg; mark_ptr_not_null_reg(mem_reg); } - err = check_mem_size_reg(env, reg, regno, true, NULL); + err = check_mem_size_reg(env, reg, regno, true, &meta); + /* Check access for BPF_WRITE */ + meta.raw_mode = true; + err = err ?: check_mem_size_reg(env, reg, regno, true, &meta); if (may_be_null) *mem_reg = saved_reg; @@ -5134,10 +5526,51 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, return 0; } -static bool arg_type_is_mem_ptr(enum bpf_arg_type type) +static int process_kptr_func(struct bpf_verifier_env *env, int regno, + struct bpf_call_arg_meta *meta) { - return base_type(type) == ARG_PTR_TO_MEM || - base_type(type) == ARG_PTR_TO_UNINIT_MEM; + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_map_value_off_desc *off_desc; + struct bpf_map *map_ptr = reg->map_ptr; + u32 kptr_off; + int ret; + + if (!tnum_is_const(reg->var_off)) { + verbose(env, + "R%d doesn't have constant offset. kptr has to be at the constant offset\n", + regno); + return -EINVAL; + } + if (!map_ptr->btf) { + verbose(env, "map '%s' has to have BTF in order to use bpf_kptr_xchg\n", + map_ptr->name); + return -EINVAL; + } + if (!map_value_has_kptrs(map_ptr)) { + ret = PTR_ERR_OR_ZERO(map_ptr->kptr_off_tab); + if (ret == -E2BIG) + verbose(env, "map '%s' has more than %d kptr\n", map_ptr->name, + BPF_MAP_VALUE_OFF_MAX); + else if (ret == -EEXIST) + verbose(env, "map '%s' has repeating kptr BTF tags\n", map_ptr->name); + else + verbose(env, "map '%s' has no valid kptr\n", map_ptr->name); + return -EINVAL; + } + + meta->map_ptr = map_ptr; + kptr_off = reg->off + reg->var_off.value; + off_desc = bpf_map_kptr_off_contains(map_ptr, kptr_off); + if (!off_desc) { + verbose(env, "off=%d doesn't point to kptr\n", kptr_off); + return -EACCES; + } + if (off_desc->type != BPF_KPTR_REF) { + verbose(env, "off=%d kptr isn't referenced kptr\n", kptr_off); + return -EACCES; + } + meta->kptr_off_desc = off_desc; + return 0; } static bool arg_type_is_mem_size(enum bpf_arg_type type) @@ -5146,15 +5579,14 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type) type == ARG_CONST_SIZE_OR_ZERO; } -static bool arg_type_is_alloc_size(enum bpf_arg_type type) +static bool arg_type_is_release(enum bpf_arg_type type) { - return type == ARG_CONST_ALLOC_SIZE_OR_ZERO; + return type & OBJ_RELEASE; } -static bool arg_type_is_int_ptr(enum bpf_arg_type type) +static bool arg_type_is_dynptr(enum bpf_arg_type type) { - return type == ARG_PTR_TO_INT || - type == ARG_PTR_TO_LONG; + return base_type(type) == ARG_PTR_TO_DYNPTR; } static int int_ptr_type_to_size(enum bpf_arg_type type) @@ -5269,11 +5701,17 @@ static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } }; static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } }; static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } }; +static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } }; +static const struct bpf_reg_types dynptr_types = { + .types = { + PTR_TO_STACK, + PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL, + } +}; static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, [ARG_PTR_TO_MAP_VALUE] = &map_key_value_types, - [ARG_PTR_TO_UNINIT_MAP_VALUE] = &map_key_value_types, [ARG_CONST_SIZE] = &scalar_types, [ARG_CONST_SIZE_OR_ZERO] = &scalar_types, [ARG_CONST_ALLOC_SIZE_OR_ZERO] = &scalar_types, @@ -5287,7 +5725,6 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_BTF_ID] = &btf_ptr_types, [ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types, [ARG_PTR_TO_MEM] = &mem_types, - [ARG_PTR_TO_UNINIT_MEM] = &mem_types, [ARG_PTR_TO_ALLOC_MEM] = &alloc_mem_types, [ARG_PTR_TO_INT] = &int_ptr_types, [ARG_PTR_TO_LONG] = &int_ptr_types, @@ -5296,11 +5733,14 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_STACK] = &stack_ptr_types, [ARG_PTR_TO_CONST_STR] = &const_str_ptr_types, [ARG_PTR_TO_TIMER] = &timer_types, + [ARG_PTR_TO_KPTR] = &kptr_types, + [ARG_PTR_TO_DYNPTR] = &dynptr_types, }; static int check_reg_type(struct bpf_verifier_env *env, u32 regno, enum bpf_arg_type arg_type, - const u32 *arg_btf_id) + const u32 *arg_btf_id, + struct bpf_call_arg_meta *meta) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; enum bpf_reg_type expected, type = reg->type; @@ -5345,6 +5785,13 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, found: if (reg->type == PTR_TO_BTF_ID) { + /* For bpf_sk_release, it needs to match against first member + * 'struct sock_common', hence make an exception for it. This + * allows bpf_sk_release to work for multiple socket types. + */ + bool strict_type_match = arg_type_is_release(arg_type) && + meta->func_id != BPF_FUNC_sk_release; + if (!arg_btf_id) { if (!compatible->btf_id) { verbose(env, "verifier internal error: missing arg compatible BTF ID\n"); @@ -5353,12 +5800,25 @@ found: arg_btf_id = compatible->btf_id; } - if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, - btf_vmlinux, *arg_btf_id)) { - verbose(env, "R%d is of type %s but %s is expected\n", - regno, kernel_type_name(reg->btf, reg->btf_id), - kernel_type_name(btf_vmlinux, *arg_btf_id)); - return -EACCES; + if (meta->func_id == BPF_FUNC_kptr_xchg) { + if (map_kptr_match_type(env, meta->kptr_off_desc, reg, regno)) + return -EACCES; + } else { + if (arg_btf_id == BPF_PTR_POISON) { + verbose(env, "verifier internal error:"); + verbose(env, "R%d has non-overwritten BPF_PTR_POISON type\n", + regno); + return -EACCES; + } + + if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, + btf_vmlinux, *arg_btf_id, + strict_type_match)) { + verbose(env, "R%d is of type %s but %s is expected\n", + regno, kernel_type_name(reg->btf, reg->btf_id), + kernel_type_name(btf_vmlinux, *arg_btf_id)); + return -EACCES; + } } } @@ -5367,15 +5827,19 @@ found: int check_func_arg_reg_off(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno, - enum bpf_arg_type arg_type, - bool is_release_func) + enum bpf_arg_type arg_type) { - bool fixed_off_ok = false, release_reg; enum bpf_reg_type type = reg->type; + bool fixed_off_ok = false; switch ((u32)type) { - case SCALAR_VALUE: /* Pointer types where reg offset is explicitly allowed: */ + case PTR_TO_STACK: + if (arg_type_is_dynptr(arg_type) && reg->off % BPF_REG_SIZE) { + verbose(env, "cannot pass in dynptr at an offset\n"); + return -EINVAL; + } + fallthrough; case PTR_TO_PACKET: case PTR_TO_PACKET_META: case PTR_TO_MAP_KEY: @@ -5385,11 +5849,11 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, case PTR_TO_MEM | MEM_ALLOC: case PTR_TO_BUF: case PTR_TO_BUF | MEM_RDONLY: - case PTR_TO_STACK: + case SCALAR_VALUE: /* Some of the argument types nevertheless require a * zero register offset. */ - if (arg_type != ARG_PTR_TO_ALLOC_MEM) + if (base_type(arg_type) != ARG_PTR_TO_ALLOC_MEM) return 0; break; /* All the rest must be rejected, except PTR_TO_BTF_ID which allows @@ -5397,19 +5861,17 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, */ case PTR_TO_BTF_ID: /* When referenced PTR_TO_BTF_ID is passed to release function, - * it's fixed offset must be 0. We rely on the property that - * only one referenced register can be passed to BPF helpers and - * kfuncs. In the other cases, fixed offset can be non-zero. + * it's fixed offset must be 0. In the other cases, fixed offset + * can be non-zero. */ - release_reg = is_release_func && reg->ref_obj_id; - if (release_reg && reg->off) { + if (arg_type_is_release(arg_type) && reg->off) { verbose(env, "R%d must have zero offset when passed to release func\n", regno); return -EINVAL; } - /* For release_reg == true, fixed_off_ok must be false, but we - * already checked and rejected reg->off != 0 above, so set to - * true to allow fixed offset for all other cases. + /* For arg is release pointer, fixed_off_ok must be false, but + * we already checked and rejected reg->off != 0 above, so set + * to true to allow fixed offset for all other cases. */ fixed_off_ok = true; break; @@ -5419,6 +5881,14 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, return __check_ptr_off_reg(env, reg, regno, fixed_off_ok); } +static u32 stack_slot_get_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ + struct bpf_func_state *state = func(env, reg); + int spi = get_spi(reg->off); + + return state->stack[spi].spilled_ptr.id; +} + static int check_func_arg(struct bpf_verifier_env *env, u32 arg, struct bpf_call_arg_meta *meta, const struct bpf_func_proto *fn) @@ -5427,6 +5897,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; enum bpf_arg_type arg_type = fn->arg_type[arg]; enum bpf_reg_type type = reg->type; + u32 *arg_btf_id = NULL; int err = 0; if (arg_type == ARG_DONTCARE) @@ -5451,8 +5922,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, return -EACCES; } - if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE || - base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) { + if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE) { err = resolve_map_arg_type(env, meta, &arg_type); if (err) return err; @@ -5464,18 +5934,41 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, */ goto skip_type_check; - err = check_reg_type(env, regno, arg_type, fn->arg_btf_id[arg]); + /* arg_btf_id and arg_size are in a union. */ + if (base_type(arg_type) == ARG_PTR_TO_BTF_ID) + arg_btf_id = fn->arg_btf_id[arg]; + + err = check_reg_type(env, regno, arg_type, arg_btf_id, meta); if (err) return err; - err = check_func_arg_reg_off(env, reg, regno, arg_type, is_release_function(meta->func_id)); + err = check_func_arg_reg_off(env, reg, regno, arg_type); if (err) return err; skip_type_check: - /* check_func_arg_reg_off relies on only one referenced register being - * allowed for BPF helpers. - */ + if (arg_type_is_release(arg_type)) { + if (arg_type_is_dynptr(arg_type)) { + struct bpf_func_state *state = func(env, reg); + int spi = get_spi(reg->off); + + if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) || + !state->stack[spi].spilled_ptr.id) { + verbose(env, "arg %d is an unacquired reference\n", regno); + return -EINVAL; + } + } else if (!reg->ref_obj_id && !register_is_null(reg)) { + verbose(env, "R%d must be referenced when passed to release function\n", + regno); + return -EINVAL; + } + if (meta->release_regno) { + verbose(env, "verifier internal error: more than one release argument\n"); + return -EFAULT; + } + meta->release_regno = regno; + } + if (reg->ref_obj_id) { if (meta->ref_obj_id) { verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", @@ -5486,7 +5979,8 @@ skip_type_check: meta->ref_obj_id = reg->ref_obj_id; } - if (arg_type == ARG_CONST_MAP_PTR) { + switch (base_type(arg_type)) { + case ARG_CONST_MAP_PTR: /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ if (meta->map_ptr) { /* Use map_uid (which is unique id of inner map) to reject: @@ -5511,7 +6005,8 @@ skip_type_check: } meta->map_ptr = reg->map_ptr; meta->map_uid = reg->map_uid; - } else if (arg_type == ARG_PTR_TO_MAP_KEY) { + break; + case ARG_PTR_TO_MAP_KEY: /* bpf_map_xxx(..., map_ptr, ..., key) call: * check that [key, key + map->key_size) are within * stack limits and initialized @@ -5528,8 +6023,8 @@ skip_type_check: err = check_helper_mem_access(env, regno, meta->map_ptr->key_size, false, NULL); - } else if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE || - base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) { + break; + case ARG_PTR_TO_MAP_VALUE: if (type_may_be_null(arg_type) && register_is_null(reg)) return 0; @@ -5541,18 +6036,20 @@ skip_type_check: verbose(env, "invalid map_ptr to access map->value\n"); return -EACCES; } - meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE); + meta->raw_mode = arg_type & MEM_UNINIT; err = check_helper_mem_access(env, regno, meta->map_ptr->value_size, false, meta); - } else if (arg_type == ARG_PTR_TO_PERCPU_BTF_ID) { + break; + case ARG_PTR_TO_PERCPU_BTF_ID: if (!reg->btf_id) { verbose(env, "Helper has invalid btf_id in R%d\n", regno); return -EACCES; } meta->ret_btf = reg->btf; meta->ret_btf_id = reg->btf_id; - } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { + break; + case ARG_PTR_TO_SPIN_LOCK: if (meta->func_id == BPF_FUNC_spin_lock) { if (process_spin_lock(env, regno, true)) return -EACCES; @@ -5563,35 +6060,103 @@ skip_type_check: verbose(env, "verifier internal error\n"); return -EFAULT; } - } else if (arg_type == ARG_PTR_TO_TIMER) { + break; + case ARG_PTR_TO_TIMER: if (process_timer_func(env, regno, meta)) return -EACCES; - } else if (arg_type == ARG_PTR_TO_FUNC) { + break; + case ARG_PTR_TO_FUNC: meta->subprogno = reg->subprogno; - } else if (arg_type_is_mem_ptr(arg_type)) { + break; + case ARG_PTR_TO_MEM: /* The access to this pointer is only checked when we hit the * next is_mem_size argument below. */ - meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MEM); - } else if (arg_type_is_mem_size(arg_type)) { - bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); + meta->raw_mode = arg_type & MEM_UNINIT; + if (arg_type & MEM_FIXED_SIZE) { + err = check_helper_mem_access(env, regno, + fn->arg_size[arg], false, + meta); + } + break; + case ARG_CONST_SIZE: + err = check_mem_size_reg(env, reg, regno, false, meta); + break; + case ARG_CONST_SIZE_OR_ZERO: + err = check_mem_size_reg(env, reg, regno, true, meta); + break; + case ARG_PTR_TO_DYNPTR: + /* We only need to check for initialized / uninitialized helper + * dynptr args if the dynptr is not PTR_TO_DYNPTR, as the + * assumption is that if it is, that a helper function + * initialized the dynptr on behalf of the BPF program. + */ + if (base_type(reg->type) == PTR_TO_DYNPTR) + break; + if (arg_type & MEM_UNINIT) { + if (!is_dynptr_reg_valid_uninit(env, reg)) { + verbose(env, "Dynptr has to be an uninitialized dynptr\n"); + return -EINVAL; + } - err = check_mem_size_reg(env, reg, regno, zero_size_allowed, meta); - } else if (arg_type_is_alloc_size(arg_type)) { + /* We only support one dynptr being uninitialized at the moment, + * which is sufficient for the helper functions we have right now. + */ + if (meta->uninit_dynptr_regno) { + verbose(env, "verifier internal error: multiple uninitialized dynptr args\n"); + return -EFAULT; + } + + meta->uninit_dynptr_regno = regno; + } else if (!is_dynptr_reg_valid_init(env, reg)) { + verbose(env, + "Expected an initialized dynptr as arg #%d\n", + arg + 1); + return -EINVAL; + } else if (!is_dynptr_type_expected(env, reg, arg_type)) { + const char *err_extra = ""; + + switch (arg_type & DYNPTR_TYPE_FLAG_MASK) { + case DYNPTR_TYPE_LOCAL: + err_extra = "local"; + break; + case DYNPTR_TYPE_RINGBUF: + err_extra = "ringbuf"; + break; + default: + err_extra = "<unknown>"; + break; + } + verbose(env, + "Expected a dynptr of type %s as arg #%d\n", + err_extra, arg + 1); + return -EINVAL; + } + break; + case ARG_CONST_ALLOC_SIZE_OR_ZERO: if (!tnum_is_const(reg->var_off)) { verbose(env, "R%d is not a known constant'\n", regno); return -EACCES; } meta->mem_size = reg->var_off.value; - } else if (arg_type_is_int_ptr(arg_type)) { + err = mark_chain_precision(env, regno); + if (err) + return err; + break; + case ARG_PTR_TO_INT: + case ARG_PTR_TO_LONG: + { int size = int_ptr_type_to_size(arg_type); err = check_helper_mem_access(env, regno, size, false, meta); if (err) return err; err = check_ptr_alignment(env, reg, 0, size, true); - } else if (arg_type == ARG_PTR_TO_CONST_STR) { + break; + } + case ARG_PTR_TO_CONST_STR: + { struct bpf_map *map = reg->map_ptr; int map_off; u64 map_addr; @@ -5613,7 +6178,8 @@ skip_type_check: } err = check_map_access(env, regno, reg->off, - map->value_size - reg->off, false); + map->value_size - reg->off, false, + ACCESS_HELPER); if (err) return err; @@ -5629,6 +6195,12 @@ skip_type_check: verbose(env, "string is not zero-terminated\n"); return -EINVAL; } + break; + } + case ARG_PTR_TO_KPTR: + if (process_kptr_func(env, regno, meta)) + return -EACCES; + break; } return err; @@ -5668,7 +6240,8 @@ static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id) static bool allow_tail_call_in_subprogs(struct bpf_verifier_env *env) { - return env->prog->jit_requested && IS_ENABLED(CONFIG_X86_64); + return env->prog->jit_requested && + bpf_jit_supports_subprog_tailcalls(); } static int check_map_func_compatibility(struct bpf_verifier_env *env, @@ -5694,7 +6267,14 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_RINGBUF: if (func_id != BPF_FUNC_ringbuf_output && func_id != BPF_FUNC_ringbuf_reserve && - func_id != BPF_FUNC_ringbuf_query) + func_id != BPF_FUNC_ringbuf_query && + func_id != BPF_FUNC_ringbuf_reserve_dynptr && + func_id != BPF_FUNC_ringbuf_submit_dynptr && + func_id != BPF_FUNC_ringbuf_discard_dynptr) + goto error; + break; + case BPF_MAP_TYPE_USER_RINGBUF: + if (func_id != BPF_FUNC_user_ringbuf_drain) goto error; break; case BPF_MAP_TYPE_STACK_TRACE: @@ -5810,9 +6390,16 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_FUNC_ringbuf_output: case BPF_FUNC_ringbuf_reserve: case BPF_FUNC_ringbuf_query: + case BPF_FUNC_ringbuf_reserve_dynptr: + case BPF_FUNC_ringbuf_submit_dynptr: + case BPF_FUNC_ringbuf_discard_dynptr: if (map->map_type != BPF_MAP_TYPE_RINGBUF) goto error; break; + case BPF_FUNC_user_ringbuf_drain: + if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF) + goto error; + break; case BPF_FUNC_get_stackid: if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) goto error; @@ -5864,6 +6451,12 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, map->map_type != BPF_MAP_TYPE_BLOOM_FILTER) goto error; break; + case BPF_FUNC_map_lookup_percpu_elem: + if (map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY && + map->map_type != BPF_MAP_TYPE_PERCPU_HASH && + map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH) + goto error; + break; case BPF_FUNC_sk_storage_get: case BPF_FUNC_sk_storage_delete: if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) @@ -5912,13 +6505,19 @@ static bool check_raw_mode_ok(const struct bpf_func_proto *fn) return count <= 1; } -static bool check_args_pair_invalid(enum bpf_arg_type arg_curr, - enum bpf_arg_type arg_next) +static bool check_args_pair_invalid(const struct bpf_func_proto *fn, int arg) { - return (arg_type_is_mem_ptr(arg_curr) && - !arg_type_is_mem_size(arg_next)) || - (!arg_type_is_mem_ptr(arg_curr) && - arg_type_is_mem_size(arg_next)); + bool is_fixed = fn->arg_type[arg] & MEM_FIXED_SIZE; + bool has_size = fn->arg_size[arg] != 0; + bool is_next_size = false; + + if (arg + 1 < ARRAY_SIZE(fn->arg_type)) + is_next_size = arg_type_is_mem_size(fn->arg_type[arg + 1]); + + if (base_type(fn->arg_type[arg]) != ARG_PTR_TO_MEM) + return is_next_size; + + return has_size == is_next_size || is_next_size == is_fixed; } static bool check_arg_pair_ok(const struct bpf_func_proto *fn) @@ -5929,52 +6528,28 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn) * helper function specification. */ if (arg_type_is_mem_size(fn->arg1_type) || - arg_type_is_mem_ptr(fn->arg5_type) || - check_args_pair_invalid(fn->arg1_type, fn->arg2_type) || - check_args_pair_invalid(fn->arg2_type, fn->arg3_type) || - check_args_pair_invalid(fn->arg3_type, fn->arg4_type) || - check_args_pair_invalid(fn->arg4_type, fn->arg5_type)) + check_args_pair_invalid(fn, 0) || + check_args_pair_invalid(fn, 1) || + check_args_pair_invalid(fn, 2) || + check_args_pair_invalid(fn, 3) || + check_args_pair_invalid(fn, 4)) return false; return true; } -static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id) -{ - int count = 0; - - if (arg_type_may_be_refcounted(fn->arg1_type)) - count++; - if (arg_type_may_be_refcounted(fn->arg2_type)) - count++; - if (arg_type_may_be_refcounted(fn->arg3_type)) - count++; - if (arg_type_may_be_refcounted(fn->arg4_type)) - count++; - if (arg_type_may_be_refcounted(fn->arg5_type)) - count++; - - /* A reference acquiring function cannot acquire - * another refcounted ptr. - */ - if (may_be_acquire_function(func_id) && count) - return false; - - /* We only support one arg being unreferenced at the moment, - * which is sufficient for the helper functions we have right now. - */ - return count <= 1; -} - static bool check_btf_id_ok(const struct bpf_func_proto *fn) { int i; for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) { - if (fn->arg_type[i] == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i]) + if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i]) return false; - if (fn->arg_type[i] != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i]) + if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i] && + /* arg_btf_id and arg_size are in a union. */ + (base_type(fn->arg_type[i]) != ARG_PTR_TO_MEM || + !(fn->arg_type[i] & MEM_FIXED_SIZE))) return false; } @@ -5985,38 +6560,21 @@ static int check_func_proto(const struct bpf_func_proto *fn, int func_id) { return check_raw_mode_ok(fn) && check_arg_pair_ok(fn) && - check_btf_id_ok(fn) && - check_refcount_ok(fn, func_id) ? 0 : -EINVAL; + check_btf_id_ok(fn) ? 0 : -EINVAL; } /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] * are now invalid, so turn them into unknown SCALAR_VALUE. */ -static void __clear_all_pkt_pointers(struct bpf_verifier_env *env, - struct bpf_func_state *state) +static void clear_all_pkt_pointers(struct bpf_verifier_env *env) { - struct bpf_reg_state *regs = state->regs, *reg; - int i; - - for (i = 0; i < MAX_BPF_REG; i++) - if (reg_is_pkt_pointer_any(®s[i])) - mark_reg_unknown(env, regs, i); + struct bpf_func_state *state; + struct bpf_reg_state *reg; - bpf_for_each_spilled_reg(i, state, reg) { - if (!reg) - continue; + bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ if (reg_is_pkt_pointer_any(reg)) __mark_reg_unknown(env, reg); - } -} - -static void clear_all_pkt_pointers(struct bpf_verifier_env *env) -{ - struct bpf_verifier_state *vstate = env->cur_state; - int i; - - for (i = 0; i <= vstate->curframe; i++) - __clear_all_pkt_pointers(env, vstate->frame[i]); + })); } enum { @@ -6045,41 +6603,24 @@ static void mark_pkt_end(struct bpf_verifier_state *vstate, int regn, bool range reg->range = AT_PKT_END; } -static void release_reg_references(struct bpf_verifier_env *env, - struct bpf_func_state *state, - int ref_obj_id) -{ - struct bpf_reg_state *regs = state->regs, *reg; - int i; - - for (i = 0; i < MAX_BPF_REG; i++) - if (regs[i].ref_obj_id == ref_obj_id) - mark_reg_unknown(env, regs, i); - - bpf_for_each_spilled_reg(i, state, reg) { - if (!reg) - continue; - if (reg->ref_obj_id == ref_obj_id) - __mark_reg_unknown(env, reg); - } -} - /* The pointer with the specified id has released its reference to kernel * resources. Identify all copies of the same pointer and clear the reference. */ static int release_reference(struct bpf_verifier_env *env, int ref_obj_id) { - struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state; + struct bpf_reg_state *reg; int err; - int i; err = release_reference_state(cur_func(env), ref_obj_id); if (err) return err; - for (i = 0; i <= vstate->curframe; i++) - release_reg_references(env, vstate->frame[i], ref_obj_id); + bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ + if (reg->ref_obj_id == ref_obj_id) + __mark_reg_unknown(env, reg); + })); return 0; } @@ -6127,7 +6668,7 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn func_info_aux = env->prog->aux->func_info_aux; if (func_info_aux) is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; - err = btf_check_subprog_arg_match(env, subprog, caller->regs); + err = btf_check_subprog_call(env, subprog, caller->regs); if (err == -EFAULT) return err; if (is_global) { @@ -6301,6 +6842,7 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env, return err; callee->in_callback_fn = true; + callee->callback_ret_range = tnum_range(0, 1); return 0; } @@ -6322,6 +6864,7 @@ static int set_loop_callback_state(struct bpf_verifier_env *env, __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_callback_fn = true; + callee->callback_ret_range = tnum_range(0, 1); return 0; } @@ -6351,6 +6894,7 @@ static int set_timer_callback_state(struct bpf_verifier_env *env, __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_async_callback_fn = true; + callee->callback_ret_range = tnum_range(0, 1); return 0; } @@ -6378,6 +6922,30 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env, __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_callback_fn = true; + callee->callback_ret_range = tnum_range(0, 1); + return 0; +} + +static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + /* bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void + * callback_ctx, u64 flags); + * callback_fn(struct bpf_dynptr_t* dynptr, void *callback_ctx); + */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_0]); + callee->regs[BPF_REG_1].type = PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL; + __mark_reg_known_zero(&callee->regs[BPF_REG_1]); + callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3]; + + /* unused */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_3]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + + callee->in_callback_fn = true; return 0; } @@ -6405,7 +6973,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) caller = state->frame[state->curframe]; if (callee->in_callback_fn) { /* enforce R0 return value range [0, 1]. */ - struct tnum range = tnum_range(0, 1); + struct tnum range = callee->callback_ret_range; if (r0->type != SCALAR_VALUE) { verbose(env, "R0 not a scalar value\n"); @@ -6420,10 +6988,17 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) caller->regs[BPF_REG_0] = *r0; } - /* Transfer references to the caller */ - err = copy_reference_state(caller, callee); - if (err) - return err; + /* callback_fn frame should have released its own additions to parent's + * reference state at this point, or check_reference_leak would + * complain, hence it must be the same as the caller. There is no need + * to copy it back. + */ + if (!callee->in_callback_fn) { + /* Transfer references to the caller */ + err = copy_reference_state(caller, callee); + if (err) + return err; + } *insn_idx = callee->callsite + 1; if (env->log.level & BPF_LOG_LEVEL) { @@ -6456,9 +7031,7 @@ static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type, ret_reg->s32_max_value = meta->msize_max_value; ret_reg->smin_value = -MAX_ERRNO; ret_reg->s32_min_value = -MAX_ERRNO; - __reg_deduce_bounds(ret_reg); - __reg_bound_offset(ret_reg); - __update_reg_bounds(ret_reg); + reg_bounds_sync(ret_reg); } static int @@ -6476,7 +7049,8 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, func_id != BPF_FUNC_map_pop_elem && func_id != BPF_FUNC_map_peek_elem && func_id != BPF_FUNC_for_each_map_elem && - func_id != BPF_FUNC_redirect_map) + func_id != BPF_FUNC_redirect_map && + func_id != BPF_FUNC_map_lookup_percpu_elem) return 0; if (map == NULL) { @@ -6513,8 +7087,7 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; struct bpf_reg_state *regs = cur_regs(env), *reg; struct bpf_map *map = meta->map_ptr; - struct tnum range; - u64 val; + u64 val, max; int err; if (func_id != BPF_FUNC_tail_call) @@ -6524,10 +7097,11 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, return -EINVAL; } - range = tnum_range(0, map->max_entries - 1); reg = ®s[BPF_REG_3]; + val = reg->var_off.value; + max = map->max_entries; - if (!register_is_const(reg) || !tnum_in(range, reg->var_off)) { + if (!(register_is_const(reg) && val < max)) { bpf_map_key_store(aux, BPF_MAP_KEY_POISON); return 0; } @@ -6535,8 +7109,6 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, err = mark_chain_precision(env, BPF_REG_3); if (err) return err; - - val = reg->var_off.value; if (bpf_map_key_unseen(aux)) bpf_map_key_store(aux, val); else if (!bpf_map_key_poisoned(aux) && @@ -6548,13 +7120,20 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, static int check_reference_leak(struct bpf_verifier_env *env) { struct bpf_func_state *state = cur_func(env); + bool refs_lingering = false; int i; + if (state->frameno && !state->in_callback_fn) + return 0; + for (i = 0; i < state->acquired_refs; i++) { + if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno) + continue; verbose(env, "Unreleased reference id=%d alloc_insn=%d\n", state->refs[i].id, state->refs[i].insn_idx); + refs_lingering = true; } - return state->acquired_refs ? -EINVAL : 0; + return refs_lingering ? -EINVAL : 0; } static int check_bpf_snprintf_call(struct bpf_verifier_env *env, @@ -6615,9 +7194,45 @@ static int check_get_func_ip(struct bpf_verifier_env *env) return -ENOTSUPP; } +static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env) +{ + return &env->insn_aux_data[env->insn_idx]; +} + +static bool loop_flag_is_zero(struct bpf_verifier_env *env) +{ + struct bpf_reg_state *regs = cur_regs(env); + struct bpf_reg_state *reg = ®s[BPF_REG_4]; + bool reg_is_null = register_is_null(reg); + + if (reg_is_null) + mark_chain_precision(env, BPF_REG_4); + + return reg_is_null; +} + +static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno) +{ + struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state; + + if (!state->initialized) { + state->initialized = 1; + state->fit_for_inline = loop_flag_is_zero(env); + state->callback_subprogno = subprogno; + return; + } + + if (!state->fit_for_inline) + return; + + state->fit_for_inline = (loop_flag_is_zero(env) && + state->callback_subprogno == subprogno); +} + static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { + enum bpf_prog_type prog_type = resolve_prog_type(env->prog); const struct bpf_func_proto *fn = NULL; enum bpf_return_type ret_type; enum bpf_type_flag ret_flag; @@ -6698,8 +7313,35 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return err; } - if (is_release_function(func_id)) { - err = release_reference(env, meta.ref_obj_id); + regs = cur_regs(env); + + if (meta.uninit_dynptr_regno) { + /* we write BPF_DW bits (8 bytes) at a time */ + for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) { + err = check_mem_access(env, insn_idx, meta.uninit_dynptr_regno, + i, BPF_DW, BPF_WRITE, -1, false); + if (err) + return err; + } + + err = mark_stack_slots_dynptr(env, ®s[meta.uninit_dynptr_regno], + fn->arg_type[meta.uninit_dynptr_regno - BPF_REG_1], + insn_idx); + if (err) + return err; + } + + if (meta.release_regno) { + err = -EINVAL; + if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) + err = unmark_stack_slots_dynptr(env, ®s[meta.release_regno]); + else if (meta.ref_obj_id) + err = release_reference(env, meta.ref_obj_id); + /* meta.ref_obj_id can only be 0 if register that is meant to be + * released is NULL, which must be > R0. + */ + else if (register_is_null(®s[meta.release_regno])) + err = 0; if (err) { verbose(env, "func %s#%d reference has not been acquired before\n", func_id_name(func_id), func_id); @@ -6707,8 +7349,6 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } } - regs = cur_regs(env); - switch (func_id) { case BPF_FUNC_tail_call: err = check_reference_leak(env); @@ -6742,9 +7382,56 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn err = check_bpf_snprintf_call(env, regs); break; case BPF_FUNC_loop: + update_loop_inline_state(env, meta.subprogno); err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, set_loop_callback_state); break; + case BPF_FUNC_dynptr_from_mem: + if (regs[BPF_REG_1].type != PTR_TO_MAP_VALUE) { + verbose(env, "Unsupported reg type %s for bpf_dynptr_from_mem data\n", + reg_type_str(env, regs[BPF_REG_1].type)); + return -EACCES; + } + break; + case BPF_FUNC_set_retval: + if (prog_type == BPF_PROG_TYPE_LSM && + env->prog->expected_attach_type == BPF_LSM_CGROUP) { + if (!env->prog->aux->attach_func_proto->type) { + /* Make sure programs that attach to void + * hooks don't try to modify return value. + */ + verbose(env, "BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n"); + return -EINVAL; + } + } + break; + case BPF_FUNC_dynptr_data: + for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { + if (arg_type_is_dynptr(fn->arg_type[i])) { + struct bpf_reg_state *reg = ®s[BPF_REG_1 + i]; + + if (meta.ref_obj_id) { + verbose(env, "verifier internal error: meta.ref_obj_id already set\n"); + return -EFAULT; + } + + if (base_type(reg->type) != PTR_TO_DYNPTR) + /* Find the id of the dynptr we're + * tracking the reference of + */ + meta.ref_obj_id = stack_slot_get_id(env, reg); + break; + } + } + if (i == MAX_BPF_FUNC_REG_ARGS) { + verbose(env, "verifier internal error: no dynptr in bpf_dynptr_data()\n"); + return -EFAULT; + } + break; + case BPF_FUNC_user_ringbuf_drain: + err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, + set_user_ringbuf_callback_state); + break; } if (err) @@ -6761,13 +7448,17 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn /* update return register (already marked as written above) */ ret_type = fn->ret_type; - ret_flag = type_flag(fn->ret_type); - if (ret_type == RET_INTEGER) { + ret_flag = type_flag(ret_type); + + switch (base_type(ret_type)) { + case RET_INTEGER: /* sets type to SCALAR_VALUE */ mark_reg_unknown(env, regs, BPF_REG_0); - } else if (ret_type == RET_VOID) { + break; + case RET_VOID: regs[BPF_REG_0].type = NOT_INIT; - } else if (base_type(ret_type) == RET_PTR_TO_MAP_VALUE) { + break; + case RET_PTR_TO_MAP_VALUE: /* There is no offset yet applied, variable or fixed */ mark_reg_known_zero(env, regs, BPF_REG_0); /* remember map_ptr, so that check_map_access() @@ -6786,20 +7477,26 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn map_value_has_spin_lock(meta.map_ptr)) { regs[BPF_REG_0].id = ++env->id_gen; } - } else if (base_type(ret_type) == RET_PTR_TO_SOCKET) { + break; + case RET_PTR_TO_SOCKET: mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCKET | ret_flag; - } else if (base_type(ret_type) == RET_PTR_TO_SOCK_COMMON) { + break; + case RET_PTR_TO_SOCK_COMMON: mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON | ret_flag; - } else if (base_type(ret_type) == RET_PTR_TO_TCP_SOCK) { + break; + case RET_PTR_TO_TCP_SOCK: mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag; - } else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) { + break; + case RET_PTR_TO_ALLOC_MEM: mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag; regs[BPF_REG_0].mem_size = meta.mem_size; - } else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) { + break; + case RET_PTR_TO_MEM_OR_BTF_ID: + { const struct btf_type *t; mark_reg_known_zero(env, regs, BPF_REG_0); @@ -6831,24 +7528,39 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].btf = meta.ret_btf; regs[BPF_REG_0].btf_id = meta.ret_btf_id; } - } else if (base_type(ret_type) == RET_PTR_TO_BTF_ID) { + break; + } + case RET_PTR_TO_BTF_ID: + { + struct btf *ret_btf; int ret_btf_id; mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag; - ret_btf_id = *fn->ret_btf_id; + if (func_id == BPF_FUNC_kptr_xchg) { + ret_btf = meta.kptr_off_desc->kptr.btf; + ret_btf_id = meta.kptr_off_desc->kptr.btf_id; + } else { + if (fn->ret_btf_id == BPF_PTR_POISON) { + verbose(env, "verifier internal error:"); + verbose(env, "func %s has non-overwritten BPF_PTR_POISON return type\n", + func_id_name(func_id)); + return -EINVAL; + } + ret_btf = btf_vmlinux; + ret_btf_id = *fn->ret_btf_id; + } if (ret_btf_id == 0) { verbose(env, "invalid return type %u of func %s#%d\n", base_type(ret_type), func_id_name(func_id), func_id); return -EINVAL; } - /* current BPF helper definitions are only coming from - * built-in code with type IDs from vmlinux BTF - */ - regs[BPF_REG_0].btf = btf_vmlinux; + regs[BPF_REG_0].btf = ret_btf; regs[BPF_REG_0].btf_id = ret_btf_id; - } else { + break; + } + default: verbose(env, "unknown return type %u of func %s#%d\n", base_type(ret_type), func_id_name(func_id), func_id); return -EINVAL; @@ -6857,7 +7569,13 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (type_may_be_null(regs[BPF_REG_0].type)) regs[BPF_REG_0].id = ++env->id_gen; - if (is_ptr_cast_function(func_id)) { + if (helper_multiple_ref_obj_use(func_id, meta.map_ptr)) { + verbose(env, "verifier internal error: func %s#%d sets ref_obj_id more than once\n", + func_id_name(func_id), func_id); + return -EFAULT; + } + + if (is_ptr_cast_function(func_id) || is_dynptr_ref_function(func_id)) { /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; } else if (is_acquire_function(func_id, meta.map_ptr)) { @@ -6940,18 +7658,20 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, { const struct btf_type *t, *func, *func_proto, *ptr_type; struct bpf_reg_state *regs = cur_regs(env); + struct bpf_kfunc_arg_meta meta = { 0 }; const char *func_name, *ptr_type_name; u32 i, nargs, func_id, ptr_type_id; int err, insn_idx = *insn_idx_p; const struct btf_param *args; struct btf *desc_btf; + u32 *kfunc_flags; bool acq; /* skip for now, but return error when we find this in fixup_kfunc_call */ if (!insn->imm) return 0; - desc_btf = find_kfunc_desc_btf(env, insn->imm, insn->off); + desc_btf = find_kfunc_desc_btf(env, insn->off); if (IS_ERR(desc_btf)) return PTR_ERR(desc_btf); @@ -6960,18 +7680,23 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, func_name = btf_name_by_offset(desc_btf, func->name_off); func_proto = btf_type_by_id(desc_btf, func->type); - if (!btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), - BTF_KFUNC_TYPE_CHECK, func_id)) { + kfunc_flags = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), func_id); + if (!kfunc_flags) { verbose(env, "calling kernel function %s is not allowed\n", func_name); return -EACCES; } + if (*kfunc_flags & KF_DESTRUCTIVE && !capable(CAP_SYS_BOOT)) { + verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capabilities\n"); + return -EACCES; + } + + acq = *kfunc_flags & KF_ACQUIRE; - acq = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), - BTF_KFUNC_TYPE_ACQUIRE, func_id); + meta.flags = *kfunc_flags; /* Check the arguments */ - err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs); + err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs, &meta); if (err < 0) return err; /* In case of release function, we get register number of refcounted @@ -6992,7 +7717,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* Check return type */ t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL); - if (acq && !btf_type_is_ptr(t)) { + if (acq && !btf_type_is_struct_ptr(desc_btf, t)) { verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n"); return -EINVAL; } @@ -7004,19 +7729,34 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id); if (!btf_type_is_struct(ptr_type)) { - ptr_type_name = btf_name_by_offset(desc_btf, - ptr_type->name_off); - verbose(env, "kernel function %s returns pointer type %s %s is not supported\n", - func_name, btf_type_str(ptr_type), - ptr_type_name); - return -EINVAL; + if (!meta.r0_size) { + ptr_type_name = btf_name_by_offset(desc_btf, + ptr_type->name_off); + verbose(env, + "kernel function %s returns pointer type %s %s is not supported\n", + func_name, + btf_type_str(ptr_type), + ptr_type_name); + return -EINVAL; + } + + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_MEM; + regs[BPF_REG_0].mem_size = meta.r0_size; + + if (meta.r0_rdonly) + regs[BPF_REG_0].type |= MEM_RDONLY; + + /* Ensures we don't access the memory after a release_reference() */ + if (meta.ref_obj_id) + regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; + } else { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].btf = desc_btf; + regs[BPF_REG_0].type = PTR_TO_BTF_ID; + regs[BPF_REG_0].btf_id = ptr_type_id; } - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].btf = desc_btf; - regs[BPF_REG_0].type = PTR_TO_BTF_ID; - regs[BPF_REG_0].btf_id = ptr_type_id; - if (btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), - BTF_KFUNC_TYPE_RET_NULL, func_id)) { + if (*kfunc_flags & KF_RET_NULL) { regs[BPF_REG_0].type |= PTR_MAYBE_NULL; /* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */ regs[BPF_REG_0].id = ++env->id_gen; @@ -7123,11 +7863,6 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env, return true; } -static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env) -{ - return &env->insn_aux_data[env->insn_idx]; -} - enum { REASON_BOUNDS = -1, REASON_TYPE = -2, @@ -7433,7 +8168,7 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env, return -EACCES; break; case PTR_TO_MAP_VALUE: - if (check_map_access(env, dst, dst_reg->off, 1, false)) { + if (check_map_access(env, dst, dst_reg->off, 1, false, ACCESS_HELPER)) { verbose(env, "R%d pointer arithmetic of map value goes out of range, " "prohibited for !root\n", dst); return -EACCES; @@ -7664,11 +8399,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type)) return -EINVAL; - - __update_reg_bounds(dst_reg); - __reg_deduce_bounds(dst_reg); - __reg_bound_offset(dst_reg); - + reg_bounds_sync(dst_reg); if (sanitize_check_bounds(env, insn, dst_reg) < 0) return -EACCES; if (sanitize_needed(opcode)) { @@ -8406,10 +9137,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, /* ALU32 ops are zero extended into 64bit register */ if (alu32) zext_32_to_64(dst_reg); - - __update_reg_bounds(dst_reg); - __reg_deduce_bounds(dst_reg); - __reg_bound_offset(dst_reg); + reg_bounds_sync(dst_reg); return 0; } @@ -8505,7 +9233,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (opcode == BPF_END || opcode == BPF_NEG) { if (opcode == BPF_NEG) { - if (BPF_SRC(insn->code) != 0 || + if (BPF_SRC(insn->code) != BPF_K || insn->src_reg != BPF_REG_0 || insn->off != 0 || insn->imm != 0) { verbose(env, "BPF_NEG uses reserved fields\n"); @@ -8598,10 +9326,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) insn->dst_reg); } zext_32_to_64(dst_reg); - - __update_reg_bounds(dst_reg); - __reg_deduce_bounds(dst_reg); - __reg_bound_offset(dst_reg); + reg_bounds_sync(dst_reg); } } else { /* case: R = imm @@ -8673,34 +9398,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } -static void __find_good_pkt_pointers(struct bpf_func_state *state, - struct bpf_reg_state *dst_reg, - enum bpf_reg_type type, int new_range) -{ - struct bpf_reg_state *reg; - int i; - - for (i = 0; i < MAX_BPF_REG; i++) { - reg = &state->regs[i]; - if (reg->type == type && reg->id == dst_reg->id) - /* keep the maximum range already checked */ - reg->range = max(reg->range, new_range); - } - - bpf_for_each_spilled_reg(i, state, reg) { - if (!reg) - continue; - if (reg->type == type && reg->id == dst_reg->id) - reg->range = max(reg->range, new_range); - } -} - static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, struct bpf_reg_state *dst_reg, enum bpf_reg_type type, bool range_right_open) { - int new_range, i; + struct bpf_func_state *state; + struct bpf_reg_state *reg; + int new_range; if (dst_reg->off < 0 || (dst_reg->off == 0 && range_right_open)) @@ -8765,9 +9470,11 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, * the range won't allow anything. * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16. */ - for (i = 0; i <= vstate->curframe; i++) - __find_good_pkt_pointers(vstate->frame[i], dst_reg, type, - new_range); + bpf_for_each_reg_in_vstate(vstate, state, reg, ({ + if (reg->type == type && reg->id == dst_reg->id) + /* keep the maximum range already checked */ + reg->range = max(reg->range, new_range); + })); } static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode) @@ -9039,26 +9746,33 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, return; switch (opcode) { + /* JEQ/JNE comparison doesn't change the register equivalence. + * + * r1 = r2; + * if (r1 == 42) goto label; + * ... + * label: // here both r1 and r2 are known to be 42. + * + * Hence when marking register as known preserve it's ID. + */ case BPF_JEQ: + if (is_jmp32) { + __mark_reg32_known(true_reg, val32); + true_32off = tnum_subreg(true_reg->var_off); + } else { + ___mark_reg_known(true_reg, val); + true_64off = true_reg->var_off; + } + break; case BPF_JNE: - { - struct bpf_reg_state *reg = - opcode == BPF_JEQ ? true_reg : false_reg; - - /* JEQ/JNE comparison doesn't change the register equivalence. - * r1 = r2; - * if (r1 == 42) goto label; - * ... - * label: // here both r1 and r2 are known to be 42. - * - * Hence when marking register as known preserve it's ID. - */ - if (is_jmp32) - __mark_reg32_known(reg, val32); - else - ___mark_reg_known(reg, val); + if (is_jmp32) { + __mark_reg32_known(false_reg, val32); + false_32off = tnum_subreg(false_reg->var_off); + } else { + ___mark_reg_known(false_reg, val); + false_64off = false_reg->var_off; + } break; - } case BPF_JSET: if (is_jmp32) { false_32off = tnum_and(false_32off, tnum_const(~val32)); @@ -9197,21 +9911,8 @@ static void __reg_combine_min_max(struct bpf_reg_state *src_reg, dst_reg->smax_value); src_reg->var_off = dst_reg->var_off = tnum_intersect(src_reg->var_off, dst_reg->var_off); - /* We might have learned new bounds from the var_off. */ - __update_reg_bounds(src_reg); - __update_reg_bounds(dst_reg); - /* We might have learned something about the sign bit. */ - __reg_deduce_bounds(src_reg); - __reg_deduce_bounds(dst_reg); - /* We might have learned some bits from the bounds. */ - __reg_bound_offset(src_reg); - __reg_bound_offset(dst_reg); - /* Intersecting with the old var_off might have improved our bounds - * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), - * then new var_off is (0; 0x7f...fc) which improves our umax. - */ - __update_reg_bounds(src_reg); - __update_reg_bounds(dst_reg); + reg_bounds_sync(src_reg); + reg_bounds_sync(dst_reg); } static void reg_combine_min_max(struct bpf_reg_state *true_src, @@ -9262,7 +9963,7 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, if (!reg_may_point_to_spin_lock(reg)) { /* For not-NULL ptr, reg->ref_obj_id will be reset - * in release_reg_references(). + * in release_reference(). * * reg->id is still used by spin_lock ptr. Other * than spin_lock ptr type, reg->id can be reset. @@ -9272,22 +9973,6 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, } } -static void __mark_ptr_or_null_regs(struct bpf_func_state *state, u32 id, - bool is_null) -{ - struct bpf_reg_state *reg; - int i; - - for (i = 0; i < MAX_BPF_REG; i++) - mark_ptr_or_null_reg(state, &state->regs[i], id, is_null); - - bpf_for_each_spilled_reg(i, state, reg) { - if (!reg) - continue; - mark_ptr_or_null_reg(state, reg, id, is_null); - } -} - /* The logic is similar to find_good_pkt_pointers(), both could eventually * be folded together at some point. */ @@ -9295,10 +9980,9 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, bool is_null) { struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *regs = state->regs; + struct bpf_reg_state *regs = state->regs, *reg; u32 ref_obj_id = regs[regno].ref_obj_id; u32 id = regs[regno].id; - int i; if (ref_obj_id && ref_obj_id == id && is_null) /* regs[regno] is in the " == NULL" branch. @@ -9307,8 +9991,9 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, */ WARN_ON_ONCE(release_reference_state(state, id)); - for (i = 0; i <= vstate->curframe; i++) - __mark_ptr_or_null_regs(vstate->frame[i], id, is_null); + bpf_for_each_reg_in_vstate(vstate, state, reg, ({ + mark_ptr_or_null_reg(state, reg, id, is_null); + })); } static bool try_match_pkt_pointers(const struct bpf_insn *insn, @@ -9421,23 +10106,11 @@ static void find_equal_scalars(struct bpf_verifier_state *vstate, { struct bpf_func_state *state; struct bpf_reg_state *reg; - int i, j; - for (i = 0; i <= vstate->curframe; i++) { - state = vstate->frame[i]; - for (j = 0; j < MAX_BPF_REG; j++) { - reg = &state->regs[j]; - if (reg->type == SCALAR_VALUE && reg->id == known_reg->id) - *reg = *known_reg; - } - - bpf_for_each_spilled_reg(j, state, reg) { - if (!reg) - continue; - if (reg->type == SCALAR_VALUE && reg->id == known_reg->id) - *reg = *known_reg; - } - } + bpf_for_each_reg_in_vstate(vstate, state, reg, ({ + if (reg->type == SCALAR_VALUE && reg->id == known_reg->id) + *reg = *known_reg; + })); } static int check_cond_jmp_op(struct bpf_verifier_env *env, @@ -9841,11 +10514,21 @@ static int check_return_code(struct bpf_verifier_env *env) const bool is_subprog = frame->subprogno; /* LSM and struct_ops func-ptr's return type could be "void" */ - if (!is_subprog && - (prog_type == BPF_PROG_TYPE_STRUCT_OPS || - prog_type == BPF_PROG_TYPE_LSM) && - !prog->aux->attach_func_proto->type) - return 0; + if (!is_subprog) { + switch (prog_type) { + case BPF_PROG_TYPE_LSM: + if (prog->expected_attach_type == BPF_LSM_CGROUP) + /* See below, can be 0 or 0-1 depending on hook. */ + break; + fallthrough; + case BPF_PROG_TYPE_STRUCT_OPS: + if (!prog->aux->attach_func_proto->type) + return 0; + break; + default: + break; + } + } /* eBPF calling convention is such that R0 is used * to return the value from eBPF program. @@ -9936,6 +10619,22 @@ static int check_return_code(struct bpf_verifier_env *env) case BPF_PROG_TYPE_SK_LOOKUP: range = tnum_range(SK_DROP, SK_PASS); break; + + case BPF_PROG_TYPE_LSM: + if (env->prog->expected_attach_type != BPF_LSM_CGROUP) { + /* Regular BPF_PROG_TYPE_LSM programs can return + * any value. + */ + return 0; + } + if (!env->prog->aux->attach_func_proto->type) { + /* Make sure programs that attach to void + * hooks don't try to modify return value. + */ + range = tnum_range(1, 1); + } + break; + case BPF_PROG_TYPE_EXT: /* freplace program can return anything as its return value * depends on the to-be-replaced kernel func or bpf program. @@ -9952,6 +10651,10 @@ static int check_return_code(struct bpf_verifier_env *env) if (!tnum_in(range, reg->var_off)) { verbose_invalid_scalar(env, reg, &range, "program exit", "R0"); + if (prog->expected_attach_type == BPF_LSM_CGROUP && + prog_type == BPF_PROG_TYPE_LSM && + !prog->aux->attach_func_proto->type) + verbose(env, "Note, BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n"); return -EINVAL; } @@ -10363,7 +11066,7 @@ static int check_btf_func(struct bpf_verifier_env *env, goto err_free; ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL); scalar_return = - btf_type_is_small_int(ret_type) || btf_type_is_enum(ret_type); + btf_type_is_small_int(ret_type) || btf_is_any_enum(ret_type); if (i && !scalar_return && env->subprog_info[i].has_ld_abs) { verbose(env, "LD_ABS is only allowed in functions that return 'int'.\n"); goto err_free; @@ -11708,6 +12411,16 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } + /* We must do check_reference_leak here before + * prepare_func_exit to handle the case when + * state->curframe > 0, it may be a callback + * function, for which reference_state must + * match caller reference state when it exits. + */ + err = check_reference_leak(env); + if (err) + return err; + if (state->curframe) { /* exit from nested function */ err = prepare_func_exit(env, &env->insn_idx); @@ -11717,10 +12430,6 @@ static int do_check(struct bpf_verifier_env *env) continue; } - err = check_reference_leak(env); - if (err) - return err; - err = check_return_code(env); if (err) return err; @@ -11933,14 +12642,6 @@ err_put: return err; } -static int check_map_prealloc(struct bpf_map *map) -{ - return (map->map_type != BPF_MAP_TYPE_HASH && - map->map_type != BPF_MAP_TYPE_PERCPU_HASH && - map->map_type != BPF_MAP_TYPE_HASH_OF_MAPS) || - !(map->map_flags & BPF_F_NO_PREALLOC); -} - static bool is_tracing_prog_type(enum bpf_prog_type type) { switch (type) { @@ -11948,56 +12649,19 @@ static bool is_tracing_prog_type(enum bpf_prog_type type) case BPF_PROG_TYPE_TRACEPOINT: case BPF_PROG_TYPE_PERF_EVENT: case BPF_PROG_TYPE_RAW_TRACEPOINT: + case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: return true; default: return false; } } -static bool is_preallocated_map(struct bpf_map *map) -{ - if (!check_map_prealloc(map)) - return false; - if (map->inner_map_meta && !check_map_prealloc(map->inner_map_meta)) - return false; - return true; -} - static int check_map_prog_compatibility(struct bpf_verifier_env *env, struct bpf_map *map, struct bpf_prog *prog) { enum bpf_prog_type prog_type = resolve_prog_type(prog); - /* - * Validate that trace type programs use preallocated hash maps. - * - * For programs attached to PERF events this is mandatory as the - * perf NMI can hit any arbitrary code sequence. - * - * All other trace types using preallocated hash maps are unsafe as - * well because tracepoint or kprobes can be inside locked regions - * of the memory allocator or at a place where a recursion into the - * memory allocator would see inconsistent state. - * - * On RT enabled kernels run-time allocation of all trace type - * programs is strictly prohibited due to lock type constraints. On - * !RT kernels it is allowed for backwards compatibility reasons for - * now, but warnings are emitted so developers are made aware of - * the unsafety and can fix their programs before this is enforced. - */ - if (is_tracing_prog_type(prog_type) && !is_preallocated_map(map)) { - if (prog_type == BPF_PROG_TYPE_PERF_EVENT) { - verbose(env, "perf_event programs can only use preallocated hash map\n"); - return -EINVAL; - } - if (IS_ENABLED(CONFIG_PREEMPT_RT)) { - verbose(env, "trace type programs can only use preallocated hash map\n"); - return -EINVAL; - } - WARN_ONCE(1, "trace type BPF program uses run-time allocation\n"); - verbose(env, "trace type programs with run-time allocated hash maps are unsafe. Switch to preallocated hash maps.\n"); - } if (map_value_has_spin_lock(map)) { if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) { @@ -12044,13 +12708,8 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_LRU_PERCPU_HASH: case BPF_MAP_TYPE_ARRAY_OF_MAPS: case BPF_MAP_TYPE_HASH_OF_MAPS: - if (!is_preallocated_map(map)) { - verbose(env, - "Sleepable programs can only use preallocated maps\n"); - return -EINVAL; - } - break; case BPF_MAP_TYPE_RINGBUF: + case BPF_MAP_TYPE_USER_RINGBUF: case BPF_MAP_TYPE_INODE_STORAGE: case BPF_MAP_TYPE_SK_STORAGE: case BPF_MAP_TYPE_TASK_STORAGE: @@ -12822,7 +13481,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) if (!ctx_access) continue; - switch (env->insn_aux_data[i + delta].ptr_type) { + switch ((int)env->insn_aux_data[i + delta].ptr_type) { case PTR_TO_CTX: if (!ops->convert_ctx_access) continue; @@ -12839,13 +13498,11 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) convert_ctx_access = bpf_xdp_sock_convert_ctx_access; break; case PTR_TO_BTF_ID: + case PTR_TO_BTF_ID | PTR_UNTRUSTED: if (type == BPF_READ) { insn->code = BPF_LDX | BPF_PROBE_MEM | BPF_SIZE((insn)->code); env->prog->aux->num_exentries++; - } else if (resolve_prog_type(env->prog) != BPF_PROG_TYPE_STRUCT_OPS) { - verbose(env, "Writes through BTF pointers are not allowed\n"); - return -EINVAL; } continue; default: @@ -13005,6 +13662,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) /* Below members will be freed only at prog->aux */ func[i]->aux->btf = prog->aux->btf; func[i]->aux->func_info = prog->aux->func_info; + func[i]->aux->func_info_cnt = prog->aux->func_info_cnt; func[i]->aux->poke_tab = prog->aux->poke_tab; func[i]->aux->size_poke_tab = prog->aux->size_poke_tab; @@ -13017,9 +13675,6 @@ static int jit_subprogs(struct bpf_verifier_env *env) poke->aux = func[i]->aux; } - /* Use bpf_prog_F_tag to indicate functions in stack traces. - * Long term would need debug info to populate names - */ func[i]->aux->name[0] = 'F'; func[i]->aux->stack_depth = env->subprog_info[i].stack_depth; func[i]->jit_requested = 1; @@ -13524,7 +14179,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env) insn->imm == BPF_FUNC_map_pop_elem || insn->imm == BPF_FUNC_map_peek_elem || insn->imm == BPF_FUNC_redirect_map || - insn->imm == BPF_FUNC_for_each_map_elem)) { + insn->imm == BPF_FUNC_for_each_map_elem || + insn->imm == BPF_FUNC_map_lookup_percpu_elem)) { aux = &env->insn_aux_data[i + delta]; if (bpf_map_ptr_poisoned(aux)) goto patch_call_imm; @@ -13573,6 +14229,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env) bpf_callback_t callback_fn, void *callback_ctx, u64 flags))NULL)); + BUILD_BUG_ON(!__same_type(ops->map_lookup_percpu_elem, + (void *(*)(struct bpf_map *map, void *key, u32 cpu))NULL)); patch_map_ops_generic: switch (insn->imm) { @@ -13600,6 +14258,9 @@ patch_map_ops_generic: case BPF_FUNC_for_each_map_elem: insn->imm = BPF_CALL_IMM(ops->map_for_each_callback); continue; + case BPF_FUNC_map_lookup_percpu_elem: + insn->imm = BPF_CALL_IMM(ops->map_lookup_percpu_elem); + continue; } goto patch_call_imm; @@ -13749,6 +14410,142 @@ patch_call_imm: return 0; } +static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env, + int position, + s32 stack_base, + u32 callback_subprogno, + u32 *cnt) +{ + s32 r6_offset = stack_base + 0 * BPF_REG_SIZE; + s32 r7_offset = stack_base + 1 * BPF_REG_SIZE; + s32 r8_offset = stack_base + 2 * BPF_REG_SIZE; + int reg_loop_max = BPF_REG_6; + int reg_loop_cnt = BPF_REG_7; + int reg_loop_ctx = BPF_REG_8; + + struct bpf_prog *new_prog; + u32 callback_start; + u32 call_insn_offset; + s32 callback_offset; + + /* This represents an inlined version of bpf_iter.c:bpf_loop, + * be careful to modify this code in sync. + */ + struct bpf_insn insn_buf[] = { + /* Return error and jump to the end of the patch if + * expected number of iterations is too big. + */ + BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2), + BPF_MOV32_IMM(BPF_REG_0, -E2BIG), + BPF_JMP_IMM(BPF_JA, 0, 0, 16), + /* spill R6, R7, R8 to use these as loop vars */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset), + /* initialize loop vars */ + BPF_MOV64_REG(reg_loop_max, BPF_REG_1), + BPF_MOV32_IMM(reg_loop_cnt, 0), + BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3), + /* loop header, + * if reg_loop_cnt >= reg_loop_max skip the loop body + */ + BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max, 5), + /* callback call, + * correct callback offset would be set after patching + */ + BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt), + BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx), + BPF_CALL_REL(0), + /* increment loop counter */ + BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1), + /* jump to loop header if callback returned 0 */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -6), + /* return value of bpf_loop, + * set R0 to the number of iterations + */ + BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt), + /* restore original values of R6, R7, R8 */ + BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset), + BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset), + BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset), + }; + + *cnt = ARRAY_SIZE(insn_buf); + new_prog = bpf_patch_insn_data(env, position, insn_buf, *cnt); + if (!new_prog) + return new_prog; + + /* callback start is known only after patching */ + callback_start = env->subprog_info[callback_subprogno].start; + /* Note: insn_buf[12] is an offset of BPF_CALL_REL instruction */ + call_insn_offset = position + 12; + callback_offset = callback_start - call_insn_offset - 1; + new_prog->insnsi[call_insn_offset].imm = callback_offset; + + return new_prog; +} + +static bool is_bpf_loop_call(struct bpf_insn *insn) +{ + return insn->code == (BPF_JMP | BPF_CALL) && + insn->src_reg == 0 && + insn->imm == BPF_FUNC_loop; +} + +/* For all sub-programs in the program (including main) check + * insn_aux_data to see if there are bpf_loop calls that require + * inlining. If such calls are found the calls are replaced with a + * sequence of instructions produced by `inline_bpf_loop` function and + * subprog stack_depth is increased by the size of 3 registers. + * This stack space is used to spill values of the R6, R7, R8. These + * registers are used to store the loop bound, counter and context + * variables. + */ +static int optimize_bpf_loop(struct bpf_verifier_env *env) +{ + struct bpf_subprog_info *subprogs = env->subprog_info; + int i, cur_subprog = 0, cnt, delta = 0; + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + u16 stack_depth = subprogs[cur_subprog].stack_depth; + u16 stack_depth_roundup = round_up(stack_depth, 8) - stack_depth; + u16 stack_depth_extra = 0; + + for (i = 0; i < insn_cnt; i++, insn++) { + struct bpf_loop_inline_state *inline_state = + &env->insn_aux_data[i + delta].loop_inline_state; + + if (is_bpf_loop_call(insn) && inline_state->fit_for_inline) { + struct bpf_prog *new_prog; + + stack_depth_extra = BPF_REG_SIZE * 3 + stack_depth_roundup; + new_prog = inline_bpf_loop(env, + i + delta, + -(stack_depth + stack_depth_extra), + inline_state->callback_subprogno, + &cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = new_prog; + insn = new_prog->insnsi + i + delta; + } + + if (subprogs[cur_subprog + 1].start == i + delta + 1) { + subprogs[cur_subprog].stack_depth += stack_depth_extra; + cur_subprog++; + stack_depth = subprogs[cur_subprog].stack_depth; + stack_depth_roundup = round_up(stack_depth, 8) - stack_depth; + stack_depth_extra = 0; + } + } + + env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; + + return 0; +} + static void free_states(struct bpf_verifier_env *env) { struct bpf_verifier_state_list *sl, *sln; @@ -14168,6 +14965,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, fallthrough; case BPF_MODIFY_RETURN: case BPF_LSM_MAC: + case BPF_LSM_CGROUP: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: if (!btf_type_is_func(t)) { @@ -14284,8 +15082,8 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) } if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING && - prog->type != BPF_PROG_TYPE_LSM) { - verbose(env, "Only fentry/fexit/fmod_ret and lsm programs can be sleepable\n"); + prog->type != BPF_PROG_TYPE_LSM && prog->type != BPF_PROG_TYPE_KPROBE) { + verbose(env, "Only fentry/fexit/fmod_ret, lsm, and kprobe/uprobe programs can be sleepable\n"); return -EINVAL; } @@ -14486,6 +15284,9 @@ skip_full_check: ret = check_max_stack_depth(env); /* instruction rewrites happen after this point */ + if (ret == 0) + ret = optimize_bpf_loop(env); + if (is_priv) { if (ret == 0) opt_hard_wire_dead_code_branches(env); diff --git a/kernel/cfi.c b/kernel/cfi.c index 9594cfd1cf2c..08caad776717 100644 --- a/kernel/cfi.c +++ b/kernel/cfi.c @@ -1,329 +1,101 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Clang Control Flow Integrity (CFI) error and slowpath handling. + * Clang Control Flow Integrity (CFI) error handling. * - * Copyright (C) 2021 Google LLC + * Copyright (C) 2022 Google LLC */ -#include <linux/hardirq.h> -#include <linux/kallsyms.h> -#include <linux/module.h> -#include <linux/mutex.h> -#include <linux/printk.h> -#include <linux/ratelimit.h> -#include <linux/rcupdate.h> -#include <linux/vmalloc.h> -#include <asm/cacheflush.h> -#include <asm/set_memory.h> +#include <linux/cfi.h> -/* Compiler-defined handler names */ -#ifdef CONFIG_CFI_PERMISSIVE -#define cfi_failure_handler __ubsan_handle_cfi_check_fail -#else -#define cfi_failure_handler __ubsan_handle_cfi_check_fail_abort -#endif - -static inline void handle_cfi_failure(void *ptr) +enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr, + unsigned long *target, u32 type) { - if (IS_ENABLED(CONFIG_CFI_PERMISSIVE)) - WARN_RATELIMIT(1, "CFI failure (target: %pS):\n", ptr); + if (target) + pr_err("CFI failure at %pS (target: %pS; expected type: 0x%08x)\n", + (void *)addr, (void *)*target, type); else - panic("CFI failure (target: %pS)\n", ptr); -} - -#ifdef CONFIG_MODULES -#ifdef CONFIG_CFI_CLANG_SHADOW -/* - * Index type. A 16-bit index can address at most (2^16)-2 pages (taking - * into account SHADOW_INVALID), i.e. ~256M with 4k pages. - */ -typedef u16 shadow_t; -#define SHADOW_INVALID ((shadow_t)~0UL) - -struct cfi_shadow { - /* Page index for the beginning of the shadow */ - unsigned long base; - /* An array of __cfi_check locations (as indices to the shadow) */ - shadow_t shadow[1]; -} __packed; - -/* - * The shadow covers ~128M from the beginning of the module region. If - * the region is larger, we fall back to __module_address for the rest. - */ -#define __SHADOW_RANGE (_UL(SZ_128M) >> PAGE_SHIFT) - -/* The in-memory size of struct cfi_shadow, always at least one page */ -#define __SHADOW_PAGES ((__SHADOW_RANGE * sizeof(shadow_t)) >> PAGE_SHIFT) -#define SHADOW_PAGES max(1UL, __SHADOW_PAGES) -#define SHADOW_SIZE (SHADOW_PAGES << PAGE_SHIFT) - -/* The actual size of the shadow array, minus metadata */ -#define SHADOW_ARR_SIZE (SHADOW_SIZE - offsetof(struct cfi_shadow, shadow)) -#define SHADOW_ARR_SLOTS (SHADOW_ARR_SIZE / sizeof(shadow_t)) - -static DEFINE_MUTEX(shadow_update_lock); -static struct cfi_shadow __rcu *cfi_shadow __read_mostly; - -/* Returns the index in the shadow for the given address */ -static inline int ptr_to_shadow(const struct cfi_shadow *s, unsigned long ptr) -{ - unsigned long index; - unsigned long page = ptr >> PAGE_SHIFT; - - if (unlikely(page < s->base)) - return -1; /* Outside of module area */ - - index = page - s->base; - - if (index >= SHADOW_ARR_SLOTS) - return -1; /* Cannot be addressed with shadow */ - - return (int)index; -} - -/* Returns the page address for an index in the shadow */ -static inline unsigned long shadow_to_ptr(const struct cfi_shadow *s, - int index) -{ - if (unlikely(index < 0 || index >= SHADOW_ARR_SLOTS)) - return 0; - - return (s->base + index) << PAGE_SHIFT; -} + pr_err("CFI failure at %pS (no target information)\n", + (void *)addr); -/* Returns the __cfi_check function address for the given shadow location */ -static inline unsigned long shadow_to_check_fn(const struct cfi_shadow *s, - int index) -{ - if (unlikely(index < 0 || index >= SHADOW_ARR_SLOTS)) - return 0; - - if (unlikely(s->shadow[index] == SHADOW_INVALID)) - return 0; - - /* __cfi_check is always page aligned */ - return (s->base + s->shadow[index]) << PAGE_SHIFT; -} - -static void prepare_next_shadow(const struct cfi_shadow __rcu *prev, - struct cfi_shadow *next) -{ - int i, index, check; - - /* Mark everything invalid */ - memset(next->shadow, 0xFF, SHADOW_ARR_SIZE); - - if (!prev) - return; /* No previous shadow */ - - /* If the base address didn't change, an update is not needed */ - if (prev->base == next->base) { - memcpy(next->shadow, prev->shadow, SHADOW_ARR_SIZE); - return; + if (IS_ENABLED(CONFIG_CFI_PERMISSIVE)) { + __warn(NULL, 0, (void *)addr, 0, regs, NULL); + return BUG_TRAP_TYPE_WARN; } - /* Convert the previous shadow to the new address range */ - for (i = 0; i < SHADOW_ARR_SLOTS; ++i) { - if (prev->shadow[i] == SHADOW_INVALID) - continue; - - index = ptr_to_shadow(next, shadow_to_ptr(prev, i)); - if (index < 0) - continue; - - check = ptr_to_shadow(next, - shadow_to_check_fn(prev, prev->shadow[i])); - if (check < 0) - continue; - - next->shadow[index] = (shadow_t)check; - } + return BUG_TRAP_TYPE_BUG; } -static void add_module_to_shadow(struct cfi_shadow *s, struct module *mod, - unsigned long min_addr, unsigned long max_addr) +#ifdef CONFIG_ARCH_USES_CFI_TRAPS +static inline unsigned long trap_address(s32 *p) { - int check_index; - unsigned long check = (unsigned long)mod->cfi_check; - unsigned long ptr; - - if (unlikely(!PAGE_ALIGNED(check))) { - pr_warn("cfi: not using shadow for module %s\n", mod->name); - return; - } - - check_index = ptr_to_shadow(s, check); - if (check_index < 0) - return; /* Module not addressable with shadow */ - - /* For each page, store the check function index in the shadow */ - for (ptr = min_addr; ptr <= max_addr; ptr += PAGE_SIZE) { - int index = ptr_to_shadow(s, ptr); - - if (index >= 0) { - /* Each page must only contain one module */ - WARN_ON_ONCE(s->shadow[index] != SHADOW_INVALID); - s->shadow[index] = (shadow_t)check_index; - } - } + return (unsigned long)((long)p + (long)*p); } -static void remove_module_from_shadow(struct cfi_shadow *s, struct module *mod, - unsigned long min_addr, unsigned long max_addr) +static bool is_trap(unsigned long addr, s32 *start, s32 *end) { - unsigned long ptr; - - for (ptr = min_addr; ptr <= max_addr; ptr += PAGE_SIZE) { - int index = ptr_to_shadow(s, ptr); + s32 *p; - if (index >= 0) - s->shadow[index] = SHADOW_INVALID; + for (p = start; p < end; ++p) { + if (trap_address(p) == addr) + return true; } -} -typedef void (*update_shadow_fn)(struct cfi_shadow *, struct module *, - unsigned long min_addr, unsigned long max_addr); - -static void update_shadow(struct module *mod, unsigned long base_addr, - update_shadow_fn fn) -{ - struct cfi_shadow *prev; - struct cfi_shadow *next; - unsigned long min_addr, max_addr; - - next = vmalloc(SHADOW_SIZE); - - mutex_lock(&shadow_update_lock); - prev = rcu_dereference_protected(cfi_shadow, - mutex_is_locked(&shadow_update_lock)); - - if (next) { - next->base = base_addr >> PAGE_SHIFT; - prepare_next_shadow(prev, next); - - min_addr = (unsigned long)mod->core_layout.base; - max_addr = min_addr + mod->core_layout.text_size; - fn(next, mod, min_addr & PAGE_MASK, max_addr & PAGE_MASK); - - set_memory_ro((unsigned long)next, SHADOW_PAGES); - } - - rcu_assign_pointer(cfi_shadow, next); - mutex_unlock(&shadow_update_lock); - synchronize_rcu(); - - if (prev) { - set_memory_rw((unsigned long)prev, SHADOW_PAGES); - vfree(prev); - } -} - -void cfi_module_add(struct module *mod, unsigned long base_addr) -{ - update_shadow(mod, base_addr, add_module_to_shadow); -} - -void cfi_module_remove(struct module *mod, unsigned long base_addr) -{ - update_shadow(mod, base_addr, remove_module_from_shadow); + return false; } -static inline cfi_check_fn ptr_to_check_fn(const struct cfi_shadow __rcu *s, - unsigned long ptr) +#ifdef CONFIG_MODULES +/* Populates `kcfi_trap(_end)?` fields in `struct module`. */ +void module_cfi_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, + struct module *mod) { - int index; + char *secstrings; + unsigned int i; - if (unlikely(!s)) - return NULL; /* No shadow available */ + mod->kcfi_traps = NULL; + mod->kcfi_traps_end = NULL; - index = ptr_to_shadow(s, ptr); - if (index < 0) - return NULL; /* Cannot be addressed with shadow */ + secstrings = (char *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; - return (cfi_check_fn)shadow_to_check_fn(s, index); -} - -static inline cfi_check_fn find_shadow_check_fn(unsigned long ptr) -{ - cfi_check_fn fn; - - rcu_read_lock_sched_notrace(); - fn = ptr_to_check_fn(rcu_dereference_sched(cfi_shadow), ptr); - rcu_read_unlock_sched_notrace(); - - return fn; -} - -#else /* !CONFIG_CFI_CLANG_SHADOW */ + for (i = 1; i < hdr->e_shnum; i++) { + if (strcmp(secstrings + sechdrs[i].sh_name, "__kcfi_traps")) + continue; -static inline cfi_check_fn find_shadow_check_fn(unsigned long ptr) -{ - return NULL; + mod->kcfi_traps = (s32 *)sechdrs[i].sh_addr; + mod->kcfi_traps_end = (s32 *)(sechdrs[i].sh_addr + sechdrs[i].sh_size); + break; + } } -#endif /* CONFIG_CFI_CLANG_SHADOW */ - -static inline cfi_check_fn find_module_check_fn(unsigned long ptr) +static bool is_module_cfi_trap(unsigned long addr) { - cfi_check_fn fn = NULL; struct module *mod; + bool found = false; rcu_read_lock_sched_notrace(); - mod = __module_address(ptr); - if (mod) - fn = mod->cfi_check; - rcu_read_unlock_sched_notrace(); - return fn; -} - -static inline cfi_check_fn find_check_fn(unsigned long ptr) -{ - cfi_check_fn fn = NULL; - - if (is_kernel_text(ptr)) - return __cfi_check; - - /* - * Indirect call checks can happen when RCU is not watching. Both - * the shadow and __module_address use RCU, so we need to wake it - * up if necessary. - */ - RCU_NONIDLE({ - if (IS_ENABLED(CONFIG_CFI_CLANG_SHADOW)) - fn = find_shadow_check_fn(ptr); + mod = __module_address(addr); + if (mod) + found = is_trap(addr, mod->kcfi_traps, mod->kcfi_traps_end); - if (!fn) - fn = find_module_check_fn(ptr); - }); + rcu_read_unlock_sched_notrace(); - return fn; + return found; } - -void __cfi_slowpath_diag(uint64_t id, void *ptr, void *diag) +#else /* CONFIG_MODULES */ +static inline bool is_module_cfi_trap(unsigned long addr) { - cfi_check_fn fn = find_check_fn((unsigned long)ptr); - - if (likely(fn)) - fn(id, ptr, diag); - else /* Don't allow unchecked modules */ - handle_cfi_failure(ptr); + return false; } -EXPORT_SYMBOL(__cfi_slowpath_diag); +#endif /* CONFIG_MODULES */ -#else /* !CONFIG_MODULES */ +extern s32 __start___kcfi_traps[]; +extern s32 __stop___kcfi_traps[]; -void __cfi_slowpath_diag(uint64_t id, void *ptr, void *diag) +bool is_cfi_trap(unsigned long addr) { - handle_cfi_failure(ptr); /* No modules */ -} -EXPORT_SYMBOL(__cfi_slowpath_diag); + if (is_trap(addr, __start___kcfi_traps, __stop___kcfi_traps)) + return true; -#endif /* CONFIG_MODULES */ - -void cfi_failure_handler(void *data, void *ptr, void *vtable) -{ - handle_cfi_failure(ptr); + return is_module_cfi_trap(addr); } -EXPORT_SYMBOL(cfi_failure_handler); +#endif /* CONFIG_ARCH_USES_CFI_TRAPS */ diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 6e36e854b512..36b740cb3d59 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -12,7 +12,6 @@ #define TRACE_CGROUP_PATH_LEN 1024 extern spinlock_t trace_cgroup_path_lock; extern char trace_cgroup_path[TRACE_CGROUP_PATH_LEN]; -extern bool cgroup_debug; extern void __init enable_debug_cgroup(void); /* @@ -234,6 +233,7 @@ void cgroup_kn_unlock(struct kernfs_node *kn); int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns); +void cgroup_favor_dynmods(struct cgroup_root *root, bool favor); void cgroup_free_root(struct cgroup_root *root); void init_cgroup_root(struct cgroup_fs_context *ctx); int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index afc6c0e9c966..ff6a8099eb2a 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -59,6 +59,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) int retval = 0; mutex_lock(&cgroup_mutex); + cpus_read_lock(); percpu_down_write(&cgroup_threadgroup_rwsem); for_each_root(root) { struct cgroup *from_cgrp; @@ -72,6 +73,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) break; } percpu_up_write(&cgroup_threadgroup_rwsem); + cpus_read_unlock(); mutex_unlock(&cgroup_mutex); return retval; @@ -875,6 +877,8 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo seq_puts(seq, ",xattr"); if (root->flags & CGRP_ROOT_CPUSET_V2_MODE) seq_puts(seq, ",cpuset_v2_mode"); + if (root->flags & CGRP_ROOT_FAVOR_DYNMODS) + seq_puts(seq, ",favordynmods"); spin_lock(&release_agent_path_lock); if (strlen(root->release_agent_path)) @@ -898,6 +902,8 @@ enum cgroup1_param { Opt_noprefix, Opt_release_agent, Opt_xattr, + Opt_favordynmods, + Opt_nofavordynmods, }; const struct fs_parameter_spec cgroup1_fs_parameters[] = { @@ -909,6 +915,8 @@ const struct fs_parameter_spec cgroup1_fs_parameters[] = { fsparam_flag ("noprefix", Opt_noprefix), fsparam_string("release_agent", Opt_release_agent), fsparam_flag ("xattr", Opt_xattr), + fsparam_flag ("favordynmods", Opt_favordynmods), + fsparam_flag ("nofavordynmods", Opt_nofavordynmods), {} }; @@ -960,6 +968,12 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_xattr: ctx->flags |= CGRP_ROOT_XATTR; break; + case Opt_favordynmods: + ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; + break; + case Opt_nofavordynmods: + ctx->flags &= ~CGRP_ROOT_FAVOR_DYNMODS; + break; case Opt_release_agent: /* Specifying two release agents is forbidden */ if (ctx->release_agent) @@ -1211,8 +1225,11 @@ static int cgroup1_root_to_use(struct fs_context *fc) init_cgroup_root(ctx); ret = cgroup_setup_root(root, ctx->subsys_mask); - if (ret) + if (!ret) + cgroup_favor_dynmods(root, ctx->flags & CGRP_ROOT_FAVOR_DYNMODS); + else cgroup_free_root(root); + return ret; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index adb820e98f24..b6e3110b3ea7 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -96,7 +96,7 @@ EXPORT_SYMBOL_GPL(css_set_lock); DEFINE_SPINLOCK(trace_cgroup_path_lock); char trace_cgroup_path[TRACE_CGROUP_PATH_LEN]; -bool cgroup_debug __read_mostly; +static bool cgroup_debug __read_mostly; /* * Protects cgroup_idr and css_idr so that IDs can be released without @@ -279,8 +279,6 @@ bool cgroup_ssid_enabled(int ssid) * * - When mounting an existing superblock, mount options should match. * - * - Remount is disallowed. - * * - rename(2) is disallowed. * * - "tasks" is removed. Everything should be at process granularity. Use @@ -765,7 +763,8 @@ struct css_set init_css_set = { .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets), .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), - .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), + .mg_src_preload_node = LIST_HEAD_INIT(init_css_set.mg_src_preload_node), + .mg_dst_preload_node = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node), .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), /* @@ -1240,7 +1239,8 @@ static struct css_set *find_css_set(struct css_set *old_cset, INIT_LIST_HEAD(&cset->threaded_csets); INIT_HLIST_NODE(&cset->hlist); INIT_LIST_HEAD(&cset->cgrp_links); - INIT_LIST_HEAD(&cset->mg_preload_node); + INIT_LIST_HEAD(&cset->mg_src_preload_node); + INIT_LIST_HEAD(&cset->mg_dst_preload_node); INIT_LIST_HEAD(&cset->mg_node); /* Copy the set of subsystem state objects generated in @@ -1307,6 +1307,20 @@ struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) return root_cgrp->root; } +void cgroup_favor_dynmods(struct cgroup_root *root, bool favor) +{ + bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS; + + /* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */ + if (favor && !favoring) { + rcu_sync_enter(&cgroup_threadgroup_rwsem.rss); + root->flags |= CGRP_ROOT_FAVOR_DYNMODS; + } else if (!favor && favoring) { + rcu_sync_exit(&cgroup_threadgroup_rwsem.rss); + root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS; + } +} + static int cgroup_init_root_id(struct cgroup_root *root) { int id; @@ -1367,6 +1381,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) cgroup_root_count--; } + cgroup_favor_dynmods(root, false); cgroup_exit_root_id(root); mutex_unlock(&cgroup_mutex); @@ -1376,6 +1391,31 @@ static void cgroup_destroy_root(struct cgroup_root *root) cgroup_free_root(root); } +static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset, + struct cgroup_root *root) +{ + struct cgroup *res_cgroup = NULL; + + if (cset == &init_css_set) { + res_cgroup = &root->cgrp; + } else if (root == &cgrp_dfl_root) { + res_cgroup = cset->dfl_cgrp; + } else { + struct cgrp_cset_link *link; + + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { + struct cgroup *c = link->cgrp; + + if (c->root == root) { + res_cgroup = c; + break; + } + } + } + + return res_cgroup; +} + /* * look up cgroup associated with current task's cgroup namespace on the * specified hierarchy @@ -1391,22 +1431,8 @@ current_cgns_cgroup_from_root(struct cgroup_root *root) rcu_read_lock(); cset = current->nsproxy->cgroup_ns->root_cset; - if (cset == &init_css_set) { - res = &root->cgrp; - } else if (root == &cgrp_dfl_root) { - res = cset->dfl_cgrp; - } else { - struct cgrp_cset_link *link; - - list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { - struct cgroup *c = link->cgrp; + res = __cset_cgroup_from_root(cset, root); - if (c->root == root) { - res = c; - break; - } - } - } rcu_read_unlock(); BUG_ON(!res); @@ -1422,22 +1448,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock); - if (cset == &init_css_set) { - res = &root->cgrp; - } else if (root == &cgrp_dfl_root) { - res = cset->dfl_cgrp; - } else { - struct cgrp_cset_link *link; - - list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { - struct cgroup *c = link->cgrp; - - if (c->root == root) { - res = c; - break; - } - } - } + res = __cset_cgroup_from_root(cset, root); BUG_ON(!res); return res; @@ -1809,6 +1820,7 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) if (ss->css_rstat_flush) { list_del_rcu(&css->rstat_css_node); + synchronize_rcu(); list_add_rcu(&css->rstat_css_node, &dcgrp->rstat_css_list); } @@ -1864,6 +1876,7 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, enum cgroup2_param { Opt_nsdelegate, + Opt_favordynmods, Opt_memory_localevents, Opt_memory_recursiveprot, nr__cgroup2_params @@ -1871,6 +1884,7 @@ enum cgroup2_param { static const struct fs_parameter_spec cgroup2_fs_parameters[] = { fsparam_flag("nsdelegate", Opt_nsdelegate), + fsparam_flag("favordynmods", Opt_favordynmods), fsparam_flag("memory_localevents", Opt_memory_localevents), fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot), {} @@ -1890,6 +1904,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param case Opt_nsdelegate: ctx->flags |= CGRP_ROOT_NS_DELEGATE; return 0; + case Opt_favordynmods: + ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; + return 0; case Opt_memory_localevents: ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; return 0; @@ -1908,6 +1925,9 @@ static void apply_cgroup_root_flags(unsigned int root_flags) else cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE; + cgroup_favor_dynmods(&cgrp_dfl_root, + root_flags & CGRP_ROOT_FAVOR_DYNMODS); + if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; else @@ -1924,6 +1944,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root { if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) seq_puts(seq, ",nsdelegate"); + if (cgrp_dfl_root.flags & CGRP_ROOT_FAVOR_DYNMODS) + seq_puts(seq, ",favordynmods"); if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) seq_puts(seq, ",memory_localevents"); if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT) @@ -1974,7 +1996,8 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) cgrp->root = root; init_cgroup_housekeeping(cgrp); - root->flags = ctx->flags; + /* DYNMODS must be modified through cgroup_favor_dynmods() */ + root->flags = ctx->flags & ~CGRP_ROOT_FAVOR_DYNMODS; if (ctx->release_agent) strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX); if (ctx->name) @@ -2196,6 +2219,10 @@ static int cgroup_init_fs_context(struct fs_context *fc) put_user_ns(fc->user_ns); fc->user_ns = get_user_ns(ctx->ns->user_ns); fc->global = true; + +#ifdef CONFIG_CGROUP_FAVOR_DYNMODS + ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; +#endif return 0; } @@ -2344,6 +2371,47 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) EXPORT_SYMBOL_GPL(task_cgroup_path); /** + * cgroup_attach_lock - Lock for ->attach() + * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem + * + * cgroup migration sometimes needs to stabilize threadgroups against forks and + * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach() + * implementations (e.g. cpuset), also need to disable CPU hotplug. + * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can + * lead to deadlocks. + * + * Bringing up a CPU may involve creating and destroying tasks which requires + * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside + * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while + * write-locking threadgroup_rwsem, the locking order is reversed and we end up + * waiting for an on-going CPU hotplug operation which in turn is waiting for + * the threadgroup_rwsem to be released to create new tasks. For more details: + * + * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu + * + * Resolve the situation by always acquiring cpus_read_lock() before optionally + * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that + * CPU hotplug is disabled on entry. + */ +static void cgroup_attach_lock(bool lock_threadgroup) +{ + cpus_read_lock(); + if (lock_threadgroup) + percpu_down_write(&cgroup_threadgroup_rwsem); +} + +/** + * cgroup_attach_unlock - Undo cgroup_attach_lock() + * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem + */ +static void cgroup_attach_unlock(bool lock_threadgroup) +{ + if (lock_threadgroup) + percpu_up_write(&cgroup_threadgroup_rwsem); + cpus_read_unlock(); +} + +/** * cgroup_migrate_add_task - add a migration target task to a migration context * @task: target task * @mgctx: target migration context @@ -2570,10 +2638,6 @@ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp) if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp)) return -EOPNOTSUPP; - /* mixables don't care */ - if (cgroup_is_mixable(dst_cgrp)) - return 0; - /* * If @dst_cgrp is already or can become a thread root or is * threaded, it doesn't matter. @@ -2597,21 +2661,27 @@ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp) */ void cgroup_migrate_finish(struct cgroup_mgctx *mgctx) { - LIST_HEAD(preloaded); struct css_set *cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); - list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded); - list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded); + list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets, + mg_src_preload_node) { + cset->mg_src_cgrp = NULL; + cset->mg_dst_cgrp = NULL; + cset->mg_dst_cset = NULL; + list_del_init(&cset->mg_src_preload_node); + put_css_set_locked(cset); + } - list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) { + list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets, + mg_dst_preload_node) { cset->mg_src_cgrp = NULL; cset->mg_dst_cgrp = NULL; cset->mg_dst_cset = NULL; - list_del_init(&cset->mg_preload_node); + list_del_init(&cset->mg_dst_preload_node); put_css_set_locked(cset); } @@ -2651,7 +2721,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset, if (src_cset->dead) return; - if (!list_empty(&src_cset->mg_preload_node)) + if (!list_empty(&src_cset->mg_src_preload_node)) return; src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); @@ -2664,7 +2734,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset, src_cset->mg_src_cgrp = src_cgrp; src_cset->mg_dst_cgrp = dst_cgrp; get_css_set(src_cset); - list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets); + list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets); } /** @@ -2689,7 +2759,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) /* look up the dst cset for each src cset and link it to src */ list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets, - mg_preload_node) { + mg_src_preload_node) { struct css_set *dst_cset; struct cgroup_subsys *ss; int ssid; @@ -2708,7 +2778,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) if (src_cset == dst_cset) { src_cset->mg_src_cgrp = NULL; src_cset->mg_dst_cgrp = NULL; - list_del_init(&src_cset->mg_preload_node); + list_del_init(&src_cset->mg_src_preload_node); put_css_set(src_cset); put_css_set(dst_cset); continue; @@ -2716,8 +2786,8 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) src_cset->mg_dst_cset = dst_cset; - if (list_empty(&dst_cset->mg_preload_node)) - list_add_tail(&dst_cset->mg_preload_node, + if (list_empty(&dst_cset->mg_dst_preload_node)) + list_add_tail(&dst_cset->mg_dst_preload_node, &mgctx->preloaded_dst_csets); else put_css_set(dst_cset); @@ -2813,8 +2883,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, } struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, - bool *locked) - __acquires(&cgroup_threadgroup_rwsem) + bool *threadgroup_locked) { struct task_struct *tsk; pid_t pid; @@ -2831,12 +2900,8 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, * Therefore, we can skip the global lock. */ lockdep_assert_held(&cgroup_mutex); - if (pid || threadgroup) { - percpu_down_write(&cgroup_threadgroup_rwsem); - *locked = true; - } else { - *locked = false; - } + *threadgroup_locked = pid || threadgroup; + cgroup_attach_lock(*threadgroup_locked); rcu_read_lock(); if (pid) { @@ -2867,17 +2932,14 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, goto out_unlock_rcu; out_unlock_threadgroup: - if (*locked) { - percpu_up_write(&cgroup_threadgroup_rwsem); - *locked = false; - } + cgroup_attach_unlock(*threadgroup_locked); + *threadgroup_locked = false; out_unlock_rcu: rcu_read_unlock(); return tsk; } -void cgroup_procs_write_finish(struct task_struct *task, bool locked) - __releases(&cgroup_threadgroup_rwsem) +void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked) { struct cgroup_subsys *ss; int ssid; @@ -2885,8 +2947,8 @@ void cgroup_procs_write_finish(struct task_struct *task, bool locked) /* release reference from cgroup_procs_write_start() */ put_task_struct(task); - if (locked) - percpu_up_write(&cgroup_threadgroup_rwsem); + cgroup_attach_unlock(threadgroup_locked); + for_each_subsys(ss, ssid) if (ss->post_attach) ss->post_attach(); @@ -2941,29 +3003,47 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) struct cgroup_subsys_state *d_css; struct cgroup *dsct; struct css_set *src_cset; + bool has_tasks; int ret; lockdep_assert_held(&cgroup_mutex); - percpu_down_write(&cgroup_threadgroup_rwsem); - /* look up all csses currently attached to @cgrp's subtree */ spin_lock_irq(&css_set_lock); cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { struct cgrp_cset_link *link; + /* + * As cgroup_update_dfl_csses() is only called by + * cgroup_apply_control(). The csses associated with the + * given cgrp will not be affected by changes made to + * its subtree_control file. We can skip them. + */ + if (dsct == cgrp) + continue; + list_for_each_entry(link, &dsct->cset_links, cset_link) cgroup_migrate_add_src(link->cset, dsct, &mgctx); } spin_unlock_irq(&css_set_lock); + /* + * We need to write-lock threadgroup_rwsem while migrating tasks. + * However, if there are no source csets for @cgrp, changing its + * controllers isn't gonna produce any task migrations and the + * write-locking can be skipped safely. + */ + has_tasks = !list_empty(&mgctx.preloaded_src_csets); + cgroup_attach_lock(has_tasks); + /* NULL dst indicates self on default hierarchy */ ret = cgroup_migrate_prepare_dst(&mgctx); if (ret) goto out_finish; spin_lock_irq(&css_set_lock); - list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) { + list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, + mg_src_preload_node) { struct task_struct *task, *ntask; /* all tasks in src_csets need to be migrated */ @@ -2975,7 +3055,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) ret = cgroup_migrate_execute(&mgctx); out_finish: cgroup_migrate_finish(&mgctx); - percpu_up_write(&cgroup_threadgroup_rwsem); + cgroup_attach_unlock(has_tasks); return ret; } @@ -3609,21 +3689,21 @@ static int cpu_stat_show(struct seq_file *seq, void *v) static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; return psi_show(seq, psi, PSI_IO); } static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; return psi_show(seq, psi, PSI_MEM); } static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; return psi_show(seq, psi, PSI_CPU); } @@ -3649,8 +3729,8 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, return -EBUSY; } - psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; - new = psi_trigger_create(psi, buf, nbytes, res); + psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; + new = psi_trigger_create(psi, buf, res); if (IS_ERR(new)) { cgroup_put(cgrp); return PTR_ERR(new); @@ -4292,6 +4372,26 @@ void cgroup_file_notify(struct cgroup_file *cfile) } /** + * cgroup_file_show - show or hide a hidden cgroup file + * @cfile: target cgroup_file obtained by setting cftype->file_offset + * @show: whether to show or hide + */ +void cgroup_file_show(struct cgroup_file *cfile, bool show) +{ + struct kernfs_node *kn; + + spin_lock_irq(&cgroup_file_kn_lock); + kn = cfile->kn; + kernfs_get(kn); + spin_unlock_irq(&cgroup_file_kn_lock); + + if (kn) + kernfs_show(kn, show); + + kernfs_put(kn); +} + +/** * css_next_child - find the next child of a given css * @pos: the current position (%NULL to initiate traversal) * @parent: css whose children to walk @@ -4923,13 +5023,13 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, struct task_struct *task; const struct cred *saved_cred; ssize_t ret; - bool locked; + bool threadgroup_locked; dst_cgrp = cgroup_kn_lock_live(of->kn, false); if (!dst_cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, threadgroup, &locked); + task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -4955,7 +5055,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, ret = cgroup_attach_task(dst_cgrp, task, threadgroup); out_finish: - cgroup_procs_write_finish(task, locked); + cgroup_procs_write_finish(task, threadgroup_locked); out_unlock: cgroup_kn_unlock(of->kn); @@ -5685,7 +5785,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) css_clear_dir(&cgrp->self); kernfs_remove(cgrp->kn); - if (parent && cgroup_is_threaded(cgrp)) + if (cgroup_is_threaded(cgrp)) parent->nr_threaded_children--; spin_lock_irq(&css_set_lock); @@ -5842,12 +5942,6 @@ int __init cgroup_init(void) cgroup_rstat_boot(); - /* - * The latency of the synchronize_rcu() is too high for cgroups, - * avoid it at the cost of forcing all readers into the slow path. - */ - rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss); - get_user_ns(init_cgroup_ns.user_ns); mutex_lock(&cgroup_mutex); @@ -5975,6 +6069,9 @@ struct cgroup *cgroup_get_from_id(u64 id) if (!kn) goto out; + if (kernfs_type(kn) != KERNFS_DIR) + goto put; + rcu_read_lock(); cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); @@ -5982,7 +6079,7 @@ struct cgroup *cgroup_get_from_id(u64 id) cgrp = NULL; rcu_read_unlock(); - +put: kernfs_put(kn); out: return cgrp; @@ -6090,11 +6187,6 @@ static struct cgroup *cgroup_get_from_file(struct file *f) return ERR_CAST(css); cgrp = css->cgroup; - if (!cgroup_on_dfl(cgrp)) { - cgroup_put(cgrp); - return ERR_PTR(-EBADF); - } - return cgrp; } @@ -6759,6 +6851,7 @@ static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, { return snprintf(buf, PAGE_SIZE, "nsdelegate\n" + "favordynmods\n" "memory_localevents\n" "memory_recursiveprot\n"); } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 9390bfd9f1cd..1f3a55297f39 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2239,7 +2239,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) goto out_unlock; cgroup_taskset_for_each(task, css, tset) { - ret = task_can_attach(task, cs->cpus_allowed); + ret = task_can_attach(task, cs->effective_cpus); if (ret) goto out_unlock; ret = security_task_setscheduler(task); @@ -2289,7 +2289,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) cgroup_taskset_first(tset, &css); cs = css_cs(css); - cpus_read_lock(); + lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */ percpu_down_write(&cpuset_rwsem); guarantee_online_mems(cs, &cpuset_attach_nodemask_to); @@ -2343,7 +2343,6 @@ static void cpuset_attach(struct cgroup_taskset *tset) wake_up(&cpuset_attach_wq); percpu_up_write(&cpuset_rwsem); - cpus_read_unlock(); } /* The various types of files and directories in a cpuset file system */ @@ -3390,8 +3389,11 @@ static struct notifier_block cpuset_track_online_nodes_nb = { */ void __init cpuset_init_smp(void) { - cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); - top_cpuset.mems_allowed = node_states[N_MEMORY]; + /* + * cpus_allowd/mems_allowed set to v2 values in the initial + * cpuset_bind() call will be reset to v1 values in another + * cpuset_bind() call when v1 cpuset is mounted. + */ top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 24b5c2ab5598..793ecff29038 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -3,6 +3,10 @@ #include <linux/sched/cputime.h> +#include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/btf_ids.h> + static DEFINE_SPINLOCK(cgroup_rstat_lock); static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock); @@ -141,6 +145,31 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, return pos; } +/* + * A hook for bpf stat collectors to attach to and flush their stats. + * Together with providing bpf kfuncs for cgroup_rstat_updated() and + * cgroup_rstat_flush(), this enables a complete workflow where bpf progs that + * collect cgroup stats can integrate with rstat for efficient flushing. + * + * A static noinline declaration here could cause the compiler to optimize away + * the function. A global noinline declaration will keep the definition, but may + * optimize away the callsite. Therefore, __weak is needed to ensure that the + * call is still emitted, by telling the compiler that we don't know what the + * function might eventually be. + * + * __diag_* below are needed to dismiss the missing prototype warning. + */ +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "kfuncs which will be used in BPF programs"); + +__weak noinline void bpf_rstat_flush(struct cgroup *cgrp, + struct cgroup *parent, int cpu) +{ +} + +__diag_pop(); + /* see cgroup_rstat_flush() */ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) @@ -168,6 +197,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) struct cgroup_subsys_state *css; cgroup_base_stat_flush(pos, cpu); + bpf_rstat_flush(pos, cgroup_parent(pos), cpu); rcu_read_lock(); list_for_each_entry_rcu(css, &pos->rstat_css_list, @@ -310,6 +340,9 @@ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat, dst_bstat->cputime.utime += src_bstat->cputime.utime; dst_bstat->cputime.stime += src_bstat->cputime.stime; dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime; +#ifdef CONFIG_SCHED_CORE + dst_bstat->forceidle_sum += src_bstat->forceidle_sum; +#endif } static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, @@ -318,6 +351,9 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, dst_bstat->cputime.utime -= src_bstat->cputime.utime; dst_bstat->cputime.stime -= src_bstat->cputime.stime; dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime; +#ifdef CONFIG_SCHED_CORE + dst_bstat->forceidle_sum -= src_bstat->forceidle_sum; +#endif } static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) @@ -398,6 +434,11 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp, case CPUTIME_SOFTIRQ: rstatc->bstat.cputime.stime += delta_exec; break; +#ifdef CONFIG_SCHED_CORE + case CPUTIME_FORCEIDLE: + rstatc->bstat.forceidle_sum += delta_exec; + break; +#endif default: break; } @@ -411,8 +452,9 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp, * with how it is done by __cgroup_account_cputime_field for each bit of * cpu time attributed to a cgroup. */ -static void root_cgroup_cputime(struct task_cputime *cputime) +static void root_cgroup_cputime(struct cgroup_base_stat *bstat) { + struct task_cputime *cputime = &bstat->cputime; int i; cputime->stime = 0; @@ -438,6 +480,10 @@ static void root_cgroup_cputime(struct task_cputime *cputime) cputime->sum_exec_runtime += user; cputime->sum_exec_runtime += sys; cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL]; + +#ifdef CONFIG_SCHED_CORE + bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE]; +#endif } } @@ -445,27 +491,61 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) { struct cgroup *cgrp = seq_css(seq)->cgroup; u64 usage, utime, stime; - struct task_cputime cputime; + struct cgroup_base_stat bstat; +#ifdef CONFIG_SCHED_CORE + u64 forceidle_time; +#endif if (cgroup_parent(cgrp)) { cgroup_rstat_flush_hold(cgrp); usage = cgrp->bstat.cputime.sum_exec_runtime; cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &utime, &stime); +#ifdef CONFIG_SCHED_CORE + forceidle_time = cgrp->bstat.forceidle_sum; +#endif cgroup_rstat_flush_release(); } else { - root_cgroup_cputime(&cputime); - usage = cputime.sum_exec_runtime; - utime = cputime.utime; - stime = cputime.stime; + root_cgroup_cputime(&bstat); + usage = bstat.cputime.sum_exec_runtime; + utime = bstat.cputime.utime; + stime = bstat.cputime.stime; +#ifdef CONFIG_SCHED_CORE + forceidle_time = bstat.forceidle_sum; +#endif } do_div(usage, NSEC_PER_USEC); do_div(utime, NSEC_PER_USEC); do_div(stime, NSEC_PER_USEC); +#ifdef CONFIG_SCHED_CORE + do_div(forceidle_time, NSEC_PER_USEC); +#endif seq_printf(seq, "usage_usec %llu\n" "user_usec %llu\n" "system_usec %llu\n", usage, utime, stime); + +#ifdef CONFIG_SCHED_CORE + seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time); +#endif +} + +/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */ +BTF_SET8_START(bpf_rstat_kfunc_ids) +BTF_ID_FLAGS(func, cgroup_rstat_updated) +BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE) +BTF_SET8_END(bpf_rstat_kfunc_ids) + +static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = { + .owner = THIS_MODULE, + .set = &bpf_rstat_kfunc_ids, +}; + +static int __init bpf_rstat_kfunc_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, + &bpf_rstat_kfunc_set); } +late_initcall(bpf_rstat_kfunc_init); diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index eb701b2ac72f..44b0f0146a3f 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -7,7 +7,6 @@ # CONFIG_OABI_COMPAT is not set # CONFIG_SYSVIPC is not set # CONFIG_USELIB is not set -CONFIG_ANDROID=y CONFIG_ANDROID_BINDER_IPC=y CONFIG_ANDROID_BINDER_DEVICES=binder,hwbinder,vndbinder CONFIG_ANDROID_LOW_MEMORY_KILLER=y diff --git a/kernel/configs/rust.config b/kernel/configs/rust.config new file mode 100644 index 000000000000..38a7c5362c9c --- /dev/null +++ b/kernel/configs/rust.config @@ -0,0 +1 @@ +CONFIG_RUST=y diff --git a/kernel/configs/x86_debug.config b/kernel/configs/x86_debug.config new file mode 100644 index 000000000000..6fac5b405334 --- /dev/null +++ b/kernel/configs/x86_debug.config @@ -0,0 +1,17 @@ +CONFIG_X86_DEBUG_FPU=y +CONFIG_LOCK_STAT=y +CONFIG_DEBUG_VM=y +CONFIG_DEBUG_VM_VMACACHE=y +CONFIG_DEBUG_VM_RB=y +CONFIG_DEBUG_SLAB=y +CONFIG_DEBUG_KMEMLEAK=y +CONFIG_DEBUG_PAGEALLOC=y +CONFIG_SLUB_DEBUG_ON=y +CONFIG_DEBUG_OBJECTS=y +CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT=1 +CONFIG_GCOV_KERNEL=y +CONFIG_LOCKDEP=y +CONFIG_PROVE_LOCKING=y +CONFIG_SCHEDSTATS=y +CONFIG_NOINSTR_VALIDATION=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y diff --git a/kernel/configs/xen.config b/kernel/configs/xen.config index ff756221f112..436f806aa1ed 100644 --- a/kernel/configs/xen.config +++ b/kernel/configs/xen.config @@ -34,7 +34,6 @@ CONFIG_INPUT_XEN_KBDDEV_FRONTEND=m CONFIG_XEN_SCSI_FRONTEND=m # others CONFIG_XEN_BALLOON=y -CONFIG_XEN_SCRUB_PAGES=y CONFIG_XEN_DEV_EVTCHN=m CONFIG_XEN_BLKDEV_FRONTEND=m CONFIG_XEN_NETDEV_FRONTEND=m diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 36a98c48aedc..77978e372377 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -1,18 +1,20 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Context tracking: Probe on high level context boundaries such as kernel - * and userspace. This includes syscalls and exceptions entry/exit. + * Context tracking: Probe on high level context boundaries such as kernel, + * userspace, guest or idle. * * This is used by RCU to remove its dependency on the timer tick while a CPU - * runs in userspace. + * runs in idle, userspace or guest mode. * - * Started by Frederic Weisbecker: + * User/guest tracking started by Frederic Weisbecker: * - * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com> + * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker * * Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton, * Steven Rostedt, Peter Zijlstra for suggestions and improvements. * + * RCU extended quiescent state bits imported from kernel/rcu/tree.c + * where the relevant authorship may be found. */ #include <linux/context_tracking.h> @@ -21,6 +23,411 @@ #include <linux/hardirq.h> #include <linux/export.h> #include <linux/kprobes.h> +#include <trace/events/rcu.h> + + +DEFINE_PER_CPU(struct context_tracking, context_tracking) = { +#ifdef CONFIG_CONTEXT_TRACKING_IDLE + .dynticks_nesting = 1, + .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, +#endif + .state = ATOMIC_INIT(RCU_DYNTICKS_IDX), +}; +EXPORT_SYMBOL_GPL(context_tracking); + +#ifdef CONFIG_CONTEXT_TRACKING_IDLE +#define TPS(x) tracepoint_string(x) + +/* Record the current task on dyntick-idle entry. */ +static __always_inline void rcu_dynticks_task_enter(void) +{ +#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) + WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id()); +#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ +} + +/* Record no current task on dyntick-idle exit. */ +static __always_inline void rcu_dynticks_task_exit(void) +{ +#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) + WRITE_ONCE(current->rcu_tasks_idle_cpu, -1); +#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ +} + +/* Turn on heavyweight RCU tasks trace readers on idle/user entry. */ +static __always_inline void rcu_dynticks_task_trace_enter(void) +{ +#ifdef CONFIG_TASKS_TRACE_RCU + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) + current->trc_reader_special.b.need_mb = true; +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ +} + +/* Turn off heavyweight RCU tasks trace readers on idle/user exit. */ +static __always_inline void rcu_dynticks_task_trace_exit(void) +{ +#ifdef CONFIG_TASKS_TRACE_RCU + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) + current->trc_reader_special.b.need_mb = false; +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ +} + +/* + * Record entry into an extended quiescent state. This is only to be + * called when not already in an extended quiescent state, that is, + * RCU is watching prior to the call to this function and is no longer + * watching upon return. + */ +static noinstr void ct_kernel_exit_state(int offset) +{ + int seq; + + /* + * CPUs seeing atomic_add_return() must see prior RCU read-side + * critical sections, and we also must force ordering with the + * next idle sojourn. + */ + rcu_dynticks_task_trace_enter(); // Before ->dynticks update! + seq = ct_state_inc(offset); + // RCU is no longer watching. Better be in extended quiescent state! + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & RCU_DYNTICKS_IDX)); +} + +/* + * Record exit from an extended quiescent state. This is only to be + * called from an extended quiescent state, that is, RCU is not watching + * prior to the call to this function and is watching upon return. + */ +static noinstr void ct_kernel_enter_state(int offset) +{ + int seq; + + /* + * CPUs seeing atomic_add_return() must see prior idle sojourns, + * and we also must force ordering with the next RCU read-side + * critical section. + */ + seq = ct_state_inc(offset); + // RCU is now watching. Better not be in an extended quiescent state! + rcu_dynticks_task_trace_exit(); // After ->dynticks update! + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & RCU_DYNTICKS_IDX)); +} + +/* + * Enter an RCU extended quiescent state, which can be either the + * idle loop or adaptive-tickless usermode execution. + * + * We crowbar the ->dynticks_nmi_nesting field to zero to allow for + * the possibility of usermode upcalls having messed up our count + * of interrupt nesting level during the prior busy period. + */ +static void noinstr ct_kernel_exit(bool user, int offset) +{ + struct context_tracking *ct = this_cpu_ptr(&context_tracking); + + WARN_ON_ONCE(ct_dynticks_nmi_nesting() != DYNTICK_IRQ_NONIDLE); + WRITE_ONCE(ct->dynticks_nmi_nesting, 0); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + ct_dynticks_nesting() == 0); + if (ct_dynticks_nesting() != 1) { + // RCU will still be watching, so just do accounting and leave. + ct->dynticks_nesting--; + return; + } + + instrumentation_begin(); + lockdep_assert_irqs_disabled(); + trace_rcu_dyntick(TPS("Start"), ct_dynticks_nesting(), 0, ct_dynticks()); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); + rcu_preempt_deferred_qs(current); + + // instrumentation for the noinstr ct_kernel_exit_state() + instrument_atomic_write(&ct->state, sizeof(ct->state)); + + instrumentation_end(); + WRITE_ONCE(ct->dynticks_nesting, 0); /* Avoid irq-access tearing. */ + // RCU is watching here ... + ct_kernel_exit_state(offset); + // ... but is no longer watching here. + rcu_dynticks_task_enter(); +} + +/* + * Exit an RCU extended quiescent state, which can be either the + * idle loop or adaptive-tickless usermode execution. + * + * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to + * allow for the possibility of usermode upcalls messing up our count of + * interrupt nesting level during the busy period that is just now starting. + */ +static void noinstr ct_kernel_enter(bool user, int offset) +{ + struct context_tracking *ct = this_cpu_ptr(&context_tracking); + long oldval; + + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled()); + oldval = ct_dynticks_nesting(); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); + if (oldval) { + // RCU was already watching, so just do accounting and leave. + ct->dynticks_nesting++; + return; + } + rcu_dynticks_task_exit(); + // RCU is not watching here ... + ct_kernel_enter_state(offset); + // ... but is watching here. + instrumentation_begin(); + + // instrumentation for the noinstr ct_kernel_enter_state() + instrument_atomic_write(&ct->state, sizeof(ct->state)); + + trace_rcu_dyntick(TPS("End"), ct_dynticks_nesting(), 1, ct_dynticks()); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); + WRITE_ONCE(ct->dynticks_nesting, 1); + WARN_ON_ONCE(ct_dynticks_nmi_nesting()); + WRITE_ONCE(ct->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); + instrumentation_end(); +} + +/** + * ct_nmi_exit - inform RCU of exit from NMI context + * + * If we are returning from the outermost NMI handler that interrupted an + * RCU-idle period, update ct->state and ct->dynticks_nmi_nesting + * to let the RCU grace-period handling know that the CPU is back to + * being RCU-idle. + * + * If you add or remove a call to ct_nmi_exit(), be sure to test + * with CONFIG_RCU_EQS_DEBUG=y. + */ +void noinstr ct_nmi_exit(void) +{ + struct context_tracking *ct = this_cpu_ptr(&context_tracking); + + instrumentation_begin(); + /* + * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks. + * (We are exiting an NMI handler, so RCU better be paying attention + * to us!) + */ + WARN_ON_ONCE(ct_dynticks_nmi_nesting() <= 0); + WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs()); + + /* + * If the nesting level is not 1, the CPU wasn't RCU-idle, so + * leave it in non-RCU-idle state. + */ + if (ct_dynticks_nmi_nesting() != 1) { + trace_rcu_dyntick(TPS("--="), ct_dynticks_nmi_nesting(), ct_dynticks_nmi_nesting() - 2, + ct_dynticks()); + WRITE_ONCE(ct->dynticks_nmi_nesting, /* No store tearing. */ + ct_dynticks_nmi_nesting() - 2); + instrumentation_end(); + return; + } + + /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ + trace_rcu_dyntick(TPS("Startirq"), ct_dynticks_nmi_nesting(), 0, ct_dynticks()); + WRITE_ONCE(ct->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ + + // instrumentation for the noinstr ct_kernel_exit_state() + instrument_atomic_write(&ct->state, sizeof(ct->state)); + instrumentation_end(); + + // RCU is watching here ... + ct_kernel_exit_state(RCU_DYNTICKS_IDX); + // ... but is no longer watching here. + + if (!in_nmi()) + rcu_dynticks_task_enter(); +} + +/** + * ct_nmi_enter - inform RCU of entry to NMI context + * + * If the CPU was idle from RCU's viewpoint, update ct->state and + * ct->dynticks_nmi_nesting to let the RCU grace-period handling know + * that the CPU is active. This implementation permits nested NMIs, as + * long as the nesting level does not overflow an int. (You will probably + * run out of stack space first.) + * + * If you add or remove a call to ct_nmi_enter(), be sure to test + * with CONFIG_RCU_EQS_DEBUG=y. + */ +void noinstr ct_nmi_enter(void) +{ + long incby = 2; + struct context_tracking *ct = this_cpu_ptr(&context_tracking); + + /* Complain about underflow. */ + WARN_ON_ONCE(ct_dynticks_nmi_nesting() < 0); + + /* + * If idle from RCU viewpoint, atomically increment ->dynticks + * to mark non-idle and increment ->dynticks_nmi_nesting by one. + * Otherwise, increment ->dynticks_nmi_nesting by two. This means + * if ->dynticks_nmi_nesting is equal to one, we are guaranteed + * to be in the outermost NMI handler that interrupted an RCU-idle + * period (observation due to Andy Lutomirski). + */ + if (rcu_dynticks_curr_cpu_in_eqs()) { + + if (!in_nmi()) + rcu_dynticks_task_exit(); + + // RCU is not watching here ... + ct_kernel_enter_state(RCU_DYNTICKS_IDX); + // ... but is watching here. + + instrumentation_begin(); + // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs() + instrument_atomic_read(&ct->state, sizeof(ct->state)); + // instrumentation for the noinstr ct_kernel_enter_state() + instrument_atomic_write(&ct->state, sizeof(ct->state)); + + incby = 1; + } else if (!in_nmi()) { + instrumentation_begin(); + rcu_irq_enter_check_tick(); + } else { + instrumentation_begin(); + } + + trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), + ct_dynticks_nmi_nesting(), + ct_dynticks_nmi_nesting() + incby, ct_dynticks()); + instrumentation_end(); + WRITE_ONCE(ct->dynticks_nmi_nesting, /* Prevent store tearing. */ + ct_dynticks_nmi_nesting() + incby); + barrier(); +} + +/** + * ct_idle_enter - inform RCU that current CPU is entering idle + * + * Enter idle mode, in other words, -leave- the mode in which RCU + * read-side critical sections can occur. (Though RCU read-side + * critical sections can occur in irq handlers in idle, a possibility + * handled by irq_enter() and irq_exit().) + * + * If you add or remove a call to ct_idle_enter(), be sure to test with + * CONFIG_RCU_EQS_DEBUG=y. + */ +void noinstr ct_idle_enter(void) +{ + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled()); + ct_kernel_exit(false, RCU_DYNTICKS_IDX + CONTEXT_IDLE); +} +EXPORT_SYMBOL_GPL(ct_idle_enter); + +/** + * ct_idle_exit - inform RCU that current CPU is leaving idle + * + * Exit idle mode, in other words, -enter- the mode in which RCU + * read-side critical sections can occur. + * + * If you add or remove a call to ct_idle_exit(), be sure to test with + * CONFIG_RCU_EQS_DEBUG=y. + */ +void noinstr ct_idle_exit(void) +{ + unsigned long flags; + + raw_local_irq_save(flags); + ct_kernel_enter(false, RCU_DYNTICKS_IDX - CONTEXT_IDLE); + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(ct_idle_exit); + +/** + * ct_irq_enter - inform RCU that current CPU is entering irq away from idle + * + * Enter an interrupt handler, which might possibly result in exiting + * idle mode, in other words, entering the mode in which read-side critical + * sections can occur. The caller must have disabled interrupts. + * + * Note that the Linux kernel is fully capable of entering an interrupt + * handler that it never exits, for example when doing upcalls to user mode! + * This code assumes that the idle loop never does upcalls to user mode. + * If your architecture's idle loop does do upcalls to user mode (or does + * anything else that results in unbalanced calls to the irq_enter() and + * irq_exit() functions), RCU will give you what you deserve, good and hard. + * But very infrequently and irreproducibly. + * + * Use things like work queues to work around this limitation. + * + * You have been warned. + * + * If you add or remove a call to ct_irq_enter(), be sure to test with + * CONFIG_RCU_EQS_DEBUG=y. + */ +noinstr void ct_irq_enter(void) +{ + lockdep_assert_irqs_disabled(); + ct_nmi_enter(); +} + +/** + * ct_irq_exit - inform RCU that current CPU is exiting irq towards idle + * + * Exit from an interrupt handler, which might possibly result in entering + * idle mode, in other words, leaving the mode in which read-side critical + * sections can occur. The caller must have disabled interrupts. + * + * This code assumes that the idle loop never does anything that might + * result in unbalanced calls to irq_enter() and irq_exit(). If your + * architecture's idle loop violates this assumption, RCU will give you what + * you deserve, good and hard. But very infrequently and irreproducibly. + * + * Use things like work queues to work around this limitation. + * + * You have been warned. + * + * If you add or remove a call to ct_irq_exit(), be sure to test with + * CONFIG_RCU_EQS_DEBUG=y. + */ +noinstr void ct_irq_exit(void) +{ + lockdep_assert_irqs_disabled(); + ct_nmi_exit(); +} + +/* + * Wrapper for ct_irq_enter() where interrupts are enabled. + * + * If you add or remove a call to ct_irq_enter_irqson(), be sure to test + * with CONFIG_RCU_EQS_DEBUG=y. + */ +void ct_irq_enter_irqson(void) +{ + unsigned long flags; + + local_irq_save(flags); + ct_irq_enter(); + local_irq_restore(flags); +} + +/* + * Wrapper for ct_irq_exit() where interrupts are enabled. + * + * If you add or remove a call to ct_irq_exit_irqson(), be sure to test + * with CONFIG_RCU_EQS_DEBUG=y. + */ +void ct_irq_exit_irqson(void) +{ + unsigned long flags; + + local_irq_save(flags); + ct_irq_exit(); + local_irq_restore(flags); +} +#else +static __always_inline void ct_kernel_exit(bool user, int offset) { } +static __always_inline void ct_kernel_enter(bool user, int offset) { } +#endif /* #ifdef CONFIG_CONTEXT_TRACKING_IDLE */ + +#ifdef CONFIG_CONTEXT_TRACKING_USER #define CREATE_TRACE_POINTS #include <trace/events/context_tracking.h> @@ -28,9 +435,6 @@ DEFINE_STATIC_KEY_FALSE(context_tracking_key); EXPORT_SYMBOL_GPL(context_tracking_key); -DEFINE_PER_CPU(struct context_tracking, context_tracking); -EXPORT_SYMBOL_GPL(context_tracking); - static noinstr bool context_tracking_recursion_enter(void) { int recursion; @@ -51,29 +455,32 @@ static __always_inline void context_tracking_recursion_exit(void) } /** - * context_tracking_enter - Inform the context tracking that the CPU is going - * enter user or guest space mode. + * __ct_user_enter - Inform the context tracking that the CPU is going + * to enter user or guest space mode. * * This function must be called right before we switch from the kernel * to user or guest space, when it's guaranteed the remaining kernel * instructions to execute won't use any RCU read side critical section * because this function sets RCU in extended quiescent state. */ -void noinstr __context_tracking_enter(enum ctx_state state) +void noinstr __ct_user_enter(enum ctx_state state) { + struct context_tracking *ct = this_cpu_ptr(&context_tracking); + lockdep_assert_irqs_disabled(); + /* Kernel threads aren't supposed to go to userspace */ WARN_ON_ONCE(!current->mm); if (!context_tracking_recursion_enter()) return; - if ( __this_cpu_read(context_tracking.state) != state) { - if (__this_cpu_read(context_tracking.active)) { + if (__ct_state() != state) { + if (ct->active) { /* * At this stage, only low level arch entry code remains and * then we'll run in userspace. We can assume there won't be * any RCU read-side critical section until the next call to - * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency + * user_exit() or ct_irq_enter(). Let's remove RCU's dependency * on the tick. */ if (state == CONTEXT_USER) { @@ -82,35 +489,77 @@ void noinstr __context_tracking_enter(enum ctx_state state) vtime_user_enter(current); instrumentation_end(); } - rcu_user_enter(); + /* + * Other than generic entry implementation, we may be past the last + * rescheduling opportunity in the entry code. Trigger a self IPI + * that will fire and reschedule once we resume in user/guest mode. + */ + rcu_irq_work_resched(); + + /* + * Enter RCU idle mode right before resuming userspace. No use of RCU + * is permitted between this call and rcu_eqs_exit(). This way the + * CPU doesn't need to maintain the tick for RCU maintenance purposes + * when the CPU runs in userspace. + */ + ct_kernel_exit(true, RCU_DYNTICKS_IDX + state); + + /* + * Special case if we only track user <-> kernel transitions for tickless + * cputime accounting but we don't support RCU extended quiescent state. + * In this we case we don't care about any concurrency/ordering. + */ + if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) + atomic_set(&ct->state, state); + } else { + /* + * Even if context tracking is disabled on this CPU, because it's outside + * the full dynticks mask for example, we still have to keep track of the + * context transitions and states to prevent inconsistency on those of + * other CPUs. + * If a task triggers an exception in userspace, sleep on the exception + * handler and then migrate to another CPU, that new CPU must know where + * the exception returns by the time we call exception_exit(). + * This information can only be provided by the previous CPU when it called + * exception_enter(). + * OTOH we can spare the calls to vtime and RCU when context_tracking.active + * is false because we know that CPU is not tickless. + */ + if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) { + /* Tracking for vtime only, no concurrent RCU EQS accounting */ + atomic_set(&ct->state, state); + } else { + /* + * Tracking for vtime and RCU EQS. Make sure we don't race + * with NMIs. OTOH we don't care about ordering here since + * RCU only requires RCU_DYNTICKS_IDX increments to be fully + * ordered. + */ + atomic_add(state, &ct->state); + } } - /* - * Even if context tracking is disabled on this CPU, because it's outside - * the full dynticks mask for example, we still have to keep track of the - * context transitions and states to prevent inconsistency on those of - * other CPUs. - * If a task triggers an exception in userspace, sleep on the exception - * handler and then migrate to another CPU, that new CPU must know where - * the exception returns by the time we call exception_exit(). - * This information can only be provided by the previous CPU when it called - * exception_enter(). - * OTOH we can spare the calls to vtime and RCU when context_tracking.active - * is false because we know that CPU is not tickless. - */ - __this_cpu_write(context_tracking.state, state); } context_tracking_recursion_exit(); } -EXPORT_SYMBOL_GPL(__context_tracking_enter); +EXPORT_SYMBOL_GPL(__ct_user_enter); -void context_tracking_enter(enum ctx_state state) +/* + * OBSOLETE: + * This function should be noinstr but the below local_irq_restore() is + * unsafe because it involves illegal RCU uses through tracing and lockdep. + * This is unlikely to be fixed as this function is obsolete. The preferred + * way is to call __context_tracking_enter() through user_enter_irqoff() + * or context_tracking_guest_enter(). It should be the arch entry code + * responsibility to call into context tracking with IRQs disabled. + */ +void ct_user_enter(enum ctx_state state) { unsigned long flags; /* * Some contexts may involve an exception occuring in an irq, * leading to that nesting: - * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() + * ct_irq_enter() rcu_eqs_exit(true) rcu_eqs_enter(true) ct_irq_exit() * This would mess up the dyntick_nesting count though. And rcu_irq_*() * helpers are enough to protect RCU uses inside the exception. So * just return immediately if we detect we are in an IRQ. @@ -119,21 +568,32 @@ void context_tracking_enter(enum ctx_state state) return; local_irq_save(flags); - __context_tracking_enter(state); + __ct_user_enter(state); local_irq_restore(flags); } -NOKPROBE_SYMBOL(context_tracking_enter); -EXPORT_SYMBOL_GPL(context_tracking_enter); +NOKPROBE_SYMBOL(ct_user_enter); +EXPORT_SYMBOL_GPL(ct_user_enter); -void context_tracking_user_enter(void) +/** + * user_enter_callable() - Unfortunate ASM callable version of user_enter() for + * archs that didn't manage to check the context tracking + * static key from low level code. + * + * This OBSOLETE function should be noinstr but it unsafely calls + * local_irq_restore(), involving illegal RCU uses through tracing and lockdep. + * This is unlikely to be fixed as this function is obsolete. The preferred + * way is to call user_enter_irqoff(). It should be the arch entry code + * responsibility to call into context tracking with IRQs disabled. + */ +void user_enter_callable(void) { user_enter(); } -NOKPROBE_SYMBOL(context_tracking_user_enter); +NOKPROBE_SYMBOL(user_enter_callable); /** - * context_tracking_exit - Inform the context tracking that the CPU is - * exiting user or guest mode and entering the kernel. + * __ct_user_exit - Inform the context tracking that the CPU is + * exiting user or guest mode and entering the kernel. * * This function must be called after we entered the kernel from user or * guest space before any use of RCU read side critical section. This @@ -143,32 +603,64 @@ NOKPROBE_SYMBOL(context_tracking_user_enter); * This call supports re-entrancy. This way it can be called from any exception * handler without needing to know if we came from userspace or not. */ -void noinstr __context_tracking_exit(enum ctx_state state) +void noinstr __ct_user_exit(enum ctx_state state) { + struct context_tracking *ct = this_cpu_ptr(&context_tracking); + if (!context_tracking_recursion_enter()) return; - if (__this_cpu_read(context_tracking.state) == state) { - if (__this_cpu_read(context_tracking.active)) { + if (__ct_state() == state) { + if (ct->active) { /* - * We are going to run code that may use RCU. Inform - * RCU core about that (ie: we may need the tick again). + * Exit RCU idle mode while entering the kernel because it can + * run a RCU read side critical section anytime. */ - rcu_user_exit(); + ct_kernel_enter(true, RCU_DYNTICKS_IDX - state); if (state == CONTEXT_USER) { instrumentation_begin(); vtime_user_exit(current); trace_user_exit(0); instrumentation_end(); } + + /* + * Special case if we only track user <-> kernel transitions for tickless + * cputime accounting but we don't support RCU extended quiescent state. + * In this we case we don't care about any concurrency/ordering. + */ + if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) + atomic_set(&ct->state, CONTEXT_KERNEL); + + } else { + if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) { + /* Tracking for vtime only, no concurrent RCU EQS accounting */ + atomic_set(&ct->state, CONTEXT_KERNEL); + } else { + /* + * Tracking for vtime and RCU EQS. Make sure we don't race + * with NMIs. OTOH we don't care about ordering here since + * RCU only requires RCU_DYNTICKS_IDX increments to be fully + * ordered. + */ + atomic_sub(state, &ct->state); + } } - __this_cpu_write(context_tracking.state, CONTEXT_KERNEL); } context_tracking_recursion_exit(); } -EXPORT_SYMBOL_GPL(__context_tracking_exit); +EXPORT_SYMBOL_GPL(__ct_user_exit); -void context_tracking_exit(enum ctx_state state) +/* + * OBSOLETE: + * This function should be noinstr but the below local_irq_save() is + * unsafe because it involves illegal RCU uses through tracing and lockdep. + * This is unlikely to be fixed as this function is obsolete. The preferred + * way is to call __context_tracking_exit() through user_exit_irqoff() + * or context_tracking_guest_exit(). It should be the arch entry code + * responsibility to call into context tracking with IRQs disabled. + */ +void ct_user_exit(enum ctx_state state) { unsigned long flags; @@ -176,19 +668,30 @@ void context_tracking_exit(enum ctx_state state) return; local_irq_save(flags); - __context_tracking_exit(state); + __ct_user_exit(state); local_irq_restore(flags); } -NOKPROBE_SYMBOL(context_tracking_exit); -EXPORT_SYMBOL_GPL(context_tracking_exit); +NOKPROBE_SYMBOL(ct_user_exit); +EXPORT_SYMBOL_GPL(ct_user_exit); -void context_tracking_user_exit(void) +/** + * user_exit_callable() - Unfortunate ASM callable version of user_exit() for + * archs that didn't manage to check the context tracking + * static key from low level code. + * + * This OBSOLETE function should be noinstr but it unsafely calls local_irq_save(), + * involving illegal RCU uses through tracing and lockdep. This is unlikely + * to be fixed as this function is obsolete. The preferred way is to call + * user_exit_irqoff(). It should be the arch entry code responsibility to + * call into context tracking with IRQs disabled. + */ +void user_exit_callable(void) { user_exit(); } -NOKPROBE_SYMBOL(context_tracking_user_exit); +NOKPROBE_SYMBOL(user_exit_callable); -void __init context_tracking_cpu_set(int cpu) +void __init ct_cpu_track_user(int cpu) { static __initdata bool initialized = false; @@ -212,12 +715,14 @@ void __init context_tracking_cpu_set(int cpu) initialized = true; } -#ifdef CONFIG_CONTEXT_TRACKING_FORCE +#ifdef CONFIG_CONTEXT_TRACKING_USER_FORCE void __init context_tracking_init(void) { int cpu; for_each_possible_cpu(cpu) - context_tracking_cpu_set(cpu); + ct_cpu_track_user(cpu); } #endif + +#endif /* #ifdef CONFIG_CONTEXT_TRACKING_USER */ diff --git a/kernel/cpu.c b/kernel/cpu.c index 5797c2a7a93f..bbad5e375d3b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -35,6 +35,7 @@ #include <linux/percpu-rwsem.h> #include <linux/cpuset.h> #include <linux/random.h> +#include <linux/cc_platform.h> #include <trace/events/power.h> #define CREATE_TRACE_POINTS @@ -71,7 +72,6 @@ struct cpuhp_cpu_state { bool rollback; bool single; bool bringup; - int cpu; struct hlist_node *node; struct hlist_node *last; enum cpuhp_state cb_state; @@ -475,7 +475,7 @@ static inline bool cpu_smt_allowed(unsigned int cpu) { return true; } #endif static inline enum cpuhp_state -cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target) +cpuhp_set_state(int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state target) { enum cpuhp_state prev_state = st->state; bool bringup = st->state < target; @@ -486,14 +486,15 @@ cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target) st->target = target; st->single = false; st->bringup = bringup; - if (cpu_dying(st->cpu) != !bringup) - set_cpu_dying(st->cpu, !bringup); + if (cpu_dying(cpu) != !bringup) + set_cpu_dying(cpu, !bringup); return prev_state; } static inline void -cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state) +cpuhp_reset_state(int cpu, struct cpuhp_cpu_state *st, + enum cpuhp_state prev_state) { bool bringup = !st->bringup; @@ -520,8 +521,8 @@ cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state) } st->bringup = bringup; - if (cpu_dying(st->cpu) != !bringup) - set_cpu_dying(st->cpu, !bringup); + if (cpu_dying(cpu) != !bringup) + set_cpu_dying(cpu, !bringup); } /* Regular hotplug invocation of the AP hotplug thread */ @@ -541,15 +542,16 @@ static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st) wait_for_ap_thread(st, st->bringup); } -static int cpuhp_kick_ap(struct cpuhp_cpu_state *st, enum cpuhp_state target) +static int cpuhp_kick_ap(int cpu, struct cpuhp_cpu_state *st, + enum cpuhp_state target) { enum cpuhp_state prev_state; int ret; - prev_state = cpuhp_set_state(st, target); + prev_state = cpuhp_set_state(cpu, st, target); __cpuhp_kick_ap(st); if ((ret = st->result)) { - cpuhp_reset_state(st, prev_state); + cpuhp_reset_state(cpu, st, prev_state); __cpuhp_kick_ap(st); } @@ -581,7 +583,7 @@ static int bringup_wait_for_ap(unsigned int cpu) if (st->target <= CPUHP_AP_ONLINE_IDLE) return 0; - return cpuhp_kick_ap(st, st->target); + return cpuhp_kick_ap(cpu, st, st->target); } static int bringup_cpu(unsigned int cpu) @@ -704,7 +706,7 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, ret, cpu, cpuhp_get_step(st->state)->name, st->state); - cpuhp_reset_state(st, prev_state); + cpuhp_reset_state(cpu, st, prev_state); if (can_rollback_cpu(st)) WARN_ON(cpuhp_invoke_callback_range(false, cpu, st, prev_state)); @@ -715,15 +717,6 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, /* * The cpu hotplug threads manage the bringup and teardown of the cpus */ -static void cpuhp_create(unsigned int cpu) -{ - struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); - - init_completion(&st->done_up); - init_completion(&st->done_down); - st->cpu = cpu; -} - static int cpuhp_should_run(unsigned int cpu) { struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); @@ -875,7 +868,7 @@ static int cpuhp_kick_ap_work(unsigned int cpu) cpuhp_lock_release(true); trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work); - ret = cpuhp_kick_ap(st, st->target); + ret = cpuhp_kick_ap(cpu, st, st->target); trace_cpuhp_exit(cpu, st->state, prev_state, ret); return ret; @@ -883,15 +876,27 @@ static int cpuhp_kick_ap_work(unsigned int cpu) static struct smp_hotplug_thread cpuhp_threads = { .store = &cpuhp_state.thread, - .create = &cpuhp_create, .thread_should_run = cpuhp_should_run, .thread_fn = cpuhp_thread_fun, .thread_comm = "cpuhp/%u", .selfparking = true, }; +static __init void cpuhp_init_state(void) +{ + struct cpuhp_cpu_state *st; + int cpu; + + for_each_possible_cpu(cpu) { + st = per_cpu_ptr(&cpuhp_state, cpu); + init_completion(&st->done_up); + init_completion(&st->done_down); + } +} + void __init cpuhp_threads_init(void) { + cpuhp_init_state(); BUG_ON(smpboot_register_percpu_thread(&cpuhp_threads)); kthread_unpark(this_cpu_read(cpuhp_state.thread)); } @@ -1107,7 +1112,7 @@ static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, ret, cpu, cpuhp_get_step(st->state)->name, st->state); - cpuhp_reset_state(st, prev_state); + cpuhp_reset_state(cpu, st, prev_state); if (st->state < prev_state) WARN_ON(cpuhp_invoke_callback_range(true, cpu, st, @@ -1134,7 +1139,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, cpuhp_tasks_frozen = tasks_frozen; - prev_state = cpuhp_set_state(st, target); + prev_state = cpuhp_set_state(cpu, st, target); /* * If the current CPU state is in the range of the AP hotplug thread, * then we need to kick the thread. @@ -1165,7 +1170,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, ret = cpuhp_down_callbacks(cpu, st, target); if (ret && st->state < prev_state) { if (st->state == CPUHP_TEARDOWN_CPU) { - cpuhp_reset_state(st, prev_state); + cpuhp_reset_state(cpu, st, prev_state); __cpuhp_kick_ap(st); } else { WARN(1, "DEAD callback error for CPU%d", cpu); @@ -1186,6 +1191,12 @@ out: static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target) { + /* + * If the platform does not support hotplug, report it explicitly to + * differentiate it from a transient offlining failure. + */ + if (cc_platform_has(CC_ATTR_HOTPLUG_DISABLED)) + return -EOPNOTSUPP; if (cpu_hotplug_disabled) return -EBUSY; return _cpu_down(cpu, 0, target); @@ -1352,7 +1363,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) cpuhp_tasks_frozen = tasks_frozen; - cpuhp_set_state(st, target); + cpuhp_set_state(cpu, st, target); /* * If the current CPU state is in the range of the AP hotplug thread, * then we need to kick the thread once more. diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index 246efc74e3f3..ba4ba71facf9 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -35,11 +35,11 @@ static int cpu_pm_notify(enum cpu_pm_event event) * disfunctional in cpu idle. Copy RCU_NONIDLE code to let RCU know * this. */ - rcu_irq_enter_irqson(); + ct_irq_enter_irqson(); rcu_read_lock(); ret = raw_notifier_call_chain(&cpu_pm_notifier.chain, event, NULL); rcu_read_unlock(); - rcu_irq_exit_irqson(); + ct_irq_exit_irqson(); return notifier_to_errno(ret); } @@ -49,11 +49,11 @@ static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event ev unsigned long flags; int ret; - rcu_irq_enter_irqson(); + ct_irq_enter_irqson(); raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags); ret = raw_notifier_call_chain_robust(&cpu_pm_notifier.chain, event_up, event_down, NULL); raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags); - rcu_irq_exit_irqson(); + ct_irq_exit_irqson(); return notifier_to_errno(ret); } diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 256cf6db573c..a0eb4d5cf557 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -9,12 +9,15 @@ #include <linux/init.h> #include <linux/utsname.h> #include <linux/vmalloc.h> +#include <linux/sizes.h> #include <asm/page.h> #include <asm/sections.h> #include <crypto/sha1.h> +#include "kallsyms_internal.h" + /* vmcoreinfo stuff */ unsigned char *vmcoreinfo_data; size_t vmcoreinfo_size; @@ -43,6 +46,15 @@ static int __init parse_crashkernel_mem(char *cmdline, unsigned long long *crash_base) { char *cur = cmdline, *tmp; + unsigned long long total_mem = system_ram; + + /* + * Firmware sometimes reserves some memory regions for its own use, + * so the system memory size is less than the actual physical memory + * size. Work around this by rounding up the total size to 128M, + * which is enough for most test cases. + */ + total_mem = roundup(total_mem, SZ_128M); /* for each entry of the comma-separated list */ do { @@ -87,13 +99,13 @@ static int __init parse_crashkernel_mem(char *cmdline, return -EINVAL; } cur = tmp; - if (size >= system_ram) { + if (size >= total_mem) { pr_warn("crashkernel: invalid size\n"); return -EINVAL; } /* match ? */ - if (system_ram >= start && system_ram < end) { + if (total_mem >= start && total_mem < end) { *crash_size = size; break; } @@ -222,9 +234,6 @@ next: p = strstr(p+1, name); } - if (!ck_cmdline) - return NULL; - return ck_cmdline; } @@ -243,9 +252,8 @@ static int __init __parse_crashkernel(char *cmdline, *crash_base = 0; ck_cmdline = get_last_crashkernel(cmdline, name, suffix); - if (!ck_cmdline) - return -EINVAL; + return -ENOENT; ck_cmdline += strlen(name); @@ -484,6 +492,19 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); #endif +#ifdef CONFIG_KALLSYMS + VMCOREINFO_SYMBOL(kallsyms_names); + VMCOREINFO_SYMBOL(kallsyms_num_syms); + VMCOREINFO_SYMBOL(kallsyms_token_table); + VMCOREINFO_SYMBOL(kallsyms_token_index); +#ifdef CONFIG_KALLSYMS_BASE_RELATIVE + VMCOREINFO_SYMBOL(kallsyms_offsets); + VMCOREINFO_SYMBOL(kallsyms_relative_base); +#else + VMCOREINFO_SYMBOL(kallsyms_addresses); +#endif /* CONFIG_KALLSYMS_BASE_RELATIVE */ +#endif /* CONFIG_KALLSYMS */ + arch_crash_save_vmcoreinfo(); update_vmcoreinfo_note(); diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index da06a5553835..7beceb447211 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -53,6 +53,7 @@ #include <linux/vmacache.h> #include <linux/rcupdate.h> #include <linux/irq.h> +#include <linux/security.h> #include <asm/cacheflush.h> #include <asm/byteorder.h> @@ -752,6 +753,29 @@ cpu_master_loop: continue; kgdb_connected = 0; } else { + /* + * This is a brutal way to interfere with the debugger + * and prevent gdb being used to poke at kernel memory. + * This could cause trouble if lockdown is applied when + * there is already an active gdb session. For now the + * answer is simply "don't do that". Typically lockdown + * *will* be applied before the debug core gets started + * so only developers using kgdb for fairly advanced + * early kernel debug can be biten by this. Hopefully + * they are sophisticated enough to take care of + * themselves, especially with help from the lockdown + * message printed on the console! + */ + if (security_locked_down(LOCKDOWN_DBG_WRITE_KERNEL)) { + if (IS_ENABLED(CONFIG_KGDB_KDB)) { + /* Switch back to kdb if possible... */ + dbg_kdb_mode = 1; + continue; + } else { + /* ... otherwise just bail */ + break; + } + } error = gdb_serial_stub(ks); } diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 6735ac36b718..67d3c48a1522 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -9,7 +9,6 @@ * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. */ -#include <linux/module.h> #include <linux/types.h> #include <linux/ctype.h> #include <linux/kernel.h> diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c index f877a0a0d7cf..f87c750d3eb3 100644 --- a/kernel/debug/kdb/kdb_keyboard.c +++ b/kernel/debug/kdb/kdb_keyboard.c @@ -11,7 +11,6 @@ #include <linux/kdb.h> #include <linux/keyboard.h> #include <linux/ctype.h> -#include <linux/module.h> #include <linux/io.h> /* Keyboard Controller Registers on normal PCs. */ diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 0852a537dad4..438b868cbfa9 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -26,7 +26,6 @@ #include <linux/utsname.h> #include <linux/vmalloc.h> #include <linux/atomic.h> -#include <linux/module.h> #include <linux/moduleparam.h> #include <linux/mm.h> #include <linux/init.h> @@ -45,6 +44,7 @@ #include <linux/proc_fs.h> #include <linux/uaccess.h> #include <linux/slab.h> +#include <linux/security.h> #include "kdb_private.h" #undef MODULE_PARAM_PREFIX @@ -166,10 +166,62 @@ struct task_struct *kdb_curr_task(int cpu) } /* - * Check whether the flags of the current command and the permissions - * of the kdb console has allow a command to be run. + * Update the permissions flags (kdb_cmd_enabled) to match the + * current lockdown state. + * + * Within this function the calls to security_locked_down() are "lazy". We + * avoid calling them if the current value of kdb_cmd_enabled already excludes + * flags that might be subject to lockdown. Additionally we deliberately check + * the lockdown flags independently (even though read lockdown implies write + * lockdown) since that results in both simpler code and clearer messages to + * the user on first-time debugger entry. + * + * The permission masks during a read+write lockdown permits the following + * flags: INSPECT, SIGNAL, REBOOT (and ALWAYS_SAFE). + * + * The INSPECT commands are not blocked during lockdown because they are + * not arbitrary memory reads. INSPECT covers the backtrace family (sometimes + * forcing them to have no arguments) and lsmod. These commands do expose + * some kernel state but do not allow the developer seated at the console to + * choose what state is reported. SIGNAL and REBOOT should not be controversial, + * given these are allowed for root during lockdown already. + */ +static void kdb_check_for_lockdown(void) +{ + const int write_flags = KDB_ENABLE_MEM_WRITE | + KDB_ENABLE_REG_WRITE | + KDB_ENABLE_FLOW_CTRL; + const int read_flags = KDB_ENABLE_MEM_READ | + KDB_ENABLE_REG_READ; + + bool need_to_lockdown_write = false; + bool need_to_lockdown_read = false; + + if (kdb_cmd_enabled & (KDB_ENABLE_ALL | write_flags)) + need_to_lockdown_write = + security_locked_down(LOCKDOWN_DBG_WRITE_KERNEL); + + if (kdb_cmd_enabled & (KDB_ENABLE_ALL | read_flags)) + need_to_lockdown_read = + security_locked_down(LOCKDOWN_DBG_READ_KERNEL); + + /* De-compose KDB_ENABLE_ALL if required */ + if (need_to_lockdown_write || need_to_lockdown_read) + if (kdb_cmd_enabled & KDB_ENABLE_ALL) + kdb_cmd_enabled = KDB_ENABLE_MASK & ~KDB_ENABLE_ALL; + + if (need_to_lockdown_write) + kdb_cmd_enabled &= ~write_flags; + + if (need_to_lockdown_read) + kdb_cmd_enabled &= ~read_flags; +} + +/* + * Check whether the flags of the current command, the permissions of the kdb + * console and the lockdown state allow a command to be run. */ -static inline bool kdb_check_flags(kdb_cmdflags_t flags, int permissions, +static bool kdb_check_flags(kdb_cmdflags_t flags, int permissions, bool no_args) { /* permissions comes from userspace so needs massaging slightly */ @@ -1180,6 +1232,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, kdb_curr_task(raw_smp_processor_id()); KDB_DEBUG_STATE("kdb_local 1", reason); + + kdb_check_for_lockdown(); + kdb_go_count = 0; if (reason == KDB_REASON_DEBUG) { /* special case below */ @@ -2004,54 +2059,6 @@ static int kdb_ef(int argc, const char **argv) return 0; } -#if defined(CONFIG_MODULES) -/* - * kdb_lsmod - This function implements the 'lsmod' command. Lists - * currently loaded kernel modules. - * Mostly taken from userland lsmod. - */ -static int kdb_lsmod(int argc, const char **argv) -{ - struct module *mod; - - if (argc != 0) - return KDB_ARGCOUNT; - - kdb_printf("Module Size modstruct Used by\n"); - list_for_each_entry(mod, kdb_modules, list) { - if (mod->state == MODULE_STATE_UNFORMED) - continue; - - kdb_printf("%-20s%8u 0x%px ", mod->name, - mod->core_layout.size, (void *)mod); -#ifdef CONFIG_MODULE_UNLOAD - kdb_printf("%4d ", module_refcount(mod)); -#endif - if (mod->state == MODULE_STATE_GOING) - kdb_printf(" (Unloading)"); - else if (mod->state == MODULE_STATE_COMING) - kdb_printf(" (Loading)"); - else - kdb_printf(" (Live)"); - kdb_printf(" 0x%px", mod->core_layout.base); - -#ifdef CONFIG_MODULE_UNLOAD - { - struct module_use *use; - kdb_printf(" [ "); - list_for_each_entry(use, &mod->source_list, - source_list) - kdb_printf("%s ", use->target->name); - kdb_printf("]\n"); - } -#endif - } - - return 0; -} - -#endif /* CONFIG_MODULES */ - /* * kdb_env - This function implements the 'env' command. Display the * current environment variables. diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 0d2f9feea0a4..1f8c519a5f81 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -226,10 +226,6 @@ extern void kdb_kbd_cleanup_state(void); #define kdb_kbd_cleanup_state() #endif /* ! CONFIG_KDB_KEYBOARD */ -#ifdef CONFIG_MODULES -extern struct list_head *kdb_modules; -#endif /* CONFIG_MODULES */ - extern char kdb_prompt_str[]; #define KDB_WORD_SIZE ((int)sizeof(unsigned long)) diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 85cb51c4a17e..0a39497140bf 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -17,7 +17,6 @@ #include <linux/stddef.h> #include <linux/vmalloc.h> #include <linux/ptrace.h> -#include <linux/module.h> #include <linux/highmem.h> #include <linux/hardirq.h> #include <linux/delay.h> diff --git a/kernel/delayacct.c b/kernel/delayacct.c index c5e8cea9e05f..164ed9ef77a3 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -44,7 +44,7 @@ void delayacct_init(void) } #ifdef CONFIG_PROC_SYSCTL -int sysctl_delayacct(struct ctl_table *table, int write, void *buffer, +static int sysctl_delayacct(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int state = delayacct_on; @@ -63,6 +63,26 @@ int sysctl_delayacct(struct ctl_table *table, int write, void *buffer, set_delayacct(state); return err; } + +static struct ctl_table kern_delayacct_table[] = { + { + .procname = "task_delayacct", + .data = NULL, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_delayacct, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { } +}; + +static __init int kernel_delayacct_sysctls_init(void) +{ + register_sysctl_init("kernel", kern_delayacct_table); + return 0; +} +late_initcall(kernel_delayacct_sysctls_init); #endif void __delayacct_tsk_init(struct task_struct *tsk) @@ -157,11 +177,14 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp; tmp = d->compact_delay_total + tsk->delays->compact_delay; d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp; + tmp = d->wpcopy_delay_total + tsk->delays->wpcopy_delay; + d->wpcopy_delay_total = (tmp < d->wpcopy_delay_total) ? 0 : tmp; d->blkio_count += tsk->delays->blkio_count; d->swapin_count += tsk->delays->swapin_count; d->freepages_count += tsk->delays->freepages_count; d->thrashing_count += tsk->delays->thrashing_count; d->compact_count += tsk->delays->compact_count; + d->wpcopy_count += tsk->delays->wpcopy_count; raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return 0; @@ -229,3 +252,16 @@ void __delayacct_compact_end(void) ¤t->delays->compact_delay, ¤t->delays->compact_count); } + +void __delayacct_wpcopy_start(void) +{ + current->delays->wpcopy_start = local_clock(); +} + +void __delayacct_wpcopy_end(void) +{ + delayacct_end(¤t->delays->lock, + ¤t->delays->wpcopy_start, + ¤t->delays->wpcopy_delay, + ¤t->delays->wpcopy_count); +} diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 375fb3c9538d..c21abc77c53e 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -74,7 +74,7 @@ out_unmap_membase: return ERR_PTR(-ENOMEM); } -static void dma_release_coherent_memory(struct dma_coherent_mem *mem) +static void _dma_release_coherent_memory(struct dma_coherent_mem *mem) { if (!mem) return; @@ -126,10 +126,16 @@ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, ret = dma_assign_coherent_memory(dev, mem); if (ret) - dma_release_coherent_memory(mem); + _dma_release_coherent_memory(mem); return ret; } +void dma_release_coherent_memory(struct device *dev) +{ + if (dev) + _dma_release_coherent_memory(dev->dma_mem); +} + static void *__dma_alloc_from_coherent(struct device *dev, struct dma_coherent_mem *mem, ssize_t size, dma_addr_t *dma_handle) diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index f8ff598596b8..18c93c2276ca 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -350,11 +350,10 @@ static struct dma_debug_entry *bucket_find_contain(struct hash_bucket **bucket, unsigned long *flags) { - unsigned int max_range = dma_get_max_seg_size(ref->dev); struct dma_debug_entry *entry, index = *ref; - unsigned int range = 0; + int limit = min(HASH_SIZE, (index.dev_addr >> HASH_FN_SHIFT) + 1); - while (range <= max_range) { + for (int i = 0; i < limit; i++) { entry = __hash_bucket_find(*bucket, ref, containing_match); if (entry) @@ -364,7 +363,6 @@ static struct dma_debug_entry *bucket_find_contain(struct hash_bucket **bucket, * Nothing found, go back a hash bucket */ put_hash_bucket(*bucket, *flags); - range += (1 << HASH_FN_SHIFT); index.dev_addr -= (1 << HASH_FN_SHIFT); *bucket = get_hash_bucket(&index, flags); } @@ -448,7 +446,7 @@ void debug_dma_dump_mappings(struct device *dev) * other hand, consumes a single dma_debug_entry, but inserts 'nents' * entries into the tree. */ -static RADIX_TREE(dma_active_cacheline, GFP_NOWAIT); +static RADIX_TREE(dma_active_cacheline, GFP_ATOMIC); static DEFINE_SPINLOCK(radix_lock); #define ACTIVE_CACHELINE_MAX_OVERLAP ((1 << RADIX_TREE_MAX_TAGS) - 1) #define CACHELINE_PER_PAGE_SHIFT (PAGE_SHIFT - L1_CACHE_SHIFT) @@ -564,7 +562,7 @@ static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs) rc = active_cacheline_insert(entry); if (rc == -ENOMEM) { - pr_err("cacheline tracking ENOMEM, dma-debug disabled\n"); + pr_err_once("cacheline tracking ENOMEM, dma-debug disabled\n"); global_disable = true; } else if (rc == -EEXIST && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { err_printk(entry->dev, entry, diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 9743c6ccce1a..63859a101ed8 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -79,7 +79,7 @@ static int dma_set_decrypted(struct device *dev, void *vaddr, size_t size) { if (!force_dma_unencrypted(dev)) return 0; - return set_memory_decrypted((unsigned long)vaddr, 1 << get_order(size)); + return set_memory_decrypted((unsigned long)vaddr, PFN_UP(size)); } static int dma_set_encrypted(struct device *dev, void *vaddr, size_t size) @@ -88,7 +88,7 @@ static int dma_set_encrypted(struct device *dev, void *vaddr, size_t size) if (!force_dma_unencrypted(dev)) return 0; - ret = set_memory_encrypted((unsigned long)vaddr, 1 << get_order(size)); + ret = set_memory_encrypted((unsigned long)vaddr, PFN_UP(size)); if (ret) pr_warn_ratelimited("leaking DMA memory that can't be re-encrypted\n"); return ret; @@ -115,7 +115,7 @@ static struct page *dma_direct_alloc_swiotlb(struct device *dev, size_t size) } static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, - gfp_t gfp) + gfp_t gfp, bool allow_highmem) { int node = dev_to_node(dev); struct page *page = NULL; @@ -129,9 +129,12 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, &phys_limit); page = dma_alloc_contiguous(dev, size, gfp); - if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { - dma_free_contiguous(dev, page, size); - page = NULL; + if (page) { + if (!dma_coherent_ok(dev, page_to_phys(page), size) || + (!allow_highmem && PageHighMem(page))) { + dma_free_contiguous(dev, page, size); + page = NULL; + } } again: if (!page) @@ -189,7 +192,7 @@ static void *dma_direct_alloc_no_mapping(struct device *dev, size_t size, { struct page *page; - page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO); + page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO, true); if (!page) return NULL; @@ -262,7 +265,7 @@ void *dma_direct_alloc(struct device *dev, size_t size, return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); /* we always manually zero the memory once we are done */ - page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO); + page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO, true); if (!page) return NULL; @@ -354,7 +357,7 @@ void dma_direct_free(struct device *dev, size_t size, } else { if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED)) arch_dma_clear_uncached(cpu_addr, size); - if (dma_set_encrypted(dev, cpu_addr, 1 << page_order)) + if (dma_set_encrypted(dev, cpu_addr, size)) return; } @@ -370,19 +373,9 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size, if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp)) return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); - page = __dma_direct_alloc_pages(dev, size, gfp); + page = __dma_direct_alloc_pages(dev, size, gfp, false); if (!page) return NULL; - if (PageHighMem(page)) { - /* - * Depending on the cma= arguments and per-arch setup - * dma_alloc_contiguous could return highmem pages. - * Without remapping there is no way to return them here, - * so log an error and fail. - */ - dev_info(dev, "Rejecting highmem page from CMA.\n"); - goto out_free_pages; - } ret = page_address(page); if (dma_set_decrypted(dev, ret, size)) @@ -399,7 +392,6 @@ void dma_direct_free_pages(struct device *dev, size_t size, struct page *page, dma_addr_t dma_addr, enum dma_data_direction dir) { - unsigned int page_order = get_order(size); void *vaddr = page_address(page); /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */ @@ -407,7 +399,7 @@ void dma_direct_free_pages(struct device *dev, size_t size, dma_free_from_pool(dev, vaddr, size)) return; - if (dma_set_encrypted(dev, vaddr, 1 << page_order)) + if (dma_set_encrypted(dev, vaddr, size)) return; __dma_direct_free_pages(dev, page, size); } @@ -461,29 +453,60 @@ void dma_direct_sync_sg_for_cpu(struct device *dev, arch_sync_dma_for_cpu_all(); } +/* + * Unmaps segments, except for ones marked as pci_p2pdma which do not + * require any further action as they contain a bus address. + */ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs) { struct scatterlist *sg; int i; - for_each_sg(sgl, sg, nents, i) - dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir, - attrs); + for_each_sg(sgl, sg, nents, i) { + if (sg_is_dma_bus_address(sg)) + sg_dma_unmark_bus_address(sg); + else + dma_direct_unmap_page(dev, sg->dma_address, + sg_dma_len(sg), dir, attrs); + } } #endif int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs) { - int i; + struct pci_p2pdma_map_state p2pdma_state = {}; + enum pci_p2pdma_map_type map; struct scatterlist *sg; + int i, ret; for_each_sg(sgl, sg, nents, i) { + if (is_pci_p2pdma_page(sg_page(sg))) { + map = pci_p2pdma_map_segment(&p2pdma_state, dev, sg); + switch (map) { + case PCI_P2PDMA_MAP_BUS_ADDR: + continue; + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + /* + * Any P2P mapping that traverses the PCI + * host bridge must be mapped with CPU physical + * address and not PCI bus addresses. This is + * done with dma_direct_map_page() below. + */ + break; + default: + ret = -EREMOTEIO; + goto out_unmap; + } + } + sg->dma_address = dma_direct_map_page(dev, sg_page(sg), sg->offset, sg->length, dir, attrs); - if (sg->dma_address == DMA_MAPPING_ERROR) + if (sg->dma_address == DMA_MAPPING_ERROR) { + ret = -EIO; goto out_unmap; + } sg_dma_len(sg) = sg->length; } @@ -491,7 +514,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, out_unmap: dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); - return -EIO; + return ret; } dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index 4632b0f4f72e..e38ffc5e6bdd 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -8,6 +8,7 @@ #define _KERNEL_DMA_DIRECT_H #include <linux/dma-direct.h> +#include <linux/memremap.h> int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, @@ -87,11 +88,16 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev, phys_addr_t phys = page_to_phys(page) + offset; dma_addr_t dma_addr = phys_to_dma(dev, phys); - if (is_swiotlb_force_bounce(dev)) + if (is_swiotlb_force_bounce(dev)) { + if (is_pci_p2pdma_page(page)) + return DMA_MAPPING_ERROR; return swiotlb_map(dev, phys, size, dir, attrs); + } if (unlikely(!dma_capable(dev, dma_addr, size, true))) { - if (swiotlb_force != SWIOTLB_NO_FORCE) + if (is_pci_p2pdma_page(page)) + return DMA_MAPPING_ERROR; + if (is_swiotlb_active(dev)) return swiotlb_map(dev, phys, size, dir, attrs); dev_WARN_ONCE(dev, 1, @@ -114,6 +120,7 @@ static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, dma_direct_sync_single_for_cpu(dev, addr, size, dir); if (unlikely(is_swiotlb_buffer(dev, phys))) - swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs); + swiotlb_tbl_unmap_single(dev, phys, size, dir, + attrs | DMA_ATTR_SKIP_CPU_SYNC); } #endif /* _KERNEL_DMA_DIRECT_H */ diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index db7244291b74..27f272381cf2 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -197,7 +197,7 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, if (ents > 0) debug_dma_map_sg(dev, sg, nents, ents, dir, attrs); else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM && - ents != -EIO)) + ents != -EIO && ents != -EREMOTEIO)) return -EIO; return ents; @@ -249,12 +249,15 @@ EXPORT_SYMBOL(dma_map_sg_attrs); * Returns 0 on success or a negative error code on error. The following * error codes are supported with the given meaning: * - * -EINVAL An invalid argument, unaligned access or other error - * in usage. Will not succeed if retried. - * -ENOMEM Insufficient resources (like memory or IOVA space) to - * complete the mapping. Should succeed if retried later. - * -EIO Legacy error code with an unknown meaning. eg. this is - * returned if a lower level call returned DMA_MAPPING_ERROR. + * -EINVAL An invalid argument, unaligned access or other error + * in usage. Will not succeed if retried. + * -ENOMEM Insufficient resources (like memory or IOVA space) to + * complete the mapping. Should succeed if retried later. + * -EIO Legacy error code with an unknown meaning. eg. this is + * returned if a lower level call returned + * DMA_MAPPING_ERROR. + * -EREMOTEIO The DMA device cannot access P2PDMA memory specified + * in the sg_table. This will not succeed if retried. */ int dma_map_sgtable(struct device *dev, struct sg_table *sgt, enum dma_data_direction dir, unsigned long attrs) @@ -704,7 +707,7 @@ int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma, } EXPORT_SYMBOL_GPL(dma_mmap_noncontiguous); -int dma_supported(struct device *dev, u64 mask) +static int dma_supported(struct device *dev, u64 mask) { const struct dma_map_ops *ops = get_dma_ops(dev); @@ -718,7 +721,24 @@ int dma_supported(struct device *dev, u64 mask) return 1; return ops->dma_supported(dev, mask); } -EXPORT_SYMBOL(dma_supported); + +bool dma_pci_p2pdma_supported(struct device *dev) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + /* if ops is not set, dma direct will be used which supports P2PDMA */ + if (!ops) + return true; + + /* + * Note: dma_ops_bypass is not checked here because P2PDMA should + * not be used with dma mapping ops that do not have support even + * if the specific device is bypassing them. + */ + + return ops->flags & DMA_F_PCI_P2PDMA_SUPPORTED; +} +EXPORT_SYMBOL_GPL(dma_pci_p2pdma_supported); #ifdef CONFIG_ARCH_HAS_DMA_SET_MASK void arch_dma_set_mask(struct device *dev, u64 mask); @@ -773,6 +793,18 @@ size_t dma_max_mapping_size(struct device *dev) } EXPORT_SYMBOL_GPL(dma_max_mapping_size); +size_t dma_opt_mapping_size(struct device *dev) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + size_t size = SIZE_MAX; + + if (ops && ops->opt_mapping_size) + size = ops->opt_mapping_size(); + + return min(dma_max_mapping_size(dev), size); +} +EXPORT_SYMBOL_GPL(dma_opt_mapping_size); + bool dma_need_sync(struct device *dev, dma_addr_t dma_addr) { const struct dma_map_ops *ops = get_dma_ops(dev); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 73a41cec9e38..0ef6b12f961d 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -62,19 +62,76 @@ #define INVALID_PHYS_ADDR (~(phys_addr_t)0) -enum swiotlb_force swiotlb_force; +struct io_tlb_slot { + phys_addr_t orig_addr; + size_t alloc_size; + unsigned int list; +}; + +static bool swiotlb_force_bounce; +static bool swiotlb_force_disable; struct io_tlb_mem io_tlb_default_mem; phys_addr_t swiotlb_unencrypted_base; +static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT; +static unsigned long default_nareas; + +/** + * struct io_tlb_area - IO TLB memory area descriptor + * + * This is a single area with a single lock. + * + * @used: The number of used IO TLB block. + * @index: The slot index to start searching in this area for next round. + * @lock: The lock to protect the above data structures in the map and + * unmap calls. + */ +struct io_tlb_area { + unsigned long used; + unsigned int index; + spinlock_t lock; +}; + /* - * Max segment that we can provide which (if pages are contingous) will - * not be bounced (unless SWIOTLB_FORCE is set). + * Round up number of slabs to the next power of 2. The last area is going + * be smaller than the rest if default_nslabs is not power of two. + * The number of slot in an area should be a multiple of IO_TLB_SEGSIZE, + * otherwise a segment may span two or more areas. It conflicts with free + * contiguous slots tracking: free slots are treated contiguous no matter + * whether they cross an area boundary. + * + * Return true if default_nslabs is rounded up. */ -static unsigned int max_segment; +static bool round_up_default_nslabs(void) +{ + if (!default_nareas) + return false; -static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT; + if (default_nslabs < IO_TLB_SEGSIZE * default_nareas) + default_nslabs = IO_TLB_SEGSIZE * default_nareas; + else if (is_power_of_2(default_nslabs)) + return false; + default_nslabs = roundup_pow_of_two(default_nslabs); + return true; +} + +static void swiotlb_adjust_nareas(unsigned int nareas) +{ + /* use a single area when non is specified */ + if (!nareas) + nareas = 1; + else if (!is_power_of_2(nareas)) + nareas = roundup_pow_of_two(nareas); + + default_nareas = nareas; + + pr_info("area num %d.\n", nareas); + if (round_up_default_nslabs()) + pr_info("SWIOTLB bounce buffer size roundup to %luMB", + (default_nslabs << IO_TLB_SHIFT) >> 20); +} static int __init setup_io_tlb_npages(char *str) @@ -86,10 +143,14 @@ setup_io_tlb_npages(char *str) } if (*str == ',') ++str; + if (isdigit(*str)) + swiotlb_adjust_nareas(simple_strtoul(str, &str, 0)); + if (*str == ',') + ++str; if (!strcmp(str, "force")) - swiotlb_force = SWIOTLB_FORCE; + swiotlb_force_bounce = true; else if (!strcmp(str, "noforce")) - swiotlb_force = SWIOTLB_NO_FORCE; + swiotlb_force_disable = true; return 0; } @@ -97,18 +158,12 @@ early_param("swiotlb", setup_io_tlb_npages); unsigned int swiotlb_max_segment(void) { - return io_tlb_default_mem.nslabs ? max_segment : 0; + if (!io_tlb_default_mem.nslabs) + return 0; + return rounddown(io_tlb_default_mem.nslabs << IO_TLB_SHIFT, PAGE_SIZE); } EXPORT_SYMBOL_GPL(swiotlb_max_segment); -void swiotlb_set_max_segment(unsigned int val) -{ - if (swiotlb_force == SWIOTLB_FORCE) - max_segment = 1; - else - max_segment = rounddown(val, PAGE_SIZE); -} - unsigned long swiotlb_size_or_default(void) { return default_nslabs << IO_TLB_SHIFT; @@ -123,8 +178,11 @@ void __init swiotlb_adjust_size(unsigned long size) */ if (default_nslabs != IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT) return; + size = ALIGN(size, IO_TLB_SIZE); default_nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE); + if (round_up_default_nslabs()) + size = default_nslabs << IO_TLB_SHIFT; pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20); } @@ -203,7 +261,8 @@ void __init swiotlb_update_mem_attributes(void) } static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, - unsigned long nslabs, bool late_alloc) + unsigned long nslabs, unsigned int flags, + bool late_alloc, unsigned int nareas) { void *vaddr = phys_to_virt(start); unsigned long bytes = nslabs << IO_TLB_SHIFT, i; @@ -211,13 +270,18 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, mem->nslabs = nslabs; mem->start = start; mem->end = mem->start + bytes; - mem->index = 0; mem->late_alloc = late_alloc; + mem->nareas = nareas; + mem->area_nslabs = nslabs / mem->nareas; + + mem->force_bounce = swiotlb_force_bounce || (flags & SWIOTLB_FORCE); - if (swiotlb_force == SWIOTLB_FORCE) - mem->force_bounce = true; + for (i = 0; i < mem->nareas; i++) { + spin_lock_init(&mem->areas[i].lock); + mem->areas[i].index = 0; + mem->areas[i].used = 0; + } - spin_lock_init(&mem->lock); for (i = 0; i < mem->nslabs; i++) { mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i); mem->slots[i].orig_addr = INVALID_PHYS_ADDR; @@ -236,17 +300,57 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, return; } -int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) +/* + * Statically reserve bounce buffer space and initialize bounce buffer data + * structures for the software IO TLB used to implement the DMA API. + */ +void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags, + int (*remap)(void *tlb, unsigned long nslabs)) { struct io_tlb_mem *mem = &io_tlb_default_mem; + unsigned long nslabs; size_t alloc_size; + size_t bytes; + void *tlb; - if (swiotlb_force == SWIOTLB_NO_FORCE) - return 0; + if (!addressing_limit && !swiotlb_force_bounce) + return; + if (swiotlb_force_disable) + return; - /* protect against double initialization */ - if (WARN_ON_ONCE(mem->nslabs)) - return -ENOMEM; + /* + * default_nslabs maybe changed when adjust area number. + * So allocate bounce buffer after adjusting area number. + */ + if (!default_nareas) + swiotlb_adjust_nareas(num_possible_cpus()); + + nslabs = default_nslabs; + /* + * By default allocate the bounce buffer memory from low memory, but + * allow to pick a location everywhere for hypervisors with guest + * memory encryption. + */ +retry: + bytes = PAGE_ALIGN(nslabs << IO_TLB_SHIFT); + if (flags & SWIOTLB_ANY) + tlb = memblock_alloc(bytes, PAGE_SIZE); + else + tlb = memblock_alloc_low(bytes, PAGE_SIZE); + if (!tlb) { + pr_warn("%s: failed to allocate tlb structure\n", __func__); + return; + } + + if (remap && remap(tlb, nslabs) < 0) { + memblock_free(tlb, PAGE_ALIGN(bytes)); + + nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE); + if (nslabs < IO_TLB_MIN_SLABS) + panic("%s: Failed to remap %zu bytes\n", + __func__, bytes); + goto retry; + } alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs)); mem->slots = memblock_alloc(alloc_size, PAGE_SIZE); @@ -254,39 +358,21 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) panic("%s: Failed to allocate %zu bytes align=0x%lx\n", __func__, alloc_size, PAGE_SIZE); - swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, false); + mem->areas = memblock_alloc(array_size(sizeof(struct io_tlb_area), + default_nareas), SMP_CACHE_BYTES); + if (!mem->areas) + panic("%s: Failed to allocate mem->areas.\n", __func__); + + swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, flags, false, + default_nareas); - if (verbose) + if (flags & SWIOTLB_VERBOSE) swiotlb_print_info(); - swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT); - return 0; } -/* - * Statically reserve bounce buffer space and initialize bounce buffer data - * structures for the software IO TLB used to implement the DMA API. - */ -void __init -swiotlb_init(int verbose) +void __init swiotlb_init(bool addressing_limit, unsigned int flags) { - size_t bytes = PAGE_ALIGN(default_nslabs << IO_TLB_SHIFT); - void *tlb; - - if (swiotlb_force == SWIOTLB_NO_FORCE) - return; - - /* Get IO TLB memory from the low pages */ - tlb = memblock_alloc_low(bytes, PAGE_SIZE); - if (!tlb) - goto fail; - if (swiotlb_init_with_tbl(tlb, default_nslabs, verbose)) - goto fail_free_mem; - return; - -fail_free_mem: - memblock_free(tlb, bytes); -fail: - pr_warn("Cannot allocate buffer"); + swiotlb_init_remap(addressing_limit, flags, NULL); } /* @@ -294,73 +380,81 @@ fail: * initialize the swiotlb later using the slab allocator if needed. * This should be just like above, but with some error catching. */ -int -swiotlb_late_init_with_default_size(size_t default_size) +int swiotlb_init_late(size_t size, gfp_t gfp_mask, + int (*remap)(void *tlb, unsigned long nslabs)) { - unsigned long nslabs = - ALIGN(default_size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE); - unsigned long bytes; + struct io_tlb_mem *mem = &io_tlb_default_mem; + unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE); unsigned char *vstart = NULL; - unsigned int order; + unsigned int order, area_order; + bool retried = false; int rc = 0; - if (swiotlb_force == SWIOTLB_NO_FORCE) + if (swiotlb_force_disable) return 0; - /* - * Get IO TLB memory from the low pages - */ +retry: order = get_order(nslabs << IO_TLB_SHIFT); nslabs = SLABS_PER_PAGE << order; - bytes = nslabs << IO_TLB_SHIFT; while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { - vstart = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, + vstart = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN, order); if (vstart) break; order--; + nslabs = SLABS_PER_PAGE << order; + retried = true; } if (!vstart) return -ENOMEM; - if (order != get_order(bytes)) { - pr_warn("only able to allocate %ld MB\n", - (PAGE_SIZE << order) >> 20); - nslabs = SLABS_PER_PAGE << order; - } - rc = swiotlb_late_init_with_tbl(vstart, nslabs); - if (rc) + if (remap) + rc = remap(vstart, nslabs); + if (rc) { free_pages((unsigned long)vstart, order); - return rc; -} + nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE); + if (nslabs < IO_TLB_MIN_SLABS) + return rc; + retried = true; + goto retry; + } -int -swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) -{ - struct io_tlb_mem *mem = &io_tlb_default_mem; - unsigned long bytes = nslabs << IO_TLB_SHIFT; + if (retried) { + pr_warn("only able to allocate %ld MB\n", + (PAGE_SIZE << order) >> 20); + } - if (swiotlb_force == SWIOTLB_NO_FORCE) - return 0; + if (!default_nareas) + swiotlb_adjust_nareas(num_possible_cpus()); - /* protect against double initialization */ - if (WARN_ON_ONCE(mem->nslabs)) - return -ENOMEM; + area_order = get_order(array_size(sizeof(*mem->areas), + default_nareas)); + mem->areas = (struct io_tlb_area *) + __get_free_pages(GFP_KERNEL | __GFP_ZERO, area_order); + if (!mem->areas) + goto error_area; mem->slots = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(array_size(sizeof(*mem->slots), nslabs))); if (!mem->slots) - return -ENOMEM; + goto error_slots; - set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT); - swiotlb_init_io_tlb_mem(mem, virt_to_phys(tlb), nslabs, true); + set_memory_decrypted((unsigned long)vstart, + (nslabs << IO_TLB_SHIFT) >> PAGE_SHIFT); + swiotlb_init_io_tlb_mem(mem, virt_to_phys(vstart), nslabs, 0, true, + default_nareas); swiotlb_print_info(); - swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT); return 0; + +error_slots: + free_pages((unsigned long)mem->areas, area_order); +error_area: + free_pages((unsigned long)vstart, order); + return -ENOMEM; } void __init swiotlb_exit(void) @@ -368,6 +462,10 @@ void __init swiotlb_exit(void) struct io_tlb_mem *mem = &io_tlb_default_mem; unsigned long tbl_vaddr; size_t tbl_size, slots_size; + unsigned int area_order; + + if (swiotlb_force_bounce) + return; if (!mem->nslabs) return; @@ -379,9 +477,14 @@ void __init swiotlb_exit(void) set_memory_encrypted(tbl_vaddr, tbl_size >> PAGE_SHIFT); if (mem->late_alloc) { + area_order = get_order(array_size(sizeof(*mem->areas), + mem->nareas)); + free_pages((unsigned long)mem->areas, area_order); free_pages(tbl_vaddr, get_order(tbl_size)); free_pages((unsigned long)mem->slots, get_order(slots_size)); } else { + memblock_free_late(__pa(mem->areas), + array_size(sizeof(*mem->areas), mem->nareas)); memblock_free_late(mem->start, tbl_size); memblock_free_late(__pa(mem->slots), slots_size); } @@ -472,7 +575,10 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size } } -#define slot_addr(start, idx) ((start) + ((idx) << IO_TLB_SHIFT)) +static inline phys_addr_t slot_addr(phys_addr_t start, phys_addr_t idx) +{ + return start + (idx << IO_TLB_SHIFT); +} /* * Carefully handle integer overflow which can occur when boundary_mask == ~0UL. @@ -484,9 +590,9 @@ static inline unsigned long get_max_slots(unsigned long boundary_mask) return nr_slots(boundary_mask + 1); } -static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index) +static unsigned int wrap_area_index(struct io_tlb_mem *mem, unsigned int index) { - if (index >= mem->nslabs) + if (index >= mem->area_nslabs) return 0; return index; } @@ -495,10 +601,12 @@ static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index) * Find a suitable number of IO TLB entries size that will fit this request and * allocate a buffer from that IO TLB pool. */ -static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, - size_t alloc_size, unsigned int alloc_align_mask) +static int swiotlb_do_find_slots(struct device *dev, int area_index, + phys_addr_t orig_addr, size_t alloc_size, + unsigned int alloc_align_mask) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; + struct io_tlb_area *area = mem->areas + area_index; unsigned long boundary_mask = dma_get_seg_boundary(dev); dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(dev, mem->start) & boundary_mask; @@ -509,8 +617,11 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, unsigned int index, wrap, count = 0, i; unsigned int offset = swiotlb_align_offset(dev, orig_addr); unsigned long flags; + unsigned int slot_base; + unsigned int slot_index; BUG_ON(!nslots); + BUG_ON(area_index >= mem->nareas); /* * For mappings with an alignment requirement don't bother looping to @@ -522,16 +633,20 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, stride = max(stride, stride << (PAGE_SHIFT - IO_TLB_SHIFT)); stride = max(stride, (alloc_align_mask >> IO_TLB_SHIFT) + 1); - spin_lock_irqsave(&mem->lock, flags); - if (unlikely(nslots > mem->nslabs - mem->used)) + spin_lock_irqsave(&area->lock, flags); + if (unlikely(nslots > mem->area_nslabs - area->used)) goto not_found; - index = wrap = wrap_index(mem, ALIGN(mem->index, stride)); + slot_base = area_index * mem->area_nslabs; + index = wrap = wrap_area_index(mem, ALIGN(area->index, stride)); + do { + slot_index = slot_base + index; + if (orig_addr && - (slot_addr(tbl_dma_addr, index) & iotlb_align_mask) != - (orig_addr & iotlb_align_mask)) { - index = wrap_index(mem, index + 1); + (slot_addr(tbl_dma_addr, slot_index) & + iotlb_align_mask) != (orig_addr & iotlb_align_mask)) { + index = wrap_area_index(mem, index + 1); continue; } @@ -540,26 +655,26 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, * contiguous buffers, we allocate the buffers from that slot * and mark the entries as '0' indicating unavailable. */ - if (!iommu_is_span_boundary(index, nslots, + if (!iommu_is_span_boundary(slot_index, nslots, nr_slots(tbl_dma_addr), max_slots)) { - if (mem->slots[index].list >= nslots) + if (mem->slots[slot_index].list >= nslots) goto found; } - index = wrap_index(mem, index + stride); + index = wrap_area_index(mem, index + stride); } while (index != wrap); not_found: - spin_unlock_irqrestore(&mem->lock, flags); + spin_unlock_irqrestore(&area->lock, flags); return -1; found: - for (i = index; i < index + nslots; i++) { + for (i = slot_index; i < slot_index + nslots; i++) { mem->slots[i].list = 0; - mem->slots[i].alloc_size = - alloc_size - (offset + ((i - index) << IO_TLB_SHIFT)); + mem->slots[i].alloc_size = alloc_size - (offset + + ((i - slot_index) << IO_TLB_SHIFT)); } - for (i = index - 1; + for (i = slot_index - 1; io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && mem->slots[i].list; i--) mem->slots[i].list = ++count; @@ -567,14 +682,42 @@ found: /* * Update the indices to avoid searching in the next round. */ - if (index + nslots < mem->nslabs) - mem->index = index + nslots; + if (index + nslots < mem->area_nslabs) + area->index = index + nslots; else - mem->index = 0; - mem->used += nslots; + area->index = 0; + area->used += nslots; + spin_unlock_irqrestore(&area->lock, flags); + return slot_index; +} - spin_unlock_irqrestore(&mem->lock, flags); - return index; +static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, + size_t alloc_size, unsigned int alloc_align_mask) +{ + struct io_tlb_mem *mem = dev->dma_io_tlb_mem; + int start = raw_smp_processor_id() & (mem->nareas - 1); + int i = start, index; + + do { + index = swiotlb_do_find_slots(dev, i, orig_addr, alloc_size, + alloc_align_mask); + if (index >= 0) + return index; + if (++i >= mem->nareas) + i = 0; + } while (i != start); + + return -1; +} + +static unsigned long mem_used(struct io_tlb_mem *mem) +{ + int i; + unsigned long used = 0; + + for (i = 0; i < mem->nareas; i++) + used += mem->areas[i].used; + return used; } phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, @@ -588,7 +731,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, int index; phys_addr_t tlb_addr; - if (!mem) + if (!mem || !mem->nslabs) panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) @@ -606,7 +749,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, if (!(attrs & DMA_ATTR_NO_WARN)) dev_warn_ratelimited(dev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n", - alloc_size, mem->nslabs, mem->used); + alloc_size, mem->nslabs, mem_used(mem)); return (phys_addr_t)DMA_MAPPING_ERROR; } @@ -621,7 +764,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, /* * When dir == DMA_FROM_DEVICE we could omit the copy from the orig * to the tlb buffer, if we knew for sure the device will - * overwirte the entire current content. But we don't. Thus + * overwrite the entire current content. But we don't. Thus * unconditional bounce may prevent leaking swiotlb content (i.e. * kernel memory) to user-space. */ @@ -636,6 +779,8 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr) unsigned int offset = swiotlb_align_offset(dev, tlb_addr); int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT; int nslots = nr_slots(mem->slots[index].alloc_size + offset); + int aindex = index / mem->area_nslabs; + struct io_tlb_area *area = &mem->areas[aindex]; int count, i; /* @@ -644,7 +789,9 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr) * While returning the entries to the free list, we merge the entries * with slots below and above the pool being returned. */ - spin_lock_irqsave(&mem->lock, flags); + BUG_ON(aindex >= mem->nareas); + + spin_lock_irqsave(&area->lock, flags); if (index + nslots < ALIGN(index + 1, IO_TLB_SEGSIZE)) count = mem->slots[index + nslots].list; else @@ -668,8 +815,8 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr) io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && mem->slots[i].list; i--) mem->slots[i].list = ++count; - mem->used -= nslots; - spin_unlock_irqrestore(&mem->lock, flags); + area->used -= nslots; + spin_unlock_irqrestore(&area->lock, flags); } /* @@ -717,8 +864,7 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, phys_addr_t swiotlb_addr; dma_addr_t dma_addr; - trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size, - swiotlb_force); + trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size); swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, 0, dir, attrs); @@ -743,7 +889,18 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, size_t swiotlb_max_mapping_size(struct device *dev) { - return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE; + int min_align_mask = dma_get_min_align_mask(dev); + int min_align = 0; + + /* + * swiotlb_find_slots() skips slots according to + * min align mask. This affects max mapping size. + * Take it into acount here. + */ + if (min_align_mask) + min_align = roundup(min_align_mask, IO_TLB_SIZE); + + return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE - min_align; } bool is_swiotlb_active(struct device *dev) @@ -754,6 +911,13 @@ bool is_swiotlb_active(struct device *dev) } EXPORT_SYMBOL_GPL(is_swiotlb_active); +static int io_tlb_used_get(void *data, u64 *val) +{ + *val = mem_used(&io_tlb_default_mem); + return 0; +} +DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_used, io_tlb_used_get, NULL, "%llu\n"); + static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem, const char *dirname) { @@ -762,7 +926,8 @@ static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem, return; debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs); - debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, &mem->used); + debugfs_create_file("io_tlb_used", 0400, mem->debugfs, NULL, + &fops_io_tlb_used); } static int __init __maybe_unused swiotlb_create_default_debugfs(void) @@ -813,6 +978,9 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem, struct io_tlb_mem *mem = rmem->priv; unsigned long nslabs = rmem->size >> IO_TLB_SHIFT; + /* Set Per-device io tlb area to one */ + unsigned int nareas = 1; + /* * Since multiple devices can share the same pool, the private data, * io_tlb_mem struct, will be initialized by the first device attached @@ -829,10 +997,18 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem, return -ENOMEM; } + mem->areas = kcalloc(nareas, sizeof(*mem->areas), + GFP_KERNEL); + if (!mem->areas) { + kfree(mem->slots); + kfree(mem); + return -ENOMEM; + } + set_memory_decrypted((unsigned long)phys_to_virt(rmem->base), rmem->size >> PAGE_SHIFT); - swiotlb_init_io_tlb_mem(mem, rmem->base, nslabs, false); - mem->force_bounce = true; + swiotlb_init_io_tlb_mem(mem, rmem->base, nslabs, SWIOTLB_FORCE, + false, nareas); mem->for_alloc = true; rmem->priv = mem; diff --git a/kernel/entry/common.c b/kernel/entry/common.c index e57a224d6b79..063068a9ea9b 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -17,7 +17,7 @@ /* See comment for enter_from_user_mode() in entry-common.h */ static __always_inline void __enter_from_user_mode(struct pt_regs *regs) { - arch_check_user_regs(regs); + arch_enter_from_user_mode(regs); lockdep_hardirqs_off(CALLER_ADDR0); CT_WARN_ON(ct_state() != CONTEXT_USER); @@ -126,7 +126,7 @@ static __always_inline void __exit_to_user_mode(void) { instrumentation_begin(); trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); + lockdep_hardirqs_on_prepare(); instrumentation_end(); user_enter_irqoff(); @@ -321,7 +321,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) } /* - * If this entry hit the idle task invoke rcu_irq_enter() whether + * If this entry hit the idle task invoke ct_irq_enter() whether * RCU is watching or not. * * Interrupts can nest when the first interrupt invokes softirq @@ -332,12 +332,12 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) * not nested into another interrupt. * * Checking for rcu_is_watching() here would prevent the nesting - * interrupt to invoke rcu_irq_enter(). If that nested interrupt is + * interrupt to invoke ct_irq_enter(). If that nested interrupt is * the tick then rcu_flavor_sched_clock_irq() would wrongfully * assume that it is the first interrupt and eventually claim * quiescent state and end grace periods prematurely. * - * Unconditionally invoke rcu_irq_enter() so RCU state stays + * Unconditionally invoke ct_irq_enter() so RCU state stays * consistent. * * TINY_RCU does not support EQS, so let the compiler eliminate @@ -350,7 +350,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) * as in irqentry_enter_from_user_mode(). */ lockdep_hardirqs_off(CALLER_ADDR0); - rcu_irq_enter(); + ct_irq_enter(); instrumentation_begin(); trace_hardirqs_off_finish(); instrumentation_end(); @@ -392,7 +392,7 @@ DEFINE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched); DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); void dynamic_irqentry_exit_cond_resched(void) { - if (!static_key_unlikely(&sk_dynamic_irqentry_exit_cond_resched)) + if (!static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched)) return; raw_irqentry_exit_cond_resched(); } @@ -416,9 +416,9 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) instrumentation_begin(); /* Tell the tracer that IRET will enable interrupts */ trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); + lockdep_hardirqs_on_prepare(); instrumentation_end(); - rcu_irq_exit(); + ct_irq_exit(); lockdep_hardirqs_on(CALLER_ADDR0); return; } @@ -436,7 +436,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) * was not watching on entry. */ if (state.exit_rcu) - rcu_irq_exit(); + ct_irq_exit(); } } @@ -449,7 +449,7 @@ irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs) __nmi_enter(); lockdep_hardirqs_off(CALLER_ADDR0); lockdep_hardirq_enter(); - rcu_nmi_enter(); + ct_nmi_enter(); instrumentation_begin(); trace_hardirqs_off_finish(); @@ -465,11 +465,11 @@ void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state) ftrace_nmi_exit(); if (irq_state.lockdep) { trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); + lockdep_hardirqs_on_prepare(); } instrumentation_end(); - rcu_nmi_exit(); + ct_nmi_exit(); lockdep_hardirq_exit(); if (irq_state.lockdep) lockdep_hardirqs_on(CALLER_ADDR0); diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c index 9d09f489b60e..2e0f75bcb7fd 100644 --- a/kernel/entry/kvm.c +++ b/kernel/entry/kvm.c @@ -9,12 +9,6 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) int ret; if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) { - clear_notify_signal(); - if (task_work_pending(current)) - task_work_run(); - } - - if (ti_work & _TIF_SIGPENDING) { kvm_handle_signal_exit(vcpu); return -EINTR; } diff --git a/kernel/events/core.c b/kernel/events/core.c index cfde994ce61c..ff4bffc502c6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -574,8 +574,7 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, enum event_type_t event_type); static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type, - struct task_struct *task); + enum event_type_t event_type); static void update_context_time(struct perf_event_context *ctx); static u64 perf_event_time(struct perf_event *event); @@ -781,7 +780,6 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, static inline void update_cgrp_time_from_event(struct perf_event *event) { struct perf_cgroup_info *info; - struct perf_cgroup *cgrp; /* * ensure we access cgroup data only when needed and @@ -790,21 +788,19 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) if (!is_cgroup_event(event)) return; - cgrp = perf_cgroup_from_task(current, event->ctx); + info = this_cpu_ptr(event->cgrp->info); /* * Do not update time when cgroup is not active */ - if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) { - info = this_cpu_ptr(event->cgrp->info); + if (info->active) __update_cgrp_time(info, perf_clock(), true); - } } static inline void -perf_cgroup_set_timestamp(struct task_struct *task, - struct perf_event_context *ctx) +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) { - struct perf_cgroup *cgrp; + struct perf_event_context *ctx = &cpuctx->ctx; + struct perf_cgroup *cgrp = cpuctx->cgrp; struct perf_cgroup_info *info; struct cgroup_subsys_state *css; @@ -813,10 +809,10 @@ perf_cgroup_set_timestamp(struct task_struct *task, * ensure we do not access cgroup data * unless we have the cgroup pinned (css_get) */ - if (!task || !ctx->nr_cgroups) + if (!cgrp) return; - cgrp = perf_cgroup_from_task(task, ctx); + WARN_ON_ONCE(!ctx->nr_cgroups); for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); @@ -828,17 +824,12 @@ perf_cgroup_set_timestamp(struct task_struct *task, static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list); -#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ -#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */ - /* * reschedule events based on the cgroup constraint of task. - * - * mode SWOUT : schedule out everything - * mode SWIN : schedule in based on cgroup for next */ -static void perf_cgroup_switch(struct task_struct *task, int mode) +static void perf_cgroup_switch(struct task_struct *task) { + struct perf_cgroup *cgrp; struct perf_cpu_context *cpuctx, *tmp; struct list_head *list; unsigned long flags; @@ -849,35 +840,31 @@ static void perf_cgroup_switch(struct task_struct *task, int mode) */ local_irq_save(flags); + cgrp = perf_cgroup_from_task(task, NULL); + list = this_cpu_ptr(&cgrp_cpuctx_list); list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) { WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); + if (READ_ONCE(cpuctx->cgrp) == cgrp) + continue; perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(cpuctx->ctx.pmu); - if (mode & PERF_CGROUP_SWOUT) { - cpu_ctx_sched_out(cpuctx, EVENT_ALL); - /* - * must not be done before ctxswout due - * to event_filter_match() in event_sched_out() - */ - cpuctx->cgrp = NULL; - } + cpu_ctx_sched_out(cpuctx, EVENT_ALL); + /* + * must not be done before ctxswout due + * to update_cgrp_time_from_cpuctx() in + * ctx_sched_out() + */ + cpuctx->cgrp = cgrp; + /* + * set cgrp before ctxsw in to allow + * perf_cgroup_set_timestamp() in ctx_sched_in() + * to not have to pass task around + */ + cpu_ctx_sched_in(cpuctx, EVENT_ALL); - if (mode & PERF_CGROUP_SWIN) { - WARN_ON_ONCE(cpuctx->cgrp); - /* - * set cgrp before ctxsw in to allow - * event_filter_match() to not have to pass - * task around - * we pass the cpuctx->ctx to perf_cgroup_from_task() - * because cgorup events are only per-cpu - */ - cpuctx->cgrp = perf_cgroup_from_task(task, - &cpuctx->ctx); - cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); - } perf_pmu_enable(cpuctx->ctx.pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } @@ -885,58 +872,6 @@ static void perf_cgroup_switch(struct task_struct *task, int mode) local_irq_restore(flags); } -static inline void perf_cgroup_sched_out(struct task_struct *task, - struct task_struct *next) -{ - struct perf_cgroup *cgrp1; - struct perf_cgroup *cgrp2 = NULL; - - rcu_read_lock(); - /* - * we come here when we know perf_cgroup_events > 0 - * we do not need to pass the ctx here because we know - * we are holding the rcu lock - */ - cgrp1 = perf_cgroup_from_task(task, NULL); - cgrp2 = perf_cgroup_from_task(next, NULL); - - /* - * only schedule out current cgroup events if we know - * that we are switching to a different cgroup. Otherwise, - * do no touch the cgroup events. - */ - if (cgrp1 != cgrp2) - perf_cgroup_switch(task, PERF_CGROUP_SWOUT); - - rcu_read_unlock(); -} - -static inline void perf_cgroup_sched_in(struct task_struct *prev, - struct task_struct *task) -{ - struct perf_cgroup *cgrp1; - struct perf_cgroup *cgrp2 = NULL; - - rcu_read_lock(); - /* - * we come here when we know perf_cgroup_events > 0 - * we do not need to pass the ctx here because we know - * we are holding the rcu lock - */ - cgrp1 = perf_cgroup_from_task(task, NULL); - cgrp2 = perf_cgroup_from_task(prev, NULL); - - /* - * only need to schedule in cgroup events if we are changing - * cgroup during ctxsw. Cgroup events were not scheduled - * out of ctxsw out if that was not the case. - */ - if (cgrp1 != cgrp2) - perf_cgroup_switch(task, PERF_CGROUP_SWIN); - - rcu_read_unlock(); -} - static int perf_cgroup_ensure_storage(struct perf_event *event, struct cgroup_subsys_state *css) { @@ -1032,22 +967,10 @@ perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ct */ cpuctx = container_of(ctx, struct perf_cpu_context, ctx); - /* - * Since setting cpuctx->cgrp is conditional on the current @cgrp - * matching the event's cgroup, we must do this for every new event, - * because if the first would mismatch, the second would not try again - * and we would leave cpuctx->cgrp unset. - */ - if (ctx->is_active && !cpuctx->cgrp) { - struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); - - if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) - cpuctx->cgrp = cgrp; - } - if (ctx->nr_cgroups++) return; + cpuctx->cgrp = perf_cgroup_from_task(current, ctx); list_add(&cpuctx->cgrp_cpuctx_entry, per_cpu_ptr(&cgrp_cpuctx_list, event->cpu)); } @@ -1069,9 +992,7 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c if (--ctx->nr_cgroups) return; - if (ctx->is_active && cpuctx->cgrp) - cpuctx->cgrp = NULL; - + cpuctx->cgrp = NULL; list_del(&cpuctx->cgrp_cpuctx_entry); } @@ -1100,16 +1021,6 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, { } -static inline void perf_cgroup_sched_out(struct task_struct *task, - struct task_struct *next) -{ -} - -static inline void perf_cgroup_sched_in(struct task_struct *prev, - struct task_struct *task) -{ -} - static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, struct perf_event_attr *attr, struct perf_event *group_leader) @@ -1118,13 +1029,7 @@ static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, } static inline void -perf_cgroup_set_timestamp(struct task_struct *task, - struct perf_event_context *ctx) -{ -} - -static inline void -perf_cgroup_switch(struct task_struct *task, struct task_struct *next) +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) { } @@ -1147,6 +1052,10 @@ static inline void perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx) { } + +static void perf_cgroup_switch(struct task_struct *task) +{ +} #endif /* @@ -1910,6 +1819,9 @@ static void __perf_event_read_size(struct perf_event *event, int nr_siblings) if (event->attr.read_format & PERF_FORMAT_ID) entry += sizeof(u64); + if (event->attr.read_format & PERF_FORMAT_LOST) + entry += sizeof(u64); + if (event->attr.read_format & PERF_FORMAT_GROUP) { nr += nr_siblings; size += sizeof(u64); @@ -2713,8 +2625,7 @@ static void ctx_sched_out(struct perf_event_context *ctx, static void ctx_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, - enum event_type_t event_type, - struct task_struct *task); + enum event_type_t event_type); static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, @@ -2730,15 +2641,14 @@ static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, } static void perf_event_sched_in(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx, - struct task_struct *task) + struct perf_event_context *ctx) { - cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task); + cpu_ctx_sched_in(cpuctx, EVENT_PINNED); if (ctx) - ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); + ctx_sched_in(ctx, cpuctx, EVENT_PINNED); + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); if (ctx) - ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); + ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); } /* @@ -2788,7 +2698,7 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, else if (ctx_event_type & EVENT_PINNED) cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - perf_event_sched_in(cpuctx, task_ctx, current); + perf_event_sched_in(cpuctx, task_ctx); perf_pmu_enable(cpuctx->ctx.pmu); } @@ -3011,7 +2921,7 @@ static void __perf_event_enable(struct perf_event *event, return; if (!event_filter_match(event)) { - ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); + ctx_sched_in(ctx, cpuctx, EVENT_TIME); return; } @@ -3020,7 +2930,7 @@ static void __perf_event_enable(struct perf_event *event, * then don't put it on unless the group is on. */ if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) { - ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); + ctx_sched_in(ctx, cpuctx, EVENT_TIME); return; } @@ -3668,7 +3578,7 @@ void __perf_event_task_sched_out(struct task_struct *task, * cgroup event are system-wide mode only */ if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) - perf_cgroup_sched_out(task, next); + perf_cgroup_switch(next); } /* @@ -3865,8 +3775,7 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, static void ctx_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, - enum event_type_t event_type, - struct task_struct *task) + enum event_type_t event_type) { int is_active = ctx->is_active; @@ -3878,7 +3787,7 @@ ctx_sched_in(struct perf_event_context *ctx, if (is_active ^ EVENT_TIME) { /* start ctx time */ __update_context_time(ctx, false); - perf_cgroup_set_timestamp(task, ctx); + perf_cgroup_set_timestamp(cpuctx); /* * CPU-release for the below ->is_active store, * see __load_acquire() in perf_event_time_now() @@ -3909,12 +3818,11 @@ ctx_sched_in(struct perf_event_context *ctx, } static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type, - struct task_struct *task) + enum event_type_t event_type) { struct perf_event_context *ctx = &cpuctx->ctx; - ctx_sched_in(ctx, cpuctx, event_type, task); + ctx_sched_in(ctx, cpuctx, event_type); } static void perf_event_context_sched_in(struct perf_event_context *ctx, @@ -3956,7 +3864,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, */ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - perf_event_sched_in(cpuctx, ctx, task); + perf_event_sched_in(cpuctx, ctx); if (cpuctx->sched_cb_usage && pmu->sched_task) pmu->sched_task(cpuctx->task_ctx, true); @@ -3984,16 +3892,6 @@ void __perf_event_task_sched_in(struct task_struct *prev, struct perf_event_context *ctx; int ctxn; - /* - * If cgroup events exist on this CPU, then we need to check if we have - * to switch in PMU state; cgroup event are system-wide mode only. - * - * Since cgroup events are CPU events, we must schedule these in before - * we schedule in the task events. - */ - if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) - perf_cgroup_sched_in(prev, task); - for_each_task_context_nr(ctxn) { ctx = task->perf_event_ctxp[ctxn]; if (likely(!ctx)) @@ -4267,7 +4165,7 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx) if (cpu_event) rotate_ctx(&cpuctx->ctx, cpu_event); - perf_event_sched_in(cpuctx, task_ctx, current); + perf_event_sched_in(cpuctx, task_ctx); perf_pmu_enable(cpuctx->ctx.pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); @@ -4339,7 +4237,7 @@ static void perf_event_enable_on_exec(int ctxn) clone_ctx = unclone_ctx(ctx); ctx_resched(cpuctx, ctx, event_type); } else { - ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); + ctx_sched_in(ctx, cpuctx, EVENT_TIME); } perf_ctx_unlock(cpuctx, ctx); @@ -4362,7 +4260,6 @@ static void perf_event_remove_on_exec(int ctxn) { struct perf_event_context *ctx, *clone_ctx = NULL; struct perf_event *event, *next; - LIST_HEAD(free_list); unsigned long flags; bool modified = false; @@ -4560,7 +4457,7 @@ int perf_event_read_local(struct perf_event *event, u64 *value, *value = local64_read(&event->count); if (enabled || running) { - u64 __enabled, __running, __now;; + u64 __enabled, __running, __now; calc_timer_values(event, &__now, &__enabled, &__running); if (enabled) @@ -5366,11 +5263,15 @@ static int __perf_read_group_add(struct perf_event *leader, values[n++] += perf_event_count(leader); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); + if (read_format & PERF_FORMAT_LOST) + values[n++] = atomic64_read(&leader->lost_samples); for_each_sibling_event(sub, leader) { values[n++] += perf_event_count(sub); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); + if (read_format & PERF_FORMAT_LOST) + values[n++] = atomic64_read(&sub->lost_samples); } raw_spin_unlock_irqrestore(&ctx->lock, flags); @@ -5427,7 +5328,7 @@ static int perf_read_one(struct perf_event *event, u64 read_format, char __user *buf) { u64 enabled, running; - u64 values[4]; + u64 values[5]; int n = 0; values[n++] = __perf_event_read_value(event, &enabled, &running); @@ -5437,6 +5338,8 @@ static int perf_read_one(struct perf_event *event, values[n++] = running; if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(event); + if (read_format & PERF_FORMAT_LOST) + values[n++] = atomic64_read(&event->lost_samples); if (copy_to_user(buf, values, n * sizeof(u64))) return -EFAULT; @@ -6352,17 +6255,17 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) again: mutex_lock(&event->mmap_mutex); if (event->rb) { - if (event->rb->nr_pages != nr_pages) { + if (data_page_nr(event->rb) != nr_pages) { ret = -EINVAL; goto unlock; } if (!atomic_inc_not_zero(&event->rb->mmap_count)) { /* - * Raced against perf_mmap_close() through - * perf_event_set_output(). Try again, hope for better - * luck. + * Raced against perf_mmap_close(); remove the + * event and try again. */ + ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); goto again; } @@ -6533,8 +6436,8 @@ static void perf_sigtrap(struct perf_event *event) if (current->flags & PF_EXITING) return; - force_sig_perf((void __user *)event->pending_addr, - event->attr.type, event->attr.sig_data); + send_sig_perf((void __user *)event->pending_addr, + event->attr.type, event->attr.sig_data); } static void perf_pending_event_disable(struct perf_event *event) @@ -6964,7 +6867,7 @@ static void perf_output_read_one(struct perf_output_handle *handle, u64 enabled, u64 running) { u64 read_format = event->attr.read_format; - u64 values[4]; + u64 values[5]; int n = 0; values[n++] = perf_event_count(event); @@ -6978,6 +6881,8 @@ static void perf_output_read_one(struct perf_output_handle *handle, } if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(event); + if (read_format & PERF_FORMAT_LOST) + values[n++] = atomic64_read(&event->lost_samples); __output_copy(handle, values, n * sizeof(u64)); } @@ -6988,9 +6893,16 @@ static void perf_output_read_group(struct perf_output_handle *handle, { struct perf_event *leader = event->group_leader, *sub; u64 read_format = event->attr.read_format; - u64 values[5]; + unsigned long flags; + u64 values[6]; int n = 0; + /* + * Disabling interrupts avoids all counter scheduling + * (context switches, timer based rotation and IPIs). + */ + local_irq_save(flags); + values[n++] = 1 + leader->nr_siblings; if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) @@ -7006,6 +6918,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, values[n++] = perf_event_count(leader); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); + if (read_format & PERF_FORMAT_LOST) + values[n++] = atomic64_read(&leader->lost_samples); __output_copy(handle, values, n * sizeof(u64)); @@ -7019,9 +6933,13 @@ static void perf_output_read_group(struct perf_output_handle *handle, values[n++] = perf_event_count(sub); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); + if (read_format & PERF_FORMAT_LOST) + values[n++] = atomic64_read(&sub->lost_samples); __output_copy(handle, values, n * sizeof(u64)); } + + local_irq_restore(flags); } #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ @@ -10174,26 +10092,30 @@ static inline bool perf_event_is_tracing(struct perf_event *event) int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie) { - bool is_kprobe, is_tracepoint, is_syscall_tp; + bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp; if (!perf_event_is_tracing(event)) return perf_event_set_bpf_handler(event, prog, bpf_cookie); - is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; + is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_KPROBE; + is_uprobe = event->tp_event->flags & TRACE_EVENT_FL_UPROBE; is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; is_syscall_tp = is_syscall_trace_event(event->tp_event); - if (!is_kprobe && !is_tracepoint && !is_syscall_tp) + if (!is_kprobe && !is_uprobe && !is_tracepoint && !is_syscall_tp) /* bpf programs can only be attached to u/kprobe or tracepoint */ return -EINVAL; - if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) || + if (((is_kprobe || is_uprobe) && prog->type != BPF_PROG_TYPE_KPROBE) || (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) || (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) return -EINVAL; + if (prog->type == BPF_PROG_TYPE_KPROBE && prog->aux->sleepable && !is_uprobe) + /* only uprobe programs are allowed to be sleepable */ + return -EINVAL; + /* Kprobe override only works for kprobes, not uprobes. */ - if (prog->kprobe_override && - !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) + if (prog->kprobe_override && !is_kprobe) return -EINVAL; if (is_tracepoint || is_syscall_tp) { @@ -11635,6 +11557,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, event->state = PERF_EVENT_STATE_INACTIVE; + if (parent_event) + event->event_caps = parent_event->event_caps; + if (event->attr.sigtrap) atomic_set(&event->event_limit, 1); @@ -11928,14 +11853,25 @@ err_size: goto out; } +static void mutex_lock_double(struct mutex *a, struct mutex *b) +{ + if (b < a) + swap(a, b); + + mutex_lock(a); + mutex_lock_nested(b, SINGLE_DEPTH_NESTING); +} + static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event) { struct perf_buffer *rb = NULL; int ret = -EINVAL; - if (!output_event) + if (!output_event) { + mutex_lock(&event->mmap_mutex); goto set; + } /* don't allow circular references */ if (event == output_event) @@ -11973,8 +11909,15 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) event->pmu != output_event->pmu) goto out; + /* + * Hold both mmap_mutex to serialize against perf_mmap_close(). Since + * output_event is already on rb->event_list, and the list iteration + * restarts after every removal, it is guaranteed this new event is + * observed *OR* if output_event is already removed, it's guaranteed we + * observe !rb->mmap_count. + */ + mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex); set: - mutex_lock(&event->mmap_mutex); /* Can't redirect output if we've got an active mmap() */ if (atomic_read(&event->mmap_count)) goto unlock; @@ -11984,6 +11927,12 @@ set: rb = ring_buffer_get(output_event); if (!rb) goto unlock; + + /* did we race against perf_mmap_close() */ + if (!atomic_read(&rb->mmap_count)) { + ring_buffer_put(rb); + goto unlock; + } } ring_buffer_attach(event, rb); @@ -11991,20 +11940,13 @@ set: ret = 0; unlock: mutex_unlock(&event->mmap_mutex); + if (output_event) + mutex_unlock(&output_event->mmap_mutex); out: return ret; } -static void mutex_lock_double(struct mutex *a, struct mutex *b) -{ - if (b < a) - swap(a, b); - - mutex_lock(a); - mutex_lock_nested(b, SINGLE_DEPTH_NESTING); -} - static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) { bool nmi_safe = false; @@ -12319,6 +12261,9 @@ SYSCALL_DEFINE5(perf_event_open, * Do not allow to attach to a group in a different task * or CPU context. If we're moving SW events, we'll fix * this up later, so allow that. + * + * Racy, not holding group_leader->ctx->mutex, see comment with + * perf_event_ctx_lock(). */ if (!move_group && group_leader->ctx != ctx) goto err_context; @@ -12384,6 +12329,7 @@ SYSCALL_DEFINE5(perf_event_open, } else { perf_event_ctx_unlock(group_leader, gctx); move_group = 0; + goto not_move_group; } } @@ -12400,7 +12346,17 @@ SYSCALL_DEFINE5(perf_event_open, } } else { mutex_lock(&ctx->mutex); + + /* + * Now that we hold ctx->lock, (re)validate group_leader->ctx == ctx, + * see the group_leader && !move_group test earlier. + */ + if (group_leader && group_leader->ctx != ctx) { + err = -EINVAL; + goto err_locked; + } } +not_move_group: if (ctx->task == TASK_TOMBSTONE) { err = -ESRCH; @@ -13562,7 +13518,7 @@ static int __perf_cgroup_move(void *info) { struct task_struct *task = info; rcu_read_lock(); - perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN); + perf_cgroup_switch(task); rcu_read_unlock(); return 0; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 082832738c8f..5150d5f84c03 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -116,6 +116,11 @@ static inline int page_order(struct perf_buffer *rb) } #endif +static inline int data_page_nr(struct perf_buffer *rb) +{ + return rb->nr_pages << page_order(rb); +} + static inline unsigned long perf_data_size(struct perf_buffer *rb) { return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 52868716ec35..726132039c38 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -172,8 +172,10 @@ __perf_output_begin(struct perf_output_handle *handle, goto out; if (unlikely(rb->paused)) { - if (rb->nr_pages) + if (rb->nr_pages) { local_inc(&rb->lost); + atomic64_inc(&event->lost_samples); + } goto out; } @@ -254,6 +256,7 @@ __perf_output_begin(struct perf_output_handle *handle, fail: local_inc(&rb->lost); + atomic64_inc(&event->lost_samples); perf_output_put_handle(handle); out: rcu_read_unlock(); @@ -859,11 +862,6 @@ void rb_free(struct perf_buffer *rb) } #else -static int data_page_nr(struct perf_buffer *rb) -{ - return rb->nr_pages << page_order(rb); -} - static struct page * __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff) { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 6418083901d4..2eaa327f8158 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -180,7 +180,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, if (new_page) { get_page(new_page); - page_add_new_anon_rmap(new_page, vma, addr, false); + page_add_new_anon_rmap(new_page, vma, addr); lru_cache_add_inactive_or_unevictable(new_page, vma); } else /* no new page, just dec_mm_counter for old_page */ @@ -787,10 +787,10 @@ static int __copy_insn(struct address_space *mapping, struct file *filp, struct page *page; /* * Ensure that the page that has the original instruction is populated - * and in page-cache. If ->readpage == NULL it must be shmem_mapping(), + * and in page-cache. If ->read_folio == NULL it must be shmem_mapping(), * see uprobe_register(). */ - if (mapping->a_ops->readpage) + if (mapping->a_ops->read_folio) page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp); else page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT); @@ -1143,7 +1143,8 @@ static int __uprobe_register(struct inode *inode, loff_t offset, return -EINVAL; /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */ - if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping)) + if (!inode->i_mapping->a_ops->read_folio && + !shmem_mapping(inode->i_mapping)) return -EIO; /* Racy, just to catch the obvious mistakes */ if (offset > i_size_read(inode)) diff --git a/kernel/exit.c b/kernel/exit.c index f072959fcab7..4f7424523bac 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -733,11 +733,29 @@ static void check_stack_usage(void) static inline void check_stack_usage(void) {} #endif +static void synchronize_group_exit(struct task_struct *tsk, long code) +{ + struct sighand_struct *sighand = tsk->sighand; + struct signal_struct *signal = tsk->signal; + + spin_lock_irq(&sighand->siglock); + signal->quick_threads--; + if ((signal->quick_threads == 0) && + !(signal->flags & SIGNAL_GROUP_EXIT)) { + signal->flags = SIGNAL_GROUP_EXIT; + signal->group_exit_code = code; + signal->group_stop_count = 0; + } + spin_unlock_irq(&sighand->siglock); +} + void __noreturn do_exit(long code) { struct task_struct *tsk = current; int group_dead; + synchronize_group_exit(tsk, code); + WARN_ON(tsk->plug); kcov_task_exit(tsk); @@ -766,7 +784,7 @@ void __noreturn do_exit(long code) #ifdef CONFIG_POSIX_TIMERS hrtimer_cancel(&tsk->signal->real_timer); - exit_itimers(tsk->signal); + exit_itimers(tsk); #endif if (tsk->mm) setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); @@ -905,7 +923,7 @@ do_group_exit(int exit_code) exit_code = sig->group_exit_code; else if (sig->group_exec_task) exit_code = 0; - else if (!thread_group_empty(current)) { + else { struct sighand_struct *const sighand = current->sighand; spin_lock_irq(&sighand->siglock); @@ -1051,7 +1069,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * p->signal fields because the whole thread group is dead * and nobody can change them. * - * psig->stats_lock also protects us from our sub-theads + * psig->stats_lock also protects us from our sub-threads * which can reap other children at the same time. Until * we change k_getrusage()-like users to rely on this lock * we have to take ->siglock as well. diff --git a/kernel/extable.c b/kernel/extable.c index bda5e9761541..71f482581cab 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -114,7 +114,7 @@ int kernel_text_address(unsigned long addr) /* Treat this like an NMI as it can happen anywhere */ if (no_rcu) - rcu_nmi_enter(); + ct_nmi_enter(); if (is_module_text_address(addr)) goto out; @@ -127,7 +127,7 @@ int kernel_text_address(unsigned long addr) ret = 0; out: if (no_rcu) - rcu_nmi_exit(); + ct_nmi_exit(); return ret; } diff --git a/kernel/fork.c b/kernel/fork.c index 5f46401f096b..cecab2735935 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -612,9 +612,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, retval = ksm_fork(mm, oldmm); if (retval) goto out; - retval = khugepaged_fork(mm, oldmm); - if (retval) - goto out; + khugepaged_fork(mm, oldmm); prev = NULL; for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { @@ -792,6 +790,7 @@ void __mmdrop(struct mm_struct *mm) mmu_notifier_subscriptions_destroy(mm); check_mm(mm); put_user_ns(mm->user_ns); + mm_pasid_drop(mm); free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@ -1045,6 +1044,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #ifdef CONFIG_MEMCG tsk->active_memcg = NULL; #endif + +#ifdef CONFIG_CPU_SUP_INTEL + tsk->reported_split_lock = 0; +#endif + return tsk; free_stack: @@ -1190,7 +1194,6 @@ static inline void __mmput(struct mm_struct *mm) } if (mm->binfmt) module_put(mm->binfmt->module); - mm_pasid_drop(mm); mmdrop(mm); } @@ -1222,6 +1225,7 @@ void mmput_async(struct mm_struct *mm) schedule_work(&mm->async_put_work); } } +EXPORT_SYMBOL_GPL(mmput_async); #endif /** @@ -1689,6 +1693,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) return -ENOMEM; sig->nr_threads = 1; + sig->quick_threads = 1; atomic_set(&sig->live, 1); refcount_set(&sig->sigcnt, 1); @@ -1811,6 +1816,7 @@ static inline void rcu_copy_process(struct task_struct *p) p->trc_reader_nesting = 0; p->trc_reader_special.s = 0; INIT_LIST_HEAD(&p->trc_holdout_list); + INIT_LIST_HEAD(&p->trc_blkd_node); #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } @@ -1961,6 +1967,18 @@ static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk) mutex_unlock(&oom_adj_mutex); } +#ifdef CONFIG_RV +static void rv_task_fork(struct task_struct *p) +{ + int i; + + for (i = 0; i < RV_PER_TASK_MONITORS; i++) + p->rv[i].da_mon.monitoring = false; +} +#else +#define rv_task_fork(p) do {} while (0) +#endif + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -1979,7 +1997,7 @@ static __latent_entropy struct task_struct *copy_process( struct task_struct *p; struct multiprocess_signals delayed; struct file *pidfile = NULL; - u64 clone_flags = args->flags; + const u64 clone_flags = args->flags; struct nsproxy *nsp = current->nsproxy; /* @@ -2068,6 +2086,9 @@ static __latent_entropy struct task_struct *copy_process( p = dup_task_struct(current, node); if (!p) goto fork_out; + p->flags &= ~PF_KTHREAD; + if (args->kthread) + p->flags |= PF_KTHREAD; if (args->io_thread) { /* * Mark us an IO worker, and block any signal that isn't @@ -2157,7 +2178,7 @@ static __latent_entropy struct task_struct *copy_process( p->io_context = NULL; audit_set_context(p, NULL); cgroup_fork(p); - if (p->flags & PF_KTHREAD) { + if (args->kthread) { if (!set_kthread_struct(p)) goto bad_fork_cleanup_delayacct; } @@ -2240,7 +2261,7 @@ static __latent_entropy struct task_struct *copy_process( retval = copy_io(clone_flags, p); if (retval) goto bad_fork_cleanup_namespaces; - retval = copy_thread(clone_flags, args->stack, args->stack_size, p, args->tls); + retval = copy_thread(p, args); if (retval) goto bad_fork_cleanup_io; @@ -2393,6 +2414,8 @@ static __latent_entropy struct task_struct *copy_process( */ copy_seccomp(p); + rv_task_fork(p); + rseq_fork(p, clone_flags); /* Don't start children in a dying pid namespace */ @@ -2438,6 +2461,7 @@ static __latent_entropy struct task_struct *copy_process( __this_cpu_inc(process_counts); } else { current->signal->nr_threads++; + current->signal->quick_threads++; atomic_inc(¤t->signal->live); refcount_inc(¤t->signal->sigcnt); task_join_group_stop(p); @@ -2544,11 +2568,21 @@ static inline void init_idle_pids(struct task_struct *idle) } } +static int idle_dummy(void *dummy) +{ + /* This function is never called */ + return 0; +} + struct task_struct * __init fork_idle(int cpu) { struct task_struct *task; struct kernel_clone_args args = { - .flags = CLONE_VM, + .flags = CLONE_VM, + .fn = &idle_dummy, + .fn_arg = NULL, + .kthread = 1, + .idle = 1, }; task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args); @@ -2579,8 +2613,8 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) .flags = ((lower_32_bits(flags) | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), .exit_signal = (lower_32_bits(flags) & CSIGNAL), - .stack = (unsigned long)fn, - .stack_size = (unsigned long)arg, + .fn = fn, + .fn_arg = arg, .io_thread = 1, }; @@ -2684,8 +2718,25 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) .flags = ((lower_32_bits(flags) | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), .exit_signal = (lower_32_bits(flags) & CSIGNAL), - .stack = (unsigned long)fn, - .stack_size = (unsigned long)arg, + .fn = fn, + .fn_arg = arg, + .kthread = 1, + }; + + return kernel_clone(&args); +} + +/* + * Create a user mode thread. + */ +pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags) +{ + struct kernel_clone_args args = { + .flags = ((lower_32_bits(flags) | CLONE_VM | + CLONE_UNTRACED) & ~CSIGNAL), + .exit_signal = (lower_32_bits(flags) & CSIGNAL), + .fn = fn, + .fn_arg = arg, }; return kernel_clone(&args); diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index c264cbeab71c..b5379c0e6d6d 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -3,6 +3,7 @@ #define _FUTEX_H #include <linux/futex.h> +#include <linux/rtmutex.h> #include <linux/sched/wake_q.h> #ifdef CONFIG_PREEMPT_RT diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c index 183b28c32c83..ce2889f12375 100644 --- a/kernel/futex/pi.c +++ b/kernel/futex/pi.c @@ -1005,7 +1005,7 @@ retry_private: rt_mutex_init_waiter(&rt_waiter); /* - * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not + * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not * hold it while doing rt_mutex_start_proxy(), because then it will * include hb->lock in the blocking chain, even through we'll not in * fact hold it while blocking. This will lead it to report -EDEADLK diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 1966a749e0d9..0c78e64f747d 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -74,7 +74,7 @@ fi # of tree builds having stale headers in srctree. Just silence CPIO for now. for f in $dir_list; do find "$f" -name "*.h"; -done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1 +done | cpio --quiet -pdu $cpio_dir >/dev/null 2>&1 # Remove comments except SDPX lines find $cpio_dir -type f -print0 | diff --git a/kernel/groups.c b/kernel/groups.c index 787b381c7c00..9aaed2a31073 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -134,13 +134,26 @@ EXPORT_SYMBOL(set_groups); int set_current_groups(struct group_info *group_info) { struct cred *new; + const struct cred *old; + int retval; new = prepare_creds(); if (!new) return -ENOMEM; + old = current_cred(); + set_groups(new, group_info); + + retval = security_task_fix_setgroups(new, old); + if (retval < 0) + goto error; + return commit_creds(new); + +error: + abort_creds(new); + return retval; } EXPORT_SYMBOL(set_current_groups); diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 52501e5f7655..bb2354f73ded 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -73,7 +73,7 @@ static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; * hung task is detected: */ unsigned int __read_mostly sysctl_hung_task_panic = - CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE; + IS_ENABLED(CONFIG_BOOTPARAM_HUNG_TASK_PANIC); static int hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr) @@ -229,7 +229,7 @@ static long hung_timeout_jiffies(unsigned long last_checked, * Process updating of timeout sysctl */ static int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - void __user *buffer, + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 10929eda9825..db3d174c53d4 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -24,6 +24,7 @@ config GENERIC_IRQ_SHOW_LEVEL # Supports effective affinity mask config GENERIC_IRQ_EFFECTIVE_AFF_MASK + depends on SMP bool # Support for delayed migration from interrupt context @@ -82,6 +83,7 @@ config IRQ_FASTEOI_HIERARCHY_HANDLERS # Generic IRQ IPI support config GENERIC_IRQ_IPI bool + depends on SMP select IRQ_DOMAIN_HIERARCHY # Generic MSI interrupt support diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index f7ff8919dc9b..d9a5c1d65a79 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -258,7 +258,7 @@ static int __irq_build_affinity_masks(unsigned int startvec, nodemask_t nodemsk = NODE_MASK_NONE; struct node_vectors *node_vectors; - if (!cpumask_weight(cpu_mask)) + if (cpumask_empty(cpu_mask)) return 0; nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); @@ -269,8 +269,9 @@ static int __irq_build_affinity_masks(unsigned int startvec, */ if (numvecs <= nodes) { for_each_node_mask(n, nodemsk) { - cpumask_or(&masks[curvec].mask, &masks[curvec].mask, - node_to_cpumask[n]); + /* Ensure that only CPUs which are in both masks are set */ + cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); + cpumask_or(&masks[curvec].mask, &masks[curvec].mask, nmsk); if (++curvec == last_affv) curvec = firstvec; } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 54af0deb239b..8ac37e8e738a 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -188,7 +188,8 @@ enum { #ifdef CONFIG_SMP static int -__irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force) +__irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff, + bool force) { struct irq_data *d = irq_desc_get_irq_data(desc); @@ -224,7 +225,8 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force) } #else static __always_inline int -__irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force) +__irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff, + bool force) { return IRQ_STARTUP_NORMAL; } @@ -252,7 +254,7 @@ static int __irq_startup(struct irq_desc *desc) int irq_startup(struct irq_desc *desc, bool resend, bool force) { struct irq_data *d = irq_desc_get_irq_data(desc); - struct cpumask *aff = irq_data_get_affinity_mask(d); + const struct cpumask *aff = irq_data_get_affinity_mask(d); int ret = 0; desc->depth = 0; @@ -1006,8 +1008,10 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, if (desc->irq_data.chip != &no_irq_chip) mask_ack_irq(desc); irq_state_set_disabled(desc); - if (is_chained) + if (is_chained) { desc->action = NULL; + WARN_ON(irq_chip_pm_put(irq_desc_get_irq_data(desc))); + } desc->depth = 1; } desc->handle_irq = handle; @@ -1033,6 +1037,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, irq_settings_set_norequest(desc); irq_settings_set_nothread(desc); desc->action = &chained_action; + WARN_ON(irq_chip_pm_get(irq_desc_get_irq_data(desc))); irq_activate_and_startup(desc, IRQ_RESEND); } } @@ -1513,7 +1518,8 @@ int irq_chip_request_resources_parent(struct irq_data *data) if (data->chip->irq_request_resources) return data->chip->irq_request_resources(data); - return -ENOSYS; + /* no error on missing optional irq_chip::irq_request_resources */ + return 0; } EXPORT_SYMBOL_GPL(irq_chip_request_resources_parent); @@ -1573,17 +1579,12 @@ static struct device *irq_get_parent_device(struct irq_data *data) int irq_chip_pm_get(struct irq_data *data) { struct device *dev = irq_get_parent_device(data); - int retval; + int retval = 0; - if (IS_ENABLED(CONFIG_PM) && dev) { - retval = pm_runtime_get_sync(dev); - if (retval < 0) { - pm_runtime_put_noidle(dev); - return retval; - } - } + if (IS_ENABLED(CONFIG_PM) && dev) + retval = pm_runtime_resume_and_get(dev); - return 0; + return retval; } /** diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index 2b43f5f5033d..bbcaac64038e 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -30,7 +30,7 @@ static void irq_debug_show_bits(struct seq_file *m, int ind, unsigned int state, static void irq_debug_show_masks(struct seq_file *m, struct irq_desc *desc) { struct irq_data *data = irq_desc_get_irq_data(desc); - struct cpumask *msk; + const struct cpumask *msk; msk = irq_data_get_affinity_mask(data); seq_printf(m, "affinity: %*pbl\n", cpumask_pr_args(msk)); @@ -58,6 +58,7 @@ static const struct irq_bit_descr irqchip_flags[] = { BIT_MASK_DESCR(IRQCHIP_SUPPORTS_LEVEL_MSI), BIT_MASK_DESCR(IRQCHIP_SUPPORTS_NMI), BIT_MASK_DESCR(IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND), + BIT_MASK_DESCR(IRQCHIP_IMMUTABLE), }; static void diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index f0862eb6b506..c653cd31548d 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -431,7 +431,7 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, return 0; } -static void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq) +void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq) { struct irq_data *data = irq_domain_get_irq_data(d, virq); struct irq_domain_chip_generic *dgc = d->gc; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 99cbdf55a8bd..f09c60393e55 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -29,12 +29,14 @@ extern struct irqaction chained_action; * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed * IRQTF_AFFINITY - irq thread is requested to adjust affinity * IRQTF_FORCED_THREAD - irq action is force threaded + * IRQTF_READY - signals that irq thread is ready */ enum { IRQTF_RUNTHREAD, IRQTF_WARNED, IRQTF_AFFINITY, IRQTF_FORCED_THREAD, + IRQTF_READY, }; /* diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index 08ce7da3b57c..bbd945bacef0 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c @@ -115,11 +115,11 @@ free_descs: int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest) { struct irq_data *data = irq_get_irq_data(irq); - struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL; + const struct cpumask *ipimask; struct irq_domain *domain; unsigned int nr_irqs; - if (!irq || !data || !ipimask) + if (!irq || !data) return -EINVAL; domain = data->domain; @@ -131,7 +131,8 @@ int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest) return -EINVAL; } - if (WARN_ON(!cpumask_subset(dest, ipimask))) + ipimask = irq_data_get_affinity_mask(data); + if (!ipimask || WARN_ON(!cpumask_subset(dest, ipimask))) /* * Must be destroying a subset of CPUs to which this IPI * was set up to target @@ -162,12 +163,13 @@ int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest) irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu) { struct irq_data *data = irq_get_irq_data(irq); - struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL; + const struct cpumask *ipimask; - if (!data || !ipimask || cpu >= nr_cpu_ids) + if (!data || cpu >= nr_cpu_ids) return INVALID_HWIRQ; - if (!cpumask_test_cpu(cpu, ipimask)) + ipimask = irq_data_get_affinity_mask(data); + if (!ipimask || !cpumask_test_cpu(cpu, ipimask)) return INVALID_HWIRQ; /* @@ -186,7 +188,7 @@ EXPORT_SYMBOL_GPL(ipi_get_hwirq); static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data, const struct cpumask *dest, unsigned int cpu) { - struct cpumask *ipimask = irq_data_get_affinity_mask(data); + const struct cpumask *ipimask = irq_data_get_affinity_mask(data); if (!chip || !ipimask) return -EINVAL; diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index 0cd02efa3a74..dd76323ea3fd 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -181,7 +181,7 @@ struct irq_domain *irq_domain_create_sim(struct fwnode_handle *fwnode, goto err_free_bitmap; work_ctx->irq_count = num_irqs; - init_irq_work(&work_ctx->work, irq_sim_handle_irq); + work_ctx->work = IRQ_WORK_INIT_HARD(irq_sim_handle_irq); return work_ctx->domain; diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 939d21cd55c3..5db0230aa6b5 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -251,7 +251,7 @@ static ssize_t actions_show(struct kobject *kobj, char *p = ""; raw_spin_lock_irq(&desc->lock); - for (action = desc->action; action != NULL; action = action->next) { + for_each_action_of_desc(desc, action) { ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s", p, action->name); p = ","; @@ -407,6 +407,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, lockdep_set_class(&desc->lock, &irq_desc_lock_class); mutex_init(&desc->request_mutex); init_rcu_head(&desc->rcu); + init_waitqueue_head(&desc->wait_for_threads); desc_set_defaults(irq, desc, node, affinity, owner); irqd_set(&desc->irq_data, flags); @@ -575,6 +576,7 @@ int __init early_irq_init(void) raw_spin_lock_init(&desc[i].lock); lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); mutex_init(&desc[i].request_mutex); + init_waitqueue_head(&desc[i].wait_for_threads); desc_set_defaults(i, &desc[i], node, NULL, NULL); } return arch_early_irq_init(); @@ -699,7 +701,6 @@ EXPORT_SYMBOL_GPL(generic_handle_irq_safe); */ int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq) { - WARN_ON_ONCE(!in_hardirq()); return handle_irq_desc(irq_resolve_mapping(domain, hwirq)); } EXPORT_SYMBOL_GPL(generic_handle_domain_irq); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index d5ce96510549..8fe1da9614ee 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -147,7 +147,8 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int s static atomic_t unknown_domains; if (WARN_ON((size && direct_max) || - (!IS_ENABLED(CONFIG_IRQ_DOMAIN_NOMAP) && direct_max))) + (!IS_ENABLED(CONFIG_IRQ_DOMAIN_NOMAP) && direct_max) || + (direct_max && (direct_max != hwirq_max)))) return NULL; domain = kzalloc_node(struct_size(domain, revmap, size), @@ -219,7 +220,6 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int s domain->hwirq_max = hwirq_max; if (direct_max) { - size = direct_max; domain->flags |= IRQ_DOMAIN_FLAG_NO_MAP; } @@ -650,9 +650,9 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) pr_debug("create_direct virq allocation failed\n"); return 0; } - if (virq >= domain->revmap_size) { - pr_err("ERROR: no free irqs available below %i maximum\n", - domain->revmap_size); + if (virq >= domain->hwirq_max) { + pr_err("ERROR: no free irqs available below %lu maximum\n", + domain->hwirq_max); irq_free_desc(virq); return 0; } @@ -906,10 +906,12 @@ struct irq_desc *__irq_resolve_mapping(struct irq_domain *domain, return desc; if (irq_domain_is_nomap(domain)) { - if (hwirq < domain->revmap_size) { + if (hwirq < domain->hwirq_max) { data = irq_domain_get_irq_data(domain, hwirq); if (data && data->hwirq == hwirq) desc = irq_data_to_desc(data); + if (irq && desc) + *irq = hwirq; } return desc; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index c03f71d5ec10..40fe7806cc8c 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -205,16 +205,8 @@ static void irq_validate_effective_affinity(struct irq_data *data) pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n", chip->name, data->irq); } - -static inline void irq_init_effective_affinity(struct irq_data *data, - const struct cpumask *mask) -{ - cpumask_copy(irq_data_get_effective_affinity_mask(data), mask); -} #else static inline void irq_validate_effective_affinity(struct irq_data *data) { } -static inline void irq_init_effective_affinity(struct irq_data *data, - const struct cpumask *mask) { } #endif int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, @@ -222,11 +214,16 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, { struct irq_desc *desc = irq_data_to_desc(data); struct irq_chip *chip = irq_data_get_irq_chip(data); + const struct cpumask *prog_mask; int ret; + static DEFINE_RAW_SPINLOCK(tmp_mask_lock); + static struct cpumask tmp_mask; + if (!chip || !chip->irq_set_affinity) return -EINVAL; + raw_spin_lock(&tmp_mask_lock); /* * If this is a managed interrupt and housekeeping is enabled on * it check whether the requested affinity mask intersects with @@ -248,24 +245,34 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, */ if (irqd_affinity_is_managed(data) && housekeeping_enabled(HK_TYPE_MANAGED_IRQ)) { - const struct cpumask *hk_mask, *prog_mask; - - static DEFINE_RAW_SPINLOCK(tmp_mask_lock); - static struct cpumask tmp_mask; + const struct cpumask *hk_mask; hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ); - raw_spin_lock(&tmp_mask_lock); cpumask_and(&tmp_mask, mask, hk_mask); if (!cpumask_intersects(&tmp_mask, cpu_online_mask)) prog_mask = mask; else prog_mask = &tmp_mask; - ret = chip->irq_set_affinity(data, prog_mask, force); - raw_spin_unlock(&tmp_mask_lock); } else { - ret = chip->irq_set_affinity(data, mask, force); + prog_mask = mask; } + + /* + * Make sure we only provide online CPUs to the irqchip, + * unless we are being asked to force the affinity (in which + * case we do as we are told). + */ + cpumask_and(&tmp_mask, prog_mask, cpu_online_mask); + if (!force && !cpumask_empty(&tmp_mask)) + ret = chip->irq_set_affinity(data, &tmp_mask, force); + else if (force) + ret = chip->irq_set_affinity(data, mask, force); + else + ret = -EINVAL; + + raw_spin_unlock(&tmp_mask_lock); + switch (ret) { case IRQ_SET_MASK_OK: case IRQ_SET_MASK_OK_DONE: @@ -332,7 +339,7 @@ static bool irq_set_affinity_deactivated(struct irq_data *data, return false; cpumask_copy(desc->irq_common_data.affinity, mask); - irq_init_effective_affinity(data, mask); + irq_data_update_effective_affinity(data, mask); irqd_set(data, IRQD_AFFINITY_SET); return true; } @@ -1249,6 +1256,31 @@ static void irq_wake_secondary(struct irq_desc *desc, struct irqaction *action) } /* + * Internal function to notify that a interrupt thread is ready. + */ +static void irq_thread_set_ready(struct irq_desc *desc, + struct irqaction *action) +{ + set_bit(IRQTF_READY, &action->thread_flags); + wake_up(&desc->wait_for_threads); +} + +/* + * Internal function to wake up a interrupt thread and wait until it is + * ready. + */ +static void wake_up_and_wait_for_irq_thread_ready(struct irq_desc *desc, + struct irqaction *action) +{ + if (!action || !action->thread) + return; + + wake_up_process(action->thread); + wait_event(desc->wait_for_threads, + test_bit(IRQTF_READY, &action->thread_flags)); +} + +/* * Interrupt handler thread */ static int irq_thread(void *data) @@ -1259,6 +1291,8 @@ static int irq_thread(void *data) irqreturn_t (*handler_fn)(struct irq_desc *desc, struct irqaction *action); + irq_thread_set_ready(desc, action); + sched_set_fifo(current); if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD, @@ -1683,8 +1717,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } if (!shared) { - init_waitqueue_head(&desc->wait_for_threads); - /* Setup the type (level, edge polarity) if configured: */ if (new->flags & IRQF_TRIGGER_MASK) { ret = __irq_set_trigger(desc, @@ -1780,14 +1812,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) irq_setup_timings(desc, new); - /* - * Strictly no need to wake it up, but hung_task complains - * when no hard interrupt wakes the thread up. - */ - if (new->thread) - wake_up_process(new->thread); - if (new->secondary) - wake_up_process(new->secondary->thread); + wake_up_and_wait_for_irq_thread_ready(desc, new); + wake_up_and_wait_for_irq_thread_ready(desc, new->secondary); register_irq_proc(irq, desc); new->dir = NULL; diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index bbfb26489aa1..1698e77645ac 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -286,7 +286,7 @@ void irq_matrix_remove_managed(struct irq_matrix *m, const struct cpumask *msk) int irq_matrix_alloc_managed(struct irq_matrix *m, const struct cpumask *msk, unsigned int *mapped_cpu) { - unsigned int bit, cpu, end = m->alloc_end; + unsigned int bit, cpu, end; struct cpumap *cm; if (cpumask_empty(msk)) diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 2bdfce5edafd..a9ee535293eb 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -818,6 +818,21 @@ static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflag irqd_clr_can_reserve(irqd); if (vflags & VIRQ_NOMASK_QUIRK) irqd_set_msi_nomask_quirk(irqd); + + /* + * If the interrupt is managed but no CPU is available to + * service it, shut it down until better times. Note that + * we only do this on the !RESERVE path as x86 (the only + * architecture using this flag) deals with this in a + * different way by using a catch-all vector. + */ + if ((vflags & VIRQ_ACTIVATE) && + irqd_affinity_is_managed(irqd) && + !cpumask_intersects(irq_data_get_affinity_mask(irqd), + cpu_online_mask)) { + irqd_set_managed_shutdown(irqd); + return 0; + } } if (!(vflags & VIRQ_ACTIVATE)) diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index ca71123a6130..c556bc49d213 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -147,7 +147,6 @@ void suspend_device_irqs(void) synchronize_irq(irq); } } -EXPORT_SYMBOL_GPL(suspend_device_irqs); static void resume_irq(struct irq_desc *desc) { @@ -259,4 +258,3 @@ void resume_device_irqs(void) { resume_irqs(false); } -EXPORT_SYMBOL_GPL(resume_device_irqs); diff --git a/kernel/irq_work.c b/kernel/irq_work.c index f7df715ec28e..7afa40fe5cc4 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -137,7 +137,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) if (!irq_work_claim(work)) return false; - kasan_record_aux_stack(work); + kasan_record_aux_stack_noalloc(work); preempt_disable(); if (cpu != smp_processor_id()) { diff --git a/kernel/jump_label.c b/kernel/jump_label.c index b156e152d6b4..714ac4c3b556 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -332,17 +332,13 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start, return 0; } -/* - * Update code which is definitely not currently executing. - * Architectures which need heavyweight synchronization to modify - * running code can override this to make the non-live update case - * cheaper. - */ -void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry, - enum jump_label_type type) +#ifndef arch_jump_label_transform_static +static void arch_jump_label_transform_static(struct jump_entry *entry, + enum jump_label_type type) { - arch_jump_label_transform(entry, type); + /* nothing to do on most architectures */ } +#endif static inline struct jump_entry *static_key_entries(struct static_key *key) { @@ -508,7 +504,7 @@ void __init jump_label_init(void) #ifdef CONFIG_MODULES -static enum jump_label_type jump_label_init_type(struct jump_entry *entry) +enum jump_label_type jump_label_init_type(struct jump_entry *entry) { struct static_key *key = jump_entry_key(entry); bool type = static_key_type(key); @@ -596,31 +592,6 @@ static void __jump_label_mod_update(struct static_key *key) } } -/*** - * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop() - * @mod: module to patch - * - * Allow for run-time selection of the optimal nops. Before the module - * loads patch these with arch_get_jump_label_nop(), which is specified by - * the arch specific jump label code. - */ -void jump_label_apply_nops(struct module *mod) -{ - struct jump_entry *iter_start = mod->jump_entries; - struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; - struct jump_entry *iter; - - /* if the module doesn't have jump label entries, just return */ - if (iter_start == iter_stop) - return; - - for (iter = iter_start; iter < iter_stop; iter++) { - /* Only write NOPs for arch_branch_static(). */ - if (jump_label_init_type(iter) == JUMP_LABEL_NOP) - arch_jump_label_transform_static(iter, JUMP_LABEL_NOP); - } -} - static int jump_label_add_module(struct module *mod) { struct jump_entry *iter_start = mod->jump_entries; diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 79f2eb617a62..60c20f301a6b 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -29,29 +29,10 @@ #include <linux/compiler.h> #include <linux/module.h> #include <linux/kernel.h> +#include <linux/bsearch.h> +#include <linux/btf_ids.h> -/* - * These will be re-linked against their real values - * during the second link stage. - */ -extern const unsigned long kallsyms_addresses[] __weak; -extern const int kallsyms_offsets[] __weak; -extern const u8 kallsyms_names[] __weak; - -/* - * Tell the compiler that the count isn't in the small data section if the arch - * has one (eg: FRV). - */ -extern const unsigned int kallsyms_num_syms -__section(".rodata") __attribute__((weak)); - -extern const unsigned long kallsyms_relative_base -__section(".rodata") __attribute__((weak)); - -extern const char kallsyms_token_table[] __weak; -extern const u16 kallsyms_token_index[] __weak; - -extern const unsigned int kallsyms_markers[] __weak; +#include "kallsyms_internal.h" /* * Expand a compressed symbol data into the resulting uncompressed string, @@ -69,12 +50,20 @@ static unsigned int kallsyms_expand_symbol(unsigned int off, data = &kallsyms_names[off]; len = *data; data++; + off++; + + /* If MSB is 1, it is a "big" symbol, so needs an additional byte. */ + if ((len & 0x80) != 0) { + len = (len & 0x7F) | (*data << 7); + data++; + off++; + } /* * Update the offset to return the offset for the next symbol on * the compressed stream. */ - off += len + 1; + off += len; /* * For every byte on the compressed symbol data, copy the table @@ -127,7 +116,7 @@ static char kallsyms_get_symbol_type(unsigned int off) static unsigned int get_symbol_offset(unsigned long pos) { const u8 *name; - int i; + int i, len; /* * Use the closest marker we have. We have markers every 256 positions, @@ -141,8 +130,18 @@ static unsigned int get_symbol_offset(unsigned long pos) * so we just need to add the len to the current pointer for every * symbol we wish to skip. */ - for (i = 0; i < (pos & 0xFF); i++) - name = name + (*name) + 1; + for (i = 0; i < (pos & 0xFF); i++) { + len = *name; + + /* + * If MSB is 1, it is a "big" symbol, so we need to look into + * the next byte (and skip it, too). + */ + if ((len & 0x80) != 0) + len = ((len & 0x7F) | (name[1] << 7)) + 1; + + name = name + len + 1; + } return name - kallsyms_names; } @@ -178,7 +177,6 @@ static bool cleanup_symbol_name(char *s) * character in an identifier in C. Suffixes observed: * - foo.llvm.[0-9a-f]+ * - foo.[0-9a-f]+ - * - foo.[0-9a-f]+.cfi_jt */ res = strchr(s, '.'); if (res) { @@ -186,22 +184,6 @@ static bool cleanup_symbol_name(char *s) return true; } - if (!IS_ENABLED(CONFIG_CFI_CLANG) || - !IS_ENABLED(CONFIG_LTO_CLANG_THIN) || - CONFIG_CLANG_VERSION >= 130000) - return false; - - /* - * Prior to LLVM 13, the following suffixes were observed when thinLTO - * and CFI are both enabled: - * - foo$[0-9]+ - */ - res = strrchr(s, '$'); - if (res) { - *res = '\0'; - return true; - } - return false; } @@ -228,7 +210,6 @@ unsigned long kallsyms_lookup_name(const char *name) return module_kallsyms_lookup_name(name); } -#ifdef CONFIG_LIVEPATCH /* * Iterate over all symbols in vmlinux. For symbols from modules use * module_kallsyms_on_each_symbol instead. @@ -251,7 +232,6 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, } return 0; } -#endif /* CONFIG_LIVEPATCH */ static unsigned long get_symbol_pos(unsigned long addr, unsigned long *symbolsize, @@ -800,6 +780,96 @@ static const struct seq_operations kallsyms_op = { .show = s_show }; +#ifdef CONFIG_BPF_SYSCALL + +struct bpf_iter__ksym { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct kallsym_iter *, ksym); +}; + +static int ksym_prog_seq_show(struct seq_file *m, bool in_stop) +{ + struct bpf_iter__ksym ctx; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + meta.seq = m; + prog = bpf_iter_get_info(&meta, in_stop); + if (!prog) + return 0; + + ctx.meta = &meta; + ctx.ksym = m ? m->private : NULL; + return bpf_iter_run_prog(prog, &ctx); +} + +static int bpf_iter_ksym_seq_show(struct seq_file *m, void *p) +{ + return ksym_prog_seq_show(m, false); +} + +static void bpf_iter_ksym_seq_stop(struct seq_file *m, void *p) +{ + if (!p) + (void) ksym_prog_seq_show(m, true); + else + s_stop(m, p); +} + +static const struct seq_operations bpf_iter_ksym_ops = { + .start = s_start, + .next = s_next, + .stop = bpf_iter_ksym_seq_stop, + .show = bpf_iter_ksym_seq_show, +}; + +static int bpf_iter_ksym_init(void *priv_data, struct bpf_iter_aux_info *aux) +{ + struct kallsym_iter *iter = priv_data; + + reset_iter(iter, 0); + + /* cache here as in kallsyms_open() case; use current process + * credentials to tell BPF iterators if values should be shown. + */ + iter->show_value = kallsyms_show_value(current_cred()); + + return 0; +} + +DEFINE_BPF_ITER_FUNC(ksym, struct bpf_iter_meta *meta, struct kallsym_iter *ksym) + +static const struct bpf_iter_seq_info ksym_iter_seq_info = { + .seq_ops = &bpf_iter_ksym_ops, + .init_seq_private = bpf_iter_ksym_init, + .fini_seq_private = NULL, + .seq_priv_size = sizeof(struct kallsym_iter), +}; + +static struct bpf_iter_reg ksym_iter_reg_info = { + .target = "ksym", + .feature = BPF_ITER_RESCHED, + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__ksym, ksym), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &ksym_iter_seq_info, +}; + +BTF_ID_LIST(btf_ksym_iter_id) +BTF_ID(struct, kallsym_iter) + +static int __init bpf_ksym_iter_register(void) +{ + ksym_iter_reg_info.ctx_arg_info[0].btf_id = *btf_ksym_iter_id; + return bpf_iter_reg_target(&ksym_iter_reg_info); +} + +late_initcall(bpf_ksym_iter_register); + +#endif /* CONFIG_BPF_SYSCALL */ + static inline int kallsyms_for_perf(void) { #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/kallsyms_internal.h b/kernel/kallsyms_internal.h new file mode 100644 index 000000000000..2d0c6f2f0243 --- /dev/null +++ b/kernel/kallsyms_internal.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef LINUX_KALLSYMS_INTERNAL_H_ +#define LINUX_KALLSYMS_INTERNAL_H_ + +#include <linux/types.h> + +/* + * These will be re-linked against their real values + * during the second link stage. + */ +extern const unsigned long kallsyms_addresses[] __weak; +extern const int kallsyms_offsets[] __weak; +extern const u8 kallsyms_names[] __weak; + +/* + * Tell the compiler that the count isn't in the small data section if the arch + * has one (eg: FRV). + */ +extern const unsigned int kallsyms_num_syms +__section(".rodata") __attribute__((weak)); + +extern const unsigned long kallsyms_relative_base +__section(".rodata") __attribute__((weak)); + +extern const char kallsyms_token_table[] __weak; +extern const u16 kallsyms_token_index[] __weak; + +extern const unsigned int kallsyms_markers[] __weak; + +#endif // LINUX_KALLSYMS_INTERNAL_H_ diff --git a/kernel/kcov.c b/kernel/kcov.c index 475524bd900a..e19c84b02452 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -204,8 +204,16 @@ void notrace __sanitizer_cov_trace_pc(void) /* The first 64-bit word is the number of subsequent PCs. */ pos = READ_ONCE(area[0]) + 1; if (likely(pos < t->kcov_size)) { - area[pos] = ip; + /* Previously we write pc before updating pos. However, some + * early interrupt code could bypass check_kcov_mode() check + * and invoke __sanitizer_cov_trace_pc(). If such interrupt is + * raised between writing pc and updating pos, the pc could be + * overitten by the recursive __sanitizer_cov_trace_pc(). + * Update pos before writing pc to avoid such interleaving. + */ WRITE_ONCE(area[0], pos); + barrier(); + area[pos] = ip; } } EXPORT_SYMBOL(__sanitizer_cov_trace_pc); @@ -236,11 +244,13 @@ static void notrace write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip) start_index = 1 + count * KCOV_WORDS_PER_CMP; end_pos = (start_index + KCOV_WORDS_PER_CMP) * sizeof(u64); if (likely(end_pos <= max_pos)) { + /* See comment in __sanitizer_cov_trace_pc(). */ + WRITE_ONCE(area[0], count + 1); + barrier(); area[start_index] = type; area[start_index + 1] = arg1; area[start_index + 2] = arg2; area[start_index + 3] = ip; - WRITE_ONCE(area[0], count + 1); } } @@ -475,8 +485,11 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) vma->vm_flags |= VM_DONTEXPAND; for (off = 0; off < size; off += PAGE_SIZE) { page = vmalloc_to_page(kcov->area + off); - if (vm_insert_page(vma, vma->vm_start + off, page)) - WARN_ONCE(1, "vm_insert_page() failed"); + res = vm_insert_page(vma, vma->vm_start + off, page); + if (res) { + pr_warn_once("kcov: vm_insert_page() failed\n"); + return res; + } } return 0; exit: diff --git a/kernel/kcsan/.kunitconfig b/kernel/kcsan/.kunitconfig new file mode 100644 index 000000000000..e82f0f52ab0a --- /dev/null +++ b/kernel/kcsan/.kunitconfig @@ -0,0 +1,24 @@ +# Note that the KCSAN tests need to run on an SMP setup. +# Under kunit_tool, this can be done by using the --qemu_args +# option to configure a machine with several cores. For example: +# ./tools/testing/kunit/kunit.py run --kunitconfig=kernel/kcsan \ +# --arch=x86_64 --qemu_args="-smp 8" + +CONFIG_KUNIT=y + +CONFIG_DEBUG_KERNEL=y + +# Need some level of concurrency to test a concurrency sanitizer. +CONFIG_SMP=y + +CONFIG_KCSAN=y +CONFIG_KCSAN_KUNIT_TEST=y + +# Set these if you want to run test_barrier_nothreads +#CONFIG_KCSAN_STRICT=y +#CONFIG_KCSAN_WEAK_MEMORY=y + +# This prevents the test from timing out on many setups. Feel free to remove +# (or alter) this, in conjunction with setting a different test timeout with, +# for example, the --timeout kunit_tool option. +CONFIG_KCSAN_REPORT_ONCE_IN_MS=100 diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index a36fca063a73..dcec1b743c69 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -1380,13 +1380,14 @@ static const void *nthreads_gen_params(const void *prev, char *desc) else nthreads *= 2; - if (!IS_ENABLED(CONFIG_PREEMPT) || !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) { + if (!preempt_model_preemptible() || + !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) { /* * Without any preemption, keep 2 CPUs free for other tasks, one * of which is the main test case function checking for * completion or failure. */ - const long min_unused_cpus = IS_ENABLED(CONFIG_PREEMPT_NONE) ? 2 : 0; + const long min_unused_cpus = preempt_model_none() ? 2 : 0; const long min_required_cpus = 2 + min_unused_cpus; if (num_online_cpus() < min_required_cpus) { @@ -1565,14 +1566,6 @@ static void test_exit(struct kunit *test) torture_cleanup_end(); } -static struct kunit_suite kcsan_test_suite = { - .name = "kcsan", - .test_cases = kcsan_test_cases, - .init = test_init, - .exit = test_exit, -}; -static struct kunit_suite *kcsan_test_suites[] = { &kcsan_test_suite, NULL }; - __no_kcsan static void register_tracepoints(struct tracepoint *tp, void *ignore) { @@ -1588,11 +1581,7 @@ static void unregister_tracepoints(struct tracepoint *tp, void *ignore) tracepoint_probe_unregister(tp, probe_console, NULL); } -/* - * We only want to do tracepoints setup and teardown once, therefore we have to - * customize the init and exit functions and cannot rely on kunit_test_suite(). - */ -static int __init kcsan_test_init(void) +static int kcsan_suite_init(struct kunit_suite *suite) { /* * Because we want to be able to build the test as a module, we need to @@ -1600,18 +1589,25 @@ static int __init kcsan_test_init(void) * won't work here. */ for_each_kernel_tracepoint(register_tracepoints, NULL); - return __kunit_test_suites_init(kcsan_test_suites); + return 0; } -static void kcsan_test_exit(void) +static void kcsan_suite_exit(struct kunit_suite *suite) { - __kunit_test_suites_exit(kcsan_test_suites); for_each_kernel_tracepoint(unregister_tracepoints, NULL); tracepoint_synchronize_unregister(); } -late_initcall_sync(kcsan_test_init); -module_exit(kcsan_test_exit); +static struct kunit_suite kcsan_test_suite = { + .name = "kcsan", + .test_cases = kcsan_test_cases, + .init = test_init, + .exit = test_exit, + .suite_init = kcsan_suite_init, + .suite_exit = kcsan_suite_exit, +}; + +kunit_test_suites(&kcsan_test_suite); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Marco Elver <elver@google.com>"); diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 68480f731192..acd029b307e4 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -591,11 +591,6 @@ static void kimage_free_extra_pages(struct kimage *image) } -int __weak machine_kexec_post_load(struct kimage *image) -{ - return 0; -} - void kimage_terminate(struct kimage *image) { if (*image->entry != 0) @@ -768,7 +763,6 @@ static struct page *kimage_alloc_page(struct kimage *image, kimage_free_pages(old_page); continue; } - addr = old_addr; page = old_page; break; } @@ -788,7 +782,6 @@ static int kimage_load_normal_segment(struct kimage *image, unsigned char __user *buf = NULL; unsigned char *kbuf = NULL; - result = 0; if (image->file_mode) kbuf = segment->kbuf; else @@ -936,6 +929,28 @@ int kimage_load_segment(struct kimage *image, struct kimage *kexec_image; struct kimage *kexec_crash_image; int kexec_load_disabled; +#ifdef CONFIG_SYSCTL +static struct ctl_table kexec_core_sysctls[] = { + { + .procname = "kexec_load_disabled", + .data = &kexec_load_disabled, + .maxlen = sizeof(int), + .mode = 0644, + /* only handle a transition from default "0" to "1" */ + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_ONE, + }, + { } +}; + +static int __init kexec_core_sysctl_init(void) +{ + register_sysctl_init("kernel", kexec_core_sysctls); + return 0; +} +late_initcall(kexec_core_sysctl_init); +#endif /* * No panic_cpu check version of crash_kexec(). This function is called @@ -1000,15 +1015,6 @@ size_t crash_get_memory_size(void) return size; } -void __weak crash_free_reserved_phys_range(unsigned long begin, - unsigned long end) -{ - unsigned long addr; - - for (addr = begin; addr < end; addr += PAGE_SIZE) - free_reserved_page(boot_pfn_to_page(addr >> PAGE_SHIFT)); -} - int crash_shrink_memory(unsigned long new_size) { int ret = 0; @@ -1078,7 +1084,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu) return; memset(&prstatus, 0, sizeof(prstatus)); prstatus.common.pr_pid = current->pid; - elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); + elf_core_copy_regs(&prstatus.pr_reg, regs); buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, &prstatus, sizeof(prstatus)); final_note(buf); @@ -1205,16 +1211,3 @@ int kernel_kexec(void) mutex_unlock(&kexec_mutex); return error; } - -/* - * Protection mechanism for crashkernel reserved memory after - * the kdump kernel is loaded. - * - * Provide an empty default implementation here -- architecture - * code may override this - */ -void __weak arch_kexec_protect_crashkres(void) -{} - -void __weak arch_kexec_unprotect_crashkres(void) -{} diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 8347fc158d2b..1d546dc97c50 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -29,8 +29,20 @@ #include <linux/vmalloc.h> #include "kexec_internal.h" +#ifdef CONFIG_KEXEC_SIG +static bool sig_enforce = IS_ENABLED(CONFIG_KEXEC_SIG_FORCE); + +void set_kexec_sig_enforced(void) +{ + sig_enforce = true; +} +#endif + static int kexec_calculate_store_digests(struct kimage *image); +/* Maximum size in bytes for kernel/initrd files. */ +#define KEXEC_FILE_SIZE_MAX min_t(s64, 4LL << 30, SSIZE_MAX) + /* * Currently this is the only default function that is exported as some * architectures need it to do additional handlings. @@ -53,14 +65,7 @@ int kexec_image_probe_default(struct kimage *image, void *buf, return ret; } -/* Architectures can provide this probe function */ -int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, - unsigned long buf_len) -{ - return kexec_image_probe_default(image, buf, buf_len); -} - -static void *kexec_image_load_default(struct kimage *image) +void *kexec_image_load_default(struct kimage *image) { if (!image->fops || !image->fops->load) return ERR_PTR(-ENOEXEC); @@ -71,11 +76,6 @@ static void *kexec_image_load_default(struct kimage *image) image->cmdline_buf_len); } -void * __weak arch_kexec_kernel_image_load(struct kimage *image) -{ - return kexec_image_load_default(image); -} - int kexec_image_post_load_cleanup_default(struct kimage *image) { if (!image->fops || !image->fops->cleanup) @@ -84,64 +84,6 @@ int kexec_image_post_load_cleanup_default(struct kimage *image) return image->fops->cleanup(image->image_loader_data); } -int __weak arch_kimage_file_post_load_cleanup(struct kimage *image) -{ - return kexec_image_post_load_cleanup_default(image); -} - -#ifdef CONFIG_KEXEC_SIG -static int kexec_image_verify_sig_default(struct kimage *image, void *buf, - unsigned long buf_len) -{ - if (!image->fops || !image->fops->verify_sig) { - pr_debug("kernel loader does not support signature verification.\n"); - return -EKEYREJECTED; - } - - return image->fops->verify_sig(buf, buf_len); -} - -int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, - unsigned long buf_len) -{ - return kexec_image_verify_sig_default(image, buf, buf_len); -} -#endif - -/* - * arch_kexec_apply_relocations_add - apply relocations of type RELA - * @pi: Purgatory to be relocated. - * @section: Section relocations applying to. - * @relsec: Section containing RELAs. - * @symtab: Corresponding symtab. - * - * Return: 0 on success, negative errno on error. - */ -int __weak -arch_kexec_apply_relocations_add(struct purgatory_info *pi, Elf_Shdr *section, - const Elf_Shdr *relsec, const Elf_Shdr *symtab) -{ - pr_err("RELA relocation unsupported.\n"); - return -ENOEXEC; -} - -/* - * arch_kexec_apply_relocations - apply relocations of type REL - * @pi: Purgatory to be relocated. - * @section: Section relocations applying to. - * @relsec: Section containing RELs. - * @symtab: Corresponding symtab. - * - * Return: 0 on success, negative errno on error. - */ -int __weak -arch_kexec_apply_relocations(struct purgatory_info *pi, Elf_Shdr *section, - const Elf_Shdr *relsec, const Elf_Shdr *symtab) -{ - pr_err("REL relocation unsupported.\n"); - return -ENOEXEC; -} - /* * Free up memory used by kernel, initrd, and command line. This is temporary * memory allocation which is not needed any more after these buffers have @@ -184,16 +126,44 @@ void kimage_file_post_load_cleanup(struct kimage *image) } #ifdef CONFIG_KEXEC_SIG +#ifdef CONFIG_SIGNED_PE_FILE_VERIFICATION +int kexec_kernel_verify_pe_sig(const char *kernel, unsigned long kernel_len) +{ + int ret; + + ret = verify_pefile_signature(kernel, kernel_len, + VERIFY_USE_SECONDARY_KEYRING, + VERIFYING_KEXEC_PE_SIGNATURE); + if (ret == -ENOKEY && IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING)) { + ret = verify_pefile_signature(kernel, kernel_len, + VERIFY_USE_PLATFORM_KEYRING, + VERIFYING_KEXEC_PE_SIGNATURE); + } + return ret; +} +#endif + +static int kexec_image_verify_sig(struct kimage *image, void *buf, + unsigned long buf_len) +{ + if (!image->fops || !image->fops->verify_sig) { + pr_debug("kernel loader does not support signature verification.\n"); + return -EKEYREJECTED; + } + + return image->fops->verify_sig(buf, buf_len); +} + static int kimage_validate_signature(struct kimage *image) { int ret; - ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, - image->kernel_buf_len); + ret = kexec_image_verify_sig(image, image->kernel_buf, + image->kernel_buf_len); if (ret) { - if (IS_ENABLED(CONFIG_KEXEC_SIG_FORCE)) { + if (sig_enforce) { pr_notice("Enforced kernel signature verification failed (%d).\n", ret); return ret; } @@ -223,11 +193,12 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, const char __user *cmdline_ptr, unsigned long cmdline_len, unsigned flags) { - int ret; + ssize_t ret; void *ldata; ret = kernel_read_file_from_fd(kernel_fd, 0, &image->kernel_buf, - INT_MAX, NULL, READING_KEXEC_IMAGE); + KEXEC_FILE_SIZE_MAX, NULL, + READING_KEXEC_IMAGE); if (ret < 0) return ret; image->kernel_buf_len = ret; @@ -247,7 +218,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, /* It is possible that there no initramfs is being loaded */ if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { ret = kernel_read_file_from_fd(initrd_fd, 0, &image->initrd_buf, - INT_MAX, NULL, + KEXEC_FILE_SIZE_MAX, NULL, READING_KEXEC_INITRAMFS); if (ret < 0) goto out; @@ -647,19 +618,6 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) } /** - * arch_kexec_locate_mem_hole - Find free memory to place the segments. - * @kbuf: Parameters for the memory search. - * - * On success, kbuf->mem will have the start address of the memory region found. - * - * Return: 0 on success, negative errno on error. - */ -int __weak arch_kexec_locate_mem_hole(struct kexec_buf *kbuf) -{ - return kexec_locate_mem_hole(kbuf); -} - -/** * kexec_add_buffer - place a buffer in a kexec segment * @kbuf: Buffer contents and memory parameters. * @@ -1260,7 +1218,7 @@ int crash_exclude_mem_range(struct crash_mem *mem, return 0; } -int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, +int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map, void **addr, unsigned long *sz) { Elf64_Ehdr *ehdr; @@ -1324,7 +1282,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, phdr++; /* Prepare PT_LOAD type program header for kernel text region */ - if (kernel_map) { + if (need_kernel_map) { phdr->p_type = PT_LOAD; phdr->p_flags = PF_R|PF_W|PF_X; phdr->p_vaddr = (unsigned long) _text; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index dbe57df2e199..3220b0a2fb4a 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1257,79 +1257,6 @@ void kprobe_busy_end(void) preempt_enable(); } -#if !defined(CONFIG_KRETPROBE_ON_RETHOOK) -static void free_rp_inst_rcu(struct rcu_head *head) -{ - struct kretprobe_instance *ri = container_of(head, struct kretprobe_instance, rcu); - - if (refcount_dec_and_test(&ri->rph->ref)) - kfree(ri->rph); - kfree(ri); -} -NOKPROBE_SYMBOL(free_rp_inst_rcu); - -static void recycle_rp_inst(struct kretprobe_instance *ri) -{ - struct kretprobe *rp = get_kretprobe(ri); - - if (likely(rp)) - freelist_add(&ri->freelist, &rp->freelist); - else - call_rcu(&ri->rcu, free_rp_inst_rcu); -} -NOKPROBE_SYMBOL(recycle_rp_inst); - -/* - * This function is called from delayed_put_task_struct() when a task is - * dead and cleaned up to recycle any kretprobe instances associated with - * this task. These left over instances represent probed functions that - * have been called but will never return. - */ -void kprobe_flush_task(struct task_struct *tk) -{ - struct kretprobe_instance *ri; - struct llist_node *node; - - /* Early boot, not yet initialized. */ - if (unlikely(!kprobes_initialized)) - return; - - kprobe_busy_begin(); - - node = __llist_del_all(&tk->kretprobe_instances); - while (node) { - ri = container_of(node, struct kretprobe_instance, llist); - node = node->next; - - recycle_rp_inst(ri); - } - - kprobe_busy_end(); -} -NOKPROBE_SYMBOL(kprobe_flush_task); - -static inline void free_rp_inst(struct kretprobe *rp) -{ - struct kretprobe_instance *ri; - struct freelist_node *node; - int count = 0; - - node = rp->freelist.head; - while (node) { - ri = container_of(node, struct kretprobe_instance, freelist); - node = node->next; - - kfree(ri); - count++; - } - - if (refcount_sub_and_test(count, &rp->rph->ref)) { - kfree(rp->rph); - rp->rph = NULL; - } -} -#endif /* !CONFIG_KRETPROBE_ON_RETHOOK */ - /* Add the new probe to 'ap->list'. */ static int add_new_kprobe(struct kprobe *ap, struct kprobe *p) { @@ -1633,7 +1560,9 @@ static int check_kprobe_address_safe(struct kprobe *p, preempt_disable(); /* Ensure it is not in reserved area nor out of text */ - if (!kernel_text_address((unsigned long) p->addr) || + if (!(core_kernel_text((unsigned long) p->addr) || + is_module_text_address((unsigned long) p->addr)) || + in_gate_area_no_mm((unsigned long) p->addr) || within_kprobe_blacklist((unsigned long) p->addr) || jump_label_text_reserved(p->addr, p->addr) || static_call_text_reserved(p->addr, p->addr) || @@ -1678,9 +1607,10 @@ int register_kprobe(struct kprobe *p) struct kprobe *old_p; struct module *probed_mod; kprobe_opcode_t *addr; + bool on_func_entry; /* Adjust probe address from symbol */ - addr = kprobe_addr(p); + addr = _kprobe_addr(p->addr, p->symbol_name, p->offset, &on_func_entry); if (IS_ERR(addr)) return PTR_ERR(addr); p->addr = addr; @@ -1700,6 +1630,9 @@ int register_kprobe(struct kprobe *p) mutex_lock(&kprobe_mutex); + if (on_func_entry) + p->flags |= KPROBE_FLAG_ON_FUNC_ENTRY; + old_p = get_kprobe(p->addr); if (old_p) { /* Since this may unoptimize 'old_p', locking 'text_mutex'. */ @@ -1779,11 +1712,12 @@ static struct kprobe *__disable_kprobe(struct kprobe *p) /* Try to disarm and disable this/parent probe */ if (p == orig_p || aggr_kprobe_disabled(orig_p)) { /* - * If 'kprobes_all_disarmed' is set, 'orig_p' - * should have already been disarmed, so - * skip unneed disarming process. + * Don't be lazy here. Even if 'kprobes_all_disarmed' + * is false, 'orig_p' might not have been armed yet. + * Note arm_all_kprobes() __tries__ to arm all kprobes + * on the best effort basis. */ - if (!kprobes_all_disarmed) { + if (!kprobes_all_disarmed && !kprobe_disabled(orig_p)) { ret = disarm_kprobe(orig_p, true); if (ret) { p->flags &= ~KPROBE_FLAG_DISABLED; @@ -1928,6 +1862,77 @@ static struct notifier_block kprobe_exceptions_nb = { #ifdef CONFIG_KRETPROBES #if !defined(CONFIG_KRETPROBE_ON_RETHOOK) +static void free_rp_inst_rcu(struct rcu_head *head) +{ + struct kretprobe_instance *ri = container_of(head, struct kretprobe_instance, rcu); + + if (refcount_dec_and_test(&ri->rph->ref)) + kfree(ri->rph); + kfree(ri); +} +NOKPROBE_SYMBOL(free_rp_inst_rcu); + +static void recycle_rp_inst(struct kretprobe_instance *ri) +{ + struct kretprobe *rp = get_kretprobe(ri); + + if (likely(rp)) + freelist_add(&ri->freelist, &rp->freelist); + else + call_rcu(&ri->rcu, free_rp_inst_rcu); +} +NOKPROBE_SYMBOL(recycle_rp_inst); + +/* + * This function is called from delayed_put_task_struct() when a task is + * dead and cleaned up to recycle any kretprobe instances associated with + * this task. These left over instances represent probed functions that + * have been called but will never return. + */ +void kprobe_flush_task(struct task_struct *tk) +{ + struct kretprobe_instance *ri; + struct llist_node *node; + + /* Early boot, not yet initialized. */ + if (unlikely(!kprobes_initialized)) + return; + + kprobe_busy_begin(); + + node = __llist_del_all(&tk->kretprobe_instances); + while (node) { + ri = container_of(node, struct kretprobe_instance, llist); + node = node->next; + + recycle_rp_inst(ri); + } + + kprobe_busy_end(); +} +NOKPROBE_SYMBOL(kprobe_flush_task); + +static inline void free_rp_inst(struct kretprobe *rp) +{ + struct kretprobe_instance *ri; + struct freelist_node *node; + int count = 0; + + node = rp->freelist.head; + while (node) { + ri = container_of(node, struct kretprobe_instance, freelist); + node = node->next; + + kfree(ri); + count++; + } + + if (refcount_sub_and_test(count, &rp->rph->ref)) { + kfree(rp->rph); + rp->rph = NULL; + } +} + /* This assumes the 'tsk' is the current task or the is not running. */ static kprobe_opcode_t *__kretprobe_find_ret_addr(struct task_struct *tsk, struct llist_node **cur) @@ -2126,7 +2131,7 @@ static void kretprobe_rethook_handler(struct rethook_node *rh, void *data, struct kprobe_ctlblk *kcb; /* The data must NOT be null. This means rethook data structure is broken. */ - if (WARN_ON_ONCE(!data)) + if (WARN_ON_ONCE(!data) || !rp->handler) return; __this_cpu_write(current_kprobe, &rp->kp); diff --git a/kernel/kthread.c b/kernel/kthread.c index 50265f69a135..f97fd01a2932 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -340,7 +340,7 @@ static int kthread(void *_create) self = to_kthread(current); - /* If user was SIGKILLed, I release the structure. */ + /* Release the structure when caller killed by a fatal signal. */ done = xchg(&create->done, NULL); if (!done) { kfree(create); @@ -398,7 +398,7 @@ static void create_kthread(struct kthread_create_info *create) /* We want our own signal handler (we take no signals by default). */ pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); if (pid < 0) { - /* If user was SIGKILLed, I release the structure. */ + /* Release the structure when caller killed by a fatal signal. */ struct completion *done = xchg(&create->done, NULL); if (!done) { @@ -440,9 +440,9 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), */ if (unlikely(wait_for_completion_killable(&done))) { /* - * If I was SIGKILLed before kthreadd (or new kernel thread) - * calls complete(), leave the cleanup of this structure to - * that thread. + * If I was killed by a fatal signal before kthreadd (or new + * kernel thread) calls complete(), leave the cleanup of this + * structure to that thread. */ if (xchg(&create->done, NULL)) return ERR_PTR(-EINTR); @@ -704,6 +704,7 @@ int kthread_stop(struct task_struct *k) kthread = to_kthread(k); set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); kthread_unpark(k); + set_tsk_thread_flag(k, TIF_NOTIFY_SIGNAL); wake_up_process(k); wait_for_completion(&kthread->exited); ret = kthread->result; @@ -876,7 +877,7 @@ fail_task: * * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM) * when the needed structures could not get allocated, and ERR_PTR(-EINTR) - * when the worker was SIGKILLed. + * when the caller was killed by a fatal signal. */ struct kthread_worker * kthread_create_worker(unsigned int flags, const char namefmt[], ...) @@ -925,7 +926,7 @@ EXPORT_SYMBOL(kthread_create_worker); * Return: * The pointer to the allocated worker on success, ERR_PTR(-ENOMEM) * when the needed structures could not get allocated, and ERR_PTR(-EINTR) - * when the worker was SIGKILLed. + * when the caller was killed by a fatal signal. */ struct kthread_worker * kthread_create_worker_on_cpu(int cpu, unsigned int flags, @@ -1050,8 +1051,7 @@ static void __kthread_queue_delayed_work(struct kthread_worker *worker, struct timer_list *timer = &dwork->timer; struct kthread_work *work = &dwork->work; - WARN_ON_FUNCTION_MISMATCH(timer->function, - kthread_delayed_work_timer_fn); + WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn); /* * If @delay is 0, queue @dwork->work immediately. This is for @@ -1522,5 +1522,4 @@ struct cgroup_subsys_state *kthread_blkcg(void) } return NULL; } -EXPORT_SYMBOL(kthread_blkcg); #endif diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 166d7bf49666..76166df011a4 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -55,6 +55,7 @@ #include <linux/sched/stat.h> #include <linux/list.h> #include <linux/stacktrace.h> +#include <linux/sysctl.h> static DEFINE_RAW_SPINLOCK(latency_lock); @@ -63,6 +64,31 @@ static struct latency_record latency_record[MAXLR]; int latencytop_enabled; +#ifdef CONFIG_SYSCTL +static int sysctl_latencytop(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + int err; + + err = proc_dointvec(table, write, buffer, lenp, ppos); + if (latencytop_enabled) + force_schedstat_enabled(); + + return err; +} + +static struct ctl_table latencytop_sysctl[] = { + { + .procname = "latencytop", + .data = &latencytop_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_latencytop, + }, + {} +}; +#endif + void clear_tsk_latency_tracing(struct task_struct *p) { unsigned long flags; @@ -266,18 +292,9 @@ static const struct proc_ops lstats_proc_ops = { static int __init init_lstats_procfs(void) { proc_create("latency_stats", 0644, NULL, &lstats_proc_ops); +#ifdef CONFIG_SYSCTL + register_sysctl_init("kernel", latencytop_sysctl); +#endif return 0; } - -int sysctl_latencytop(struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos) -{ - int err; - - err = proc_dointvec(table, write, buffer, lenp, ppos); - if (latencytop_enabled) - force_schedstat_enabled(); - - return err; -} device_initcall(init_lstats_procfs); diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index bc475e62279d..ec06ce59d728 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -213,7 +213,7 @@ static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab, * we use the smallest/strictest upper bound possible (56, based on * the current definition of MODULE_NAME_LEN) to prevent overflows. */ - BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 128); + BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 512); relas = (Elf_Rela *) relasec->sh_addr; /* For each rela in this klp relocation section */ @@ -227,7 +227,7 @@ static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab, /* Format: .klp.sym.sym_objname.sym_name,sympos */ cnt = sscanf(strtab + sym->st_name, - ".klp.sym.%55[^.].%127[^,],%lu", + ".klp.sym.%55[^.].%511[^,],%lu", sym_objname, sym_name, &sympos); if (cnt != 3) { pr_err("symbol %s has an incorrectly formatted name\n", diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index c172bf92b576..4c4f5a776d80 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -118,7 +118,7 @@ static void notrace klp_ftrace_handler(unsigned long ip, if (func->nop) goto unlock; - klp_arch_set_pc(fregs, (unsigned long)func->new_func); + ftrace_instruction_pointer_set(fregs, (unsigned long)func->new_func); unlock: ftrace_test_recursion_unlock(bit); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index c06cab6546ed..64a13eb56078 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -60,23 +60,53 @@ #include "lockdep_internals.h" -#define CREATE_TRACE_POINTS #include <trace/events/lock.h> #ifdef CONFIG_PROVE_LOCKING -int prove_locking = 1; +static int prove_locking = 1; module_param(prove_locking, int, 0644); #else #define prove_locking 0 #endif #ifdef CONFIG_LOCK_STAT -int lock_stat = 1; +static int lock_stat = 1; module_param(lock_stat, int, 0644); #else #define lock_stat 0 #endif +#ifdef CONFIG_SYSCTL +static struct ctl_table kern_lockdep_table[] = { +#ifdef CONFIG_PROVE_LOCKING + { + .procname = "prove_locking", + .data = &prove_locking, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif /* CONFIG_PROVE_LOCKING */ +#ifdef CONFIG_LOCK_STAT + { + .procname = "lock_stat", + .data = &lock_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif /* CONFIG_LOCK_STAT */ + { } +}; + +static __init int kernel_lockdep_sysctls_init(void) +{ + register_sysctl_init("kernel", kern_lockdep_table); + return 0; +} +late_initcall(kernel_lockdep_sysctls_init); +#endif /* CONFIG_SYSCTL */ + DEFINE_PER_CPU(unsigned int, lockdep_recursion); EXPORT_PER_CPU_SYMBOL_GPL(lockdep_recursion); @@ -1380,7 +1410,7 @@ static struct lock_list *alloc_list_entry(void) */ static int add_lock_to_list(struct lock_class *this, struct lock_class *links_to, struct list_head *head, - unsigned long ip, u16 distance, u8 dep, + u16 distance, u8 dep, const struct lock_trace *trace) { struct lock_list *entry; @@ -3133,19 +3163,15 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * to the previous lock's dependency list: */ ret = add_lock_to_list(hlock_class(next), hlock_class(prev), - &hlock_class(prev)->locks_after, - next->acquire_ip, distance, - calc_dep(prev, next), - *trace); + &hlock_class(prev)->locks_after, distance, + calc_dep(prev, next), *trace); if (!ret) return 0; ret = add_lock_to_list(hlock_class(prev), hlock_class(next), - &hlock_class(next)->locks_before, - next->acquire_ip, distance, - calc_depb(prev, next), - *trace); + &hlock_class(next)->locks_before, distance, + calc_depb(prev, next), *trace); if (!ret) return 0; @@ -4236,14 +4262,13 @@ static void __trace_hardirqs_on_caller(void) /** * lockdep_hardirqs_on_prepare - Prepare for enabling interrupts - * @ip: Caller address * * Invoked before a possible transition to RCU idle from exit to user or * guest mode. This ensures that all RCU operations are done before RCU * stops watching. After the RCU transition lockdep_hardirqs_on() has to be * invoked to set the final state. */ -void lockdep_hardirqs_on_prepare(unsigned long ip) +void lockdep_hardirqs_on_prepare(void) { if (unlikely(!debug_locks)) return; @@ -4840,8 +4865,7 @@ EXPORT_SYMBOL_GPL(__lockdep_no_validate__); static void print_lock_nested_lock_not_held(struct task_struct *curr, - struct held_lock *hlock, - unsigned long ip) + struct held_lock *hlock) { if (!debug_locks_off()) return; @@ -5017,7 +5041,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, chain_key = iterate_chain_key(chain_key, hlock_id(hlock)); if (nest_lock && !__lock_is_held(nest_lock, -1)) { - print_lock_nested_lock_not_held(curr, hlock, ip); + print_lock_nested_lock_not_held(curr, hlock); return 0; } @@ -5214,9 +5238,10 @@ __lock_set_class(struct lockdep_map *lock, const char *name, return 0; } - lockdep_init_map_waits(lock, name, key, 0, - lock->wait_type_inner, - lock->wait_type_outer); + lockdep_init_map_type(lock, name, key, 0, + lock->wait_type_inner, + lock->wait_type_outer, + lock->lock_type); class = register_lock_class(lock, subclass, 0); hlock->class_idx = class - lock_classes; @@ -5408,7 +5433,7 @@ static struct pin_cookie __lock_pin_lock(struct lockdep_map *lock) * be guessable and still allows some pin nesting in * our u32 pin_count. */ - cookie.val = 1 + (prandom_u32() >> 16); + cookie.val = 1 + (sched_clock() & 0xffff); hlock->pin_count += cookie.val; return cookie; } @@ -6546,7 +6571,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) /* * If a CPU is in the RCU-free window in idle (ie: in the section - * between rcu_idle_enter() and rcu_idle_exit(), then RCU + * between ct_idle_enter() and ct_idle_exit(), then RCU * considers that CPU to be in an "extended quiescent state", * which means that RCU will be completely ignoring that CPU. * Therefore, rcu_read_lock() and friends have absolutely no diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 5e3585950ec8..d973fe6041bf 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -30,6 +30,9 @@ #include <linux/debug_locks.h> #include <linux/osq_lock.h> +#define CREATE_TRACE_POINTS +#include <trace/events/lock.h> + #ifndef CONFIG_PREEMPT_RT #include "mutex.h" @@ -599,12 +602,14 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas preempt_disable(); mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); + trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN); if (__mutex_trylock(lock) || mutex_optimistic_spin(lock, ww_ctx, NULL)) { /* got the lock, yay! */ lock_acquired(&lock->dep_map, ip); if (ww_ctx) ww_mutex_set_context_fastpath(ww, ww_ctx); + trace_contention_end(lock, 0); preempt_enable(); return 0; } @@ -641,6 +646,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas } set_current_state(state); + trace_contention_begin(lock, LCB_F_MUTEX); for (;;) { bool first; @@ -680,10 +686,16 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas * state back to RUNNING and fall through the next schedule(), * or we must see its unlock and acquire. */ - if (__mutex_trylock_or_handoff(lock, first) || - (first && mutex_optimistic_spin(lock, ww_ctx, &waiter))) + if (__mutex_trylock_or_handoff(lock, first)) break; + if (first) { + trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN); + if (mutex_optimistic_spin(lock, ww_ctx, &waiter)) + break; + trace_contention_begin(lock, LCB_F_MUTEX); + } + raw_spin_lock(&lock->wait_lock); } raw_spin_lock(&lock->wait_lock); @@ -707,6 +719,7 @@ acquired: skip_wait: /* got the lock - cleanup and rejoice! */ lock_acquired(&lock->dep_map, ip); + trace_contention_end(lock, 0); if (ww_ctx) ww_mutex_lock_acquired(ww, ww_ctx); @@ -719,6 +732,7 @@ err: __set_current_state(TASK_RUNNING); __mutex_remove_waiter(lock, &waiter); err_early_kill: + trace_contention_end(lock, ret); raw_spin_unlock(&lock->wait_lock); debug_mutex_free_waiter(&waiter); mutex_release(&lock->dep_map, ip); diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index c9fdae94e098..5fe4c5495ba3 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -9,6 +9,7 @@ #include <linux/sched/task.h> #include <linux/sched/debug.h> #include <linux/errno.h> +#include <trace/events/lock.h> int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, const char *name, struct lock_class_key *key) @@ -171,9 +172,11 @@ bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) if (try) return false; + trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_READ); preempt_enable(); percpu_rwsem_wait(sem, /* .reader = */ true); preempt_disable(); + trace_contention_end(sem, 0); return true; } @@ -216,6 +219,7 @@ void __sched percpu_down_write(struct percpu_rw_semaphore *sem) { might_sleep(); rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); + trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_WRITE); /* Notify readers to take the slow path. */ rcu_sync_enter(&sem->rss); @@ -237,6 +241,7 @@ void __sched percpu_down_write(struct percpu_rw_semaphore *sem) /* Wait for all active readers to complete. */ rcuwait_wait_event(&sem->writer, readers_active_check(sem), TASK_UNINTERRUPTIBLE); + trace_contention_end(sem, 0); } EXPORT_SYMBOL_GPL(percpu_down_write); diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index ec36b73f4733..2e1600906c9f 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -12,10 +12,11 @@ #include <linux/percpu.h> #include <linux/hardirq.h> #include <linux/spinlock.h> +#include <trace/events/lock.h> /** - * queued_read_lock_slowpath - acquire read lock of a queue rwlock - * @lock: Pointer to queue rwlock structure + * queued_read_lock_slowpath - acquire read lock of a queued rwlock + * @lock: Pointer to queued rwlock structure */ void queued_read_lock_slowpath(struct qrwlock *lock) { @@ -34,6 +35,8 @@ void queued_read_lock_slowpath(struct qrwlock *lock) } atomic_sub(_QR_BIAS, &lock->cnts); + trace_contention_begin(lock, LCB_F_SPIN | LCB_F_READ); + /* * Put the reader into the wait queue */ @@ -51,17 +54,21 @@ void queued_read_lock_slowpath(struct qrwlock *lock) * Signal the next one in queue to become queue head */ arch_spin_unlock(&lock->wait_lock); + + trace_contention_end(lock, 0); } EXPORT_SYMBOL(queued_read_lock_slowpath); /** - * queued_write_lock_slowpath - acquire write lock of a queue rwlock - * @lock : Pointer to queue rwlock structure + * queued_write_lock_slowpath - acquire write lock of a queued rwlock + * @lock : Pointer to queued rwlock structure */ void queued_write_lock_slowpath(struct qrwlock *lock) { int cnts; + trace_contention_begin(lock, LCB_F_SPIN | LCB_F_WRITE); + /* Put the writer into the wait queue */ arch_spin_lock(&lock->wait_lock); @@ -79,5 +86,7 @@ void queued_write_lock_slowpath(struct qrwlock *lock) } while (!atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED)); unlock: arch_spin_unlock(&lock->wait_lock); + + trace_contention_end(lock, 0); } EXPORT_SYMBOL(queued_write_lock_slowpath); diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index cbff6ba53d56..65a9a10caa6f 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -22,6 +22,7 @@ #include <linux/prefetch.h> #include <asm/byteorder.h> #include <asm/qspinlock.h> +#include <trace/events/lock.h> /* * Include queued spinlock statistics code @@ -401,6 +402,8 @@ pv_queue: idx = node->count++; tail = encode_tail(smp_processor_id(), idx); + trace_contention_begin(lock, LCB_F_SPIN); + /* * 4 nodes are allocated based on the assumption that there will * not be nested NMIs taking spinlocks. That may not be true in @@ -554,6 +557,8 @@ locked: pv_kick_node(lock, next); release: + trace_contention_end(lock, 0); + /* * release the node */ diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 8555c4efe97c..7779ee8abc2a 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -24,6 +24,8 @@ #include <linux/sched/wake_q.h> #include <linux/ww_mutex.h> +#include <trace/events/lock.h> + #include "rtmutex_common.h" #ifndef WW_RT @@ -1579,6 +1581,8 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, set_current_state(state); + trace_contention_begin(lock, LCB_F_RT); + ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk); if (likely(!ret)) ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter); @@ -1601,6 +1605,9 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, * unconditionally. We might have to fix that up. */ fixup_rt_mutex_waiters(lock); + + trace_contention_end(lock, ret); + return ret; } @@ -1683,6 +1690,8 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock) /* Save current state and set state to TASK_RTLOCK_WAIT */ current_save_and_set_rtlock_wait_state(); + trace_contention_begin(lock, LCB_F_RT); + task_blocks_on_rt_mutex(lock, &waiter, current, NULL, RT_MUTEX_MIN_CHAINWALK); for (;;) { @@ -1712,6 +1721,8 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock) */ fixup_rt_mutex_waiters(lock); debug_rt_mutex_free_waiter(&waiter); + + trace_contention_end(lock, 0); } static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock) diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c index 6fd3162e4098..c201aadb9301 100644 --- a/kernel/locking/rwbase_rt.c +++ b/kernel/locking/rwbase_rt.c @@ -112,6 +112,8 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb, * Reader2 to call up_read(), which might be unbound. */ + trace_contention_begin(rwb, LCB_F_RT | LCB_F_READ); + /* * For rwlocks this returns 0 unconditionally, so the below * !ret conditionals are optimized out. @@ -130,6 +132,8 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb, raw_spin_unlock_irq(&rtm->wait_lock); if (!ret) rwbase_rtmutex_unlock(rtm); + + trace_contention_end(rwb, ret); return ret; } @@ -247,11 +251,13 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb, goto out_unlock; rwbase_set_and_save_current_state(state); + trace_contention_begin(rwb, LCB_F_RT | LCB_F_WRITE); for (;;) { /* Optimized out for rwlocks */ if (rwbase_signal_pending_state(state, current)) { rwbase_restore_current_state(); __rwbase_write_unlock(rwb, 0, flags); + trace_contention_end(rwb, -EINTR); return -EINTR; } @@ -265,6 +271,7 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb, set_current_state(state); } rwbase_restore_current_state(); + trace_contention_end(rwb, 0); out_unlock: raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index acde5d6f1254..65f0262f635e 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -27,6 +27,7 @@ #include <linux/export.h> #include <linux/rwsem.h> #include <linux/atomic.h> +#include <trace/events/lock.h> #ifndef CONFIG_PREEMPT_RT #include "lock_events.h" @@ -334,8 +335,6 @@ struct rwsem_waiter { struct task_struct *task; enum rwsem_waiter_type type; unsigned long timeout; - - /* Writer only, not initialized in reader */ bool handoff_set; }; #define rwsem_first_waiter(sem) \ @@ -375,16 +374,19 @@ rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) * * Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of * this function. Modify with care. + * + * Return: true if wait_list isn't empty and false otherwise */ -static inline void +static inline bool rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) { lockdep_assert_held(&sem->wait_lock); list_del(&waiter->list); if (likely(!list_empty(&sem->wait_list))) - return; + return true; atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count); + return false; } /* @@ -455,10 +457,12 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, * to give up the lock), request a HANDOFF to * force the issue. */ - if (!(oldcount & RWSEM_FLAG_HANDOFF) && - time_after(jiffies, waiter->timeout)) { - adjustment -= RWSEM_FLAG_HANDOFF; - lockevent_inc(rwsem_rlock_handoff); + if (time_after(jiffies, waiter->timeout)) { + if (!(oldcount & RWSEM_FLAG_HANDOFF)) { + adjustment -= RWSEM_FLAG_HANDOFF; + lockevent_inc(rwsem_rlock_handoff); + } + waiter->handoff_set = true; } atomic_long_add(-adjustment, &sem->count); @@ -559,6 +563,33 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, } /* + * Remove a waiter and try to wake up other waiters in the wait queue + * This function is called from the out_nolock path of both the reader and + * writer slowpaths with wait_lock held. It releases the wait_lock and + * optionally wake up waiters before it returns. + */ +static inline void +rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter, + struct wake_q_head *wake_q) + __releases(&sem->wait_lock) +{ + bool first = rwsem_first_waiter(sem) == waiter; + + wake_q_init(wake_q); + + /* + * If the wait_list isn't empty and the waiter to be deleted is + * the first waiter, we wake up the remaining waiters as they may + * be eligible to acquire or spin on the lock. + */ + if (rwsem_del_waiter(sem, waiter) && first) + rwsem_mark_wake(sem, RWSEM_WAKE_ANY, wake_q); + raw_spin_unlock_irq(&sem->wait_lock); + if (!wake_q_empty(wake_q)) + wake_up_q(wake_q); +} + +/* * This function must be called with the sem->wait_lock held to prevent * race conditions between checking the rwsem wait list and setting the * sem->count accordingly. @@ -568,7 +599,7 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, struct rwsem_waiter *waiter) { - bool first = rwsem_first_waiter(sem) == waiter; + struct rwsem_waiter *first = rwsem_first_waiter(sem); long count, new; lockdep_assert_held(&sem->wait_lock); @@ -578,11 +609,20 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF); if (has_handoff) { - if (!first) + /* + * Honor handoff bit and yield only when the first + * waiter is the one that set it. Otherwisee, we + * still try to acquire the rwsem. + */ + if (first->handoff_set && (waiter != first)) return false; - /* First waiter inherits a previously set handoff bit */ - waiter->handoff_set = true; + /* + * First waiter can inherit a previously set handoff + * bit and spin on rwsem if lock acquisition fails. + */ + if (waiter == first) + waiter->handoff_set = true; } new = count; @@ -901,7 +941,7 @@ done: */ static inline void clear_nonspinnable(struct rw_semaphore *sem) { - if (rwsem_test_oflags(sem, RWSEM_NONSPINNABLE)) + if (unlikely(rwsem_test_oflags(sem, RWSEM_NONSPINNABLE))) atomic_long_andnot(RWSEM_NONSPINNABLE, &sem->owner); } @@ -926,6 +966,31 @@ rwsem_spin_on_owner(struct rw_semaphore *sem) #endif /* + * Prepare to wake up waiter(s) in the wait queue by putting them into the + * given wake_q if the rwsem lock owner isn't a writer. If rwsem is likely + * reader-owned, wake up read lock waiters in queue front or wake up any + * front waiter otherwise. + + * This is being called from both reader and writer slow paths. + */ +static inline void rwsem_cond_wake_waiter(struct rw_semaphore *sem, long count, + struct wake_q_head *wake_q) +{ + enum rwsem_wake_type wake_type; + + if (count & RWSEM_WRITER_MASK) + return; + + if (count & RWSEM_READER_MASK) { + wake_type = RWSEM_WAKE_READERS; + } else { + wake_type = RWSEM_WAKE_ANY; + clear_nonspinnable(sem); + } + rwsem_mark_wake(sem, wake_type, wake_q); +} + +/* * Wait for the read lock to be granted */ static struct rw_semaphore __sched * @@ -935,7 +1000,6 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat long rcnt = (count >> RWSEM_READER_SHIFT); struct rwsem_waiter waiter; DEFINE_WAKE_Q(wake_q); - bool wake = false; /* * To prevent a constant stream of readers from starving a sleeping @@ -972,17 +1036,17 @@ queue: waiter.task = current; waiter.type = RWSEM_WAITING_FOR_READ; waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT; + waiter.handoff_set = false; raw_spin_lock_irq(&sem->wait_lock); if (list_empty(&sem->wait_list)) { /* * In case the wait queue is empty and the lock isn't owned - * by a writer or has the handoff bit set, this reader can - * exit the slowpath and return immediately as its - * RWSEM_READER_BIAS has already been set in the count. + * by a writer, this reader can exit the slowpath and return + * immediately as its RWSEM_READER_BIAS has already been set + * in the count. */ - if (!(atomic_long_read(&sem->count) & - (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { + if (!(atomic_long_read(&sem->count) & RWSEM_WRITER_MASK)) { /* Provide lock ACQUIRE */ smp_acquire__after_ctrl_dep(); raw_spin_unlock_irq(&sem->wait_lock); @@ -997,22 +1061,13 @@ queue: /* we're now waiting on the lock, but no longer actively locking */ count = atomic_long_add_return(adjustment, &sem->count); - /* - * If there are no active locks, wake the front queued process(es). - * - * If there are no writers and we are first in the queue, - * wake our own waiter to join the existing active readers ! - */ - if (!(count & RWSEM_LOCK_MASK)) { - clear_nonspinnable(sem); - wake = true; - } - if (wake || (!(count & RWSEM_WRITER_MASK) && - (adjustment & RWSEM_FLAG_WAITERS))) - rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); - + rwsem_cond_wake_waiter(sem, count, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); - wake_up_q(&wake_q); + + if (!wake_q_empty(&wake_q)) + wake_up_q(&wake_q); + + trace_contention_begin(sem, LCB_F_READ); /* wait to be given the lock */ for (;;) { @@ -1035,13 +1090,14 @@ queue: __set_current_state(TASK_RUNNING); lockevent_inc(rwsem_rlock); + trace_contention_end(sem, 0); return sem; out_nolock: - rwsem_del_waiter(sem, &waiter); - raw_spin_unlock_irq(&sem->wait_lock); + rwsem_del_wake_waiter(sem, &waiter, &wake_q); __set_current_state(TASK_RUNNING); lockevent_inc(rwsem_rlock_fail); + trace_contention_end(sem, -EINTR); return ERR_PTR(-EINTR); } @@ -1051,7 +1107,6 @@ out_nolock: static struct rw_semaphore __sched * rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) { - long count; struct rwsem_waiter waiter; DEFINE_WAKE_Q(wake_q); @@ -1075,23 +1130,8 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) /* we're now waiting on the lock */ if (rwsem_first_waiter(sem) != &waiter) { - count = atomic_long_read(&sem->count); - - /* - * If there were already threads queued before us and: - * 1) there are no active locks, wake the front - * queued process(es) as the handoff bit might be set. - * 2) there are no active writers and some readers, the lock - * must be read owned; so we try to wake any read lock - * waiters that were queued ahead of us. - */ - if (count & RWSEM_WRITER_MASK) - goto wait; - - rwsem_mark_wake(sem, (count & RWSEM_READER_MASK) - ? RWSEM_WAKE_READERS - : RWSEM_WAKE_ANY, &wake_q); - + rwsem_cond_wake_waiter(sem, atomic_long_read(&sem->count), + &wake_q); if (!wake_q_empty(&wake_q)) { /* * We want to minimize wait_lock hold time especially @@ -1099,16 +1139,16 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) */ raw_spin_unlock_irq(&sem->wait_lock); wake_up_q(&wake_q); - wake_q_init(&wake_q); /* Used again, reinit */ raw_spin_lock_irq(&sem->wait_lock); } } else { atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count); } -wait: /* wait until we successfully acquire the lock */ set_current_state(state); + trace_contention_begin(sem, LCB_F_WRITE); + for (;;) { if (rwsem_try_write_lock(sem, &waiter)) { /* rwsem_try_write_lock() implies ACQUIRE on success */ @@ -1148,17 +1188,15 @@ trylock_again: __set_current_state(TASK_RUNNING); raw_spin_unlock_irq(&sem->wait_lock); lockevent_inc(rwsem_wlock); + trace_contention_end(sem, 0); return sem; out_nolock: __set_current_state(TASK_RUNNING); raw_spin_lock_irq(&sem->wait_lock); - rwsem_del_waiter(sem, &waiter); - if (!list_empty(&sem->wait_list)) - rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); - raw_spin_unlock_irq(&sem->wait_lock); - wake_up_q(&wake_q); + rwsem_del_wake_waiter(sem, &waiter, &wake_q); lockevent_inc(rwsem_wlock_fail); + trace_contention_end(sem, -EINTR); return ERR_PTR(-EINTR); } diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index 9ee381e4d2a4..f2654d2fe43a 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -32,6 +32,7 @@ #include <linux/semaphore.h> #include <linux/spinlock.h> #include <linux/ftrace.h> +#include <trace/events/lock.h> static noinline void __down(struct semaphore *sem); static noinline int __down_interruptible(struct semaphore *sem); @@ -205,7 +206,7 @@ struct semaphore_waiter { * constant, and thus optimised away by the compiler. Likewise the * 'timeout' parameter for the cases without timeouts. */ -static inline int __sched __down_common(struct semaphore *sem, long state, +static inline int __sched ___down_common(struct semaphore *sem, long state, long timeout) { struct semaphore_waiter waiter; @@ -236,6 +237,18 @@ static inline int __sched __down_common(struct semaphore *sem, long state, return -EINTR; } +static inline int __sched __down_common(struct semaphore *sem, long state, + long timeout) +{ + int ret; + + trace_contention_begin(sem, 0); + ret = ___down_common(sem, state, timeout); + trace_contention_end(sem, ret); + + return ret; +} + static noinline void __sched __down(struct semaphore *sem) { __down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); diff --git a/kernel/module-internal.h b/kernel/module-internal.h deleted file mode 100644 index 8c381c99062f..000000000000 --- a/kernel/module-internal.h +++ /dev/null @@ -1,50 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* Module internals - * - * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#include <linux/elf.h> -#include <asm/module.h> - -struct load_info { - const char *name; - /* pointer to module in temporary copy, freed at end of load_module() */ - struct module *mod; - Elf_Ehdr *hdr; - unsigned long len; - Elf_Shdr *sechdrs; - char *secstrings, *strtab; - unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs; - struct _ddebug *debug; - unsigned int num_debug; - bool sig_ok; -#ifdef CONFIG_KALLSYMS - unsigned long mod_kallsyms_init_off; -#endif -#ifdef CONFIG_MODULE_DECOMPRESS - struct page **pages; - unsigned int max_pages; - unsigned int used_pages; -#endif - struct { - unsigned int sym, str, mod, vers, info, pcpu; - } index; -}; - -extern int mod_verify_sig(const void *mod, struct load_info *info); - -#ifdef CONFIG_MODULE_DECOMPRESS -int module_decompress(struct load_info *info, const void *buf, size_t size); -void module_decompress_cleanup(struct load_info *info); -#else -static inline int module_decompress(struct load_info *info, - const void *buf, size_t size) -{ - return -EOPNOTSUPP; -} -static inline void module_decompress_cleanup(struct load_info *info) -{ -} -#endif diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig new file mode 100644 index 000000000000..26ea5d04f56c --- /dev/null +++ b/kernel/module/Kconfig @@ -0,0 +1,293 @@ +# SPDX-License-Identifier: GPL-2.0-only +menuconfig MODULES + bool "Enable loadable module support" + modules + help + Kernel modules are small pieces of compiled code which can + be inserted in the running kernel, rather than being + permanently built into the kernel. You use the "modprobe" + tool to add (and sometimes remove) them. If you say Y here, + many parts of the kernel can be built as modules (by + answering M instead of Y where indicated): this is most + useful for infrequently used options which are not required + for booting. For more information, see the man pages for + modprobe, lsmod, modinfo, insmod and rmmod. + + If you say Y here, you will need to run "make + modules_install" to put the modules under /lib/modules/ + where modprobe can find them (you may need to be root to do + this). + + If unsure, say Y. + +if MODULES + +config MODULE_FORCE_LOAD + bool "Forced module loading" + default n + help + Allow loading of modules without version information (ie. modprobe + --force). Forced module loading sets the 'F' (forced) taint flag and + is usually a really bad idea. + +config MODULE_UNLOAD + bool "Module unloading" + help + Without this option you will not be able to unload any + modules (note that some modules may not be unloadable + anyway), which makes your kernel smaller, faster + and simpler. If unsure, say Y. + +config MODULE_FORCE_UNLOAD + bool "Forced module unloading" + depends on MODULE_UNLOAD + help + This option allows you to force a module to unload, even if the + kernel believes it is unsafe: the kernel will remove the module + without waiting for anyone to stop using it (using the -f option to + rmmod). This is mainly for kernel developers and desperate users. + If unsure, say N. + +config MODULE_UNLOAD_TAINT_TRACKING + bool "Tainted module unload tracking" + depends on MODULE_UNLOAD + default n + help + This option allows you to maintain a record of each unloaded + module that tainted the kernel. In addition to displaying a + list of linked (or loaded) modules e.g. on detection of a bad + page (see bad_page()), the aforementioned details are also + shown. If unsure, say N. + +config MODVERSIONS + bool "Module versioning support" + help + Usually, you have to use modules compiled with your kernel. + Saying Y here makes it sometimes possible to use modules + compiled for different kernels, by adding enough information + to the modules to (hopefully) spot any changes which would + make them incompatible with the kernel you are running. If + unsure, say N. + +config ASM_MODVERSIONS + bool + default HAVE_ASM_MODVERSIONS && MODVERSIONS + help + This enables module versioning for exported symbols also from + assembly. This can be enabled only when the target architecture + supports it. + +config MODULE_SRCVERSION_ALL + bool "Source checksum for all modules" + help + Modules which contain a MODULE_VERSION get an extra "srcversion" + field inserted into their modinfo section, which contains a + sum of the source files which made it. This helps maintainers + see exactly which source was used to build a module (since + others sometimes change the module source without updating + the version). With this option, such a "srcversion" field + will be created for all modules. If unsure, say N. + +config MODULE_SIG + bool "Module signature verification" + select MODULE_SIG_FORMAT + help + Check modules for valid signatures upon load: the signature + is simply appended to the module. For more information see + <file:Documentation/admin-guide/module-signing.rst>. + + Note that this option adds the OpenSSL development packages as a + kernel build dependency so that the signing tool can use its crypto + library. + + You should enable this option if you wish to use either + CONFIG_SECURITY_LOCKDOWN_LSM or lockdown functionality imposed via + another LSM - otherwise unsigned modules will be loadable regardless + of the lockdown policy. + + !!!WARNING!!! If you enable this option, you MUST make sure that the + module DOES NOT get stripped after being signed. This includes the + debuginfo strip done by some packagers (such as rpmbuild) and + inclusion into an initramfs that wants the module size reduced. + +config MODULE_SIG_FORCE + bool "Require modules to be validly signed" + depends on MODULE_SIG + help + Reject unsigned modules or signed modules for which we don't have a + key. Without this, such modules will simply taint the kernel. + +config MODULE_SIG_ALL + bool "Automatically sign all modules" + default y + depends on MODULE_SIG || IMA_APPRAISE_MODSIG + help + Sign all modules during make modules_install. Without this option, + modules must be signed manually, using the scripts/sign-file tool. + +comment "Do not forget to sign required modules with scripts/sign-file" + depends on MODULE_SIG_FORCE && !MODULE_SIG_ALL + +choice + prompt "Which hash algorithm should modules be signed with?" + depends on MODULE_SIG || IMA_APPRAISE_MODSIG + help + This determines which sort of hashing algorithm will be used during + signature generation. This algorithm _must_ be built into the kernel + directly so that signature verification can take place. It is not + possible to load a signed module containing the algorithm to check + the signature on that module. + +config MODULE_SIG_SHA1 + bool "Sign modules with SHA-1" + select CRYPTO_SHA1 + +config MODULE_SIG_SHA224 + bool "Sign modules with SHA-224" + select CRYPTO_SHA256 + +config MODULE_SIG_SHA256 + bool "Sign modules with SHA-256" + select CRYPTO_SHA256 + +config MODULE_SIG_SHA384 + bool "Sign modules with SHA-384" + select CRYPTO_SHA512 + +config MODULE_SIG_SHA512 + bool "Sign modules with SHA-512" + select CRYPTO_SHA512 + +endchoice + +config MODULE_SIG_HASH + string + depends on MODULE_SIG || IMA_APPRAISE_MODSIG + default "sha1" if MODULE_SIG_SHA1 + default "sha224" if MODULE_SIG_SHA224 + default "sha256" if MODULE_SIG_SHA256 + default "sha384" if MODULE_SIG_SHA384 + default "sha512" if MODULE_SIG_SHA512 + +choice + prompt "Module compression mode" + help + This option allows you to choose the algorithm which will be used to + compress modules when 'make modules_install' is run. (or, you can + choose to not compress modules at all.) + + External modules will also be compressed in the same way during the + installation. + + For modules inside an initrd or initramfs, it's more efficient to + compress the whole initrd or initramfs instead. + + This is fully compatible with signed modules. + + Please note that the tool used to load modules needs to support the + corresponding algorithm. module-init-tools MAY support gzip, and kmod + MAY support gzip, xz and zstd. + + Your build system needs to provide the appropriate compression tool + to compress the modules. + + If in doubt, select 'None'. + +config MODULE_COMPRESS_NONE + bool "None" + help + Do not compress modules. The installed modules are suffixed + with .ko. + +config MODULE_COMPRESS_GZIP + bool "GZIP" + help + Compress modules with GZIP. The installed modules are suffixed + with .ko.gz. + +config MODULE_COMPRESS_XZ + bool "XZ" + help + Compress modules with XZ. The installed modules are suffixed + with .ko.xz. + +config MODULE_COMPRESS_ZSTD + bool "ZSTD" + help + Compress modules with ZSTD. The installed modules are suffixed + with .ko.zst. + +endchoice + +config MODULE_DECOMPRESS + bool "Support in-kernel module decompression" + depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ + select ZLIB_INFLATE if MODULE_COMPRESS_GZIP + select XZ_DEC if MODULE_COMPRESS_XZ + help + + Support for decompressing kernel modules by the kernel itself + instead of relying on userspace to perform this task. Useful when + load pinning security policy is enabled. + + If unsure, say N. + +config MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS + bool "Allow loading of modules with missing namespace imports" + help + Symbols exported with EXPORT_SYMBOL_NS*() are considered exported in + a namespace. A module that makes use of a symbol exported with such a + namespace is required to import the namespace via MODULE_IMPORT_NS(). + There is no technical reason to enforce correct namespace imports, + but it creates consistency between symbols defining namespaces and + users importing namespaces they make use of. This option relaxes this + requirement and lifts the enforcement when loading a module. + + If unsure, say N. + +config MODPROBE_PATH + string "Path to modprobe binary" + default "/sbin/modprobe" + help + When kernel code requests a module, it does so by calling + the "modprobe" userspace utility. This option allows you to + set the path where that binary is found. This can be changed + at runtime via the sysctl file + /proc/sys/kernel/modprobe. Setting this to the empty string + removes the kernel's ability to request modules (but + userspace can still load modules explicitly). + +config TRIM_UNUSED_KSYMS + bool "Trim unused exported kernel symbols" if EXPERT + depends on !COMPILE_TEST + help + The kernel and some modules make many symbols available for + other modules to use via EXPORT_SYMBOL() and variants. Depending + on the set of modules being selected in your kernel configuration, + many of those exported symbols might never be used. + + This option allows for unused exported symbols to be dropped from + the build. In turn, this provides the compiler more opportunities + (especially when using LTO) for optimizing the code and reducing + binary size. This might have some security advantages as well. + + If unsure, or if you need to build out-of-tree modules, say N. + +config UNUSED_KSYMS_WHITELIST + string "Whitelist of symbols to keep in ksymtab" + depends on TRIM_UNUSED_KSYMS + help + By default, all unused exported symbols will be un-exported from the + build when TRIM_UNUSED_KSYMS is selected. + + UNUSED_KSYMS_WHITELIST allows to whitelist symbols that must be kept + exported at all times, even in absence of in-tree users. The value to + set here is the path to a text file containing the list of symbols, + one per line. The path can be absolute, or relative to the kernel + source tree. + +config MODULES_TREE_LOOKUP + def_bool y + depends on PERF_EVENTS || TRACING || CFI_CLANG + +endif # MODULES diff --git a/kernel/module/Makefile b/kernel/module/Makefile new file mode 100644 index 000000000000..948efea81e85 --- /dev/null +++ b/kernel/module/Makefile @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Makefile for linux kernel module support +# + +# These are called from save_stack_trace() on slub debug path, +# and produce insane amounts of uninteresting coverage. +KCOV_INSTRUMENT_module.o := n + +obj-y += main.o strict_rwx.o +obj-$(CONFIG_MODULE_DECOMPRESS) += decompress.o +obj-$(CONFIG_MODULE_SIG) += signing.o +obj-$(CONFIG_LIVEPATCH) += livepatch.o +obj-$(CONFIG_MODULES_TREE_LOOKUP) += tree_lookup.o +obj-$(CONFIG_DEBUG_KMEMLEAK) += debug_kmemleak.o +obj-$(CONFIG_KALLSYMS) += kallsyms.o +obj-$(CONFIG_PROC_FS) += procfs.o +obj-$(CONFIG_SYSFS) += sysfs.o +obj-$(CONFIG_KGDB_KDB) += kdb.o +obj-$(CONFIG_MODVERSIONS) += version.o +obj-$(CONFIG_MODULE_UNLOAD_TAINT_TRACKING) += tracking.o diff --git a/kernel/module/debug_kmemleak.c b/kernel/module/debug_kmemleak.c new file mode 100644 index 000000000000..12a569d361e8 --- /dev/null +++ b/kernel/module/debug_kmemleak.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Module kmemleak support + * + * Copyright (C) 2009 Catalin Marinas + */ + +#include <linux/module.h> +#include <linux/kmemleak.h> +#include "internal.h" + +void kmemleak_load_module(const struct module *mod, + const struct load_info *info) +{ + unsigned int i; + + /* only scan the sections containing data */ + kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); + + for (i = 1; i < info->hdr->e_shnum; i++) { + /* Scan all writable sections that's not executable */ + if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) || + !(info->sechdrs[i].sh_flags & SHF_WRITE) || + (info->sechdrs[i].sh_flags & SHF_EXECINSTR)) + continue; + + kmemleak_scan_area((void *)info->sechdrs[i].sh_addr, + info->sechdrs[i].sh_size, GFP_KERNEL); + } +} diff --git a/kernel/module_decompress.c b/kernel/module/decompress.c index ffef98a20320..4d0bcb3d9e44 100644 --- a/kernel/module_decompress.c +++ b/kernel/module/decompress.c @@ -12,7 +12,7 @@ #include <linux/sysfs.h> #include <linux/vmalloc.h> -#include "module-internal.h" +#include "internal.h" static int module_extend_max_pages(struct load_info *info, unsigned int extent) { @@ -113,15 +113,16 @@ static ssize_t module_gzip_decompress(struct load_info *info, do { struct page *page = module_get_next_page(info); + if (!page) { retval = -ENOMEM; goto out_inflate_end; } - s.next_out = kmap(page); + s.next_out = kmap_local_page(page); s.avail_out = PAGE_SIZE; rc = zlib_inflate(&s, 0); - kunmap(page); + kunmap_local(s.next_out); new_size += PAGE_SIZE - s.avail_out; } while (rc == Z_OK); @@ -171,16 +172,17 @@ static ssize_t module_xz_decompress(struct load_info *info, do { struct page *page = module_get_next_page(info); + if (!page) { retval = -ENOMEM; goto out; } - xz_buf.out = kmap(page); + xz_buf.out = kmap_local_page(page); xz_buf.out_pos = 0; xz_buf.out_size = PAGE_SIZE; xz_ret = xz_dec_run(xz_dec, &xz_buf); - kunmap(page); + kunmap_local(xz_buf.out); new_size += xz_buf.out_pos; } while (xz_buf.out_pos == PAGE_SIZE && xz_ret == XZ_OK); @@ -256,6 +258,7 @@ static ssize_t compression_show(struct kobject *kobj, { return sysfs_emit(buf, "%s\n", __stringify(MODULE_COMPRESSION)); } + static struct kobj_attribute module_compression_attr = __ATTR_RO(compression); static int __init module_decompress_sysfs_init(void) diff --git a/kernel/module/internal.h b/kernel/module/internal.h new file mode 100644 index 000000000000..2e2bf236f558 --- /dev/null +++ b/kernel/module/internal.h @@ -0,0 +1,305 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Module internals + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/elf.h> +#include <linux/compiler.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/rculist.h> +#include <linux/rcupdate.h> +#include <linux/mm.h> + +#ifndef ARCH_SHF_SMALL +#define ARCH_SHF_SMALL 0 +#endif + +/* If this is set, the section belongs in the init part of the module */ +#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG - 1)) +/* Maximum number of characters written by module_flags() */ +#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4) + +#ifndef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC +#define data_layout core_layout +#endif + +/* + * Modules' sections will be aligned on page boundaries + * to ensure complete separation of code and data, but + * only when CONFIG_STRICT_MODULE_RWX=y + */ +static inline unsigned int strict_align(unsigned int size) +{ + if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) + return PAGE_ALIGN(size); + else + return size; +} + +extern struct mutex module_mutex; +extern struct list_head modules; + +extern struct module_attribute *modinfo_attrs[]; +extern size_t modinfo_attrs_count; + +/* Provided by the linker */ +extern const struct kernel_symbol __start___ksymtab[]; +extern const struct kernel_symbol __stop___ksymtab[]; +extern const struct kernel_symbol __start___ksymtab_gpl[]; +extern const struct kernel_symbol __stop___ksymtab_gpl[]; +extern const s32 __start___kcrctab[]; +extern const s32 __start___kcrctab_gpl[]; + +#include <linux/dynamic_debug.h> +struct load_info { + const char *name; + /* pointer to module in temporary copy, freed at end of load_module() */ + struct module *mod; + Elf_Ehdr *hdr; + unsigned long len; + Elf_Shdr *sechdrs; + char *secstrings, *strtab; + unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs; + struct _ddebug_info dyndbg; + bool sig_ok; +#ifdef CONFIG_KALLSYMS + unsigned long mod_kallsyms_init_off; +#endif +#ifdef CONFIG_MODULE_DECOMPRESS + struct page **pages; + unsigned int max_pages; + unsigned int used_pages; +#endif + struct { + unsigned int sym, str, mod, vers, info, pcpu; + } index; +}; + +enum mod_license { + NOT_GPL_ONLY, + GPL_ONLY, +}; + +struct find_symbol_arg { + /* Input */ + const char *name; + bool gplok; + bool warn; + + /* Output */ + struct module *owner; + const s32 *crc; + const struct kernel_symbol *sym; + enum mod_license license; +}; + +int mod_verify_sig(const void *mod, struct load_info *info); +int try_to_force_load(struct module *mod, const char *reason); +bool find_symbol(struct find_symbol_arg *fsa); +struct module *find_module_all(const char *name, size_t len, bool even_unformed); +int cmp_name(const void *name, const void *sym); +long module_get_offset(struct module *mod, unsigned int *size, Elf_Shdr *sechdr, + unsigned int section); +char *module_flags(struct module *mod, char *buf, bool show_state); +size_t module_flags_taint(unsigned long taints, char *buf); + +static inline void module_assert_mutex_or_preempt(void) +{ +#ifdef CONFIG_LOCKDEP + if (unlikely(!debug_locks)) + return; + + WARN_ON_ONCE(!rcu_read_lock_sched_held() && + !lockdep_is_held(&module_mutex)); +#endif +} + +static inline unsigned long kernel_symbol_value(const struct kernel_symbol *sym) +{ +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS + return (unsigned long)offset_to_ptr(&sym->value_offset); +#else + return sym->value; +#endif +} + +#ifdef CONFIG_LIVEPATCH +int copy_module_elf(struct module *mod, struct load_info *info); +void free_module_elf(struct module *mod); +#else /* !CONFIG_LIVEPATCH */ +static inline int copy_module_elf(struct module *mod, struct load_info *info) +{ + return 0; +} + +static inline void free_module_elf(struct module *mod) { } +#endif /* CONFIG_LIVEPATCH */ + +static inline bool set_livepatch_module(struct module *mod) +{ +#ifdef CONFIG_LIVEPATCH + mod->klp = true; + return true; +#else + return false; +#endif +} + +#ifdef CONFIG_MODULE_UNLOAD_TAINT_TRACKING +struct mod_unload_taint { + struct list_head list; + char name[MODULE_NAME_LEN]; + unsigned long taints; + u64 count; +}; + +int try_add_tainted_module(struct module *mod); +void print_unloaded_tainted_modules(void); +#else /* !CONFIG_MODULE_UNLOAD_TAINT_TRACKING */ +static inline int try_add_tainted_module(struct module *mod) +{ + return 0; +} + +static inline void print_unloaded_tainted_modules(void) +{ +} +#endif /* CONFIG_MODULE_UNLOAD_TAINT_TRACKING */ + +#ifdef CONFIG_MODULE_DECOMPRESS +int module_decompress(struct load_info *info, const void *buf, size_t size); +void module_decompress_cleanup(struct load_info *info); +#else +static inline int module_decompress(struct load_info *info, + const void *buf, size_t size) +{ + return -EOPNOTSUPP; +} + +static inline void module_decompress_cleanup(struct load_info *info) +{ +} +#endif + +struct mod_tree_root { +#ifdef CONFIG_MODULES_TREE_LOOKUP + struct latch_tree_root root; +#endif + unsigned long addr_min; + unsigned long addr_max; +}; + +extern struct mod_tree_root mod_tree; +extern struct mod_tree_root mod_data_tree; + +#ifdef CONFIG_MODULES_TREE_LOOKUP +void mod_tree_insert(struct module *mod); +void mod_tree_remove_init(struct module *mod); +void mod_tree_remove(struct module *mod); +struct module *mod_find(unsigned long addr, struct mod_tree_root *tree); +#else /* !CONFIG_MODULES_TREE_LOOKUP */ + +static inline void mod_tree_insert(struct module *mod) { } +static inline void mod_tree_remove_init(struct module *mod) { } +static inline void mod_tree_remove(struct module *mod) { } +static inline struct module *mod_find(unsigned long addr, struct mod_tree_root *tree) +{ + struct module *mod; + + list_for_each_entry_rcu(mod, &modules, list, + lockdep_is_held(&module_mutex)) { + if (within_module(addr, mod)) + return mod; + } + + return NULL; +} +#endif /* CONFIG_MODULES_TREE_LOOKUP */ + +void module_enable_ro(const struct module *mod, bool after_init); +void module_enable_nx(const struct module *mod); +void module_enable_x(const struct module *mod); +int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, + char *secstrings, struct module *mod); +bool module_check_misalignment(const struct module *mod); + +#ifdef CONFIG_MODULE_SIG +int module_sig_check(struct load_info *info, int flags); +#else /* !CONFIG_MODULE_SIG */ +static inline int module_sig_check(struct load_info *info, int flags) +{ + return 0; +} +#endif /* !CONFIG_MODULE_SIG */ + +#ifdef CONFIG_DEBUG_KMEMLEAK +void kmemleak_load_module(const struct module *mod, const struct load_info *info); +#else /* !CONFIG_DEBUG_KMEMLEAK */ +static inline void kmemleak_load_module(const struct module *mod, + const struct load_info *info) { } +#endif /* CONFIG_DEBUG_KMEMLEAK */ + +#ifdef CONFIG_KALLSYMS +void init_build_id(struct module *mod, const struct load_info *info); +void layout_symtab(struct module *mod, struct load_info *info); +void add_kallsyms(struct module *mod, const struct load_info *info); +unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name); + +static inline bool sect_empty(const Elf_Shdr *sect) +{ + return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; +} +#else /* !CONFIG_KALLSYMS */ +static inline void init_build_id(struct module *mod, const struct load_info *info) { } +static inline void layout_symtab(struct module *mod, struct load_info *info) { } +static inline void add_kallsyms(struct module *mod, const struct load_info *info) { } +#endif /* CONFIG_KALLSYMS */ + +#ifdef CONFIG_SYSFS +int mod_sysfs_setup(struct module *mod, const struct load_info *info, + struct kernel_param *kparam, unsigned int num_params); +void mod_sysfs_teardown(struct module *mod); +void init_param_lock(struct module *mod); +#else /* !CONFIG_SYSFS */ +static inline int mod_sysfs_setup(struct module *mod, + const struct load_info *info, + struct kernel_param *kparam, + unsigned int num_params) +{ + return 0; +} + +static inline void mod_sysfs_teardown(struct module *mod) { } +static inline void init_param_lock(struct module *mod) { } +#endif /* CONFIG_SYSFS */ + +#ifdef CONFIG_MODVERSIONS +int check_version(const struct load_info *info, + const char *symname, struct module *mod, const s32 *crc); +void module_layout(struct module *mod, struct modversion_info *ver, struct kernel_param *kp, + struct kernel_symbol *ks, struct tracepoint * const *tp); +int check_modstruct_version(const struct load_info *info, struct module *mod); +int same_magic(const char *amagic, const char *bmagic, bool has_crcs); +#else /* !CONFIG_MODVERSIONS */ +static inline int check_version(const struct load_info *info, + const char *symname, + struct module *mod, + const s32 *crc) +{ + return 1; +} + +static inline int check_modstruct_version(const struct load_info *info, + struct module *mod) +{ + return 1; +} + +static inline int same_magic(const char *amagic, const char *bmagic, bool has_crcs) +{ + return strcmp(amagic, bmagic) == 0; +} +#endif /* CONFIG_MODVERSIONS */ diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c new file mode 100644 index 000000000000..f5c5c9175333 --- /dev/null +++ b/kernel/module/kallsyms.c @@ -0,0 +1,534 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Module kallsyms support + * + * Copyright (C) 2010 Rusty Russell + */ + +#include <linux/module.h> +#include <linux/kallsyms.h> +#include <linux/buildid.h> +#include <linux/bsearch.h> +#include "internal.h" + +/* Lookup exported symbol in given range of kernel_symbols */ +static const struct kernel_symbol *lookup_exported_symbol(const char *name, + const struct kernel_symbol *start, + const struct kernel_symbol *stop) +{ + return bsearch(name, start, stop - start, + sizeof(struct kernel_symbol), cmp_name); +} + +static int is_exported(const char *name, unsigned long value, + const struct module *mod) +{ + const struct kernel_symbol *ks; + + if (!mod) + ks = lookup_exported_symbol(name, __start___ksymtab, __stop___ksymtab); + else + ks = lookup_exported_symbol(name, mod->syms, mod->syms + mod->num_syms); + + return ks && kernel_symbol_value(ks) == value; +} + +/* As per nm */ +static char elf_type(const Elf_Sym *sym, const struct load_info *info) +{ + const Elf_Shdr *sechdrs = info->sechdrs; + + if (ELF_ST_BIND(sym->st_info) == STB_WEAK) { + if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT) + return 'v'; + else + return 'w'; + } + if (sym->st_shndx == SHN_UNDEF) + return 'U'; + if (sym->st_shndx == SHN_ABS || sym->st_shndx == info->index.pcpu) + return 'a'; + if (sym->st_shndx >= SHN_LORESERVE) + return '?'; + if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR) + return 't'; + if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC && + sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) { + if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE)) + return 'r'; + else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL) + return 'g'; + else + return 'd'; + } + if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { + if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL) + return 's'; + else + return 'b'; + } + if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name, + ".debug")) { + return 'n'; + } + return '?'; +} + +static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, + unsigned int shnum, unsigned int pcpundx) +{ + const Elf_Shdr *sec; + + if (src->st_shndx == SHN_UNDEF || + src->st_shndx >= shnum || + !src->st_name) + return false; + +#ifdef CONFIG_KALLSYMS_ALL + if (src->st_shndx == pcpundx) + return true; +#endif + + sec = sechdrs + src->st_shndx; + if (!(sec->sh_flags & SHF_ALLOC) +#ifndef CONFIG_KALLSYMS_ALL + || !(sec->sh_flags & SHF_EXECINSTR) +#endif + || (sec->sh_entsize & INIT_OFFSET_MASK)) + return false; + + return true; +} + +/* + * We only allocate and copy the strings needed by the parts of symtab + * we keep. This is simple, but has the effect of making multiple + * copies of duplicates. We could be more sophisticated, see + * linux-kernel thread starting with + * <73defb5e4bca04a6431392cc341112b1@localhost>. + */ +void layout_symtab(struct module *mod, struct load_info *info) +{ + Elf_Shdr *symsect = info->sechdrs + info->index.sym; + Elf_Shdr *strsect = info->sechdrs + info->index.str; + const Elf_Sym *src; + unsigned int i, nsrc, ndst, strtab_size = 0; + + /* Put symbol section at end of init part of module. */ + symsect->sh_flags |= SHF_ALLOC; + symsect->sh_entsize = module_get_offset(mod, &mod->init_layout.size, symsect, + info->index.sym) | INIT_OFFSET_MASK; + pr_debug("\t%s\n", info->secstrings + symsect->sh_name); + + src = (void *)info->hdr + symsect->sh_offset; + nsrc = symsect->sh_size / sizeof(*src); + + /* Compute total space required for the core symbols' strtab. */ + for (ndst = i = 0; i < nsrc; i++) { + if (i == 0 || is_livepatch_module(mod) || + is_core_symbol(src + i, info->sechdrs, info->hdr->e_shnum, + info->index.pcpu)) { + strtab_size += strlen(&info->strtab[src[i].st_name]) + 1; + ndst++; + } + } + + /* Append room for core symbols at end of core part. */ + info->symoffs = ALIGN(mod->data_layout.size, symsect->sh_addralign ?: 1); + info->stroffs = mod->data_layout.size = info->symoffs + ndst * sizeof(Elf_Sym); + mod->data_layout.size += strtab_size; + /* Note add_kallsyms() computes strtab_size as core_typeoffs - stroffs */ + info->core_typeoffs = mod->data_layout.size; + mod->data_layout.size += ndst * sizeof(char); + mod->data_layout.size = strict_align(mod->data_layout.size); + + /* Put string table section at end of init part of module. */ + strsect->sh_flags |= SHF_ALLOC; + strsect->sh_entsize = module_get_offset(mod, &mod->init_layout.size, strsect, + info->index.str) | INIT_OFFSET_MASK; + pr_debug("\t%s\n", info->secstrings + strsect->sh_name); + + /* We'll tack temporary mod_kallsyms on the end. */ + mod->init_layout.size = ALIGN(mod->init_layout.size, + __alignof__(struct mod_kallsyms)); + info->mod_kallsyms_init_off = mod->init_layout.size; + mod->init_layout.size += sizeof(struct mod_kallsyms); + info->init_typeoffs = mod->init_layout.size; + mod->init_layout.size += nsrc * sizeof(char); + mod->init_layout.size = strict_align(mod->init_layout.size); +} + +/* + * We use the full symtab and strtab which layout_symtab arranged to + * be appended to the init section. Later we switch to the cut-down + * core-only ones. + */ +void add_kallsyms(struct module *mod, const struct load_info *info) +{ + unsigned int i, ndst; + const Elf_Sym *src; + Elf_Sym *dst; + char *s; + Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; + unsigned long strtab_size; + + /* Set up to point into init section. */ + mod->kallsyms = (void __rcu *)mod->init_layout.base + + info->mod_kallsyms_init_off; + + rcu_read_lock(); + /* The following is safe since this pointer cannot change */ + rcu_dereference(mod->kallsyms)->symtab = (void *)symsec->sh_addr; + rcu_dereference(mod->kallsyms)->num_symtab = symsec->sh_size / sizeof(Elf_Sym); + /* Make sure we get permanent strtab: don't use info->strtab. */ + rcu_dereference(mod->kallsyms)->strtab = + (void *)info->sechdrs[info->index.str].sh_addr; + rcu_dereference(mod->kallsyms)->typetab = mod->init_layout.base + info->init_typeoffs; + + /* + * Now populate the cut down core kallsyms for after init + * and set types up while we still have access to sections. + */ + mod->core_kallsyms.symtab = dst = mod->data_layout.base + info->symoffs; + mod->core_kallsyms.strtab = s = mod->data_layout.base + info->stroffs; + mod->core_kallsyms.typetab = mod->data_layout.base + info->core_typeoffs; + strtab_size = info->core_typeoffs - info->stroffs; + src = rcu_dereference(mod->kallsyms)->symtab; + for (ndst = i = 0; i < rcu_dereference(mod->kallsyms)->num_symtab; i++) { + rcu_dereference(mod->kallsyms)->typetab[i] = elf_type(src + i, info); + if (i == 0 || is_livepatch_module(mod) || + is_core_symbol(src + i, info->sechdrs, info->hdr->e_shnum, + info->index.pcpu)) { + ssize_t ret; + + mod->core_kallsyms.typetab[ndst] = + rcu_dereference(mod->kallsyms)->typetab[i]; + dst[ndst] = src[i]; + dst[ndst++].st_name = s - mod->core_kallsyms.strtab; + ret = strscpy(s, + &rcu_dereference(mod->kallsyms)->strtab[src[i].st_name], + strtab_size); + if (ret < 0) + break; + s += ret + 1; + strtab_size -= ret + 1; + } + } + rcu_read_unlock(); + mod->core_kallsyms.num_symtab = ndst; +} + +#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) +void init_build_id(struct module *mod, const struct load_info *info) +{ + const Elf_Shdr *sechdr; + unsigned int i; + + for (i = 0; i < info->hdr->e_shnum; i++) { + sechdr = &info->sechdrs[i]; + if (!sect_empty(sechdr) && sechdr->sh_type == SHT_NOTE && + !build_id_parse_buf((void *)sechdr->sh_addr, mod->build_id, + sechdr->sh_size)) + break; + } +} +#else +void init_build_id(struct module *mod, const struct load_info *info) +{ +} +#endif + +/* + * This ignores the intensely annoying "mapping symbols" found + * in ARM ELF files: $a, $t and $d. + */ +static inline int is_arm_mapping_symbol(const char *str) +{ + if (str[0] == '.' && str[1] == 'L') + return true; + return str[0] == '$' && strchr("axtd", str[1]) && + (str[2] == '\0' || str[2] == '.'); +} + +static const char *kallsyms_symbol_name(struct mod_kallsyms *kallsyms, unsigned int symnum) +{ + return kallsyms->strtab + kallsyms->symtab[symnum].st_name; +} + +/* + * Given a module and address, find the corresponding symbol and return its name + * while providing its size and offset if needed. + */ +static const char *find_kallsyms_symbol(struct module *mod, + unsigned long addr, + unsigned long *size, + unsigned long *offset) +{ + unsigned int i, best = 0; + unsigned long nextval, bestval; + struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); + + /* At worse, next value is at end of module */ + if (within_module_init(addr, mod)) + nextval = (unsigned long)mod->init_layout.base + mod->init_layout.text_size; + else + nextval = (unsigned long)mod->core_layout.base + mod->core_layout.text_size; + + bestval = kallsyms_symbol_value(&kallsyms->symtab[best]); + + /* + * Scan for closest preceding symbol, and next symbol. (ELF + * starts real symbols at 1). + */ + for (i = 1; i < kallsyms->num_symtab; i++) { + const Elf_Sym *sym = &kallsyms->symtab[i]; + unsigned long thisval = kallsyms_symbol_value(sym); + + if (sym->st_shndx == SHN_UNDEF) + continue; + + /* + * We ignore unnamed symbols: they're uninformative + * and inserted at a whim. + */ + if (*kallsyms_symbol_name(kallsyms, i) == '\0' || + is_arm_mapping_symbol(kallsyms_symbol_name(kallsyms, i))) + continue; + + if (thisval <= addr && thisval > bestval) { + best = i; + bestval = thisval; + } + if (thisval > addr && thisval < nextval) + nextval = thisval; + } + + if (!best) + return NULL; + + if (size) + *size = nextval - bestval; + if (offset) + *offset = addr - bestval; + + return kallsyms_symbol_name(kallsyms, best); +} + +void * __weak dereference_module_function_descriptor(struct module *mod, + void *ptr) +{ + return ptr; +} + +/* + * For kallsyms to ask for address resolution. NULL means not found. Careful + * not to lock to avoid deadlock on oopses, simply disable preemption. + */ +const char *module_address_lookup(unsigned long addr, + unsigned long *size, + unsigned long *offset, + char **modname, + const unsigned char **modbuildid, + char *namebuf) +{ + const char *ret = NULL; + struct module *mod; + + preempt_disable(); + mod = __module_address(addr); + if (mod) { + if (modname) + *modname = mod->name; + if (modbuildid) { +#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) + *modbuildid = mod->build_id; +#else + *modbuildid = NULL; +#endif + } + + ret = find_kallsyms_symbol(mod, addr, size, offset); + } + /* Make a copy in here where it's safe */ + if (ret) { + strncpy(namebuf, ret, KSYM_NAME_LEN - 1); + ret = namebuf; + } + preempt_enable(); + + return ret; +} + +int lookup_module_symbol_name(unsigned long addr, char *symname) +{ + struct module *mod; + + preempt_disable(); + list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; + if (within_module(addr, mod)) { + const char *sym; + + sym = find_kallsyms_symbol(mod, addr, NULL, NULL); + if (!sym) + goto out; + + strscpy(symname, sym, KSYM_NAME_LEN); + preempt_enable(); + return 0; + } + } +out: + preempt_enable(); + return -ERANGE; +} + +int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, + unsigned long *offset, char *modname, char *name) +{ + struct module *mod; + + preempt_disable(); + list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; + if (within_module(addr, mod)) { + const char *sym; + + sym = find_kallsyms_symbol(mod, addr, size, offset); + if (!sym) + goto out; + if (modname) + strscpy(modname, mod->name, MODULE_NAME_LEN); + if (name) + strscpy(name, sym, KSYM_NAME_LEN); + preempt_enable(); + return 0; + } + } +out: + preempt_enable(); + return -ERANGE; +} + +int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, + char *name, char *module_name, int *exported) +{ + struct module *mod; + + preempt_disable(); + list_for_each_entry_rcu(mod, &modules, list) { + struct mod_kallsyms *kallsyms; + + if (mod->state == MODULE_STATE_UNFORMED) + continue; + kallsyms = rcu_dereference_sched(mod->kallsyms); + if (symnum < kallsyms->num_symtab) { + const Elf_Sym *sym = &kallsyms->symtab[symnum]; + + *value = kallsyms_symbol_value(sym); + *type = kallsyms->typetab[symnum]; + strscpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN); + strscpy(module_name, mod->name, MODULE_NAME_LEN); + *exported = is_exported(name, *value, mod); + preempt_enable(); + return 0; + } + symnum -= kallsyms->num_symtab; + } + preempt_enable(); + return -ERANGE; +} + +/* Given a module and name of symbol, find and return the symbol's value */ +unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name) +{ + unsigned int i; + struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); + + for (i = 0; i < kallsyms->num_symtab; i++) { + const Elf_Sym *sym = &kallsyms->symtab[i]; + + if (strcmp(name, kallsyms_symbol_name(kallsyms, i)) == 0 && + sym->st_shndx != SHN_UNDEF) + return kallsyms_symbol_value(sym); + } + return 0; +} + +static unsigned long __module_kallsyms_lookup_name(const char *name) +{ + struct module *mod; + char *colon; + + colon = strnchr(name, MODULE_NAME_LEN, ':'); + if (colon) { + mod = find_module_all(name, colon - name, false); + if (mod) + return find_kallsyms_symbol_value(mod, colon + 1); + return 0; + } + + list_for_each_entry_rcu(mod, &modules, list) { + unsigned long ret; + + if (mod->state == MODULE_STATE_UNFORMED) + continue; + ret = find_kallsyms_symbol_value(mod, name); + if (ret) + return ret; + } + return 0; +} + +/* Look for this name: can be of form module:name. */ +unsigned long module_kallsyms_lookup_name(const char *name) +{ + unsigned long ret; + + /* Don't lock: we're in enough trouble already. */ + preempt_disable(); + ret = __module_kallsyms_lookup_name(name); + preempt_enable(); + return ret; +} + +#ifdef CONFIG_LIVEPATCH +int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, + struct module *, unsigned long), + void *data) +{ + struct module *mod; + unsigned int i; + int ret = 0; + + mutex_lock(&module_mutex); + list_for_each_entry(mod, &modules, list) { + struct mod_kallsyms *kallsyms; + + if (mod->state == MODULE_STATE_UNFORMED) + continue; + + /* Use rcu_dereference_sched() to remain compliant with the sparse tool */ + preempt_disable(); + kallsyms = rcu_dereference_sched(mod->kallsyms); + preempt_enable(); + + for (i = 0; i < kallsyms->num_symtab; i++) { + const Elf_Sym *sym = &kallsyms->symtab[i]; + + if (sym->st_shndx == SHN_UNDEF) + continue; + + ret = fn(data, kallsyms_symbol_name(kallsyms, i), + mod, kallsyms_symbol_value(sym)); + if (ret != 0) + goto out; + } + } +out: + mutex_unlock(&module_mutex); + return ret; +} +#endif /* CONFIG_LIVEPATCH */ diff --git a/kernel/module/kdb.c b/kernel/module/kdb.c new file mode 100644 index 000000000000..f4317f92e189 --- /dev/null +++ b/kernel/module/kdb.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Module kdb support + * + * Copyright (C) 2010 Jason Wessel + */ + +#include <linux/module.h> +#include <linux/kdb.h> +#include "internal.h" + +/* + * kdb_lsmod - This function implements the 'lsmod' command. Lists + * currently loaded kernel modules. + * Mostly taken from userland lsmod. + */ +int kdb_lsmod(int argc, const char **argv) +{ + struct module *mod; + + if (argc != 0) + return KDB_ARGCOUNT; + + kdb_printf("Module Size modstruct Used by\n"); + list_for_each_entry(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; + + kdb_printf("%-20s%8u", mod->name, mod->core_layout.size); +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + kdb_printf("/%8u", mod->data_layout.size); +#endif + kdb_printf(" 0x%px ", (void *)mod); +#ifdef CONFIG_MODULE_UNLOAD + kdb_printf("%4d ", module_refcount(mod)); +#endif + if (mod->state == MODULE_STATE_GOING) + kdb_printf(" (Unloading)"); + else if (mod->state == MODULE_STATE_COMING) + kdb_printf(" (Loading)"); + else + kdb_printf(" (Live)"); + kdb_printf(" 0x%px", mod->core_layout.base); +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + kdb_printf("/0x%px", mod->data_layout.base); +#endif + +#ifdef CONFIG_MODULE_UNLOAD + { + struct module_use *use; + + kdb_printf(" [ "); + list_for_each_entry(use, &mod->source_list, + source_list) + kdb_printf("%s ", use->target->name); + kdb_printf("]\n"); + } +#endif + } + + return 0; +} diff --git a/kernel/module/livepatch.c b/kernel/module/livepatch.c new file mode 100644 index 000000000000..486d4ff92719 --- /dev/null +++ b/kernel/module/livepatch.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Module livepatch support + * + * Copyright (C) 2016 Jessica Yu <jeyu@redhat.com> + */ + +#include <linux/module.h> +#include <linux/string.h> +#include <linux/slab.h> +#include "internal.h" + +/* + * Persist Elf information about a module. Copy the Elf header, + * section header table, section string table, and symtab section + * index from info to mod->klp_info. + */ +int copy_module_elf(struct module *mod, struct load_info *info) +{ + unsigned int size, symndx; + int ret; + + size = sizeof(*mod->klp_info); + mod->klp_info = kmalloc(size, GFP_KERNEL); + if (!mod->klp_info) + return -ENOMEM; + + /* Elf header */ + size = sizeof(mod->klp_info->hdr); + memcpy(&mod->klp_info->hdr, info->hdr, size); + + /* Elf section header table */ + size = sizeof(*info->sechdrs) * info->hdr->e_shnum; + mod->klp_info->sechdrs = kmemdup(info->sechdrs, size, GFP_KERNEL); + if (!mod->klp_info->sechdrs) { + ret = -ENOMEM; + goto free_info; + } + + /* Elf section name string table */ + size = info->sechdrs[info->hdr->e_shstrndx].sh_size; + mod->klp_info->secstrings = kmemdup(info->secstrings, size, GFP_KERNEL); + if (!mod->klp_info->secstrings) { + ret = -ENOMEM; + goto free_sechdrs; + } + + /* Elf symbol section index */ + symndx = info->index.sym; + mod->klp_info->symndx = symndx; + + /* + * For livepatch modules, core_kallsyms.symtab is a complete + * copy of the original symbol table. Adjust sh_addr to point + * to core_kallsyms.symtab since the copy of the symtab in module + * init memory is freed at the end of do_init_module(). + */ + mod->klp_info->sechdrs[symndx].sh_addr = (unsigned long)mod->core_kallsyms.symtab; + + return 0; + +free_sechdrs: + kfree(mod->klp_info->sechdrs); +free_info: + kfree(mod->klp_info); + return ret; +} + +void free_module_elf(struct module *mod) +{ + kfree(mod->klp_info->sechdrs); + kfree(mod->klp_info->secstrings); + kfree(mod->klp_info); +} diff --git a/kernel/module.c b/kernel/module/main.c index 6cea788fd965..d02d39c7174e 100644 --- a/kernel/module.c +++ b/kernel/module/main.c @@ -14,16 +14,12 @@ #include <linux/init.h> #include <linux/kallsyms.h> #include <linux/buildid.h> -#include <linux/file.h> #include <linux/fs.h> -#include <linux/sysfs.h> #include <linux/kernel.h> #include <linux/kernel_read_file.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/elf.h> -#include <linux/proc_fs.h> -#include <linux/security.h> #include <linux/seq_file.h> #include <linux/syscalls.h> #include <linux/fcntl.h> @@ -57,240 +53,74 @@ #include <linux/bsearch.h> #include <linux/dynamic_debug.h> #include <linux/audit.h> +#include <linux/cfi.h> #include <uapi/linux/module.h> -#include "module-internal.h" +#include "internal.h" #define CREATE_TRACE_POINTS #include <trace/events/module.h> -#ifndef ARCH_SHF_SMALL -#define ARCH_SHF_SMALL 0 -#endif - -/* - * Modules' sections will be aligned on page boundaries - * to ensure complete separation of code and data, but - * only when CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y - */ -#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX -# define debug_align(X) ALIGN(X, PAGE_SIZE) -#else -# define debug_align(X) (X) -#endif - -/* If this is set, the section belongs in the init part of the module */ -#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) - /* * Mutex protects: * 1) List of modules (also safely readable with preempt_disable), * 2) module_use links, - * 3) module_addr_min/module_addr_max. + * 3) mod_tree.addr_min/mod_tree.addr_max. * (delete and add uses RCU list operations). */ -static DEFINE_MUTEX(module_mutex); -static LIST_HEAD(modules); +DEFINE_MUTEX(module_mutex); +LIST_HEAD(modules); /* Work queue for freeing init sections in success case */ static void do_free_init(struct work_struct *w); static DECLARE_WORK(init_free_wq, do_free_init); static LLIST_HEAD(init_free_list); -#ifdef CONFIG_MODULES_TREE_LOOKUP - -/* - * Use a latched RB-tree for __module_address(); this allows us to use - * RCU-sched lookups of the address from any context. - * - * This is conditional on PERF_EVENTS || TRACING because those can really hit - * __module_address() hard by doing a lot of stack unwinding; potentially from - * NMI context. - */ - -static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n) -{ - struct module_layout *layout = container_of(n, struct module_layout, mtn.node); - - return (unsigned long)layout->base; -} - -static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n) -{ - struct module_layout *layout = container_of(n, struct module_layout, mtn.node); - - return (unsigned long)layout->size; -} - -static __always_inline bool -mod_tree_less(struct latch_tree_node *a, struct latch_tree_node *b) -{ - return __mod_tree_val(a) < __mod_tree_val(b); -} - -static __always_inline int -mod_tree_comp(void *key, struct latch_tree_node *n) -{ - unsigned long val = (unsigned long)key; - unsigned long start, end; - - start = __mod_tree_val(n); - if (val < start) - return -1; - - end = start + __mod_tree_size(n); - if (val >= end) - return 1; - - return 0; -} - -static const struct latch_tree_ops mod_tree_ops = { - .less = mod_tree_less, - .comp = mod_tree_comp, +struct mod_tree_root mod_tree __cacheline_aligned = { + .addr_min = -1UL, }; -static struct mod_tree_root { - struct latch_tree_root root; - unsigned long addr_min; - unsigned long addr_max; -} mod_tree __cacheline_aligned = { +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC +struct mod_tree_root mod_data_tree __cacheline_aligned = { .addr_min = -1UL, }; +#endif #define module_addr_min mod_tree.addr_min #define module_addr_max mod_tree.addr_max -static noinline void __mod_tree_insert(struct mod_tree_node *node) -{ - latch_tree_insert(&node->node, &mod_tree.root, &mod_tree_ops); -} - -static void __mod_tree_remove(struct mod_tree_node *node) -{ - latch_tree_erase(&node->node, &mod_tree.root, &mod_tree_ops); -} - -/* - * These modifications: insert, remove_init and remove; are serialized by the - * module_mutex. - */ -static void mod_tree_insert(struct module *mod) -{ - mod->core_layout.mtn.mod = mod; - mod->init_layout.mtn.mod = mod; - - __mod_tree_insert(&mod->core_layout.mtn); - if (mod->init_layout.size) - __mod_tree_insert(&mod->init_layout.mtn); -} - -static void mod_tree_remove_init(struct module *mod) -{ - if (mod->init_layout.size) - __mod_tree_remove(&mod->init_layout.mtn); -} - -static void mod_tree_remove(struct module *mod) -{ - __mod_tree_remove(&mod->core_layout.mtn); - mod_tree_remove_init(mod); -} - -static struct module *mod_find(unsigned long addr) -{ - struct latch_tree_node *ltn; - - ltn = latch_tree_find((void *)addr, &mod_tree.root, &mod_tree_ops); - if (!ltn) - return NULL; - - return container_of(ltn, struct mod_tree_node, node)->mod; -} - -#else /* MODULES_TREE_LOOKUP */ - -static unsigned long module_addr_min = -1UL, module_addr_max = 0; - -static void mod_tree_insert(struct module *mod) { } -static void mod_tree_remove_init(struct module *mod) { } -static void mod_tree_remove(struct module *mod) { } - -static struct module *mod_find(unsigned long addr) -{ - struct module *mod; - - list_for_each_entry_rcu(mod, &modules, list, - lockdep_is_held(&module_mutex)) { - if (within_module(addr, mod)) - return mod; - } - - return NULL; -} - -#endif /* MODULES_TREE_LOOKUP */ +struct symsearch { + const struct kernel_symbol *start, *stop; + const s32 *crcs; + enum mod_license license; +}; /* * Bounds of module text, for speeding up __module_address. * Protected by module_mutex. */ -static void __mod_update_bounds(void *base, unsigned int size) +static void __mod_update_bounds(void *base, unsigned int size, struct mod_tree_root *tree) { unsigned long min = (unsigned long)base; unsigned long max = min + size; - if (min < module_addr_min) - module_addr_min = min; - if (max > module_addr_max) - module_addr_max = max; + if (min < tree->addr_min) + tree->addr_min = min; + if (max > tree->addr_max) + tree->addr_max = max; } static void mod_update_bounds(struct module *mod) { - __mod_update_bounds(mod->core_layout.base, mod->core_layout.size); + __mod_update_bounds(mod->core_layout.base, mod->core_layout.size, &mod_tree); if (mod->init_layout.size) - __mod_update_bounds(mod->init_layout.base, mod->init_layout.size); -} - -#ifdef CONFIG_KGDB_KDB -struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */ -#endif /* CONFIG_KGDB_KDB */ - -static void module_assert_mutex_or_preempt(void) -{ -#ifdef CONFIG_LOCKDEP - if (unlikely(!debug_locks)) - return; - - WARN_ON_ONCE(!rcu_read_lock_sched_held() && - !lockdep_is_held(&module_mutex)); + __mod_update_bounds(mod->init_layout.base, mod->init_layout.size, &mod_tree); +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + __mod_update_bounds(mod->data_layout.base, mod->data_layout.size, &mod_data_tree); #endif } -#ifdef CONFIG_MODULE_SIG -static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE); -module_param(sig_enforce, bool_enable_only, 0644); - -void set_module_sig_enforced(void) -{ - sig_enforce = true; -} -#else -#define sig_enforce false -#endif - -/* - * Export sig_enforce kernel cmdline parameter to allow other subsystems rely - * on that instead of directly to CONFIG_MODULE_SIG_FORCE config. - */ -bool is_module_sig_enforced(void) -{ - return sig_enforce; -} -EXPORT_SYMBOL(is_module_sig_enforced); - /* Block module loading/unloading? */ -int modules_disabled = 0; +int modules_disabled; core_param(nomodule, modules_disabled, bint, 0); /* Waiting for a module to finish initializing? */ @@ -408,66 +238,12 @@ static __maybe_unused void *any_section_objs(const struct load_info *info, return (void *)info->sechdrs[sec].sh_addr; } -/* Provided by the linker */ -extern const struct kernel_symbol __start___ksymtab[]; -extern const struct kernel_symbol __stop___ksymtab[]; -extern const struct kernel_symbol __start___ksymtab_gpl[]; -extern const struct kernel_symbol __stop___ksymtab_gpl[]; -extern const s32 __start___kcrctab[]; -extern const s32 __start___kcrctab_gpl[]; - #ifndef CONFIG_MODVERSIONS #define symversion(base, idx) NULL #else #define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL) #endif -struct symsearch { - const struct kernel_symbol *start, *stop; - const s32 *crcs; - enum mod_license { - NOT_GPL_ONLY, - GPL_ONLY, - } license; -}; - -struct find_symbol_arg { - /* Input */ - const char *name; - bool gplok; - bool warn; - - /* Output */ - struct module *owner; - const s32 *crc; - const struct kernel_symbol *sym; - enum mod_license license; -}; - -static bool check_exported_symbol(const struct symsearch *syms, - struct module *owner, - unsigned int symnum, void *data) -{ - struct find_symbol_arg *fsa = data; - - if (!fsa->gplok && syms->license == GPL_ONLY) - return false; - fsa->owner = owner; - fsa->crc = symversion(syms->crcs, symnum); - fsa->sym = &syms->start[symnum]; - fsa->license = syms->license; - return true; -} - -static unsigned long kernel_symbol_value(const struct kernel_symbol *sym) -{ -#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS - return (unsigned long)offset_to_ptr(&sym->value_offset); -#else - return sym->value; -#endif -} - static const char *kernel_symbol_name(const struct kernel_symbol *sym) { #ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS @@ -488,33 +264,38 @@ static const char *kernel_symbol_namespace(const struct kernel_symbol *sym) #endif } -static int cmp_name(const void *name, const void *sym) +int cmp_name(const void *name, const void *sym) { return strcmp(name, kernel_symbol_name(sym)); } static bool find_exported_symbol_in_section(const struct symsearch *syms, struct module *owner, - void *data) + struct find_symbol_arg *fsa) { - struct find_symbol_arg *fsa = data; struct kernel_symbol *sym; + if (!fsa->gplok && syms->license == GPL_ONLY) + return false; + sym = bsearch(fsa->name, syms->start, syms->stop - syms->start, sizeof(struct kernel_symbol), cmp_name); + if (!sym) + return false; - if (sym != NULL && check_exported_symbol(syms, owner, - sym - syms->start, data)) - return true; + fsa->owner = owner; + fsa->crc = symversion(syms->crcs, sym - syms->start); + fsa->sym = sym; + fsa->license = syms->license; - return false; + return true; } /* * Find an exported symbol and return it, along with, (optional) crc and * (optional) module which owns it. Needs preempt disabled or module_mutex. */ -static bool find_symbol(struct find_symbol_arg *fsa) +bool find_symbol(struct find_symbol_arg *fsa) { static const struct symsearch arr[] = { { __start___ksymtab, __stop___ksymtab, __start___kcrctab, @@ -558,8 +339,8 @@ static bool find_symbol(struct find_symbol_arg *fsa) * Search for module by name: must hold module_mutex (or preempt disabled * for read-only access). */ -static struct module *find_module_all(const char *name, size_t len, - bool even_unformed) +struct module *find_module_all(const char *name, size_t len, + bool even_unformed) { struct module *mod; @@ -744,7 +525,10 @@ static struct module_attribute modinfo_##field = { \ MODINFO_ATTR(version); MODINFO_ATTR(srcversion); -static char last_unloaded_module[MODULE_NAME_LEN+1]; +static struct { + char name[MODULE_NAME_LEN + 1]; + char taints[MODULE_FLAGS_BUF_SIZE]; +} last_unloaded_module; #ifdef CONFIG_MODULE_UNLOAD @@ -914,6 +698,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, { struct module *mod; char name[MODULE_NAME_LEN]; + char buf[MODULE_FLAGS_BUF_SIZE]; int ret, forced = 0; if (!capable(CAP_SYS_MODULE) || modules_disabled) @@ -973,8 +758,9 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, async_synchronize_full(); - /* Store the name of the last unloaded module for diagnostic purposes */ - strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); + /* Store the name and taints of the last unloaded module for diagnostic purposes */ + strscpy(last_unloaded_module.name, mod->name, sizeof(last_unloaded_module.name)); + strscpy(last_unloaded_module.taints, module_flags(mod, buf, false), sizeof(last_unloaded_module.taints)); free_module(mod); /* someone could wait for the module in add_unformed_module() */ @@ -985,31 +771,6 @@ out: return ret; } -static inline void print_unload_info(struct seq_file *m, struct module *mod) -{ - struct module_use *use; - int printed_something = 0; - - seq_printf(m, " %i ", module_refcount(mod)); - - /* - * Always include a trailing , so userspace can differentiate - * between this and the old multi-field proc format. - */ - list_for_each_entry(use, &mod->source_list, source_list) { - printed_something = 1; - seq_printf(m, "%s,", use->source->name); - } - - if (mod->init != NULL && mod->exit == NULL) { - printed_something = 1; - seq_puts(m, "[permanent],"); - } - - if (!printed_something) - seq_puts(m, "-"); -} - void __symbol_put(const char *symbol) { struct find_symbol_arg fsa = { @@ -1099,12 +860,6 @@ void module_put(struct module *module) EXPORT_SYMBOL(module_put); #else /* !CONFIG_MODULE_UNLOAD */ -static inline void print_unload_info(struct seq_file *m, struct module *mod) -{ - /* We don't know the usage count, or what modules are using. */ - seq_puts(m, " - -"); -} - static inline void module_unload_free(struct module *mod) { } @@ -1120,13 +875,13 @@ static inline int module_unload_init(struct module *mod) } #endif /* CONFIG_MODULE_UNLOAD */ -static size_t module_flags_taint(struct module *mod, char *buf) +size_t module_flags_taint(unsigned long taints, char *buf) { size_t l = 0; int i; for (i = 0; i < TAINT_FLAGS_COUNT; i++) { - if (taint_flags[i].module && test_bit(i, &mod->taints)) + if (taint_flags[i].module && test_bit(i, &taints)) buf[l++] = taint_flags[i].c_true; } @@ -1179,6 +934,17 @@ static ssize_t show_coresize(struct module_attribute *mattr, static struct module_attribute modinfo_coresize = __ATTR(coresize, 0444, show_coresize, NULL); +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC +static ssize_t show_datasize(struct module_attribute *mattr, + struct module_kobject *mk, char *buffer) +{ + return sprintf(buffer, "%u\n", mk->mod->data_layout.size); +} + +static struct module_attribute modinfo_datasize = + __ATTR(datasize, 0444, show_datasize, NULL); +#endif + static ssize_t show_initsize(struct module_attribute *mattr, struct module_kobject *mk, char *buffer) { @@ -1193,7 +959,7 @@ static ssize_t show_taint(struct module_attribute *mattr, { size_t l; - l = module_flags_taint(mk->mod, buffer); + l = module_flags_taint(mk->mod->taints, buffer); buffer[l++] = '\n'; return l; } @@ -1201,12 +967,15 @@ static ssize_t show_taint(struct module_attribute *mattr, static struct module_attribute modinfo_taint = __ATTR(taint, 0444, show_taint, NULL); -static struct module_attribute *modinfo_attrs[] = { +struct module_attribute *modinfo_attrs[] = { &module_uevent, &modinfo_version, &modinfo_srcversion, &modinfo_initstate, &modinfo_coresize, +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + &modinfo_datasize, +#endif &modinfo_initsize, &modinfo_taint, #ifdef CONFIG_MODULE_UNLOAD @@ -1215,9 +984,11 @@ static struct module_attribute *modinfo_attrs[] = { NULL, }; +size_t modinfo_attrs_count = ARRAY_SIZE(modinfo_attrs); + static const char vermagic[] = VERMAGIC_STRING; -static int try_to_force_load(struct module *mod, const char *reason) +int try_to_force_load(struct module *mod, const char *reason) { #ifdef CONFIG_MODULE_FORCE_LOAD if (!test_taint(TAINT_FORCED_MODULE)) @@ -1229,115 +1000,6 @@ static int try_to_force_load(struct module *mod, const char *reason) #endif } -#ifdef CONFIG_MODVERSIONS - -static u32 resolve_rel_crc(const s32 *crc) -{ - return *(u32 *)((void *)crc + *crc); -} - -static int check_version(const struct load_info *info, - const char *symname, - struct module *mod, - const s32 *crc) -{ - Elf_Shdr *sechdrs = info->sechdrs; - unsigned int versindex = info->index.vers; - unsigned int i, num_versions; - struct modversion_info *versions; - - /* Exporting module didn't supply crcs? OK, we're already tainted. */ - if (!crc) - return 1; - - /* No versions at all? modprobe --force does this. */ - if (versindex == 0) - return try_to_force_load(mod, symname) == 0; - - versions = (void *) sechdrs[versindex].sh_addr; - num_versions = sechdrs[versindex].sh_size - / sizeof(struct modversion_info); - - for (i = 0; i < num_versions; i++) { - u32 crcval; - - if (strcmp(versions[i].name, symname) != 0) - continue; - - if (IS_ENABLED(CONFIG_MODULE_REL_CRCS)) - crcval = resolve_rel_crc(crc); - else - crcval = *crc; - if (versions[i].crc == crcval) - return 1; - pr_debug("Found checksum %X vs module %lX\n", - crcval, versions[i].crc); - goto bad_version; - } - - /* Broken toolchain. Warn once, then let it go.. */ - pr_warn_once("%s: no symbol version for %s\n", info->name, symname); - return 1; - -bad_version: - pr_warn("%s: disagrees about version of symbol %s\n", - info->name, symname); - return 0; -} - -static inline int check_modstruct_version(const struct load_info *info, - struct module *mod) -{ - struct find_symbol_arg fsa = { - .name = "module_layout", - .gplok = true, - }; - - /* - * Since this should be found in kernel (which can't be removed), no - * locking is necessary -- use preempt_disable() to placate lockdep. - */ - preempt_disable(); - if (!find_symbol(&fsa)) { - preempt_enable(); - BUG(); - } - preempt_enable(); - return check_version(info, "module_layout", mod, fsa.crc); -} - -/* First part is kernel version, which we ignore if module has crcs. */ -static inline int same_magic(const char *amagic, const char *bmagic, - bool has_crcs) -{ - if (has_crcs) { - amagic += strcspn(amagic, " "); - bmagic += strcspn(bmagic, " "); - } - return strcmp(amagic, bmagic) == 0; -} -#else -static inline int check_version(const struct load_info *info, - const char *symname, - struct module *mod, - const s32 *crc) -{ - return 1; -} - -static inline int check_modstruct_version(const struct load_info *info, - struct module *mod) -{ - return 1; -} - -static inline int same_magic(const char *amagic, const char *bmagic, - bool has_crcs) -{ - return strcmp(amagic, bmagic) == 0; -} -#endif /* CONFIG_MODVERSIONS */ - static char *get_modinfo(const struct load_info *info, const char *tag); static char *get_next_modinfo(const struct load_info *info, const char *tag, char *prev); @@ -1372,20 +1034,20 @@ static int verify_namespace_is_imported(const struct load_info *info, return 0; } -static bool inherit_taint(struct module *mod, struct module *owner) +static bool inherit_taint(struct module *mod, struct module *owner, const char *name) { if (!owner || !test_bit(TAINT_PROPRIETARY_MODULE, &owner->taints)) return true; if (mod->using_gplonly_symbols) { - pr_err("%s: module using GPL-only symbols uses symbols from proprietary module %s.\n", - mod->name, owner->name); + pr_err("%s: module using GPL-only symbols uses symbols %s from proprietary module %s.\n", + mod->name, name, owner->name); return false; } if (!test_bit(TAINT_PROPRIETARY_MODULE, &mod->taints)) { - pr_warn("%s: module uses symbols from proprietary module %s, inheriting taint.\n", - mod->name, owner->name); + pr_warn("%s: module uses symbols %s from proprietary module %s, inheriting taint.\n", + mod->name, name, owner->name); set_bit(TAINT_PROPRIETARY_MODULE, &mod->taints); } return true; @@ -1417,7 +1079,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod, if (fsa.license == GPL_ONLY) mod->using_gplonly_symbols = true; - if (!inherit_taint(mod, fsa.owner)) { + if (!inherit_taint(mod, fsa.owner, name)) { fsa.sym = NULL; goto getname; } @@ -1465,674 +1127,6 @@ resolve_symbol_wait(struct module *mod, return ksym; } -#ifdef CONFIG_KALLSYMS -static inline bool sect_empty(const Elf_Shdr *sect) -{ - return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; -} -#endif - -/* - * /sys/module/foo/sections stuff - * J. Corbet <corbet@lwn.net> - */ -#ifdef CONFIG_SYSFS - -#ifdef CONFIG_KALLSYMS -struct module_sect_attr { - struct bin_attribute battr; - unsigned long address; -}; - -struct module_sect_attrs { - struct attribute_group grp; - unsigned int nsections; - struct module_sect_attr attrs[]; -}; - -#define MODULE_SECT_READ_SIZE (3 /* "0x", "\n" */ + (BITS_PER_LONG / 4)) -static ssize_t module_sect_read(struct file *file, struct kobject *kobj, - struct bin_attribute *battr, - char *buf, loff_t pos, size_t count) -{ - struct module_sect_attr *sattr = - container_of(battr, struct module_sect_attr, battr); - char bounce[MODULE_SECT_READ_SIZE + 1]; - size_t wrote; - - if (pos != 0) - return -EINVAL; - - /* - * Since we're a binary read handler, we must account for the - * trailing NUL byte that sprintf will write: if "buf" is - * too small to hold the NUL, or the NUL is exactly the last - * byte, the read will look like it got truncated by one byte. - * Since there is no way to ask sprintf nicely to not write - * the NUL, we have to use a bounce buffer. - */ - wrote = scnprintf(bounce, sizeof(bounce), "0x%px\n", - kallsyms_show_value(file->f_cred) - ? (void *)sattr->address : NULL); - count = min(count, wrote); - memcpy(buf, bounce, count); - - return count; -} - -static void free_sect_attrs(struct module_sect_attrs *sect_attrs) -{ - unsigned int section; - - for (section = 0; section < sect_attrs->nsections; section++) - kfree(sect_attrs->attrs[section].battr.attr.name); - kfree(sect_attrs); -} - -static void add_sect_attrs(struct module *mod, const struct load_info *info) -{ - unsigned int nloaded = 0, i, size[2]; - struct module_sect_attrs *sect_attrs; - struct module_sect_attr *sattr; - struct bin_attribute **gattr; - - /* Count loaded sections and allocate structures */ - for (i = 0; i < info->hdr->e_shnum; i++) - if (!sect_empty(&info->sechdrs[i])) - nloaded++; - size[0] = ALIGN(struct_size(sect_attrs, attrs, nloaded), - sizeof(sect_attrs->grp.bin_attrs[0])); - size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.bin_attrs[0]); - sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL); - if (sect_attrs == NULL) - return; - - /* Setup section attributes. */ - sect_attrs->grp.name = "sections"; - sect_attrs->grp.bin_attrs = (void *)sect_attrs + size[0]; - - sect_attrs->nsections = 0; - sattr = §_attrs->attrs[0]; - gattr = §_attrs->grp.bin_attrs[0]; - for (i = 0; i < info->hdr->e_shnum; i++) { - Elf_Shdr *sec = &info->sechdrs[i]; - if (sect_empty(sec)) - continue; - sysfs_bin_attr_init(&sattr->battr); - sattr->address = sec->sh_addr; - sattr->battr.attr.name = - kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL); - if (sattr->battr.attr.name == NULL) - goto out; - sect_attrs->nsections++; - sattr->battr.read = module_sect_read; - sattr->battr.size = MODULE_SECT_READ_SIZE; - sattr->battr.attr.mode = 0400; - *(gattr++) = &(sattr++)->battr; - } - *gattr = NULL; - - if (sysfs_create_group(&mod->mkobj.kobj, §_attrs->grp)) - goto out; - - mod->sect_attrs = sect_attrs; - return; - out: - free_sect_attrs(sect_attrs); -} - -static void remove_sect_attrs(struct module *mod) -{ - if (mod->sect_attrs) { - sysfs_remove_group(&mod->mkobj.kobj, - &mod->sect_attrs->grp); - /* - * We are positive that no one is using any sect attrs - * at this point. Deallocate immediately. - */ - free_sect_attrs(mod->sect_attrs); - mod->sect_attrs = NULL; - } -} - -/* - * /sys/module/foo/notes/.section.name gives contents of SHT_NOTE sections. - */ - -struct module_notes_attrs { - struct kobject *dir; - unsigned int notes; - struct bin_attribute attrs[]; -}; - -static ssize_t module_notes_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, - char *buf, loff_t pos, size_t count) -{ - /* - * The caller checked the pos and count against our size. - */ - memcpy(buf, bin_attr->private + pos, count); - return count; -} - -static void free_notes_attrs(struct module_notes_attrs *notes_attrs, - unsigned int i) -{ - if (notes_attrs->dir) { - while (i-- > 0) - sysfs_remove_bin_file(notes_attrs->dir, - ¬es_attrs->attrs[i]); - kobject_put(notes_attrs->dir); - } - kfree(notes_attrs); -} - -static void add_notes_attrs(struct module *mod, const struct load_info *info) -{ - unsigned int notes, loaded, i; - struct module_notes_attrs *notes_attrs; - struct bin_attribute *nattr; - - /* failed to create section attributes, so can't create notes */ - if (!mod->sect_attrs) - return; - - /* Count notes sections and allocate structures. */ - notes = 0; - for (i = 0; i < info->hdr->e_shnum; i++) - if (!sect_empty(&info->sechdrs[i]) && - (info->sechdrs[i].sh_type == SHT_NOTE)) - ++notes; - - if (notes == 0) - return; - - notes_attrs = kzalloc(struct_size(notes_attrs, attrs, notes), - GFP_KERNEL); - if (notes_attrs == NULL) - return; - - notes_attrs->notes = notes; - nattr = ¬es_attrs->attrs[0]; - for (loaded = i = 0; i < info->hdr->e_shnum; ++i) { - if (sect_empty(&info->sechdrs[i])) - continue; - if (info->sechdrs[i].sh_type == SHT_NOTE) { - sysfs_bin_attr_init(nattr); - nattr->attr.name = mod->sect_attrs->attrs[loaded].battr.attr.name; - nattr->attr.mode = S_IRUGO; - nattr->size = info->sechdrs[i].sh_size; - nattr->private = (void *) info->sechdrs[i].sh_addr; - nattr->read = module_notes_read; - ++nattr; - } - ++loaded; - } - - notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj); - if (!notes_attrs->dir) - goto out; - - for (i = 0; i < notes; ++i) - if (sysfs_create_bin_file(notes_attrs->dir, - ¬es_attrs->attrs[i])) - goto out; - - mod->notes_attrs = notes_attrs; - return; - - out: - free_notes_attrs(notes_attrs, i); -} - -static void remove_notes_attrs(struct module *mod) -{ - if (mod->notes_attrs) - free_notes_attrs(mod->notes_attrs, mod->notes_attrs->notes); -} - -#else - -static inline void add_sect_attrs(struct module *mod, - const struct load_info *info) -{ -} - -static inline void remove_sect_attrs(struct module *mod) -{ -} - -static inline void add_notes_attrs(struct module *mod, - const struct load_info *info) -{ -} - -static inline void remove_notes_attrs(struct module *mod) -{ -} -#endif /* CONFIG_KALLSYMS */ - -static void del_usage_links(struct module *mod) -{ -#ifdef CONFIG_MODULE_UNLOAD - struct module_use *use; - - mutex_lock(&module_mutex); - list_for_each_entry(use, &mod->target_list, target_list) - sysfs_remove_link(use->target->holders_dir, mod->name); - mutex_unlock(&module_mutex); -#endif -} - -static int add_usage_links(struct module *mod) -{ - int ret = 0; -#ifdef CONFIG_MODULE_UNLOAD - struct module_use *use; - - mutex_lock(&module_mutex); - list_for_each_entry(use, &mod->target_list, target_list) { - ret = sysfs_create_link(use->target->holders_dir, - &mod->mkobj.kobj, mod->name); - if (ret) - break; - } - mutex_unlock(&module_mutex); - if (ret) - del_usage_links(mod); -#endif - return ret; -} - -static void module_remove_modinfo_attrs(struct module *mod, int end); - -static int module_add_modinfo_attrs(struct module *mod) -{ - struct module_attribute *attr; - struct module_attribute *temp_attr; - int error = 0; - int i; - - mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) * - (ARRAY_SIZE(modinfo_attrs) + 1)), - GFP_KERNEL); - if (!mod->modinfo_attrs) - return -ENOMEM; - - temp_attr = mod->modinfo_attrs; - for (i = 0; (attr = modinfo_attrs[i]); i++) { - if (!attr->test || attr->test(mod)) { - memcpy(temp_attr, attr, sizeof(*temp_attr)); - sysfs_attr_init(&temp_attr->attr); - error = sysfs_create_file(&mod->mkobj.kobj, - &temp_attr->attr); - if (error) - goto error_out; - ++temp_attr; - } - } - - return 0; - -error_out: - if (i > 0) - module_remove_modinfo_attrs(mod, --i); - else - kfree(mod->modinfo_attrs); - return error; -} - -static void module_remove_modinfo_attrs(struct module *mod, int end) -{ - struct module_attribute *attr; - int i; - - for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) { - if (end >= 0 && i > end) - break; - /* pick a field to test for end of list */ - if (!attr->attr.name) - break; - sysfs_remove_file(&mod->mkobj.kobj, &attr->attr); - if (attr->free) - attr->free(mod); - } - kfree(mod->modinfo_attrs); -} - -static void mod_kobject_put(struct module *mod) -{ - DECLARE_COMPLETION_ONSTACK(c); - mod->mkobj.kobj_completion = &c; - kobject_put(&mod->mkobj.kobj); - wait_for_completion(&c); -} - -static int mod_sysfs_init(struct module *mod) -{ - int err; - struct kobject *kobj; - - if (!module_sysfs_initialized) { - pr_err("%s: module sysfs not initialized\n", mod->name); - err = -EINVAL; - goto out; - } - - kobj = kset_find_obj(module_kset, mod->name); - if (kobj) { - pr_err("%s: module is already loaded\n", mod->name); - kobject_put(kobj); - err = -EINVAL; - goto out; - } - - mod->mkobj.mod = mod; - - memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj)); - mod->mkobj.kobj.kset = module_kset; - err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL, - "%s", mod->name); - if (err) - mod_kobject_put(mod); - -out: - return err; -} - -static int mod_sysfs_setup(struct module *mod, - const struct load_info *info, - struct kernel_param *kparam, - unsigned int num_params) -{ - int err; - - err = mod_sysfs_init(mod); - if (err) - goto out; - - mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj); - if (!mod->holders_dir) { - err = -ENOMEM; - goto out_unreg; - } - - err = module_param_sysfs_setup(mod, kparam, num_params); - if (err) - goto out_unreg_holders; - - err = module_add_modinfo_attrs(mod); - if (err) - goto out_unreg_param; - - err = add_usage_links(mod); - if (err) - goto out_unreg_modinfo_attrs; - - add_sect_attrs(mod, info); - add_notes_attrs(mod, info); - - return 0; - -out_unreg_modinfo_attrs: - module_remove_modinfo_attrs(mod, -1); -out_unreg_param: - module_param_sysfs_remove(mod); -out_unreg_holders: - kobject_put(mod->holders_dir); -out_unreg: - mod_kobject_put(mod); -out: - return err; -} - -static void mod_sysfs_fini(struct module *mod) -{ - remove_notes_attrs(mod); - remove_sect_attrs(mod); - mod_kobject_put(mod); -} - -static void init_param_lock(struct module *mod) -{ - mutex_init(&mod->param_lock); -} -#else /* !CONFIG_SYSFS */ - -static int mod_sysfs_setup(struct module *mod, - const struct load_info *info, - struct kernel_param *kparam, - unsigned int num_params) -{ - return 0; -} - -static void mod_sysfs_fini(struct module *mod) -{ -} - -static void module_remove_modinfo_attrs(struct module *mod, int end) -{ -} - -static void del_usage_links(struct module *mod) -{ -} - -static void init_param_lock(struct module *mod) -{ -} -#endif /* CONFIG_SYSFS */ - -static void mod_sysfs_teardown(struct module *mod) -{ - del_usage_links(mod); - module_remove_modinfo_attrs(mod, -1); - module_param_sysfs_remove(mod); - kobject_put(mod->mkobj.drivers_dir); - kobject_put(mod->holders_dir); - mod_sysfs_fini(mod); -} - -/* - * LKM RO/NX protection: protect module's text/ro-data - * from modification and any data from execution. - * - * General layout of module is: - * [text] [read-only-data] [ro-after-init] [writable data] - * text_size -----^ ^ ^ ^ - * ro_size ------------------------| | | - * ro_after_init_size -----------------------------| | - * size -----------------------------------------------------------| - * - * These values are always page-aligned (as is base) - */ - -/* - * Since some arches are moving towards PAGE_KERNEL module allocations instead - * of PAGE_KERNEL_EXEC, keep frob_text() and module_enable_x() outside of the - * CONFIG_STRICT_MODULE_RWX block below because they are needed regardless of - * whether we are strict. - */ -#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX -static void frob_text(const struct module_layout *layout, - int (*set_memory)(unsigned long start, int num_pages)) -{ - BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); - BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1)); - set_memory((unsigned long)layout->base, - layout->text_size >> PAGE_SHIFT); -} - -static void module_enable_x(const struct module *mod) -{ - frob_text(&mod->core_layout, set_memory_x); - frob_text(&mod->init_layout, set_memory_x); -} -#else /* !CONFIG_ARCH_HAS_STRICT_MODULE_RWX */ -static void module_enable_x(const struct module *mod) { } -#endif /* CONFIG_ARCH_HAS_STRICT_MODULE_RWX */ - -#ifdef CONFIG_STRICT_MODULE_RWX -static void frob_rodata(const struct module_layout *layout, - int (*set_memory)(unsigned long start, int num_pages)) -{ - BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); - BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1)); - BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1)); - set_memory((unsigned long)layout->base + layout->text_size, - (layout->ro_size - layout->text_size) >> PAGE_SHIFT); -} - -static void frob_ro_after_init(const struct module_layout *layout, - int (*set_memory)(unsigned long start, int num_pages)) -{ - BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); - BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1)); - BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1)); - set_memory((unsigned long)layout->base + layout->ro_size, - (layout->ro_after_init_size - layout->ro_size) >> PAGE_SHIFT); -} - -static void frob_writable_data(const struct module_layout *layout, - int (*set_memory)(unsigned long start, int num_pages)) -{ - BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); - BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1)); - BUG_ON((unsigned long)layout->size & (PAGE_SIZE-1)); - set_memory((unsigned long)layout->base + layout->ro_after_init_size, - (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT); -} - -static void module_enable_ro(const struct module *mod, bool after_init) -{ - if (!rodata_enabled) - return; - - set_vm_flush_reset_perms(mod->core_layout.base); - set_vm_flush_reset_perms(mod->init_layout.base); - frob_text(&mod->core_layout, set_memory_ro); - - frob_rodata(&mod->core_layout, set_memory_ro); - frob_text(&mod->init_layout, set_memory_ro); - frob_rodata(&mod->init_layout, set_memory_ro); - - if (after_init) - frob_ro_after_init(&mod->core_layout, set_memory_ro); -} - -static void module_enable_nx(const struct module *mod) -{ - frob_rodata(&mod->core_layout, set_memory_nx); - frob_ro_after_init(&mod->core_layout, set_memory_nx); - frob_writable_data(&mod->core_layout, set_memory_nx); - frob_rodata(&mod->init_layout, set_memory_nx); - frob_writable_data(&mod->init_layout, set_memory_nx); -} - -static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, - char *secstrings, struct module *mod) -{ - const unsigned long shf_wx = SHF_WRITE|SHF_EXECINSTR; - int i; - - for (i = 0; i < hdr->e_shnum; i++) { - if ((sechdrs[i].sh_flags & shf_wx) == shf_wx) { - pr_err("%s: section %s (index %d) has invalid WRITE|EXEC flags\n", - mod->name, secstrings + sechdrs[i].sh_name, i); - return -ENOEXEC; - } - } - - return 0; -} - -#else /* !CONFIG_STRICT_MODULE_RWX */ -static void module_enable_nx(const struct module *mod) { } -static void module_enable_ro(const struct module *mod, bool after_init) {} -static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, - char *secstrings, struct module *mod) -{ - return 0; -} -#endif /* CONFIG_STRICT_MODULE_RWX */ - -#ifdef CONFIG_LIVEPATCH -/* - * Persist Elf information about a module. Copy the Elf header, - * section header table, section string table, and symtab section - * index from info to mod->klp_info. - */ -static int copy_module_elf(struct module *mod, struct load_info *info) -{ - unsigned int size, symndx; - int ret; - - size = sizeof(*mod->klp_info); - mod->klp_info = kmalloc(size, GFP_KERNEL); - if (mod->klp_info == NULL) - return -ENOMEM; - - /* Elf header */ - size = sizeof(mod->klp_info->hdr); - memcpy(&mod->klp_info->hdr, info->hdr, size); - - /* Elf section header table */ - size = sizeof(*info->sechdrs) * info->hdr->e_shnum; - mod->klp_info->sechdrs = kmemdup(info->sechdrs, size, GFP_KERNEL); - if (mod->klp_info->sechdrs == NULL) { - ret = -ENOMEM; - goto free_info; - } - - /* Elf section name string table */ - size = info->sechdrs[info->hdr->e_shstrndx].sh_size; - mod->klp_info->secstrings = kmemdup(info->secstrings, size, GFP_KERNEL); - if (mod->klp_info->secstrings == NULL) { - ret = -ENOMEM; - goto free_sechdrs; - } - - /* Elf symbol section index */ - symndx = info->index.sym; - mod->klp_info->symndx = symndx; - - /* - * For livepatch modules, core_kallsyms.symtab is a complete - * copy of the original symbol table. Adjust sh_addr to point - * to core_kallsyms.symtab since the copy of the symtab in module - * init memory is freed at the end of do_init_module(). - */ - mod->klp_info->sechdrs[symndx].sh_addr = \ - (unsigned long) mod->core_kallsyms.symtab; - - return 0; - -free_sechdrs: - kfree(mod->klp_info->sechdrs); -free_info: - kfree(mod->klp_info); - return ret; -} - -static void free_module_elf(struct module *mod) -{ - kfree(mod->klp_info->sechdrs); - kfree(mod->klp_info->secstrings); - kfree(mod->klp_info); -} -#else /* !CONFIG_LIVEPATCH */ -static int copy_module_elf(struct module *mod, struct load_info *info) -{ - return 0; -} - -static void free_module_elf(struct module *mod) -{ -} -#endif /* CONFIG_LIVEPATCH */ - void __weak module_memfree(void *module_region) { /* @@ -2151,8 +1145,6 @@ void __weak module_arch_freeing_init(struct module *mod) { } -static void cfi_cleanup(struct module *mod); - /* Free a module, remove from lists, etc. */ static void free_module(struct module *mod) { @@ -2192,11 +1184,11 @@ static void free_module(struct module *mod) module_bug_cleanup(mod); /* Wait for RCU-sched synchronizing before releasing mod->list and buglist. */ synchronize_rcu(); + if (try_add_tainted_module(mod)) + pr_err("%s: adding tainted module to the unloaded tainted modules list failed.\n", + mod->name); mutex_unlock(&module_mutex); - /* Clean up CFI for the module. */ - cfi_cleanup(mod); - /* This may be empty, but that's OK */ module_arch_freeing_init(mod); module_memfree(mod->init_layout.base); @@ -2204,10 +1196,13 @@ static void free_module(struct module *mod) percpu_modfree(mod); /* Free lock-classes; relies on the preceding sync_rcu(). */ - lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); + lockdep_free_key_range(mod->data_layout.base, mod->data_layout.size); /* Finally, free the core (containing the module structure) */ module_memfree(mod->core_layout.base); +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + vfree(mod->data_layout.base); +#endif } void *__symbol_get(const char *symbol) @@ -2395,7 +1390,7 @@ unsigned int __weak arch_mod_section_prepend(struct module *mod, } /* Update size with this section: return offset. */ -static long get_offset(struct module *mod, unsigned int *size, +long module_get_offset(struct module *mod, unsigned int *size, Elf_Shdr *sechdr, unsigned int section) { long ret; @@ -2445,30 +1440,32 @@ static void layout_sections(struct module *mod, struct load_info *info) for (i = 0; i < info->hdr->e_shnum; ++i) { Elf_Shdr *s = &info->sechdrs[i]; const char *sname = info->secstrings + s->sh_name; + unsigned int *sizep; if ((s->sh_flags & masks[m][0]) != masks[m][0] || (s->sh_flags & masks[m][1]) || s->sh_entsize != ~0UL || module_init_layout_section(sname)) continue; - s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i); + sizep = m ? &mod->data_layout.size : &mod->core_layout.size; + s->sh_entsize = module_get_offset(mod, sizep, s, i); pr_debug("\t%s\n", sname); } switch (m) { case 0: /* executable */ - mod->core_layout.size = debug_align(mod->core_layout.size); + mod->core_layout.size = strict_align(mod->core_layout.size); mod->core_layout.text_size = mod->core_layout.size; break; case 1: /* RO: text and ro-data */ - mod->core_layout.size = debug_align(mod->core_layout.size); - mod->core_layout.ro_size = mod->core_layout.size; + mod->data_layout.size = strict_align(mod->data_layout.size); + mod->data_layout.ro_size = mod->data_layout.size; break; case 2: /* RO after init */ - mod->core_layout.size = debug_align(mod->core_layout.size); - mod->core_layout.ro_after_init_size = mod->core_layout.size; + mod->data_layout.size = strict_align(mod->data_layout.size); + mod->data_layout.ro_after_init_size = mod->data_layout.size; break; case 4: /* whole core */ - mod->core_layout.size = debug_align(mod->core_layout.size); + mod->data_layout.size = strict_align(mod->data_layout.size); break; } } @@ -2484,17 +1481,17 @@ static void layout_sections(struct module *mod, struct load_info *info) || s->sh_entsize != ~0UL || !module_init_layout_section(sname)) continue; - s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i) + s->sh_entsize = (module_get_offset(mod, &mod->init_layout.size, s, i) | INIT_OFFSET_MASK); pr_debug("\t%s\n", sname); } switch (m) { case 0: /* executable */ - mod->init_layout.size = debug_align(mod->init_layout.size); + mod->init_layout.size = strict_align(mod->init_layout.size); mod->init_layout.text_size = mod->init_layout.size; break; case 1: /* RO: text and ro-data */ - mod->init_layout.size = debug_align(mod->init_layout.size); + mod->init_layout.size = strict_align(mod->init_layout.size); mod->init_layout.ro_size = mod->init_layout.size; break; case 2: @@ -2505,7 +1502,7 @@ static void layout_sections(struct module *mod, struct load_info *info) mod->init_layout.ro_after_init_size = mod->init_layout.ro_size; break; case 4: /* whole init */ - mod->init_layout.size = debug_align(mod->init_layout.size); + mod->init_layout.size = strict_align(mod->init_layout.size); break; } } @@ -2597,238 +1594,16 @@ static void free_modinfo(struct module *mod) } } -#ifdef CONFIG_KALLSYMS - -/* Lookup exported symbol in given range of kernel_symbols */ -static const struct kernel_symbol *lookup_exported_symbol(const char *name, - const struct kernel_symbol *start, - const struct kernel_symbol *stop) -{ - return bsearch(name, start, stop - start, - sizeof(struct kernel_symbol), cmp_name); -} - -static int is_exported(const char *name, unsigned long value, - const struct module *mod) -{ - const struct kernel_symbol *ks; - if (!mod) - ks = lookup_exported_symbol(name, __start___ksymtab, __stop___ksymtab); - else - ks = lookup_exported_symbol(name, mod->syms, mod->syms + mod->num_syms); - - return ks != NULL && kernel_symbol_value(ks) == value; -} - -/* As per nm */ -static char elf_type(const Elf_Sym *sym, const struct load_info *info) -{ - const Elf_Shdr *sechdrs = info->sechdrs; - - if (ELF_ST_BIND(sym->st_info) == STB_WEAK) { - if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT) - return 'v'; - else - return 'w'; - } - if (sym->st_shndx == SHN_UNDEF) - return 'U'; - if (sym->st_shndx == SHN_ABS || sym->st_shndx == info->index.pcpu) - return 'a'; - if (sym->st_shndx >= SHN_LORESERVE) - return '?'; - if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR) - return 't'; - if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC - && sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) { - if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE)) - return 'r'; - else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL) - return 'g'; - else - return 'd'; - } - if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { - if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL) - return 's'; - else - return 'b'; - } - if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name, - ".debug")) { - return 'n'; - } - return '?'; -} - -static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, - unsigned int shnum, unsigned int pcpundx) -{ - const Elf_Shdr *sec; - - if (src->st_shndx == SHN_UNDEF - || src->st_shndx >= shnum - || !src->st_name) - return false; - -#ifdef CONFIG_KALLSYMS_ALL - if (src->st_shndx == pcpundx) - return true; -#endif - - sec = sechdrs + src->st_shndx; - if (!(sec->sh_flags & SHF_ALLOC) -#ifndef CONFIG_KALLSYMS_ALL - || !(sec->sh_flags & SHF_EXECINSTR) -#endif - || (sec->sh_entsize & INIT_OFFSET_MASK)) - return false; - - return true; -} - -/* - * We only allocate and copy the strings needed by the parts of symtab - * we keep. This is simple, but has the effect of making multiple - * copies of duplicates. We could be more sophisticated, see - * linux-kernel thread starting with - * <73defb5e4bca04a6431392cc341112b1@localhost>. - */ -static void layout_symtab(struct module *mod, struct load_info *info) -{ - Elf_Shdr *symsect = info->sechdrs + info->index.sym; - Elf_Shdr *strsect = info->sechdrs + info->index.str; - const Elf_Sym *src; - unsigned int i, nsrc, ndst, strtab_size = 0; - - /* Put symbol section at end of init part of module. */ - symsect->sh_flags |= SHF_ALLOC; - symsect->sh_entsize = get_offset(mod, &mod->init_layout.size, symsect, - info->index.sym) | INIT_OFFSET_MASK; - pr_debug("\t%s\n", info->secstrings + symsect->sh_name); - - src = (void *)info->hdr + symsect->sh_offset; - nsrc = symsect->sh_size / sizeof(*src); - - /* Compute total space required for the core symbols' strtab. */ - for (ndst = i = 0; i < nsrc; i++) { - if (i == 0 || is_livepatch_module(mod) || - is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, - info->index.pcpu)) { - strtab_size += strlen(&info->strtab[src[i].st_name])+1; - ndst++; - } - } - - /* Append room for core symbols at end of core part. */ - info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1); - info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym); - mod->core_layout.size += strtab_size; - info->core_typeoffs = mod->core_layout.size; - mod->core_layout.size += ndst * sizeof(char); - mod->core_layout.size = debug_align(mod->core_layout.size); - - /* Put string table section at end of init part of module. */ - strsect->sh_flags |= SHF_ALLOC; - strsect->sh_entsize = get_offset(mod, &mod->init_layout.size, strsect, - info->index.str) | INIT_OFFSET_MASK; - pr_debug("\t%s\n", info->secstrings + strsect->sh_name); - - /* We'll tack temporary mod_kallsyms on the end. */ - mod->init_layout.size = ALIGN(mod->init_layout.size, - __alignof__(struct mod_kallsyms)); - info->mod_kallsyms_init_off = mod->init_layout.size; - mod->init_layout.size += sizeof(struct mod_kallsyms); - info->init_typeoffs = mod->init_layout.size; - mod->init_layout.size += nsrc * sizeof(char); - mod->init_layout.size = debug_align(mod->init_layout.size); -} - -/* - * We use the full symtab and strtab which layout_symtab arranged to - * be appended to the init section. Later we switch to the cut-down - * core-only ones. - */ -static void add_kallsyms(struct module *mod, const struct load_info *info) -{ - unsigned int i, ndst; - const Elf_Sym *src; - Elf_Sym *dst; - char *s; - Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; - - /* Set up to point into init section. */ - mod->kallsyms = mod->init_layout.base + info->mod_kallsyms_init_off; - - mod->kallsyms->symtab = (void *)symsec->sh_addr; - mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym); - /* Make sure we get permanent strtab: don't use info->strtab. */ - mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr; - mod->kallsyms->typetab = mod->init_layout.base + info->init_typeoffs; - - /* - * Now populate the cut down core kallsyms for after init - * and set types up while we still have access to sections. - */ - mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs; - mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs; - mod->core_kallsyms.typetab = mod->core_layout.base + info->core_typeoffs; - src = mod->kallsyms->symtab; - for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) { - mod->kallsyms->typetab[i] = elf_type(src + i, info); - if (i == 0 || is_livepatch_module(mod) || - is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, - info->index.pcpu)) { - mod->core_kallsyms.typetab[ndst] = - mod->kallsyms->typetab[i]; - dst[ndst] = src[i]; - dst[ndst++].st_name = s - mod->core_kallsyms.strtab; - s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name], - KSYM_NAME_LEN) + 1; - } - } - mod->core_kallsyms.num_symtab = ndst; -} -#else -static inline void layout_symtab(struct module *mod, struct load_info *info) -{ -} - -static void add_kallsyms(struct module *mod, const struct load_info *info) -{ -} -#endif /* CONFIG_KALLSYMS */ - -#if IS_ENABLED(CONFIG_KALLSYMS) && IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) -static void init_build_id(struct module *mod, const struct load_info *info) -{ - const Elf_Shdr *sechdr; - unsigned int i; - - for (i = 0; i < info->hdr->e_shnum; i++) { - sechdr = &info->sechdrs[i]; - if (!sect_empty(sechdr) && sechdr->sh_type == SHT_NOTE && - !build_id_parse_buf((void *)sechdr->sh_addr, mod->build_id, - sechdr->sh_size)) - break; - } -} -#else -static void init_build_id(struct module *mod, const struct load_info *info) -{ -} -#endif - -static void dynamic_debug_setup(struct module *mod, struct _ddebug *debug, unsigned int num) +static void dynamic_debug_setup(struct module *mod, struct _ddebug_info *dyndbg) { - if (!debug) + if (!dyndbg->num_descs) return; - ddebug_add_module(debug, num, mod->name); + ddebug_add_module(dyndbg, mod->name); } -static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug) +static void dynamic_debug_remove(struct module *mod, struct _ddebug_info *dyndbg) { - if (debug) + if (dyndbg->num_descs) ddebug_remove_module(mod->name); } @@ -2849,97 +1624,6 @@ bool __weak module_exit_section(const char *name) return strstarts(name, ".exit"); } -#ifdef CONFIG_DEBUG_KMEMLEAK -static void kmemleak_load_module(const struct module *mod, - const struct load_info *info) -{ - unsigned int i; - - /* only scan the sections containing data */ - kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); - - for (i = 1; i < info->hdr->e_shnum; i++) { - /* Scan all writable sections that's not executable */ - if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) || - !(info->sechdrs[i].sh_flags & SHF_WRITE) || - (info->sechdrs[i].sh_flags & SHF_EXECINSTR)) - continue; - - kmemleak_scan_area((void *)info->sechdrs[i].sh_addr, - info->sechdrs[i].sh_size, GFP_KERNEL); - } -} -#else -static inline void kmemleak_load_module(const struct module *mod, - const struct load_info *info) -{ -} -#endif - -#ifdef CONFIG_MODULE_SIG -static int module_sig_check(struct load_info *info, int flags) -{ - int err = -ENODATA; - const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; - const char *reason; - const void *mod = info->hdr; - bool mangled_module = flags & (MODULE_INIT_IGNORE_MODVERSIONS | - MODULE_INIT_IGNORE_VERMAGIC); - /* - * Do not allow mangled modules as a module with version information - * removed is no longer the module that was signed. - */ - if (!mangled_module && - info->len > markerlen && - memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { - /* We truncate the module to discard the signature */ - info->len -= markerlen; - err = mod_verify_sig(mod, info); - if (!err) { - info->sig_ok = true; - return 0; - } - } - - /* - * We don't permit modules to be loaded into the trusted kernels - * without a valid signature on them, but if we're not enforcing, - * certain errors are non-fatal. - */ - switch (err) { - case -ENODATA: - reason = "unsigned module"; - break; - case -ENOPKG: - reason = "module with unsupported crypto"; - break; - case -ENOKEY: - reason = "module with unavailable key"; - break; - - default: - /* - * All other errors are fatal, including lack of memory, - * unparseable signatures, and signature check failures -- - * even if signatures aren't required. - */ - return err; - } - - if (is_module_sig_enforced()) { - pr_notice("Loading of %s is rejected\n", reason); - return -EKEYREJECTED; - } - - return security_locked_down(LOCKDOWN_MODULE_SIGNATURE); -} -#else /* !CONFIG_MODULE_SIG */ -static int module_sig_check(struct load_info *info, int flags) -{ - return 0; -} -#endif /* !CONFIG_MODULE_SIG */ - static int validate_section_offset(struct load_info *info, Elf_Shdr *shdr) { #if defined(CONFIG_64BIT) @@ -3033,6 +1717,10 @@ static int elf_validity_check(struct load_info *info) * strings in the section safe. */ info->secstrings = (void *)info->hdr + strhdr->sh_offset; + if (strhdr->sh_size == 0) { + pr_err("empty section name table\n"); + goto no_exec; + } if (info->secstrings[strhdr->sh_size - 1] != '\0') { pr_err("ELF Spec violation: section name table isn't null terminated\n"); goto no_exec; @@ -3107,30 +1795,23 @@ static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned l return 0; } -#ifdef CONFIG_LIVEPATCH static int check_modinfo_livepatch(struct module *mod, struct load_info *info) { - if (get_modinfo(info, "livepatch")) { - mod->klp = true; + if (!get_modinfo(info, "livepatch")) + /* Nothing more to do */ + return 0; + + if (set_livepatch_module(mod)) { add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK); pr_notice_once("%s: tainting kernel with TAINT_LIVEPATCH\n", - mod->name); - } - - return 0; -} -#else /* !CONFIG_LIVEPATCH */ -static int check_modinfo_livepatch(struct module *mod, struct load_info *info) -{ - if (get_modinfo(info, "livepatch")) { - pr_err("%s: module is marked as livepatch module, but livepatch support is disabled", - mod->name); - return -ENOEXEC; + mod->name); + return 0; } - return 0; + pr_err("%s: module is marked as livepatch module, but livepatch support is disabled", + mod->name); + return -ENOEXEC; } -#endif /* CONFIG_LIVEPATCH */ static void check_modinfo_retpoline(struct module *mod, struct load_info *info) { @@ -3308,6 +1989,13 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) /* Set up license info based on the info section */ set_license(mod, get_modinfo(info, "license")); + if (get_modinfo(info, "test")) { + if (!test_taint(TAINT_TEST)) + pr_warn("%s: loading test module taints kernel.\n", + mod->name); + add_taint_module(mod, TAINT_TEST, LOCKDEP_STILL_OK); + } + return 0; } @@ -3407,14 +2095,22 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->static_call_sites), &mod->num_static_call_sites); #endif +#if IS_ENABLED(CONFIG_KUNIT) + mod->kunit_suites = section_objs(info, ".kunit_test_suites", + sizeof(*mod->kunit_suites), + &mod->num_kunit_suites); +#endif + mod->extable = section_objs(info, "__ex_table", sizeof(*mod->extable), &mod->num_exentries); if (section_addr(info, "__obsparm")) pr_warn("%s: Ignoring obsolete parameters\n", mod->name); - info->debug = section_objs(info, "__dyndbg", - sizeof(*info->debug), &info->num_debug); + info->dyndbg.descs = section_objs(info, "__dyndbg", + sizeof(*info->dyndbg.descs), &info->dyndbg.num_descs); + info->dyndbg.classes = section_objs(info, "__dyndbg_classes", + sizeof(*info->dyndbg.classes), &info->dyndbg.num_classes); return 0; } @@ -3456,6 +2152,23 @@ static int move_module(struct module *mod, struct load_info *info) } else mod->init_layout.base = NULL; +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + /* Do the allocs. */ + ptr = vzalloc(mod->data_layout.size); + /* + * The pointer to this block is stored in the module structure + * which is inside the block. Just mark it as not being a + * leak. + */ + kmemleak_not_leak(ptr); + if (!ptr) { + module_memfree(mod->core_layout.base); + module_memfree(mod->init_layout.base); + return -ENOMEM; + } + + mod->data_layout.base = ptr; +#endif /* Transfer each section which specifies SHF_ALLOC */ pr_debug("final section addresses:\n"); for (i = 0; i < info->hdr->e_shnum; i++) { @@ -3468,6 +2181,8 @@ static int move_module(struct module *mod, struct load_info *info) if (shdr->sh_entsize & INIT_OFFSET_MASK) dest = mod->init_layout.base + (shdr->sh_entsize & ~INIT_OFFSET_MASK); + else if (!(shdr->sh_flags & SHF_EXECINSTR)) + dest = mod->data_layout.base + shdr->sh_entsize; else dest = mod->core_layout.base + shdr->sh_entsize; @@ -3629,6 +2344,9 @@ static void module_deallocate(struct module *mod, struct load_info *info) module_arch_freeing_init(mod); module_memfree(mod->init_layout.base); module_memfree(mod->core_layout.base); +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + vfree(mod->data_layout.base); +#endif } int __weak module_finalize(const Elf_Ehdr *hdr, @@ -3707,6 +2425,12 @@ static void do_free_init(struct work_struct *w) } } +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "module." +/* Default value for module->async_probe_requested */ +static bool async_probe; +module_param(async_probe, bool, 0644); + /* * This is where the real work happens. * @@ -3876,8 +2600,12 @@ static int complete_formation(struct module *mod, struct load_info *info) if (err < 0) goto out; - /* This relies on module_mutex for list integrity. */ + /* These rely on module_mutex for list integrity. */ module_bug_finalize(info->hdr, info->sechdrs, mod); + module_cfi_finalize(info->hdr, info->sechdrs, mod); + + if (module_check_misalignment(mod)) + goto out_misaligned; module_enable_ro(mod, false); module_enable_nx(mod); @@ -3892,6 +2620,8 @@ static int complete_formation(struct module *mod, struct load_info *info) return 0; +out_misaligned: + err = -EINVAL; out: mutex_unlock(&module_mutex); return err; @@ -3922,7 +2652,8 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname, int ret; if (strcmp(param, "async_probe") == 0) { - mod->async_probe_requested = true; + if (strtobool(val, &mod->async_probe_requested)) + mod->async_probe_requested = true; return 0; } @@ -3933,8 +2664,6 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname, return 0; } -static void cfi_init(struct module *mod); - /* * Allocate and load the module: note that size of section 0 is always * zero, and we rely on this for optional sections. @@ -4064,9 +2793,6 @@ static int load_module(struct load_info *info, const char __user *uargs, flush_module_icache(mod); - /* Setup CFI for the module. */ - cfi_init(mod); - /* Now copy in args */ mod->args = strndup_user(uargs, ~0UL >> 1); if (IS_ERR(mod->args)) { @@ -4075,7 +2801,7 @@ static int load_module(struct load_info *info, const char __user *uargs, } init_build_id(mod, info); - dynamic_debug_setup(mod, info->debug, info->num_debug); + dynamic_debug_setup(mod, &info->dyndbg); /* Ftrace init must be called in the MODULE_STATE_UNFORMED state */ ftrace_module_init(mod); @@ -4089,6 +2815,8 @@ static int load_module(struct load_info *info, const char __user *uargs, if (err) goto bug_cleanup; + mod->async_probe_requested = async_probe; + /* Module is ready to execute: parsing args may do that. */ after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, -32768, 32767, mod, @@ -4137,11 +2865,10 @@ static int load_module(struct load_info *info, const char __user *uargs, ddebug_cleanup: ftrace_release_mod(mod); - dynamic_debug_remove(mod, info->debug); + dynamic_debug_remove(mod, &info->dyndbg); synchronize_rcu(); kfree(mod->args); free_arch_cleanup: - cfi_cleanup(mod); module_arch_cleanup(mod); free_modinfo: free_modinfo(mod); @@ -4158,7 +2885,7 @@ static int load_module(struct load_info *info, const char __user *uargs, mutex_unlock(&module_mutex); free_module: /* Free lock-classes; relies on the preceding sync_rcu() */ - lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); + lockdep_free_key_range(mod->data_layout.base, mod->data_layout.size); module_deallocate(mod, info); free_copy: @@ -4227,445 +2954,33 @@ static inline int within(unsigned long addr, void *start, unsigned long size) return ((void *)addr >= start && (void *)addr < start + size); } -#ifdef CONFIG_KALLSYMS -/* - * This ignores the intensely annoying "mapping symbols" found - * in ARM ELF files: $a, $t and $d. - */ -static inline int is_arm_mapping_symbol(const char *str) -{ - if (str[0] == '.' && str[1] == 'L') - return true; - return str[0] == '$' && strchr("axtd", str[1]) - && (str[2] == '\0' || str[2] == '.'); -} - -static const char *kallsyms_symbol_name(struct mod_kallsyms *kallsyms, unsigned int symnum) -{ - return kallsyms->strtab + kallsyms->symtab[symnum].st_name; -} - -/* - * Given a module and address, find the corresponding symbol and return its name - * while providing its size and offset if needed. - */ -static const char *find_kallsyms_symbol(struct module *mod, - unsigned long addr, - unsigned long *size, - unsigned long *offset) -{ - unsigned int i, best = 0; - unsigned long nextval, bestval; - struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); - - /* At worse, next value is at end of module */ - if (within_module_init(addr, mod)) - nextval = (unsigned long)mod->init_layout.base+mod->init_layout.text_size; - else - nextval = (unsigned long)mod->core_layout.base+mod->core_layout.text_size; - - bestval = kallsyms_symbol_value(&kallsyms->symtab[best]); - - /* - * Scan for closest preceding symbol, and next symbol. (ELF - * starts real symbols at 1). - */ - for (i = 1; i < kallsyms->num_symtab; i++) { - const Elf_Sym *sym = &kallsyms->symtab[i]; - unsigned long thisval = kallsyms_symbol_value(sym); - - if (sym->st_shndx == SHN_UNDEF) - continue; - - /* - * We ignore unnamed symbols: they're uninformative - * and inserted at a whim. - */ - if (*kallsyms_symbol_name(kallsyms, i) == '\0' - || is_arm_mapping_symbol(kallsyms_symbol_name(kallsyms, i))) - continue; - - if (thisval <= addr && thisval > bestval) { - best = i; - bestval = thisval; - } - if (thisval > addr && thisval < nextval) - nextval = thisval; - } - - if (!best) - return NULL; - - if (size) - *size = nextval - bestval; - if (offset) - *offset = addr - bestval; - - return kallsyms_symbol_name(kallsyms, best); -} - -void * __weak dereference_module_function_descriptor(struct module *mod, - void *ptr) -{ - return ptr; -} - -/* - * For kallsyms to ask for address resolution. NULL means not found. Careful - * not to lock to avoid deadlock on oopses, simply disable preemption. - */ -const char *module_address_lookup(unsigned long addr, - unsigned long *size, - unsigned long *offset, - char **modname, - const unsigned char **modbuildid, - char *namebuf) -{ - const char *ret = NULL; - struct module *mod; - - preempt_disable(); - mod = __module_address(addr); - if (mod) { - if (modname) - *modname = mod->name; - if (modbuildid) { -#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) - *modbuildid = mod->build_id; -#else - *modbuildid = NULL; -#endif - } - - ret = find_kallsyms_symbol(mod, addr, size, offset); - } - /* Make a copy in here where it's safe */ - if (ret) { - strncpy(namebuf, ret, KSYM_NAME_LEN - 1); - ret = namebuf; - } - preempt_enable(); - - return ret; -} - -int lookup_module_symbol_name(unsigned long addr, char *symname) -{ - struct module *mod; - - preempt_disable(); - list_for_each_entry_rcu(mod, &modules, list) { - if (mod->state == MODULE_STATE_UNFORMED) - continue; - if (within_module(addr, mod)) { - const char *sym; - - sym = find_kallsyms_symbol(mod, addr, NULL, NULL); - if (!sym) - goto out; - - strlcpy(symname, sym, KSYM_NAME_LEN); - preempt_enable(); - return 0; - } - } -out: - preempt_enable(); - return -ERANGE; -} - -int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, - unsigned long *offset, char *modname, char *name) -{ - struct module *mod; - - preempt_disable(); - list_for_each_entry_rcu(mod, &modules, list) { - if (mod->state == MODULE_STATE_UNFORMED) - continue; - if (within_module(addr, mod)) { - const char *sym; - - sym = find_kallsyms_symbol(mod, addr, size, offset); - if (!sym) - goto out; - if (modname) - strlcpy(modname, mod->name, MODULE_NAME_LEN); - if (name) - strlcpy(name, sym, KSYM_NAME_LEN); - preempt_enable(); - return 0; - } - } -out: - preempt_enable(); - return -ERANGE; -} - -int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, - char *name, char *module_name, int *exported) -{ - struct module *mod; - - preempt_disable(); - list_for_each_entry_rcu(mod, &modules, list) { - struct mod_kallsyms *kallsyms; - - if (mod->state == MODULE_STATE_UNFORMED) - continue; - kallsyms = rcu_dereference_sched(mod->kallsyms); - if (symnum < kallsyms->num_symtab) { - const Elf_Sym *sym = &kallsyms->symtab[symnum]; - - *value = kallsyms_symbol_value(sym); - *type = kallsyms->typetab[symnum]; - strlcpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN); - strlcpy(module_name, mod->name, MODULE_NAME_LEN); - *exported = is_exported(name, *value, mod); - preempt_enable(); - return 0; - } - symnum -= kallsyms->num_symtab; - } - preempt_enable(); - return -ERANGE; -} - -/* Given a module and name of symbol, find and return the symbol's value */ -static unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name) -{ - unsigned int i; - struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); - - for (i = 0; i < kallsyms->num_symtab; i++) { - const Elf_Sym *sym = &kallsyms->symtab[i]; - - if (strcmp(name, kallsyms_symbol_name(kallsyms, i)) == 0 && - sym->st_shndx != SHN_UNDEF) - return kallsyms_symbol_value(sym); - } - return 0; -} - -/* Look for this name: can be of form module:name. */ -unsigned long module_kallsyms_lookup_name(const char *name) -{ - struct module *mod; - char *colon; - unsigned long ret = 0; - - /* Don't lock: we're in enough trouble already. */ - preempt_disable(); - if ((colon = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) { - if ((mod = find_module_all(name, colon - name, false)) != NULL) - ret = find_kallsyms_symbol_value(mod, colon+1); - } else { - list_for_each_entry_rcu(mod, &modules, list) { - if (mod->state == MODULE_STATE_UNFORMED) - continue; - if ((ret = find_kallsyms_symbol_value(mod, name)) != 0) - break; - } - } - preempt_enable(); - return ret; -} - -#ifdef CONFIG_LIVEPATCH -int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, - struct module *, unsigned long), - void *data) -{ - struct module *mod; - unsigned int i; - int ret = 0; - - mutex_lock(&module_mutex); - list_for_each_entry(mod, &modules, list) { - /* We hold module_mutex: no need for rcu_dereference_sched */ - struct mod_kallsyms *kallsyms = mod->kallsyms; - - if (mod->state == MODULE_STATE_UNFORMED) - continue; - for (i = 0; i < kallsyms->num_symtab; i++) { - const Elf_Sym *sym = &kallsyms->symtab[i]; - - if (sym->st_shndx == SHN_UNDEF) - continue; - - ret = fn(data, kallsyms_symbol_name(kallsyms, i), - mod, kallsyms_symbol_value(sym)); - if (ret != 0) - goto out; - - cond_resched(); - } - } -out: - mutex_unlock(&module_mutex); - return ret; -} -#endif /* CONFIG_LIVEPATCH */ -#endif /* CONFIG_KALLSYMS */ - -static void cfi_init(struct module *mod) -{ -#ifdef CONFIG_CFI_CLANG - initcall_t *init; - exitcall_t *exit; - - rcu_read_lock_sched(); - mod->cfi_check = (cfi_check_fn) - find_kallsyms_symbol_value(mod, "__cfi_check"); - init = (initcall_t *) - find_kallsyms_symbol_value(mod, "__cfi_jt_init_module"); - exit = (exitcall_t *) - find_kallsyms_symbol_value(mod, "__cfi_jt_cleanup_module"); - rcu_read_unlock_sched(); - - /* Fix init/exit functions to point to the CFI jump table */ - if (init) - mod->init = *init; -#ifdef CONFIG_MODULE_UNLOAD - if (exit) - mod->exit = *exit; -#endif - - cfi_module_add(mod, module_addr_min); -#endif -} - -static void cfi_cleanup(struct module *mod) -{ -#ifdef CONFIG_CFI_CLANG - cfi_module_remove(mod, module_addr_min); -#endif -} - -/* Maximum number of characters written by module_flags() */ -#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4) - /* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */ -static char *module_flags(struct module *mod, char *buf) +char *module_flags(struct module *mod, char *buf, bool show_state) { int bx = 0; BUG_ON(mod->state == MODULE_STATE_UNFORMED); + if (!mod->taints && !show_state) + goto out; if (mod->taints || mod->state == MODULE_STATE_GOING || mod->state == MODULE_STATE_COMING) { buf[bx++] = '('; - bx += module_flags_taint(mod, buf + bx); + bx += module_flags_taint(mod->taints, buf + bx); /* Show a - for module-is-being-unloaded */ - if (mod->state == MODULE_STATE_GOING) + if (mod->state == MODULE_STATE_GOING && show_state) buf[bx++] = '-'; /* Show a + for module-is-being-loaded */ - if (mod->state == MODULE_STATE_COMING) + if (mod->state == MODULE_STATE_COMING && show_state) buf[bx++] = '+'; buf[bx++] = ')'; } +out: buf[bx] = '\0'; return buf; } -#ifdef CONFIG_PROC_FS -/* Called by the /proc file system to return a list of modules. */ -static void *m_start(struct seq_file *m, loff_t *pos) -{ - mutex_lock(&module_mutex); - return seq_list_start(&modules, *pos); -} - -static void *m_next(struct seq_file *m, void *p, loff_t *pos) -{ - return seq_list_next(p, &modules, pos); -} - -static void m_stop(struct seq_file *m, void *p) -{ - mutex_unlock(&module_mutex); -} - -static int m_show(struct seq_file *m, void *p) -{ - struct module *mod = list_entry(p, struct module, list); - char buf[MODULE_FLAGS_BUF_SIZE]; - void *value; - - /* We always ignore unformed modules. */ - if (mod->state == MODULE_STATE_UNFORMED) - return 0; - - seq_printf(m, "%s %u", - mod->name, mod->init_layout.size + mod->core_layout.size); - print_unload_info(m, mod); - - /* Informative for users. */ - seq_printf(m, " %s", - mod->state == MODULE_STATE_GOING ? "Unloading" : - mod->state == MODULE_STATE_COMING ? "Loading" : - "Live"); - /* Used by oprofile and other similar tools. */ - value = m->private ? NULL : mod->core_layout.base; - seq_printf(m, " 0x%px", value); - - /* Taints info */ - if (mod->taints) - seq_printf(m, " %s", module_flags(mod, buf)); - - seq_puts(m, "\n"); - return 0; -} - -/* - * Format: modulename size refcount deps address - * - * Where refcount is a number or -, and deps is a comma-separated list - * of depends or -. - */ -static const struct seq_operations modules_op = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = m_show -}; - -/* - * This also sets the "private" pointer to non-NULL if the - * kernel pointers should be hidden (so you can just test - * "m->private" to see if you should keep the values private). - * - * We use the same logic as for /proc/kallsyms. - */ -static int modules_open(struct inode *inode, struct file *file) -{ - int err = seq_open(file, &modules_op); - - if (!err) { - struct seq_file *m = file->private_data; - m->private = kallsyms_show_value(file->f_cred) ? NULL : (void *)8ul; - } - - return err; -} - -static const struct proc_ops modules_proc_ops = { - .proc_flags = PROC_ENTRY_PERMANENT, - .proc_open = modules_open, - .proc_read = seq_read, - .proc_lseek = seq_lseek, - .proc_release = seq_release, -}; - -static int __init proc_modules_init(void) -{ - proc_create("modules", 0, NULL, &modules_proc_ops); - return 0; -} -module_init(proc_modules_init); -#endif - /* Given an address, look for it in the module exception tables. */ const struct exception_table_entry *search_module_extables(unsigned long addr) { @@ -4721,13 +3036,20 @@ bool is_module_address(unsigned long addr) struct module *__module_address(unsigned long addr) { struct module *mod; + struct mod_tree_root *tree; - if (addr < module_addr_min || addr > module_addr_max) + if (addr >= mod_tree.addr_min && addr <= mod_tree.addr_max) + tree = &mod_tree; +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + else if (addr >= mod_data_tree.addr_min && addr <= mod_data_tree.addr_max) + tree = &mod_data_tree; +#endif + else return NULL; module_assert_mutex_or_preempt(); - mod = mod_find(addr); + mod = mod_find(addr, tree); if (mod) { BUG_ON(!within_module(addr, mod)); if (mod->state == MODULE_STATE_UNFORMED) @@ -4786,25 +3108,13 @@ void print_modules(void) list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - pr_cont(" %s%s", mod->name, module_flags(mod, buf)); + pr_cont(" %s%s", mod->name, module_flags(mod, buf, true)); } + + print_unloaded_tainted_modules(); preempt_enable(); - if (last_unloaded_module[0]) - pr_cont(" [last unloaded: %s]", last_unloaded_module); + if (last_unloaded_module.name[0]) + pr_cont(" [last unloaded: %s%s]", last_unloaded_module.name, + last_unloaded_module.taints); pr_cont("\n"); } - -#ifdef CONFIG_MODVERSIONS -/* - * Generate the signature for all relevant module structures here. - * If these change, we don't want to try to parse the module. - */ -void module_layout(struct module *mod, - struct modversion_info *ver, - struct kernel_param *kp, - struct kernel_symbol *ks, - struct tracepoint * const *tp) -{ -} -EXPORT_SYMBOL(module_layout); -#endif diff --git a/kernel/module/procfs.c b/kernel/module/procfs.c new file mode 100644 index 000000000000..cf5b9f1e6ec4 --- /dev/null +++ b/kernel/module/procfs.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Module proc support + * + * Copyright (C) 2008 Alexey Dobriyan + */ + +#include <linux/module.h> +#include <linux/kallsyms.h> +#include <linux/mutex.h> +#include <linux/seq_file.h> +#include <linux/proc_fs.h> +#include "internal.h" + +#ifdef CONFIG_MODULE_UNLOAD +static inline void print_unload_info(struct seq_file *m, struct module *mod) +{ + struct module_use *use; + int printed_something = 0; + + seq_printf(m, " %i ", module_refcount(mod)); + + /* + * Always include a trailing , so userspace can differentiate + * between this and the old multi-field proc format. + */ + list_for_each_entry(use, &mod->source_list, source_list) { + printed_something = 1; + seq_printf(m, "%s,", use->source->name); + } + + if (mod->init && !mod->exit) { + printed_something = 1; + seq_puts(m, "[permanent],"); + } + + if (!printed_something) + seq_puts(m, "-"); +} +#else /* !CONFIG_MODULE_UNLOAD */ +static inline void print_unload_info(struct seq_file *m, struct module *mod) +{ + /* We don't know the usage count, or what modules are using. */ + seq_puts(m, " - -"); +} +#endif /* CONFIG_MODULE_UNLOAD */ + +/* Called by the /proc file system to return a list of modules. */ +static void *m_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&module_mutex); + return seq_list_start(&modules, *pos); +} + +static void *m_next(struct seq_file *m, void *p, loff_t *pos) +{ + return seq_list_next(p, &modules, pos); +} + +static void m_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&module_mutex); +} + +static int m_show(struct seq_file *m, void *p) +{ + struct module *mod = list_entry(p, struct module, list); + char buf[MODULE_FLAGS_BUF_SIZE]; + void *value; + unsigned int size; + + /* We always ignore unformed modules. */ + if (mod->state == MODULE_STATE_UNFORMED) + return 0; + + size = mod->init_layout.size + mod->core_layout.size; +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + size += mod->data_layout.size; +#endif + seq_printf(m, "%s %u", mod->name, size); + print_unload_info(m, mod); + + /* Informative for users. */ + seq_printf(m, " %s", + mod->state == MODULE_STATE_GOING ? "Unloading" : + mod->state == MODULE_STATE_COMING ? "Loading" : + "Live"); + /* Used by oprofile and other similar tools. */ + value = m->private ? NULL : mod->core_layout.base; + seq_printf(m, " 0x%px", value); + + /* Taints info */ + if (mod->taints) + seq_printf(m, " %s", module_flags(mod, buf, true)); + + seq_puts(m, "\n"); + return 0; +} + +/* + * Format: modulename size refcount deps address + * + * Where refcount is a number or -, and deps is a comma-separated list + * of depends or -. + */ +static const struct seq_operations modules_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = m_show +}; + +/* + * This also sets the "private" pointer to non-NULL if the + * kernel pointers should be hidden (so you can just test + * "m->private" to see if you should keep the values private). + * + * We use the same logic as for /proc/kallsyms. + */ +static int modules_open(struct inode *inode, struct file *file) +{ + int err = seq_open(file, &modules_op); + + if (!err) { + struct seq_file *m = file->private_data; + + m->private = kallsyms_show_value(file->f_cred) ? NULL : (void *)8ul; + } + + return err; +} + +static const struct proc_ops modules_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, + .proc_open = modules_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, +}; + +static int __init proc_modules_init(void) +{ + proc_create("modules", 0, NULL, &modules_proc_ops); + return 0; +} +module_init(proc_modules_init); diff --git a/kernel/module/signing.c b/kernel/module/signing.c new file mode 100644 index 000000000000..a2ff4242e623 --- /dev/null +++ b/kernel/module/signing.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Module signature checker + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/module.h> +#include <linux/module_signature.h> +#include <linux/string.h> +#include <linux/verification.h> +#include <linux/security.h> +#include <crypto/public_key.h> +#include <uapi/linux/module.h> +#include "internal.h" + +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "module." + +static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE); +module_param(sig_enforce, bool_enable_only, 0644); + +/* + * Export sig_enforce kernel cmdline parameter to allow other subsystems rely + * on that instead of directly to CONFIG_MODULE_SIG_FORCE config. + */ +bool is_module_sig_enforced(void) +{ + return sig_enforce; +} +EXPORT_SYMBOL(is_module_sig_enforced); + +void set_module_sig_enforced(void) +{ + sig_enforce = true; +} + +/* + * Verify the signature on a module. + */ +int mod_verify_sig(const void *mod, struct load_info *info) +{ + struct module_signature ms; + size_t sig_len, modlen = info->len; + int ret; + + pr_devel("==>%s(,%zu)\n", __func__, modlen); + + if (modlen <= sizeof(ms)) + return -EBADMSG; + + memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms)); + + ret = mod_check_sig(&ms, modlen, "module"); + if (ret) + return ret; + + sig_len = be32_to_cpu(ms.sig_len); + modlen -= sig_len + sizeof(ms); + info->len = modlen; + + return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len, + VERIFY_USE_SECONDARY_KEYRING, + VERIFYING_MODULE_SIGNATURE, + NULL, NULL); +} + +int module_sig_check(struct load_info *info, int flags) +{ + int err = -ENODATA; + const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; + const char *reason; + const void *mod = info->hdr; + bool mangled_module = flags & (MODULE_INIT_IGNORE_MODVERSIONS | + MODULE_INIT_IGNORE_VERMAGIC); + /* + * Do not allow mangled modules as a module with version information + * removed is no longer the module that was signed. + */ + if (!mangled_module && + info->len > markerlen && + memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { + /* We truncate the module to discard the signature */ + info->len -= markerlen; + err = mod_verify_sig(mod, info); + if (!err) { + info->sig_ok = true; + return 0; + } + } + + /* + * We don't permit modules to be loaded into the trusted kernels + * without a valid signature on them, but if we're not enforcing, + * certain errors are non-fatal. + */ + switch (err) { + case -ENODATA: + reason = "unsigned module"; + break; + case -ENOPKG: + reason = "module with unsupported crypto"; + break; + case -ENOKEY: + reason = "module with unavailable key"; + break; + + default: + /* + * All other errors are fatal, including lack of memory, + * unparseable signatures, and signature check failures -- + * even if signatures aren't required. + */ + return err; + } + + if (is_module_sig_enforced()) { + pr_notice("Loading of %s is rejected\n", reason); + return -EKEYREJECTED; + } + + return security_locked_down(LOCKDOWN_MODULE_SIGNATURE); +} diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c new file mode 100644 index 000000000000..14fbea66f12f --- /dev/null +++ b/kernel/module/strict_rwx.c @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Module strict rwx + * + * Copyright (C) 2015 Rusty Russell + */ + +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> +#include <linux/set_memory.h> +#include "internal.h" + +/* + * LKM RO/NX protection: protect module's text/ro-data + * from modification and any data from execution. + * + * General layout of module is: + * [text] [read-only-data] [ro-after-init] [writable data] + * text_size -----^ ^ ^ ^ + * ro_size ------------------------| | | + * ro_after_init_size -----------------------------| | + * size -----------------------------------------------------------| + * + * These values are always page-aligned (as is base) when + * CONFIG_STRICT_MODULE_RWX is set. + */ + +/* + * Since some arches are moving towards PAGE_KERNEL module allocations instead + * of PAGE_KERNEL_EXEC, keep frob_text() and module_enable_x() independent of + * CONFIG_STRICT_MODULE_RWX because they are needed regardless of whether we + * are strict. + */ +static void frob_text(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) +{ + set_memory((unsigned long)layout->base, + PAGE_ALIGN(layout->text_size) >> PAGE_SHIFT); +} + +static void frob_rodata(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) +{ + set_memory((unsigned long)layout->base + layout->text_size, + (layout->ro_size - layout->text_size) >> PAGE_SHIFT); +} + +static void frob_ro_after_init(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) +{ + set_memory((unsigned long)layout->base + layout->ro_size, + (layout->ro_after_init_size - layout->ro_size) >> PAGE_SHIFT); +} + +static void frob_writable_data(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) +{ + set_memory((unsigned long)layout->base + layout->ro_after_init_size, + (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT); +} + +static bool layout_check_misalignment(const struct module_layout *layout) +{ + return WARN_ON(!PAGE_ALIGNED(layout->base)) || + WARN_ON(!PAGE_ALIGNED(layout->text_size)) || + WARN_ON(!PAGE_ALIGNED(layout->ro_size)) || + WARN_ON(!PAGE_ALIGNED(layout->ro_after_init_size)) || + WARN_ON(!PAGE_ALIGNED(layout->size)); +} + +bool module_check_misalignment(const struct module *mod) +{ + if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) + return false; + + return layout_check_misalignment(&mod->core_layout) || + layout_check_misalignment(&mod->data_layout) || + layout_check_misalignment(&mod->init_layout); +} + +void module_enable_x(const struct module *mod) +{ + if (!PAGE_ALIGNED(mod->core_layout.base) || + !PAGE_ALIGNED(mod->init_layout.base)) + return; + + frob_text(&mod->core_layout, set_memory_x); + frob_text(&mod->init_layout, set_memory_x); +} + +void module_enable_ro(const struct module *mod, bool after_init) +{ + if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) + return; +#ifdef CONFIG_STRICT_MODULE_RWX + if (!rodata_enabled) + return; +#endif + + set_vm_flush_reset_perms(mod->core_layout.base); + set_vm_flush_reset_perms(mod->init_layout.base); + frob_text(&mod->core_layout, set_memory_ro); + + frob_rodata(&mod->data_layout, set_memory_ro); + frob_text(&mod->init_layout, set_memory_ro); + frob_rodata(&mod->init_layout, set_memory_ro); + + if (after_init) + frob_ro_after_init(&mod->data_layout, set_memory_ro); +} + +void module_enable_nx(const struct module *mod) +{ + if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) + return; + + frob_rodata(&mod->data_layout, set_memory_nx); + frob_ro_after_init(&mod->data_layout, set_memory_nx); + frob_writable_data(&mod->data_layout, set_memory_nx); + frob_rodata(&mod->init_layout, set_memory_nx); + frob_writable_data(&mod->init_layout, set_memory_nx); +} + +int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, + char *secstrings, struct module *mod) +{ + const unsigned long shf_wx = SHF_WRITE | SHF_EXECINSTR; + int i; + + if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) + return 0; + + for (i = 0; i < hdr->e_shnum; i++) { + if ((sechdrs[i].sh_flags & shf_wx) == shf_wx) { + pr_err("%s: section %s (index %d) has invalid WRITE|EXEC flags\n", + mod->name, secstrings + sechdrs[i].sh_name, i); + return -ENOEXEC; + } + } + + return 0; +} diff --git a/kernel/module/sysfs.c b/kernel/module/sysfs.c new file mode 100644 index 000000000000..ce68f821dcd1 --- /dev/null +++ b/kernel/module/sysfs.c @@ -0,0 +1,436 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Module sysfs support + * + * Copyright (C) 2008 Rusty Russell + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/sysfs.h> +#include <linux/slab.h> +#include <linux/kallsyms.h> +#include <linux/mutex.h> +#include "internal.h" + +/* + * /sys/module/foo/sections stuff + * J. Corbet <corbet@lwn.net> + */ +#ifdef CONFIG_KALLSYMS +struct module_sect_attr { + struct bin_attribute battr; + unsigned long address; +}; + +struct module_sect_attrs { + struct attribute_group grp; + unsigned int nsections; + struct module_sect_attr attrs[]; +}; + +#define MODULE_SECT_READ_SIZE (3 /* "0x", "\n" */ + (BITS_PER_LONG / 4)) +static ssize_t module_sect_read(struct file *file, struct kobject *kobj, + struct bin_attribute *battr, + char *buf, loff_t pos, size_t count) +{ + struct module_sect_attr *sattr = + container_of(battr, struct module_sect_attr, battr); + char bounce[MODULE_SECT_READ_SIZE + 1]; + size_t wrote; + + if (pos != 0) + return -EINVAL; + + /* + * Since we're a binary read handler, we must account for the + * trailing NUL byte that sprintf will write: if "buf" is + * too small to hold the NUL, or the NUL is exactly the last + * byte, the read will look like it got truncated by one byte. + * Since there is no way to ask sprintf nicely to not write + * the NUL, we have to use a bounce buffer. + */ + wrote = scnprintf(bounce, sizeof(bounce), "0x%px\n", + kallsyms_show_value(file->f_cred) + ? (void *)sattr->address : NULL); + count = min(count, wrote); + memcpy(buf, bounce, count); + + return count; +} + +static void free_sect_attrs(struct module_sect_attrs *sect_attrs) +{ + unsigned int section; + + for (section = 0; section < sect_attrs->nsections; section++) + kfree(sect_attrs->attrs[section].battr.attr.name); + kfree(sect_attrs); +} + +static void add_sect_attrs(struct module *mod, const struct load_info *info) +{ + unsigned int nloaded = 0, i, size[2]; + struct module_sect_attrs *sect_attrs; + struct module_sect_attr *sattr; + struct bin_attribute **gattr; + + /* Count loaded sections and allocate structures */ + for (i = 0; i < info->hdr->e_shnum; i++) + if (!sect_empty(&info->sechdrs[i])) + nloaded++; + size[0] = ALIGN(struct_size(sect_attrs, attrs, nloaded), + sizeof(sect_attrs->grp.bin_attrs[0])); + size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.bin_attrs[0]); + sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL); + if (!sect_attrs) + return; + + /* Setup section attributes. */ + sect_attrs->grp.name = "sections"; + sect_attrs->grp.bin_attrs = (void *)sect_attrs + size[0]; + + sect_attrs->nsections = 0; + sattr = §_attrs->attrs[0]; + gattr = §_attrs->grp.bin_attrs[0]; + for (i = 0; i < info->hdr->e_shnum; i++) { + Elf_Shdr *sec = &info->sechdrs[i]; + + if (sect_empty(sec)) + continue; + sysfs_bin_attr_init(&sattr->battr); + sattr->address = sec->sh_addr; + sattr->battr.attr.name = + kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL); + if (!sattr->battr.attr.name) + goto out; + sect_attrs->nsections++; + sattr->battr.read = module_sect_read; + sattr->battr.size = MODULE_SECT_READ_SIZE; + sattr->battr.attr.mode = 0400; + *(gattr++) = &(sattr++)->battr; + } + *gattr = NULL; + + if (sysfs_create_group(&mod->mkobj.kobj, §_attrs->grp)) + goto out; + + mod->sect_attrs = sect_attrs; + return; +out: + free_sect_attrs(sect_attrs); +} + +static void remove_sect_attrs(struct module *mod) +{ + if (mod->sect_attrs) { + sysfs_remove_group(&mod->mkobj.kobj, + &mod->sect_attrs->grp); + /* + * We are positive that no one is using any sect attrs + * at this point. Deallocate immediately. + */ + free_sect_attrs(mod->sect_attrs); + mod->sect_attrs = NULL; + } +} + +/* + * /sys/module/foo/notes/.section.name gives contents of SHT_NOTE sections. + */ + +struct module_notes_attrs { + struct kobject *dir; + unsigned int notes; + struct bin_attribute attrs[]; +}; + +static ssize_t module_notes_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t pos, size_t count) +{ + /* + * The caller checked the pos and count against our size. + */ + memcpy(buf, bin_attr->private + pos, count); + return count; +} + +static void free_notes_attrs(struct module_notes_attrs *notes_attrs, + unsigned int i) +{ + if (notes_attrs->dir) { + while (i-- > 0) + sysfs_remove_bin_file(notes_attrs->dir, + ¬es_attrs->attrs[i]); + kobject_put(notes_attrs->dir); + } + kfree(notes_attrs); +} + +static void add_notes_attrs(struct module *mod, const struct load_info *info) +{ + unsigned int notes, loaded, i; + struct module_notes_attrs *notes_attrs; + struct bin_attribute *nattr; + + /* failed to create section attributes, so can't create notes */ + if (!mod->sect_attrs) + return; + + /* Count notes sections and allocate structures. */ + notes = 0; + for (i = 0; i < info->hdr->e_shnum; i++) + if (!sect_empty(&info->sechdrs[i]) && + info->sechdrs[i].sh_type == SHT_NOTE) + ++notes; + + if (notes == 0) + return; + + notes_attrs = kzalloc(struct_size(notes_attrs, attrs, notes), + GFP_KERNEL); + if (!notes_attrs) + return; + + notes_attrs->notes = notes; + nattr = ¬es_attrs->attrs[0]; + for (loaded = i = 0; i < info->hdr->e_shnum; ++i) { + if (sect_empty(&info->sechdrs[i])) + continue; + if (info->sechdrs[i].sh_type == SHT_NOTE) { + sysfs_bin_attr_init(nattr); + nattr->attr.name = mod->sect_attrs->attrs[loaded].battr.attr.name; + nattr->attr.mode = 0444; + nattr->size = info->sechdrs[i].sh_size; + nattr->private = (void *)info->sechdrs[i].sh_addr; + nattr->read = module_notes_read; + ++nattr; + } + ++loaded; + } + + notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj); + if (!notes_attrs->dir) + goto out; + + for (i = 0; i < notes; ++i) + if (sysfs_create_bin_file(notes_attrs->dir, + ¬es_attrs->attrs[i])) + goto out; + + mod->notes_attrs = notes_attrs; + return; + +out: + free_notes_attrs(notes_attrs, i); +} + +static void remove_notes_attrs(struct module *mod) +{ + if (mod->notes_attrs) + free_notes_attrs(mod->notes_attrs, mod->notes_attrs->notes); +} + +#else /* !CONFIG_KALLSYMS */ +static inline void add_sect_attrs(struct module *mod, const struct load_info *info) { } +static inline void remove_sect_attrs(struct module *mod) { } +static inline void add_notes_attrs(struct module *mod, const struct load_info *info) { } +static inline void remove_notes_attrs(struct module *mod) { } +#endif /* CONFIG_KALLSYMS */ + +static void del_usage_links(struct module *mod) +{ +#ifdef CONFIG_MODULE_UNLOAD + struct module_use *use; + + mutex_lock(&module_mutex); + list_for_each_entry(use, &mod->target_list, target_list) + sysfs_remove_link(use->target->holders_dir, mod->name); + mutex_unlock(&module_mutex); +#endif +} + +static int add_usage_links(struct module *mod) +{ + int ret = 0; +#ifdef CONFIG_MODULE_UNLOAD + struct module_use *use; + + mutex_lock(&module_mutex); + list_for_each_entry(use, &mod->target_list, target_list) { + ret = sysfs_create_link(use->target->holders_dir, + &mod->mkobj.kobj, mod->name); + if (ret) + break; + } + mutex_unlock(&module_mutex); + if (ret) + del_usage_links(mod); +#endif + return ret; +} + +static void module_remove_modinfo_attrs(struct module *mod, int end) +{ + struct module_attribute *attr; + int i; + + for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) { + if (end >= 0 && i > end) + break; + /* pick a field to test for end of list */ + if (!attr->attr.name) + break; + sysfs_remove_file(&mod->mkobj.kobj, &attr->attr); + if (attr->free) + attr->free(mod); + } + kfree(mod->modinfo_attrs); +} + +static int module_add_modinfo_attrs(struct module *mod) +{ + struct module_attribute *attr; + struct module_attribute *temp_attr; + int error = 0; + int i; + + mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) * + (modinfo_attrs_count + 1)), + GFP_KERNEL); + if (!mod->modinfo_attrs) + return -ENOMEM; + + temp_attr = mod->modinfo_attrs; + for (i = 0; (attr = modinfo_attrs[i]); i++) { + if (!attr->test || attr->test(mod)) { + memcpy(temp_attr, attr, sizeof(*temp_attr)); + sysfs_attr_init(&temp_attr->attr); + error = sysfs_create_file(&mod->mkobj.kobj, + &temp_attr->attr); + if (error) + goto error_out; + ++temp_attr; + } + } + + return 0; + +error_out: + if (i > 0) + module_remove_modinfo_attrs(mod, --i); + else + kfree(mod->modinfo_attrs); + return error; +} + +static void mod_kobject_put(struct module *mod) +{ + DECLARE_COMPLETION_ONSTACK(c); + + mod->mkobj.kobj_completion = &c; + kobject_put(&mod->mkobj.kobj); + wait_for_completion(&c); +} + +static int mod_sysfs_init(struct module *mod) +{ + int err; + struct kobject *kobj; + + if (!module_sysfs_initialized) { + pr_err("%s: module sysfs not initialized\n", mod->name); + err = -EINVAL; + goto out; + } + + kobj = kset_find_obj(module_kset, mod->name); + if (kobj) { + pr_err("%s: module is already loaded\n", mod->name); + kobject_put(kobj); + err = -EINVAL; + goto out; + } + + mod->mkobj.mod = mod; + + memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj)); + mod->mkobj.kobj.kset = module_kset; + err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL, + "%s", mod->name); + if (err) + mod_kobject_put(mod); + +out: + return err; +} + +int mod_sysfs_setup(struct module *mod, + const struct load_info *info, + struct kernel_param *kparam, + unsigned int num_params) +{ + int err; + + err = mod_sysfs_init(mod); + if (err) + goto out; + + mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj); + if (!mod->holders_dir) { + err = -ENOMEM; + goto out_unreg; + } + + err = module_param_sysfs_setup(mod, kparam, num_params); + if (err) + goto out_unreg_holders; + + err = module_add_modinfo_attrs(mod); + if (err) + goto out_unreg_param; + + err = add_usage_links(mod); + if (err) + goto out_unreg_modinfo_attrs; + + add_sect_attrs(mod, info); + add_notes_attrs(mod, info); + + return 0; + +out_unreg_modinfo_attrs: + module_remove_modinfo_attrs(mod, -1); +out_unreg_param: + module_param_sysfs_remove(mod); +out_unreg_holders: + kobject_put(mod->holders_dir); +out_unreg: + mod_kobject_put(mod); +out: + return err; +} + +static void mod_sysfs_fini(struct module *mod) +{ + remove_notes_attrs(mod); + remove_sect_attrs(mod); + mod_kobject_put(mod); +} + +void mod_sysfs_teardown(struct module *mod) +{ + del_usage_links(mod); + module_remove_modinfo_attrs(mod, -1); + module_param_sysfs_remove(mod); + kobject_put(mod->mkobj.drivers_dir); + kobject_put(mod->holders_dir); + mod_sysfs_fini(mod); +} + +void init_param_lock(struct module *mod) +{ + mutex_init(&mod->param_lock); +} diff --git a/kernel/module/tracking.c b/kernel/module/tracking.c new file mode 100644 index 000000000000..7f8133044d09 --- /dev/null +++ b/kernel/module/tracking.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Module taint unload tracking support + * + * Copyright (C) 2022 Aaron Tomlin + */ + +#include <linux/module.h> +#include <linux/string.h> +#include <linux/printk.h> +#include <linux/slab.h> +#include <linux/list.h> +#include <linux/rculist.h> +#include "internal.h" + +static LIST_HEAD(unloaded_tainted_modules); + +int try_add_tainted_module(struct module *mod) +{ + struct mod_unload_taint *mod_taint; + + module_assert_mutex_or_preempt(); + + list_for_each_entry_rcu(mod_taint, &unloaded_tainted_modules, list, + lockdep_is_held(&module_mutex)) { + if (!strcmp(mod_taint->name, mod->name) && + mod_taint->taints & mod->taints) { + mod_taint->count++; + goto out; + } + } + + mod_taint = kmalloc(sizeof(*mod_taint), GFP_KERNEL); + if (unlikely(!mod_taint)) + return -ENOMEM; + strscpy(mod_taint->name, mod->name, MODULE_NAME_LEN); + mod_taint->taints = mod->taints; + list_add_rcu(&mod_taint->list, &unloaded_tainted_modules); + mod_taint->count = 1; +out: + return 0; +} + +void print_unloaded_tainted_modules(void) +{ + struct mod_unload_taint *mod_taint; + char buf[MODULE_FLAGS_BUF_SIZE]; + + if (!list_empty(&unloaded_tainted_modules)) { + printk(KERN_DEFAULT "Unloaded tainted modules:"); + list_for_each_entry_rcu(mod_taint, &unloaded_tainted_modules, + list) { + size_t l; + + l = module_flags_taint(mod_taint->taints, buf); + buf[l++] = '\0'; + pr_cont(" %s(%s):%llu", mod_taint->name, buf, + mod_taint->count); + } + } +} diff --git a/kernel/module/tree_lookup.c b/kernel/module/tree_lookup.c new file mode 100644 index 000000000000..8ec5cfd60496 --- /dev/null +++ b/kernel/module/tree_lookup.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Modules tree lookup + * + * Copyright (C) 2015 Peter Zijlstra + * Copyright (C) 2015 Rusty Russell + */ + +#include <linux/module.h> +#include <linux/rbtree_latch.h> +#include "internal.h" + +/* + * Use a latched RB-tree for __module_address(); this allows us to use + * RCU-sched lookups of the address from any context. + * + * This is conditional on PERF_EVENTS || TRACING because those can really hit + * __module_address() hard by doing a lot of stack unwinding; potentially from + * NMI context. + */ + +static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n) +{ + struct module_layout *layout = container_of(n, struct module_layout, mtn.node); + + return (unsigned long)layout->base; +} + +static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n) +{ + struct module_layout *layout = container_of(n, struct module_layout, mtn.node); + + return (unsigned long)layout->size; +} + +static __always_inline bool +mod_tree_less(struct latch_tree_node *a, struct latch_tree_node *b) +{ + return __mod_tree_val(a) < __mod_tree_val(b); +} + +static __always_inline int +mod_tree_comp(void *key, struct latch_tree_node *n) +{ + unsigned long val = (unsigned long)key; + unsigned long start, end; + + start = __mod_tree_val(n); + if (val < start) + return -1; + + end = start + __mod_tree_size(n); + if (val >= end) + return 1; + + return 0; +} + +static const struct latch_tree_ops mod_tree_ops = { + .less = mod_tree_less, + .comp = mod_tree_comp, +}; + +static noinline void __mod_tree_insert(struct mod_tree_node *node, struct mod_tree_root *tree) +{ + latch_tree_insert(&node->node, &tree->root, &mod_tree_ops); +} + +static void __mod_tree_remove(struct mod_tree_node *node, struct mod_tree_root *tree) +{ + latch_tree_erase(&node->node, &tree->root, &mod_tree_ops); +} + +/* + * These modifications: insert, remove_init and remove; are serialized by the + * module_mutex. + */ +void mod_tree_insert(struct module *mod) +{ + mod->core_layout.mtn.mod = mod; + mod->init_layout.mtn.mod = mod; + + __mod_tree_insert(&mod->core_layout.mtn, &mod_tree); + if (mod->init_layout.size) + __mod_tree_insert(&mod->init_layout.mtn, &mod_tree); + +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + mod->data_layout.mtn.mod = mod; + __mod_tree_insert(&mod->data_layout.mtn, &mod_data_tree); +#endif +} + +void mod_tree_remove_init(struct module *mod) +{ + if (mod->init_layout.size) + __mod_tree_remove(&mod->init_layout.mtn, &mod_tree); +} + +void mod_tree_remove(struct module *mod) +{ + __mod_tree_remove(&mod->core_layout.mtn, &mod_tree); + mod_tree_remove_init(mod); +#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC + __mod_tree_remove(&mod->data_layout.mtn, &mod_data_tree); +#endif +} + +struct module *mod_find(unsigned long addr, struct mod_tree_root *tree) +{ + struct latch_tree_node *ltn; + + ltn = latch_tree_find((void *)addr, &tree->root, &mod_tree_ops); + if (!ltn) + return NULL; + + return container_of(ltn, struct mod_tree_node, node)->mod; +} diff --git a/kernel/module/version.c b/kernel/module/version.c new file mode 100644 index 000000000000..53f43ac5a73e --- /dev/null +++ b/kernel/module/version.c @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Module version support + * + * Copyright (C) 2008 Rusty Russell + */ + +#include <linux/module.h> +#include <linux/string.h> +#include <linux/printk.h> +#include "internal.h" + +int check_version(const struct load_info *info, + const char *symname, + struct module *mod, + const s32 *crc) +{ + Elf_Shdr *sechdrs = info->sechdrs; + unsigned int versindex = info->index.vers; + unsigned int i, num_versions; + struct modversion_info *versions; + + /* Exporting module didn't supply crcs? OK, we're already tainted. */ + if (!crc) + return 1; + + /* No versions at all? modprobe --force does this. */ + if (versindex == 0) + return try_to_force_load(mod, symname) == 0; + + versions = (void *)sechdrs[versindex].sh_addr; + num_versions = sechdrs[versindex].sh_size + / sizeof(struct modversion_info); + + for (i = 0; i < num_versions; i++) { + u32 crcval; + + if (strcmp(versions[i].name, symname) != 0) + continue; + + crcval = *crc; + if (versions[i].crc == crcval) + return 1; + pr_debug("Found checksum %X vs module %lX\n", + crcval, versions[i].crc); + goto bad_version; + } + + /* Broken toolchain. Warn once, then let it go.. */ + pr_warn_once("%s: no symbol version for %s\n", info->name, symname); + return 1; + +bad_version: + pr_warn("%s: disagrees about version of symbol %s\n", info->name, symname); + return 0; +} + +int check_modstruct_version(const struct load_info *info, + struct module *mod) +{ + struct find_symbol_arg fsa = { + .name = "module_layout", + .gplok = true, + }; + + /* + * Since this should be found in kernel (which can't be removed), no + * locking is necessary -- use preempt_disable() to placate lockdep. + */ + preempt_disable(); + if (!find_symbol(&fsa)) { + preempt_enable(); + BUG(); + } + preempt_enable(); + return check_version(info, "module_layout", mod, fsa.crc); +} + +/* First part is kernel version, which we ignore if module has crcs. */ +int same_magic(const char *amagic, const char *bmagic, + bool has_crcs) +{ + if (has_crcs) { + amagic += strcspn(amagic, " "); + bmagic += strcspn(bmagic, " "); + } + return strcmp(amagic, bmagic) == 0; +} + +/* + * Generate the signature for all relevant module structures here. + * If these change, we don't want to try to parse the module. + */ +void module_layout(struct module *mod, + struct modversion_info *ver, + struct kernel_param *kp, + struct kernel_symbol *ks, + struct tracepoint * const *tp) +{ +} +EXPORT_SYMBOL(module_layout); diff --git a/kernel/module_signing.c b/kernel/module_signing.c deleted file mode 100644 index 8723ae70ea1f..000000000000 --- a/kernel/module_signing.c +++ /dev/null @@ -1,45 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* Module signature checker - * - * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/module.h> -#include <linux/module_signature.h> -#include <linux/string.h> -#include <linux/verification.h> -#include <crypto/public_key.h> -#include "module-internal.h" - -/* - * Verify the signature on a module. - */ -int mod_verify_sig(const void *mod, struct load_info *info) -{ - struct module_signature ms; - size_t sig_len, modlen = info->len; - int ret; - - pr_devel("==>%s(,%zu)\n", __func__, modlen); - - if (modlen <= sizeof(ms)) - return -EBADMSG; - - memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms)); - - ret = mod_check_sig(&ms, modlen, "module"); - if (ret) - return ret; - - sig_len = be32_to_cpu(ms.sig_len); - modlen -= sig_len + sizeof(ms); - info->len = modlen; - - return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len, - VERIFY_USE_SECONDARY_KEYRING, - VERIFYING_MODULE_SIGNATURE, - NULL, NULL); -} diff --git a/kernel/notifier.c b/kernel/notifier.c index ba005ebf4730..0d5bd62c480e 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -20,7 +20,8 @@ BLOCKING_NOTIFIER_HEAD(reboot_notifier_list); */ static int notifier_chain_register(struct notifier_block **nl, - struct notifier_block *n) + struct notifier_block *n, + bool unique_priority) { while ((*nl) != NULL) { if (unlikely((*nl) == n)) { @@ -30,6 +31,8 @@ static int notifier_chain_register(struct notifier_block **nl, } if (n->priority > (*nl)->priority) break; + if (n->priority == (*nl)->priority && unique_priority) + return -EBUSY; nl = &((*nl)->next); } n->next = *nl; @@ -144,13 +147,36 @@ int atomic_notifier_chain_register(struct atomic_notifier_head *nh, int ret; spin_lock_irqsave(&nh->lock, flags); - ret = notifier_chain_register(&nh->head, n); + ret = notifier_chain_register(&nh->head, n, false); spin_unlock_irqrestore(&nh->lock, flags); return ret; } EXPORT_SYMBOL_GPL(atomic_notifier_chain_register); /** + * atomic_notifier_chain_register_unique_prio - Add notifier to an atomic notifier chain + * @nh: Pointer to head of the atomic notifier chain + * @n: New entry in notifier chain + * + * Adds a notifier to an atomic notifier chain if there is no other + * notifier registered using the same priority. + * + * Returns 0 on success, %-EEXIST or %-EBUSY on error. + */ +int atomic_notifier_chain_register_unique_prio(struct atomic_notifier_head *nh, + struct notifier_block *n) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&nh->lock, flags); + ret = notifier_chain_register(&nh->head, n, true); + spin_unlock_irqrestore(&nh->lock, flags); + return ret; +} +EXPORT_SYMBOL_GPL(atomic_notifier_chain_register_unique_prio); + +/** * atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain * @nh: Pointer to head of the atomic notifier chain * @n: Entry to remove from notifier chain @@ -204,23 +230,27 @@ int atomic_notifier_call_chain(struct atomic_notifier_head *nh, EXPORT_SYMBOL_GPL(atomic_notifier_call_chain); NOKPROBE_SYMBOL(atomic_notifier_call_chain); +/** + * atomic_notifier_call_chain_is_empty - Check whether notifier chain is empty + * @nh: Pointer to head of the atomic notifier chain + * + * Checks whether notifier chain is empty. + * + * Returns true is notifier chain is empty, false otherwise. + */ +bool atomic_notifier_call_chain_is_empty(struct atomic_notifier_head *nh) +{ + return !rcu_access_pointer(nh->head); +} + /* * Blocking notifier chain routines. All access to the chain is * synchronized by an rwsem. */ -/** - * blocking_notifier_chain_register - Add notifier to a blocking notifier chain - * @nh: Pointer to head of the blocking notifier chain - * @n: New entry in notifier chain - * - * Adds a notifier to a blocking notifier chain. - * Must be called in process context. - * - * Returns 0 on success, %-EEXIST on error. - */ -int blocking_notifier_chain_register(struct blocking_notifier_head *nh, - struct notifier_block *n) +static int __blocking_notifier_chain_register(struct blocking_notifier_head *nh, + struct notifier_block *n, + bool unique_priority) { int ret; @@ -230,16 +260,49 @@ int blocking_notifier_chain_register(struct blocking_notifier_head *nh, * such times we must not call down_write(). */ if (unlikely(system_state == SYSTEM_BOOTING)) - return notifier_chain_register(&nh->head, n); + return notifier_chain_register(&nh->head, n, unique_priority); down_write(&nh->rwsem); - ret = notifier_chain_register(&nh->head, n); + ret = notifier_chain_register(&nh->head, n, unique_priority); up_write(&nh->rwsem); return ret; } + +/** + * blocking_notifier_chain_register - Add notifier to a blocking notifier chain + * @nh: Pointer to head of the blocking notifier chain + * @n: New entry in notifier chain + * + * Adds a notifier to a blocking notifier chain. + * Must be called in process context. + * + * Returns 0 on success, %-EEXIST on error. + */ +int blocking_notifier_chain_register(struct blocking_notifier_head *nh, + struct notifier_block *n) +{ + return __blocking_notifier_chain_register(nh, n, false); +} EXPORT_SYMBOL_GPL(blocking_notifier_chain_register); /** + * blocking_notifier_chain_register_unique_prio - Add notifier to a blocking notifier chain + * @nh: Pointer to head of the blocking notifier chain + * @n: New entry in notifier chain + * + * Adds a notifier to an blocking notifier chain if there is no other + * notifier registered using the same priority. + * + * Returns 0 on success, %-EEXIST or %-EBUSY on error. + */ +int blocking_notifier_chain_register_unique_prio(struct blocking_notifier_head *nh, + struct notifier_block *n) +{ + return __blocking_notifier_chain_register(nh, n, true); +} +EXPORT_SYMBOL_GPL(blocking_notifier_chain_register_unique_prio); + +/** * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain * @nh: Pointer to head of the blocking notifier chain * @n: Entry to remove from notifier chain @@ -341,7 +404,7 @@ EXPORT_SYMBOL_GPL(blocking_notifier_call_chain); int raw_notifier_chain_register(struct raw_notifier_head *nh, struct notifier_block *n) { - return notifier_chain_register(&nh->head, n); + return notifier_chain_register(&nh->head, n, false); } EXPORT_SYMBOL_GPL(raw_notifier_chain_register); @@ -420,10 +483,10 @@ int srcu_notifier_chain_register(struct srcu_notifier_head *nh, * such times we must not call mutex_lock(). */ if (unlikely(system_state == SYSTEM_BOOTING)) - return notifier_chain_register(&nh->head, n); + return notifier_chain_register(&nh->head, n, false); mutex_lock(&nh->mutex); - ret = notifier_chain_register(&nh->head, n); + ret = notifier_chain_register(&nh->head, n, false); mutex_unlock(&nh->mutex); return ret; } diff --git a/kernel/panic.c b/kernel/panic.c index eb4dfb932c85..da323209f583 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -43,12 +43,14 @@ * Should we dump all CPUs backtraces in an oops event? * Defaults to 0, can be changed via sysctl. */ -unsigned int __read_mostly sysctl_oops_all_cpu_backtrace; +static unsigned int __read_mostly sysctl_oops_all_cpu_backtrace; +#else +#define sysctl_oops_all_cpu_backtrace 0 #endif /* CONFIG_SMP */ int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; static unsigned long tainted_mask = - IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0; + IS_ENABLED(CONFIG_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0; static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); @@ -73,6 +75,28 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list); EXPORT_SYMBOL(panic_notifier_list); +#if defined(CONFIG_SMP) && defined(CONFIG_SYSCTL) +static struct ctl_table kern_panic_table[] = { + { + .procname = "oops_all_cpu_backtrace", + .data = &sysctl_oops_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { } +}; + +static __init int kernel_panic_sysctls_init(void) +{ + register_sysctl_init("kernel", kern_panic_table); + return 0; +} +late_initcall(kernel_panic_sysctls_init); +#endif + static long no_blink(int state) { return 0; @@ -305,9 +329,6 @@ void panic(const char *fmt, ...) if (_crash_kexec_post_notifiers) __crash_kexec(NULL); -#ifdef CONFIG_VT - unblank_screen(); -#endif console_unblank(); /* @@ -404,6 +425,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { [ TAINT_LIVEPATCH ] = { 'K', ' ', true }, [ TAINT_AUX ] = { 'X', ' ', true }, [ TAINT_RANDSTRUCT ] = { 'T', ' ', true }, + [ TAINT_TEST ] = { 'N', ' ', true }, }; /** diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a46a3723bc66..f4f8cb0435b4 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -52,7 +52,7 @@ static struct kmem_cache *create_pid_cachep(unsigned int level) /* Name collision forces to do allocation under mutex. */ if (!*pkc) *pkc = kmem_cache_create(name, len, 0, - SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, 0); + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); mutex_unlock(&pid_caches_mutex); /* current can fail, but someone else can succeed. */ return READ_ONCE(*pkc); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index a12779650f15..60a1d3051cc7 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -143,6 +143,26 @@ config PM_AUTOSLEEP Allow the kernel to trigger a system transition into a global sleep state automatically whenever there are no active wakeup sources. +config PM_USERSPACE_AUTOSLEEP + bool "Userspace opportunistic sleep" + depends on PM_SLEEP + help + Notify kernel of aggressive userspace autosleep power management policy. + + This option changes the behavior of various sleep-sensitive code to deal + with frequent userspace-initiated transitions into a global sleep state. + + Saying Y here, disables code paths that most users really should keep + enabled. In particular, only enable this if it is very common to be + asleep/awake for very short periods of time (<= 2 seconds). + + Only platforms, such as Android, that implement opportunistic sleep from + a userspace power manager service should enable this option; and not + other machines. Therefore, you should say N here, unless you are + extremely certain that this is what you want. The option otherwise has + bad, undesirable effects, and should not be enabled just for fun. + + config PM_WAKELOCKS bool "User space wakeup sources interface" depends on PM_SLEEP diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 5899260a8bef..874ad834dc8d 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -1,6 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 -ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG +ifeq ($(CONFIG_DYNAMIC_DEBUG), y) +CFLAGS_swap.o := -DDEBUG +CFLAGS_snapshot.o := -DDEBUG +CFLAGS_energy_model.o := -DDEBUG +endif KASAN_SANITIZE_snapshot.o := n diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 0153b0ca7b23..f82111837b8d 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -54,28 +54,15 @@ static int em_debug_cpus_show(struct seq_file *s, void *unused) } DEFINE_SHOW_ATTRIBUTE(em_debug_cpus); -static int em_debug_units_show(struct seq_file *s, void *unused) +static int em_debug_flags_show(struct seq_file *s, void *unused) { struct em_perf_domain *pd = s->private; - char *units = (pd->flags & EM_PERF_DOMAIN_MILLIWATTS) ? - "milliWatts" : "bogoWatts"; - seq_printf(s, "%s\n", units); + seq_printf(s, "%#lx\n", pd->flags); return 0; } -DEFINE_SHOW_ATTRIBUTE(em_debug_units); - -static int em_debug_skip_inefficiencies_show(struct seq_file *s, void *unused) -{ - struct em_perf_domain *pd = s->private; - int enabled = (pd->flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES) ? 1 : 0; - - seq_printf(s, "%d\n", enabled); - - return 0; -} -DEFINE_SHOW_ATTRIBUTE(em_debug_skip_inefficiencies); +DEFINE_SHOW_ATTRIBUTE(em_debug_flags); static void em_debug_create_pd(struct device *dev) { @@ -89,9 +76,8 @@ static void em_debug_create_pd(struct device *dev) debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus, &em_debug_cpus_fops); - debugfs_create_file("units", 0444, d, dev->em_pd, &em_debug_units_fops); - debugfs_create_file("skip-inefficiencies", 0444, d, dev->em_pd, - &em_debug_skip_inefficiencies_fops); + debugfs_create_file("flags", 0444, d, dev->em_pd, + &em_debug_flags_fops); /* Create a sub-directory for each performance state */ for (i = 0; i < dev->em_pd->nr_perf_states; i++) @@ -121,7 +107,8 @@ static void em_debug_remove_pd(struct device *dev) {} #endif static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, - int nr_states, struct em_data_callback *cb) + int nr_states, struct em_data_callback *cb, + unsigned long flags) { unsigned long power, freq, prev_freq = 0, prev_cost = ULONG_MAX; struct em_perf_state *table; @@ -139,7 +126,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, * lowest performance state of 'dev' above 'freq' and updates * 'power' and 'freq' accordingly. */ - ret = cb->active_power(&power, &freq, dev); + ret = cb->active_power(dev, &power, &freq); if (ret) { dev_err(dev, "EM: invalid perf. state: %d\n", ret); @@ -158,7 +145,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, /* * The power returned by active_state() is expected to be - * positive and to fit into 16 bits. + * positive and be in range. */ if (!power || power > EM_MAX_POWER) { dev_err(dev, "EM: invalid power: %lu\n", @@ -173,10 +160,22 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, /* Compute the cost of each performance state. */ fmax = (u64) table[nr_states - 1].frequency; for (i = nr_states - 1; i >= 0; i--) { - unsigned long power_res = em_scale_power(table[i].power); + unsigned long power_res, cost; + + if (flags & EM_PERF_DOMAIN_ARTIFICIAL) { + ret = cb->get_cost(dev, table[i].frequency, &cost); + if (ret || !cost || cost > EM_MAX_POWER) { + dev_err(dev, "EM: invalid cost %lu %d\n", + cost, ret); + goto free_ps_table; + } + } else { + power_res = table[i].power; + cost = div64_u64(fmax * power_res, table[i].frequency); + } + + table[i].cost = cost; - table[i].cost = div64_u64(fmax * power_res, - table[i].frequency); if (table[i].cost >= prev_cost) { table[i].flags = EM_PERF_STATE_INEFFICIENT; dev_dbg(dev, "EM: OPP:%lu is inefficient\n", @@ -197,13 +196,22 @@ free_ps_table: } static int em_create_pd(struct device *dev, int nr_states, - struct em_data_callback *cb, cpumask_t *cpus) + struct em_data_callback *cb, cpumask_t *cpus, + unsigned long flags) { struct em_perf_domain *pd; struct device *cpu_dev; - int cpu, ret; + int cpu, ret, num_cpus; if (_is_cpu_device(dev)) { + num_cpus = cpumask_weight(cpus); + + /* Prevent max possible energy calculation to not overflow */ + if (num_cpus > EM_MAX_NUM_CPUS) { + dev_err(dev, "EM: too many CPUs, overflow possible\n"); + return -EINVAL; + } + pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL); if (!pd) return -ENOMEM; @@ -215,7 +223,7 @@ static int em_create_pd(struct device *dev, int nr_states, return -ENOMEM; } - ret = em_create_perf_table(dev, pd, nr_states, cb); + ret = em_create_perf_table(dev, pd, nr_states, cb, flags); if (ret) { kfree(pd); return ret; @@ -259,6 +267,8 @@ static void em_cpufreq_update_efficiencies(struct device *dev) found++; } + cpufreq_cpu_put(policy); + if (!found) return; @@ -312,13 +322,13 @@ EXPORT_SYMBOL_GPL(em_cpu_get); * @cpus : Pointer to cpumask_t, which in case of a CPU device is * obligatory. It can be taken from i.e. 'policy->cpus'. For other * type of devices this should be set to NULL. - * @milliwatts : Flag indicating that the power values are in milliWatts or + * @microwatts : Flag indicating that the power values are in micro-Watts or * in some other scale. It must be set properly. * * Create Energy Model tables for a performance domain using the callbacks * defined in cb. * - * The @milliwatts is important to set with correct value. Some kernel + * The @microwatts is important to set with correct value. Some kernel * sub-systems might rely on this flag and check if all devices in the EM are * using the same scale. * @@ -329,9 +339,10 @@ EXPORT_SYMBOL_GPL(em_cpu_get); */ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, struct em_data_callback *cb, cpumask_t *cpus, - bool milliwatts) + bool microwatts) { unsigned long cap, prev_cap = 0; + unsigned long flags = 0; int cpu, ret; if (!dev || !nr_states || !cb) @@ -378,12 +389,16 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, } } - ret = em_create_pd(dev, nr_states, cb, cpus); + if (microwatts) + flags |= EM_PERF_DOMAIN_MICROWATTS; + else if (cb->get_cost) + flags |= EM_PERF_DOMAIN_ARTIFICIAL; + + ret = em_create_pd(dev, nr_states, cb, cpus, flags); if (ret) goto unlock; - if (milliwatts) - dev->em_pd->flags |= EM_PERF_DOMAIN_MILLIWATTS; + dev->em_pd->flags |= flags; em_cpufreq_update_efficiencies(dev); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 938d5c78b421..89c71fce225d 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -83,7 +83,7 @@ bool hibernation_available(void) { return nohibernate == 0 && !security_locked_down(LOCKDOWN_HIBERNATION) && - !secretmem_active(); + !secretmem_active() && !cxl_mem_active(); } /** @@ -665,7 +665,7 @@ static void power_down(void) hibernation_platform_enter(); fallthrough; case HIBERNATION_SHUTDOWN: - if (pm_power_off) + if (kernel_can_power_off()) kernel_power_off(); break; } diff --git a/kernel/power/main.c b/kernel/power/main.c index 7e646079fbeb..e3694034b753 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -127,7 +127,9 @@ static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr, char *s = buf; suspend_state_t i; - for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) + for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) { + if (i >= PM_SUSPEND_MEM && cxl_mem_active()) + continue; if (mem_sleep_states[i]) { const char *label = mem_sleep_states[i]; @@ -136,6 +138,7 @@ static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr, else s += sprintf(s, "%s ", label); } + } /* Convert the last space to a newline if needed. */ if (s != buf) @@ -545,35 +548,6 @@ static int __init pm_debug_messages_setup(char *str) } __setup("pm_debug_messages", pm_debug_messages_setup); -/** - * __pm_pr_dbg - Print a suspend debug message to the kernel log. - * @defer: Whether or not to use printk_deferred() to print the message. - * @fmt: Message format. - * - * The message will be emitted if enabled through the pm_debug_messages - * sysfs attribute. - */ -void __pm_pr_dbg(bool defer, const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - if (!pm_debug_messages_on) - return; - - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - - if (defer) - printk_deferred(KERN_DEBUG "PM: %pV", &vaf); - else - printk(KERN_DEBUG "PM: %pV", &vaf); - - va_end(args); -} - #else /* !CONFIG_PM_SLEEP_DEBUG */ static inline void pm_print_times_init(void) {} #endif /* CONFIG_PM_SLEEP_DEBUG */ diff --git a/kernel/power/process.c b/kernel/power/process.c index 11b570fcf049..3068601e585a 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -6,9 +6,6 @@ * Originally from swsusp. */ - -#undef DEBUG - #include <linux/interrupt.h> #include <linux/oom.h> #include <linux/suspend.h> diff --git a/kernel/power/qos.c b/kernel/power/qos.c index ec7e1e85923e..af51ed6d45ef 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -531,7 +531,7 @@ int freq_qos_add_request(struct freq_constraints *qos, { int ret; - if (IS_ERR_OR_NULL(qos) || !req) + if (IS_ERR_OR_NULL(qos) || !req || value < 0) return -EINVAL; if (WARN(freq_qos_request_active(req), @@ -563,7 +563,7 @@ EXPORT_SYMBOL_GPL(freq_qos_add_request); */ int freq_qos_update_request(struct freq_qos_request *req, s32 new_value) { - if (!req) + if (!req || new_value < 0) return -EINVAL; if (WARN(!freq_qos_request_active(req), diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 330d49937692..2a406753af90 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -326,7 +326,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) return ret; } -/** +/* * Data types related to memory bitmaps. * * Memory bitmap is a structure consisting of many linked lists of @@ -427,6 +427,10 @@ struct memory_bitmap { /** * alloc_rtree_node - Allocate a new node and add it to the radix tree. + * @gfp_mask: GFP mask for the allocation. + * @safe_needed: Get pages not used before hibernation (restore only) + * @ca: Pointer to a linked list of pages ("a chain") to allocate from + * @list: Radix Tree node to add. * * This function is used to allocate inner nodes as well as the * leave nodes of the radix tree. It also adds the node to the @@ -902,7 +906,7 @@ static bool rtree_next_node(struct memory_bitmap *bm) } /** - * memory_bm_rtree_next_pfn - Find the next set bit in a memory bitmap. + * memory_bm_next_pfn - Find the next set bit in a memory bitmap. * @bm: Memory bitmap. * * Starting from the last returned position this function searches for the next @@ -1937,7 +1941,7 @@ static inline int get_highmem_buffer(int safe_needed) } /** - * alloc_highmem_image_pages - Allocate some highmem pages for the image. + * alloc_highmem_pages - Allocate some highmem pages for the image. * * Try to allocate as many pages as needed, but if the number of free highmem * pages is less than that, allocate them all. @@ -2224,7 +2228,7 @@ static int check_header(struct swsusp_info *info) } /** - * load header - Check the image header and copy the data from it. + * load_header - Check the image header and copy the data from it. */ static int load_header(struct swsusp_info *info) { diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 6fcdee7e87a5..c6272d466e58 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -136,6 +136,9 @@ static void s2idle_loop(void) break; } + if (s2idle_ops && s2idle_ops->check) + s2idle_ops->check(); + s2idle_enter(); } @@ -236,7 +239,8 @@ EXPORT_SYMBOL_GPL(suspend_valid_only_mem); static bool sleep_state_supported(suspend_state_t state) { - return state == PM_SUSPEND_TO_IDLE || valid_state(state); + return state == PM_SUSPEND_TO_IDLE || + (valid_state(state) && !cxl_mem_active()); } static int platform_suspend_prepare(suspend_state_t state) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 91fffdd2c7fb..277434b6c0bf 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -269,15 +269,14 @@ static void hib_end_io(struct bio *bio) bio_put(bio); } -static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, - struct hib_bio_batch *hb) +static int hib_submit_io(blk_opf_t opf, pgoff_t page_off, void *addr, + struct hib_bio_batch *hb) { struct page *page = virt_to_page(addr); struct bio *bio; int error = 0; - bio = bio_alloc(hib_resume_bdev, 1, op | op_flags, - GFP_NOIO | __GFP_HIGH); + bio = bio_alloc(hib_resume_bdev, 1, opf, GFP_NOIO | __GFP_HIGH); bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { @@ -317,8 +316,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) { int error; - hib_submit_io(REQ_OP_READ, 0, swsusp_resume_block, - swsusp_header, NULL); + hib_submit_io(REQ_OP_READ, swsusp_resume_block, swsusp_header, NULL); if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); @@ -331,7 +329,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) swsusp_header->flags = flags; if (flags & SF_CRC32_MODE) swsusp_header->crc32 = handle->crc32; - error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC, + error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC, swsusp_resume_block, swsusp_header, NULL); } else { pr_err("Swap header not found!\n"); @@ -408,7 +406,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) } else { src = buf; } - return hib_submit_io(REQ_OP_WRITE, REQ_SYNC, offset, src, hb); + return hib_submit_io(REQ_OP_WRITE | REQ_SYNC, offset, src, hb); } static void release_swap_writer(struct swap_map_handle *handle) @@ -1003,7 +1001,7 @@ static int get_swap_reader(struct swap_map_handle *handle, return -ENOMEM; } - error = hib_submit_io(REQ_OP_READ, 0, offset, tmp->map, NULL); + error = hib_submit_io(REQ_OP_READ, offset, tmp->map, NULL); if (error) { release_swap_reader(handle); return error; @@ -1027,7 +1025,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf, offset = handle->cur->entries[handle->k]; if (!offset) return -EFAULT; - error = hib_submit_io(REQ_OP_READ, 0, offset, buf, hb); + error = hib_submit_io(REQ_OP_READ, offset, buf, hb); if (error) return error; if (++handle->k >= MAP_PAGE_ENTRIES) { @@ -1526,8 +1524,7 @@ int swsusp_check(void) if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); clear_page(swsusp_header); - error = hib_submit_io(REQ_OP_READ, 0, - swsusp_resume_block, + error = hib_submit_io(REQ_OP_READ, swsusp_resume_block, swsusp_header, NULL); if (error) goto put; @@ -1535,7 +1532,7 @@ int swsusp_check(void) if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) { memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); /* Reset swap signature now */ - error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC, + error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC, swsusp_resume_block, swsusp_header, NULL); } else { @@ -1586,11 +1583,11 @@ int swsusp_unmark(void) { int error; - hib_submit_io(REQ_OP_READ, 0, swsusp_resume_block, - swsusp_header, NULL); + hib_submit_io(REQ_OP_READ, swsusp_resume_block, + swsusp_header, NULL); if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) { memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10); - error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC, + error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC, swsusp_resume_block, swsusp_header, NULL); } else { diff --git a/kernel/power/user.c b/kernel/power/user.c index ad241b4ff64c..d43c2aa583b2 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -26,6 +26,7 @@ #include "power.h" +static bool need_wait; static struct snapshot_data { struct snapshot_handle handle; @@ -78,7 +79,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) * Resuming. We may need to wait for the image device to * appear. */ - wait_for_device_probe(); + need_wait = true; data->swap = -1; data->mode = O_WRONLY; @@ -168,6 +169,11 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, ssize_t res; loff_t pg_offp = *offp & ~PAGE_MASK; + if (need_wait) { + wait_for_device_probe(); + need_wait = false; + } + lock_system_sleep(); data = filp->private_data; @@ -244,6 +250,11 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, loff_t size; sector_t offset; + if (need_wait) { + wait_for_device_probe(); + need_wait = false; + } + if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC) return -ENOTTY; if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index da03c15ecc89..a1a81fd9889b 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -281,11 +281,6 @@ static bool panic_in_progress(void) static int console_locked, console_suspended; /* - * If exclusive_console is non-NULL then only this console is to be printed to. - */ -static struct console *exclusive_console; - -/* * Array of consoles built from command line options (console=) */ @@ -374,12 +369,6 @@ static u64 syslog_seq; static size_t syslog_partial; static bool syslog_time; -/* All 3 protected by @console_sem. */ -/* the next printk record to write to the console */ -static u64 console_seq; -static u64 exclusive_console_stop_seq; -static unsigned long console_dropped; - struct latched_seq { seqcount_latch_t latch; u64 val[2]; @@ -405,6 +394,9 @@ static struct latched_seq clear_seq = { /* the maximum size of a formatted record (i.e. with prefix added per line) */ #define CONSOLE_LOG_MAX 1024 +/* the maximum size for a dropped text message */ +#define DROPPED_TEXT_MAX 64 + /* the maximum size allowed to be reserved for a record */ #define LOG_LINE_MAX (CONSOLE_LOG_MAX - PREFIX_MAX) @@ -746,8 +738,19 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, goto out; } + /* + * Guarantee this task is visible on the waitqueue before + * checking the wake condition. + * + * The full memory barrier within set_current_state() of + * prepare_to_wait_event() pairs with the full memory barrier + * within wq_has_sleeper(). + * + * This pairs with __wake_up_klogd:A. + */ ret = wait_event_interruptible(log_wait, - prb_read_valid(prb, atomic64_read(&user->seq), r)); + prb_read_valid(prb, + atomic64_read(&user->seq), r)); /* LMM(devkmsg_read:A) */ if (ret) goto out; } @@ -1513,7 +1516,18 @@ static int syslog_print(char __user *buf, int size) seq = syslog_seq; mutex_unlock(&syslog_lock); - len = wait_event_interruptible(log_wait, prb_read_valid(prb, seq, NULL)); + /* + * Guarantee this task is visible on the waitqueue before + * checking the wake condition. + * + * The full memory barrier within set_current_state() of + * prepare_to_wait_event() pairs with the full memory barrier + * within wq_has_sleeper(). + * + * This pairs with __wake_up_klogd:A. + */ + len = wait_event_interruptible(log_wait, + prb_read_valid(prb, seq, NULL)); /* LMM(syslog_print:A) */ mutex_lock(&syslog_lock); if (len) @@ -1911,47 +1925,24 @@ static int console_trylock_spinning(void) } /* - * Call the console drivers, asking them to write out - * log_buf[start] to log_buf[end - 1]. - * The console_lock must be held. + * Call the specified console driver, asking it to write out the specified + * text and length. If @dropped_text is non-NULL and any records have been + * dropped, a dropped message will be written out first. */ -static void call_console_drivers(const char *ext_text, size_t ext_len, - const char *text, size_t len) +static void call_console_driver(struct console *con, const char *text, size_t len, + char *dropped_text) { - static char dropped_text[64]; - size_t dropped_len = 0; - struct console *con; + size_t dropped_len; - trace_console_rcuidle(text, len); - - if (!console_drivers) - return; - - if (console_dropped) { - dropped_len = snprintf(dropped_text, sizeof(dropped_text), + if (con->dropped && dropped_text) { + dropped_len = snprintf(dropped_text, DROPPED_TEXT_MAX, "** %lu printk messages dropped **\n", - console_dropped); - console_dropped = 0; + con->dropped); + con->dropped = 0; + con->write(con, dropped_text, dropped_len); } - for_each_console(con) { - if (exclusive_console && con != exclusive_console) - continue; - if (!(con->flags & CON_ENABLED)) - continue; - if (!con->write) - continue; - if (!cpu_online(smp_processor_id()) && - !(con->flags & CON_ANYTIME)) - continue; - if (con->flags & CON_EXTENDED) - con->write(con, ext_text, ext_len); - else { - if (dropped_len) - con->write(con, dropped_text, dropped_len); - con->write(con, text, len); - } - } + con->write(con, text, len); } /* @@ -2026,8 +2017,10 @@ static u8 *__printk_recursion_counter(void) int printk_delay_msec __read_mostly; -static inline void printk_delay(void) +static inline void printk_delay(int level) { + boot_delay_msec(level); + if (unlikely(printk_delay_msec)) { int m = printk_delay_msec; @@ -2041,7 +2034,7 @@ static inline void printk_delay(void) static inline u32 printk_caller_id(void) { return in_task() ? task_pid_nr(current) : - 0x80000000 + raw_smp_processor_id(); + 0x80000000 + smp_processor_id(); } /** @@ -2115,6 +2108,8 @@ static u16 printk_sprint(char *text, u16 size, int facility, } } + trace_console_rcuidle(text, text_len); + return text_len; } @@ -2123,7 +2118,6 @@ int vprintk_store(int facility, int level, const struct dev_printk_info *dev_info, const char *fmt, va_list args) { - const u32 caller_id = printk_caller_id(); struct prb_reserved_entry e; enum printk_info_flags flags = 0; struct printk_record r; @@ -2133,10 +2127,14 @@ int vprintk_store(int facility, int level, u8 *recursion_ptr; u16 reserve_size; va_list args2; + u32 caller_id; u16 text_len; int ret = 0; u64 ts_nsec; + if (!printk_enter_irqsave(recursion_ptr, irqflags)) + return 0; + /* * Since the duration of printk() can vary depending on the message * and state of the ringbuffer, grab the timestamp now so that it is @@ -2145,8 +2143,7 @@ int vprintk_store(int facility, int level, */ ts_nsec = local_clock(); - if (!printk_enter_irqsave(recursion_ptr, irqflags)) - return 0; + caller_id = printk_caller_id(); /* * The sprintf needs to come first since the syslog prefix might be @@ -2250,23 +2247,25 @@ asmlinkage int vprintk_emit(int facility, int level, in_sched = true; } - boot_delay_msec(level); - printk_delay(); + printk_delay(level); printed_len = vprintk_store(facility, level, dev_info, fmt, args); /* If called from the scheduler, we can not call up(). */ if (!in_sched) { /* - * Disable preemption to avoid being preempted while holding - * console_sem which would prevent anyone from printing to - * console + * The caller may be holding system-critical or + * timing-sensitive locks. Disable preemption during + * printing of all remaining records to all consoles so that + * this context can return as soon as possible. Hopefully + * another printk() caller will take over the printing. */ preempt_disable(); /* * Try to acquire and then immediately release the console - * semaphore. The release will print out buffers and wake up - * /dev/kmsg and syslog() users. + * semaphore. The release will print out buffers. With the + * spinning variant, this context tries to take over the + * printing from another printing context. */ if (console_trylock_spinning()) console_unlock(); @@ -2297,18 +2296,19 @@ asmlinkage __visible int _printk(const char *fmt, ...) } EXPORT_SYMBOL(_printk); +static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress); + #else /* CONFIG_PRINTK */ #define CONSOLE_LOG_MAX 0 +#define DROPPED_TEXT_MAX 0 #define printk_time false #define prb_read_valid(rb, seq, r) false #define prb_first_valid_seq(rb) 0 +#define prb_next_seq(rb) 0 static u64 syslog_seq; -static u64 console_seq; -static u64 exclusive_console_stop_seq; -static unsigned long console_dropped; static size_t record_print_text(const struct printk_record *r, bool syslog, bool time) @@ -2325,9 +2325,12 @@ static ssize_t msg_print_ext_body(char *buf, size_t size, struct dev_printk_info *dev_info) { return 0; } static void console_lock_spinning_enable(void) { } static int console_lock_spinning_disable_and_check(void) { return 0; } -static void call_console_drivers(const char *ext_text, size_t ext_len, - const char *text, size_t len) {} +static void call_console_driver(struct console *con, const char *text, size_t len, + char *dropped_text) +{ +} static bool suppress_message_printing(int level) { return false; } +static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; } #endif /* CONFIG_PRINTK */ @@ -2515,6 +2518,7 @@ void suspend_console(void) if (!console_suspend_enabled) return; pr_info("Suspending console(s) (use no_console_suspend to debug)\n"); + pr_flush(1000, true); console_lock(); console_suspended = 1; up_console_sem(); @@ -2527,6 +2531,7 @@ void resume_console(void) down_console_sem(); console_suspended = 0; console_unlock(); + pr_flush(1000, true); } /** @@ -2597,22 +2602,6 @@ int is_console_locked(void) EXPORT_SYMBOL(is_console_locked); /* - * Check if we have any console that is capable of printing while cpu is - * booting or shutting down. Requires console_sem. - */ -static int have_callable_console(void) -{ - struct console *con; - - for_each_console(con) - if ((con->flags & CON_ENABLED) && - (con->flags & CON_ANYTIME)) - return 1; - - return 0; -} - -/* * Return true when this CPU should unlock console_sem without pushing all * messages to the console. This reduces the chance that the console is * locked when the panic CPU tries to use it. @@ -2632,15 +2621,201 @@ static bool abandon_console_lock_in_panic(void) } /* - * Can we actually use the console at this time on this cpu? + * Check if the given console is currently capable and allowed to print + * records. * - * Console drivers may assume that per-cpu resources have been allocated. So - * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't - * call them until this CPU is officially up. + * Requires the console_lock. */ -static inline int can_use_console(void) +static inline bool console_is_usable(struct console *con) +{ + if (!(con->flags & CON_ENABLED)) + return false; + + if (!con->write) + return false; + + /* + * Console drivers may assume that per-cpu resources have been + * allocated. So unless they're explicitly marked as being able to + * cope (CON_ANYTIME) don't call them until this CPU is officially up. + */ + if (!cpu_online(raw_smp_processor_id()) && + !(con->flags & CON_ANYTIME)) + return false; + + return true; +} + +static void __console_unlock(void) { - return cpu_online(raw_smp_processor_id()) || have_callable_console(); + console_locked = 0; + up_console_sem(); +} + +/* + * Print one record for the given console. The record printed is whatever + * record is the next available record for the given console. + * + * @text is a buffer of size CONSOLE_LOG_MAX. + * + * If extended messages should be printed, @ext_text is a buffer of size + * CONSOLE_EXT_LOG_MAX. Otherwise @ext_text must be NULL. + * + * If dropped messages should be printed, @dropped_text is a buffer of size + * DROPPED_TEXT_MAX. Otherwise @dropped_text must be NULL. + * + * @handover will be set to true if a printk waiter has taken over the + * console_lock, in which case the caller is no longer holding the + * console_lock. Otherwise it is set to false. + * + * Returns false if the given console has no next record to print, otherwise + * true. + * + * Requires the console_lock. + */ +static bool console_emit_next_record(struct console *con, char *text, char *ext_text, + char *dropped_text, bool *handover) +{ + static int panic_console_dropped; + struct printk_info info; + struct printk_record r; + unsigned long flags; + char *write_text; + size_t len; + + prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX); + + *handover = false; + + if (!prb_read_valid(prb, con->seq, &r)) + return false; + + if (con->seq != r.info->seq) { + con->dropped += r.info->seq - con->seq; + con->seq = r.info->seq; + if (panic_in_progress() && panic_console_dropped++ > 10) { + suppress_panic_printk = 1; + pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n"); + } + } + + /* Skip record that has level above the console loglevel. */ + if (suppress_message_printing(r.info->level)) { + con->seq++; + goto skip; + } + + if (ext_text) { + write_text = ext_text; + len = info_print_ext_header(ext_text, CONSOLE_EXT_LOG_MAX, r.info); + len += msg_print_ext_body(ext_text + len, CONSOLE_EXT_LOG_MAX - len, + &r.text_buf[0], r.info->text_len, &r.info->dev_info); + } else { + write_text = text; + len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time); + } + + /* + * While actively printing out messages, if another printk() + * were to occur on another CPU, it may wait for this one to + * finish. This task can not be preempted if there is a + * waiter waiting to take over. + * + * Interrupts are disabled because the hand over to a waiter + * must not be interrupted until the hand over is completed + * (@console_waiter is cleared). + */ + printk_safe_enter_irqsave(flags); + console_lock_spinning_enable(); + + stop_critical_timings(); /* don't trace print latency */ + call_console_driver(con, write_text, len, dropped_text); + start_critical_timings(); + + con->seq++; + + *handover = console_lock_spinning_disable_and_check(); + printk_safe_exit_irqrestore(flags); +skip: + return true; +} + +/* + * Print out all remaining records to all consoles. + * + * @do_cond_resched is set by the caller. It can be true only in schedulable + * context. + * + * @next_seq is set to the sequence number after the last available record. + * The value is valid only when this function returns true. It means that all + * usable consoles are completely flushed. + * + * @handover will be set to true if a printk waiter has taken over the + * console_lock, in which case the caller is no longer holding the + * console_lock. Otherwise it is set to false. + * + * Returns true when there was at least one usable console and all messages + * were flushed to all usable consoles. A returned false informs the caller + * that everything was not flushed (either there were no usable consoles or + * another context has taken over printing or it is a panic situation and this + * is not the panic CPU). Regardless the reason, the caller should assume it + * is not useful to immediately try again. + * + * Requires the console_lock. + */ +static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover) +{ + static char dropped_text[DROPPED_TEXT_MAX]; + static char ext_text[CONSOLE_EXT_LOG_MAX]; + static char text[CONSOLE_LOG_MAX]; + bool any_usable = false; + struct console *con; + bool any_progress; + + *next_seq = 0; + *handover = false; + + do { + any_progress = false; + + for_each_console(con) { + bool progress; + + if (!console_is_usable(con)) + continue; + any_usable = true; + + if (con->flags & CON_EXTENDED) { + /* Extended consoles do not print "dropped messages". */ + progress = console_emit_next_record(con, &text[0], + &ext_text[0], NULL, + handover); + } else { + progress = console_emit_next_record(con, &text[0], + NULL, &dropped_text[0], + handover); + } + if (*handover) + return false; + + /* Track the next of the highest seq flushed. */ + if (con->seq > *next_seq) + *next_seq = con->seq; + + if (!progress) + continue; + any_progress = true; + + /* Allow panic_cpu to take over the consoles safely. */ + if (abandon_console_lock_in_panic()) + return false; + + if (do_cond_resched) + cond_resched(); + } + } while (any_progress); + + return any_usable; } /** @@ -2653,28 +2828,20 @@ static inline int can_use_console(void) * by printk(). If this is the case, console_unlock(); emits * the output prior to releasing the lock. * - * If there is output waiting, we wake /dev/kmsg and syslog() users. - * * console_unlock(); may be called from any context. */ void console_unlock(void) { - static char ext_text[CONSOLE_EXT_LOG_MAX]; - static char text[CONSOLE_LOG_MAX]; - static int panic_console_dropped; - unsigned long flags; - bool do_cond_resched, retry; - struct printk_info info; - struct printk_record r; - u64 __maybe_unused next_seq; + bool do_cond_resched; + bool handover; + bool flushed; + u64 next_seq; if (console_suspended) { up_console_sem(); return; } - prb_rec_init_rd(&r, &info, text, sizeof(text)); - /* * Console drivers are called with interrupts disabled, so * @console_may_schedule should be cleared before; however, we may @@ -2683,125 +2850,34 @@ void console_unlock(void) * between lines if allowable. Not doing so can cause a very long * scheduling stall on a slow console leading to RCU stall and * softlockup warnings which exacerbate the issue with more - * messages practically incapacitating the system. - * - * console_trylock() is not able to detect the preemptive - * context reliably. Therefore the value must be stored before - * and cleared after the "again" goto label. + * messages practically incapacitating the system. Therefore, create + * a local to use for the printing loop. */ do_cond_resched = console_may_schedule; -again: - console_may_schedule = 0; - - /* - * We released the console_sem lock, so we need to recheck if - * cpu is online and (if not) is there at least one CON_ANYTIME - * console. - */ - if (!can_use_console()) { - console_locked = 0; - up_console_sem(); - return; - } - - for (;;) { - size_t ext_len = 0; - int handover; - size_t len; - -skip: - if (!prb_read_valid(prb, console_seq, &r)) - break; - if (console_seq != r.info->seq) { - console_dropped += r.info->seq - console_seq; - console_seq = r.info->seq; - if (panic_in_progress() && panic_console_dropped++ > 10) { - suppress_panic_printk = 1; - pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n"); - } - } - - if (suppress_message_printing(r.info->level)) { - /* - * Skip record we have buffered and already printed - * directly to the console when we received it, and - * record that has level above the console loglevel. - */ - console_seq++; - goto skip; - } + do { + console_may_schedule = 0; - /* Output to all consoles once old messages replayed. */ - if (unlikely(exclusive_console && - console_seq >= exclusive_console_stop_seq)) { - exclusive_console = NULL; - } + flushed = console_flush_all(do_cond_resched, &next_seq, &handover); + if (!handover) + __console_unlock(); /* - * Handle extended console text first because later - * record_print_text() will modify the record buffer in-place. + * Abort if there was a failure to flush all messages to all + * usable consoles. Either it is not possible to flush (in + * which case it would be an infinite loop of retrying) or + * another context has taken over printing. */ - if (nr_ext_console_drivers) { - ext_len = info_print_ext_header(ext_text, - sizeof(ext_text), - r.info); - ext_len += msg_print_ext_body(ext_text + ext_len, - sizeof(ext_text) - ext_len, - &r.text_buf[0], - r.info->text_len, - &r.info->dev_info); - } - len = record_print_text(&r, - console_msg_format & MSG_FORMAT_SYSLOG, - printk_time); - console_seq++; + if (!flushed) + break; /* - * While actively printing out messages, if another printk() - * were to occur on another CPU, it may wait for this one to - * finish. This task can not be preempted if there is a - * waiter waiting to take over. - * - * Interrupts are disabled because the hand over to a waiter - * must not be interrupted until the hand over is completed - * (@console_waiter is cleared). + * Some context may have added new records after + * console_flush_all() but before unlocking the console. + * Re-check if there is a new record to flush. If the trylock + * fails, another context is already handling the printing. */ - printk_safe_enter_irqsave(flags); - console_lock_spinning_enable(); - - stop_critical_timings(); /* don't trace print latency */ - call_console_drivers(ext_text, ext_len, text, len); - start_critical_timings(); - - handover = console_lock_spinning_disable_and_check(); - printk_safe_exit_irqrestore(flags); - if (handover) - return; - - /* Allow panic_cpu to take over the consoles safely */ - if (abandon_console_lock_in_panic()) - break; - - if (do_cond_resched) - cond_resched(); - } - - /* Get consistent value of the next-to-be-used sequence number. */ - next_seq = console_seq; - - console_locked = 0; - up_console_sem(); - - /* - * Someone could have filled up the buffer again, so re-check if there's - * something to flush. In case we cannot trylock the console_sem again, - * there's a new owner and the console_unlock() from them will do the - * flush, no worries. - */ - retry = prb_read_valid(prb, next_seq, NULL); - if (retry && !abandon_console_lock_in_panic() && console_trylock()) - goto again; + } while (prb_read_valid(prb, next_seq, NULL) && console_trylock()); } EXPORT_SYMBOL(console_unlock); @@ -2841,6 +2917,9 @@ void console_unblank(void) if ((c->flags & CON_ENABLED) && c->unblank) c->unblank(); console_unlock(); + + if (!oops_in_progress) + pr_flush(1000, true); } /** @@ -2861,8 +2940,14 @@ void console_flush_on_panic(enum con_flush_mode mode) console_trylock(); console_may_schedule = 0; - if (mode == CONSOLE_REPLAY_ALL) - console_seq = prb_first_valid_seq(prb); + if (mode == CONSOLE_REPLAY_ALL) { + struct console *c; + u64 seq; + + seq = prb_first_valid_seq(prb); + for_each_console(c) + c->seq = seq; + } console_unlock(); } @@ -2893,6 +2978,7 @@ struct tty_driver *console_device(int *index) */ void console_stop(struct console *console) { + __pr_flush(console, 1000, true); console_lock(); console->flags &= ~CON_ENABLED; console_unlock(); @@ -2904,6 +2990,7 @@ void console_start(struct console *console) console_lock(); console->flags |= CON_ENABLED; console_unlock(); + __pr_flush(console, 1000, true); } EXPORT_SYMBOL(console_start); @@ -2990,6 +3077,11 @@ static void try_enable_default_console(struct console *newcon) newcon->flags |= CON_CONSDEV; } +#define con_printk(lvl, con, fmt, ...) \ + printk(lvl pr_fmt("%sconsole [%s%d] " fmt), \ + (con->flags & CON_BOOT) ? "boot" : "", \ + con->name, con->index, ##__VA_ARGS__) + /* * The console driver calls this routine during kernel initialization * to register the console printing procedure with printk() and to @@ -3097,26 +3189,15 @@ void register_console(struct console *newcon) if (newcon->flags & CON_EXTENDED) nr_ext_console_drivers++; + newcon->dropped = 0; if (newcon->flags & CON_PRINTBUFFER) { - /* - * console_unlock(); will print out the buffered messages - * for us. - * - * We're about to replay the log buffer. Only do this to the - * just-registered console to avoid excessive message spam to - * the already-registered consoles. - * - * Set exclusive_console with disabled interrupts to reduce - * race window with eventual console_flush_on_panic() that - * ignores console_lock. - */ - exclusive_console = newcon; - exclusive_console_stop_seq = console_seq; - /* Get a consistent copy of @syslog_seq. */ mutex_lock(&syslog_lock); - console_seq = syslog_seq; + newcon->seq = syslog_seq; mutex_unlock(&syslog_lock); + } else { + /* Begin with next message. */ + newcon->seq = prb_next_seq(prb); } console_unlock(); console_sysfs_notify(); @@ -3128,9 +3209,7 @@ void register_console(struct console *newcon) * users know there might be something in the kernel's log buffer that * went to the bootconsole (that they do not see on the real console) */ - pr_info("%sconsole [%s%d] enabled\n", - (newcon->flags & CON_BOOT) ? "boot" : "" , - newcon->name, newcon->index); + con_printk(KERN_INFO, newcon, "enabled\n"); if (bootcon_enabled && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && !keep_bootcon) { @@ -3149,9 +3228,7 @@ int unregister_console(struct console *console) struct console *con; int res; - pr_info("%sconsole [%s%d] disabled\n", - (console->flags & CON_BOOT) ? "boot" : "" , - console->name, console->index); + con_printk(KERN_INFO, console, "disabled\n"); res = _braille_unregister_console(console); if (res < 0) @@ -3285,6 +3362,88 @@ static int __init printk_late_init(void) late_initcall(printk_late_init); #if defined CONFIG_PRINTK +/* If @con is specified, only wait for that console. Otherwise wait for all. */ +static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) +{ + int remaining = timeout_ms; + struct console *c; + u64 last_diff = 0; + u64 printk_seq; + u64 diff; + u64 seq; + + might_sleep(); + + seq = prb_next_seq(prb); + + for (;;) { + diff = 0; + + console_lock(); + + for_each_console(c) { + if (con && con != c) + continue; + if (!console_is_usable(c)) + continue; + printk_seq = c->seq; + if (printk_seq < seq) + diff += seq - printk_seq; + } + + /* + * If consoles are suspended, it cannot be expected that they + * make forward progress, so timeout immediately. @diff is + * still used to return a valid flush status. + */ + if (console_suspended) + remaining = 0; + else if (diff != last_diff && reset_on_progress) + remaining = timeout_ms; + + console_unlock(); + + if (diff == 0 || remaining == 0) + break; + + if (remaining < 0) { + /* no timeout limit */ + msleep(100); + } else if (remaining < 100) { + msleep(remaining); + remaining = 0; + } else { + msleep(100); + remaining -= 100; + } + + last_diff = diff; + } + + return (diff == 0); +} + +/** + * pr_flush() - Wait for printing threads to catch up. + * + * @timeout_ms: The maximum time (in ms) to wait. + * @reset_on_progress: Reset the timeout if forward progress is seen. + * + * A value of 0 for @timeout_ms means no waiting will occur. A value of -1 + * represents infinite waiting. + * + * If @reset_on_progress is true, the timeout will be reset whenever any + * printer has been seen to make some forward progress. + * + * Context: Process context. May sleep while acquiring console lock. + * Return: true if all enabled printers are caught up. + */ +bool pr_flush(int timeout_ms, bool reset_on_progress) +{ + return __pr_flush(NULL, timeout_ms, reset_on_progress); +} +EXPORT_SYMBOL(pr_flush); + /* * Delayed printk version, for scheduler-internal messages: */ @@ -3310,28 +3469,43 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = IRQ_WORK_INIT_LAZY(wake_up_klogd_work_func); -void wake_up_klogd(void) +static void __wake_up_klogd(int val) { if (!printk_percpu_data_ready()) return; preempt_disable(); - if (waitqueue_active(&log_wait)) { - this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); + /* + * Guarantee any new records can be seen by tasks preparing to wait + * before this context checks if the wait queue is empty. + * + * The full memory barrier within wq_has_sleeper() pairs with the full + * memory barrier within set_current_state() of + * prepare_to_wait_event(), which is called after ___wait_event() adds + * the waiter but before it has checked the wait condition. + * + * This pairs with devkmsg_read:A and syslog_print:A. + */ + if (wq_has_sleeper(&log_wait) || /* LMM(__wake_up_klogd:A) */ + (val & PRINTK_PENDING_OUTPUT)) { + this_cpu_or(printk_pending, val); irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); } preempt_enable(); } -void defer_console_output(void) +void wake_up_klogd(void) { - if (!printk_percpu_data_ready()) - return; + __wake_up_klogd(PRINTK_PENDING_WAKEUP); +} - preempt_disable(); - this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); - irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); - preempt_enable(); +void defer_console_output(void) +{ + /* + * New messages may have been added directly to the ringbuffer + * using vprintk_store(), so wake any waiters as well. + */ + __wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT); } void printk_trigger_flush(void) @@ -3667,26 +3841,26 @@ EXPORT_SYMBOL_GPL(kmsg_dump_rewind); #endif #ifdef CONFIG_SMP -static atomic_t printk_cpulock_owner = ATOMIC_INIT(-1); -static atomic_t printk_cpulock_nested = ATOMIC_INIT(0); +static atomic_t printk_cpu_sync_owner = ATOMIC_INIT(-1); +static atomic_t printk_cpu_sync_nested = ATOMIC_INIT(0); /** - * __printk_wait_on_cpu_lock() - Busy wait until the printk cpu-reentrant - * spinning lock is not owned by any CPU. + * __printk_cpu_sync_wait() - Busy wait until the printk cpu-reentrant + * spinning lock is not owned by any CPU. * * Context: Any context. */ -void __printk_wait_on_cpu_lock(void) +void __printk_cpu_sync_wait(void) { do { cpu_relax(); - } while (atomic_read(&printk_cpulock_owner) != -1); + } while (atomic_read(&printk_cpu_sync_owner) != -1); } -EXPORT_SYMBOL(__printk_wait_on_cpu_lock); +EXPORT_SYMBOL(__printk_cpu_sync_wait); /** - * __printk_cpu_trylock() - Try to acquire the printk cpu-reentrant - * spinning lock. + * __printk_cpu_sync_try_get() - Try to acquire the printk cpu-reentrant + * spinning lock. * * If no processor has the lock, the calling processor takes the lock and * becomes the owner. If the calling processor is already the owner of the @@ -3695,7 +3869,7 @@ EXPORT_SYMBOL(__printk_wait_on_cpu_lock); * Context: Any context. Expects interrupts to be disabled. * Return: 1 on success, otherwise 0. */ -int __printk_cpu_trylock(void) +int __printk_cpu_sync_try_get(void) { int cpu; int old; @@ -3705,79 +3879,80 @@ int __printk_cpu_trylock(void) /* * Guarantee loads and stores from this CPU when it is the lock owner * are _not_ visible to the previous lock owner. This pairs with - * __printk_cpu_unlock:B. + * __printk_cpu_sync_put:B. * * Memory barrier involvement: * - * If __printk_cpu_trylock:A reads from __printk_cpu_unlock:B, then - * __printk_cpu_unlock:A can never read from __printk_cpu_trylock:B. + * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B, + * then __printk_cpu_sync_put:A can never read from + * __printk_cpu_sync_try_get:B. * * Relies on: * - * RELEASE from __printk_cpu_unlock:A to __printk_cpu_unlock:B + * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B * of the previous CPU * matching - * ACQUIRE from __printk_cpu_trylock:A to __printk_cpu_trylock:B - * of this CPU + * ACQUIRE from __printk_cpu_sync_try_get:A to + * __printk_cpu_sync_try_get:B of this CPU */ - old = atomic_cmpxchg_acquire(&printk_cpulock_owner, -1, - cpu); /* LMM(__printk_cpu_trylock:A) */ + old = atomic_cmpxchg_acquire(&printk_cpu_sync_owner, -1, + cpu); /* LMM(__printk_cpu_sync_try_get:A) */ if (old == -1) { /* * This CPU is now the owner and begins loading/storing - * data: LMM(__printk_cpu_trylock:B) + * data: LMM(__printk_cpu_sync_try_get:B) */ return 1; } else if (old == cpu) { /* This CPU is already the owner. */ - atomic_inc(&printk_cpulock_nested); + atomic_inc(&printk_cpu_sync_nested); return 1; } return 0; } -EXPORT_SYMBOL(__printk_cpu_trylock); +EXPORT_SYMBOL(__printk_cpu_sync_try_get); /** - * __printk_cpu_unlock() - Release the printk cpu-reentrant spinning lock. + * __printk_cpu_sync_put() - Release the printk cpu-reentrant spinning lock. * * The calling processor must be the owner of the lock. * * Context: Any context. Expects interrupts to be disabled. */ -void __printk_cpu_unlock(void) +void __printk_cpu_sync_put(void) { - if (atomic_read(&printk_cpulock_nested)) { - atomic_dec(&printk_cpulock_nested); + if (atomic_read(&printk_cpu_sync_nested)) { + atomic_dec(&printk_cpu_sync_nested); return; } /* * This CPU is finished loading/storing data: - * LMM(__printk_cpu_unlock:A) + * LMM(__printk_cpu_sync_put:A) */ /* * Guarantee loads and stores from this CPU when it was the * lock owner are visible to the next lock owner. This pairs - * with __printk_cpu_trylock:A. + * with __printk_cpu_sync_try_get:A. * * Memory barrier involvement: * - * If __printk_cpu_trylock:A reads from __printk_cpu_unlock:B, - * then __printk_cpu_trylock:B reads from __printk_cpu_unlock:A. + * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B, + * then __printk_cpu_sync_try_get:B reads from __printk_cpu_sync_put:A. * * Relies on: * - * RELEASE from __printk_cpu_unlock:A to __printk_cpu_unlock:B + * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B * of this CPU * matching - * ACQUIRE from __printk_cpu_trylock:A to __printk_cpu_trylock:B - * of the next CPU + * ACQUIRE from __printk_cpu_sync_try_get:A to + * __printk_cpu_sync_try_get:B of the next CPU */ - atomic_set_release(&printk_cpulock_owner, - -1); /* LMM(__printk_cpu_unlock:B) */ + atomic_set_release(&printk_cpu_sync_owner, + -1); /* LMM(__printk_cpu_sync_put:B) */ } -EXPORT_SYMBOL(__printk_cpu_unlock); +EXPORT_SYMBOL(__printk_cpu_sync_put); #endif /* CONFIG_SMP */ diff --git a/kernel/profile.c b/kernel/profile.c index 37640a0bd8a3..7ea01ba30e75 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -109,6 +109,13 @@ int __ref profile_init(void) /* only text is profiled */ prof_len = (_etext - _stext) >> prof_shift; + + if (!prof_len) { + pr_warn("profiling shift: %u too large\n", prof_shift); + prof_on = 0; + return -EINVAL; + } + buffer_bytes = prof_len*sizeof(atomic_t); if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL)) @@ -418,6 +425,12 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) return read; } +/* default is to not implement this call */ +int __weak setup_profiling_timer(unsigned mult) +{ + return -EINVAL; +} + /* * Writing to /proc/profile resets the counters * @@ -428,8 +441,6 @@ static ssize_t write_profile(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { #ifdef CONFIG_SMP - extern int setup_profiling_timer(unsigned int multiplier); - if (count == sizeof(int)) { unsigned int multiplier; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index ccc4b465775b..1893d909e45c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -185,7 +185,12 @@ static bool looks_like_a_spurious_pid(struct task_struct *task) return true; } -/* Ensure that nothing can wake it up, even SIGKILL */ +/* + * Ensure that nothing can wake it up, even SIGKILL + * + * A task is switched to this state while a ptrace operation is in progress; + * such that the ptrace operation is uninterruptible. + */ static bool ptrace_freeze_traced(struct task_struct *task) { bool ret = false; @@ -197,7 +202,7 @@ static bool ptrace_freeze_traced(struct task_struct *task) spin_lock_irq(&task->sighand->siglock); if (task_is_traced(task) && !looks_like_a_spurious_pid(task) && !__fatal_signal_pending(task)) { - WRITE_ONCE(task->__state, __TASK_TRACED); + task->jobctl |= JOBCTL_PTRACE_FROZEN; ret = true; } spin_unlock_irq(&task->sighand->siglock); @@ -207,23 +212,21 @@ static bool ptrace_freeze_traced(struct task_struct *task) static void ptrace_unfreeze_traced(struct task_struct *task) { - if (READ_ONCE(task->__state) != __TASK_TRACED) - return; - - WARN_ON(!task->ptrace || task->parent != current); + unsigned long flags; /* - * PTRACE_LISTEN can allow ptrace_trap_notify to wake us up remotely. - * Recheck state under the lock to close this race. + * The child may be awake and may have cleared + * JOBCTL_PTRACE_FROZEN (see ptrace_resume). The child will + * not set JOBCTL_PTRACE_FROZEN or enter __TASK_TRACED anew. */ - spin_lock_irq(&task->sighand->siglock); - if (READ_ONCE(task->__state) == __TASK_TRACED) { - if (__fatal_signal_pending(task)) + if (lock_task_sighand(task, &flags)) { + task->jobctl &= ~JOBCTL_PTRACE_FROZEN; + if (__fatal_signal_pending(task)) { + task->jobctl &= ~JOBCTL_TRACED; wake_up_state(task, __TASK_TRACED); - else - WRITE_ONCE(task->__state, TASK_TRACED); + } + unlock_task_sighand(task, &flags); } - spin_unlock_irq(&task->sighand->siglock); } /** @@ -256,7 +259,6 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) */ read_lock(&tasklist_lock); if (child->ptrace && child->parent == current) { - WARN_ON(READ_ONCE(child->__state) == __TASK_TRACED); /* * child->sighand can't be NULL, release_task() * does ptrace_unlink() before __exit_signal(). @@ -266,17 +268,9 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) } read_unlock(&tasklist_lock); - if (!ret && !ignore_state) { - if (!wait_task_inactive(child, __TASK_TRACED)) { - /* - * This can only happen if may_ptrace_stop() fails and - * ptrace_stop() changes ->state back to TASK_RUNNING, - * so we should not worry about leaking __TASK_TRACED. - */ - WARN_ON(READ_ONCE(child->__state) == __TASK_TRACED); - ret = -ESRCH; - } - } + if (!ret && !ignore_state && + WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED))) + ret = -ESRCH; return ret; } @@ -475,8 +469,10 @@ static int ptrace_attach(struct task_struct *task, long request, * in and out of STOPPED are protected by siglock. */ if (task_is_stopped(task) && - task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) + task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) { + task->jobctl &= ~JOBCTL_STOPPED; signal_wake_up_state(task, __TASK_STOPPED); + } spin_unlock(&task->sighand->siglock); @@ -829,11 +825,7 @@ static long ptrace_get_rseq_configuration(struct task_struct *task, } #endif -#ifdef PTRACE_SINGLESTEP #define is_singlestep(request) ((request) == PTRACE_SINGLESTEP) -#else -#define is_singlestep(request) 0 -#endif #ifdef PTRACE_SINGLEBLOCK #define is_singleblock(request) ((request) == PTRACE_SINGLEBLOCK) @@ -850,8 +842,6 @@ static long ptrace_get_rseq_configuration(struct task_struct *task, static int ptrace_resume(struct task_struct *child, long request, unsigned long data) { - bool need_siglock; - if (!valid_signal(data)) return -EIO; @@ -887,18 +877,12 @@ static int ptrace_resume(struct task_struct *child, long request, * Note that we need siglock even if ->exit_code == data and/or this * status was not reported yet, the new status must not be cleared by * wait_task_stopped() after resume. - * - * If data == 0 we do not care if wait_task_stopped() reports the old - * status and clears the code too; this can't race with the tracee, it - * takes siglock after resume. */ - need_siglock = data && !thread_group_empty(current); - if (need_siglock) - spin_lock_irq(&child->sighand->siglock); + spin_lock_irq(&child->sighand->siglock); child->exit_code = data; + child->jobctl &= ~JOBCTL_TRACED; wake_up_state(child, __TASK_TRACED); - if (need_siglock) - spin_unlock_irq(&child->sighand->siglock); + spin_unlock_irq(&child->sighand->siglock); return 0; } @@ -1221,9 +1205,7 @@ int ptrace_request(struct task_struct *child, long request, } #endif -#ifdef PTRACE_SINGLESTEP case PTRACE_SINGLESTEP: -#endif #ifdef PTRACE_SINGLEBLOCK case PTRACE_SINGLEBLOCK: #endif @@ -1236,9 +1218,8 @@ int ptrace_request(struct task_struct *child, long request, return ptrace_resume(child, request, data); case PTRACE_KILL: - if (child->exit_state) /* already dead */ - return 0; - return ptrace_resume(child, request, SIGKILL); + send_sig_info(SIGKILL, SEND_SIG_NOINFO, child); + return 0; #ifdef CONFIG_HAVE_ARCH_TRACEHOOK case PTRACE_GETREGSET: @@ -1285,10 +1266,6 @@ int ptrace_request(struct task_struct *child, long request, return ret; } -#ifndef arch_ptrace_attach -#define arch_ptrace_attach(child) do { } while (0) -#endif - SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, unsigned long, data) { @@ -1297,8 +1274,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, if (request == PTRACE_TRACEME) { ret = ptrace_traceme(); - if (!ret) - arch_ptrace_attach(current); goto out; } @@ -1310,12 +1285,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { ret = ptrace_attach(child, request, addr, data); - /* - * Some architectures need to do book-keeping after - * a ptrace attach. - */ - if (!ret) - arch_ptrace_attach(child); goto out_put_task_struct; } @@ -1455,12 +1424,6 @@ COMPAT_SYSCALL_DEFINE4(ptrace, compat_long_t, request, compat_long_t, pid, if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { ret = ptrace_attach(child, request, addr, data); - /* - * Some architectures need to do book-keeping after - * a ptrace attach. - */ - if (!ret) - arch_ptrace_attach(child); goto out_put_task_struct; } diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index bf8e341e75b4..d471d22a5e21 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -8,6 +8,8 @@ menu "RCU Subsystem" config TREE_RCU bool default y if SMP + # Dynticks-idle tracking + select CONTEXT_TRACKING_IDLE help This option selects the RCU implementation that is designed for very large SMP system with hundreds or @@ -77,31 +79,56 @@ config TASKS_RCU_GENERIC This option enables generic infrastructure code supporting task-based RCU implementations. Not for manual selection. +config FORCE_TASKS_RCU + bool "Force selection of TASKS_RCU" + depends on RCU_EXPERT + select TASKS_RCU + default n + help + This option force-enables a task-based RCU implementation + that uses only voluntary context switch (not preemption!), + idle, and user-mode execution as quiescent states. Not for + manual selection in most cases. + config TASKS_RCU - def_bool PREEMPTION + bool + default n + select IRQ_WORK + +config FORCE_TASKS_RUDE_RCU + bool "Force selection of Tasks Rude RCU" + depends on RCU_EXPERT + select TASKS_RUDE_RCU + default n help - This option enables a task-based RCU implementation that uses - only voluntary context switch (not preemption!), idle, and - user-mode execution as quiescent states. Not for manual selection. + This option force-enables a task-based RCU implementation + that uses only context switch (including preemption) and + user-mode execution as quiescent states. It forces IPIs and + context switches on all online CPUs, including idle ones, + so use with caution. Not for manual selection in most cases. config TASKS_RUDE_RCU - def_bool 0 + bool + default n + select IRQ_WORK + +config FORCE_TASKS_TRACE_RCU + bool "Force selection of Tasks Trace RCU" + depends on RCU_EXPERT + select TASKS_TRACE_RCU + default n help This option enables a task-based RCU implementation that uses - only context switch (including preemption) and user-mode - execution as quiescent states. It forces IPIs and context - switches on all online CPUs, including idle ones, so use - with caution. + explicit rcu_read_lock_trace() read-side markers, and allows + these readers to appear in the idle loop as well as on the + CPU hotplug code paths. It can force IPIs on online CPUs, + including idle ones, so use with caution. Not for manual + selection in most cases. config TASKS_TRACE_RCU - def_bool 0 + bool + default n select IRQ_WORK - help - This option enables a task-based RCU implementation that uses - explicit rcu_read_lock_trace() read-side markers, and allows - these readers to appear in the idle loop as well as on the CPU - hotplug code paths. It can force IPIs on online CPUs, including - idle ones, so use with caution. config RCU_STALL_COMMON def_bool TREE_RCU @@ -195,6 +222,20 @@ config RCU_BOOST_DELAY Accept the default if unsure. +config RCU_EXP_KTHREAD + bool "Perform RCU expedited work in a real-time kthread" + depends on RCU_BOOST && RCU_EXPERT + default !PREEMPT_RT && NR_CPUS <= 32 + help + Use this option to further reduce the latencies of expedited + grace periods at the expense of being more disruptive. + + This option is disabled by default on PREEMPT_RT=y kernels which + disable expedited grace periods after boot by unconditionally + setting rcupdate.rcu_normal_after_boot=1. + + Accept the default if unsure. + config RCU_NOCB_CPU bool "Offload RCU callback processing from boot-selected CPUs" depends on TREE_RCU @@ -223,9 +264,38 @@ config RCU_NOCB_CPU Say Y here if you need reduced OS jitter, despite added overhead. Say N here if you are unsure. +config RCU_NOCB_CPU_DEFAULT_ALL + bool "Offload RCU callback processing from all CPUs by default" + depends on RCU_NOCB_CPU + default n + help + Use this option to offload callback processing from all CPUs + by default, in the absence of the rcu_nocbs or nohz_full boot + parameter. This also avoids the need to use any boot parameters + to achieve the effect of offloading all CPUs on boot. + + Say Y here if you want offload all CPUs by default on boot. + Say N here if you are unsure. + +config RCU_NOCB_CPU_CB_BOOST + bool "Offload RCU callback from real-time kthread" + depends on RCU_NOCB_CPU && RCU_BOOST + default y if PREEMPT_RT + help + Use this option to invoke offloaded callbacks as SCHED_FIFO + to avoid starvation by heavy SCHED_OTHER background load. + Of course, running as SCHED_FIFO during callback floods will + cause the rcuo[ps] kthreads to monopolize the CPU for hundreds + of milliseconds or more. Therefore, when enabling this option, + it is your responsibility to ensure that latency-sensitive + tasks either run with higher priority or run on some other CPU. + + Say Y here if you want to set RT priority for offloading kthreads. + Say N here if you are building a !PREEMPT_RT kernel and are unsure. + config TASKS_TRACE_RCU_READ_MB bool "Tasks Trace RCU readers use memory barriers in user and idle" - depends on RCU_EXPERT + depends on RCU_EXPERT && TASKS_TRACE_RCU default PREEMPT_RT || NR_CPUS < 8 help Use this option to further reduce the number of IPIs sent diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 4fd64999300f..1b0c41d490f0 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -28,9 +28,6 @@ config RCU_SCALE_TEST depends on DEBUG_KERNEL select TORTURE_TEST select SRCU - select TASKS_RCU - select TASKS_RUDE_RCU - select TASKS_TRACE_RCU default n help This option provides a kernel module that runs performance @@ -47,9 +44,6 @@ config RCU_TORTURE_TEST depends on DEBUG_KERNEL select TORTURE_TEST select SRCU - select TASKS_RCU - select TASKS_RUDE_RCU - select TASKS_TRACE_RCU default n help This option provides a kernel module that runs torture tests @@ -66,9 +60,6 @@ config RCU_REF_SCALE_TEST depends on DEBUG_KERNEL select TORTURE_TEST select SRCU - select TASKS_RCU - select TASKS_RUDE_RCU - select TASKS_TRACE_RCU default n help This option provides a kernel module that runs performance tests @@ -91,6 +82,19 @@ config RCU_CPU_STALL_TIMEOUT RCU grace period persists, additional CPU stall warnings are printed at more widely spaced intervals. +config RCU_EXP_CPU_STALL_TIMEOUT + int "Expedited RCU CPU stall timeout in milliseconds" + depends on RCU_STALL_COMMON + range 0 21000 + default 0 + help + If a given expedited RCU grace period extends more than the + specified number of milliseconds, a CPU stall warning is printed. + If the RCU grace period persists, additional CPU stall warnings + are printed at more widely spaced intervals. A value of zero + says to use the RCU_CPU_STALL_TIMEOUT value converted from + seconds to milliseconds. + config RCU_TRACE bool "Enable tracing for RCU" depends on DEBUG_KERNEL @@ -116,7 +120,7 @@ config RCU_EQS_DEBUG config RCU_STRICT_GRACE_PERIOD bool "Provide debug RCU implementation with short grace periods" - depends on DEBUG_KERNEL && RCU_EXPERT && NR_CPUS <= 4 + depends on DEBUG_KERNEL && RCU_EXPERT && NR_CPUS <= 4 && !TINY_RCU default n select PREEMPT_COUNT if PREEMPT=n help diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 24b5f2c2de87..be5979da07f5 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -12,10 +12,6 @@ #include <trace/events/rcu.h> -/* Offset to allow distinguishing irq vs. task-based idle entry/exit. */ -#define DYNTICK_IRQ_NONIDLE ((LONG_MAX / 2) + 1) - - /* * Grace-period counter management. */ @@ -23,6 +19,11 @@ #define RCU_SEQ_CTR_SHIFT 2 #define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1) +/* Low-order bit definition for polled grace-period APIs. */ +#define RCU_GET_STATE_COMPLETED 0x1 + +extern int sysctl_sched_rt_runtime; + /* * Return the counter portion of a sequence number previously returned * by rcu_seq_snap() or rcu_seq_current(). @@ -118,6 +119,18 @@ static inline bool rcu_seq_done(unsigned long *sp, unsigned long s) } /* + * Given a snapshot from rcu_seq_snap(), determine whether or not a + * full update-side operation has occurred, but do not allow the + * (ULONG_MAX / 2) safety-factor/guard-band. + */ +static inline bool rcu_seq_done_exact(unsigned long *sp, unsigned long s) +{ + unsigned long cur_s = READ_ONCE(*sp); + + return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (2 * RCU_SEQ_STATE_MASK + 1)); +} + +/* * Has a grace period completed since the time the old gp_seq was collected? */ static inline bool rcu_seq_completed_gp(unsigned long old, unsigned long new) @@ -210,7 +223,9 @@ static inline bool rcu_stall_is_suppressed_at_boot(void) extern int rcu_cpu_stall_ftrace_dump; extern int rcu_cpu_stall_suppress; extern int rcu_cpu_stall_timeout; +extern int rcu_exp_cpu_stall_timeout; int rcu_jiffies_till_stall_check(void); +int rcu_exp_jiffies_till_stall_check(void); static inline bool rcu_stall_is_suppressed(void) { @@ -523,6 +538,8 @@ static inline bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { ret static inline void show_rcu_gp_kthreads(void) { } static inline int rcu_get_gp_kthreads_prio(void) { return 0; } static inline void rcu_fwd_progress_check(unsigned long j) { } +static inline void rcu_gp_slow_register(atomic_t *rgssp) { } +static inline void rcu_gp_slow_unregister(atomic_t *rgssp) { } #else /* #ifdef CONFIG_TINY_RCU */ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp); unsigned long rcu_get_gp_seq(void); @@ -534,14 +551,19 @@ int rcu_get_gp_kthreads_prio(void); void rcu_fwd_progress_check(unsigned long j); void rcu_force_quiescent_state(void); extern struct workqueue_struct *rcu_gp_wq; +#ifdef CONFIG_RCU_EXP_KTHREAD +extern struct kthread_worker *rcu_exp_gp_kworker; +extern struct kthread_worker *rcu_exp_par_gp_kworker; +#else /* !CONFIG_RCU_EXP_KTHREAD */ extern struct workqueue_struct *rcu_par_gp_wq; +#endif /* CONFIG_RCU_EXP_KTHREAD */ +void rcu_gp_slow_register(atomic_t *rgssp); +void rcu_gp_slow_unregister(atomic_t *rgssp); #endif /* #else #ifdef CONFIG_TINY_RCU */ #ifdef CONFIG_RCU_NOCB_CPU -bool rcu_is_nocb_cpu(int cpu); void rcu_bind_current_to_nocb(void); #else -static inline bool rcu_is_nocb_cpu(int cpu) { return false; } static inline void rcu_bind_current_to_nocb(void) { } #endif diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 81145c3ece25..c54ea2b6a36b 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -505,10 +505,10 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq) WRITE_ONCE(rsclp->tails[j], rsclp->tails[RCU_DONE_TAIL]); /* - * Callbacks moved, so clean up the misordered ->tails[] pointers - * that now point into the middle of the list of ready-to-invoke - * callbacks. The overall effect is to copy down the later pointers - * into the gap that was created by the now-ready segments. + * Callbacks moved, so there might be an empty RCU_WAIT_TAIL + * and a non-empty RCU_NEXT_READY_TAIL. If so, copy the + * RCU_NEXT_READY_TAIL segment to fill the RCU_WAIT_TAIL gap + * created by the now-ready-to-invoke segments. */ for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL]) diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 5e4f1f83d38e..3ef02d4a8108 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -268,6 +268,8 @@ static struct rcu_scale_ops srcud_ops = { .name = "srcud" }; +#ifdef CONFIG_TASKS_RCU + /* * Definitions for RCU-tasks scalability testing. */ @@ -295,6 +297,16 @@ static struct rcu_scale_ops tasks_ops = { .name = "tasks" }; +#define TASKS_OPS &tasks_ops, + +#else // #ifdef CONFIG_TASKS_RCU + +#define TASKS_OPS + +#endif // #else // #ifdef CONFIG_TASKS_RCU + +#ifdef CONFIG_TASKS_TRACE_RCU + /* * Definitions for RCU-tasks-trace scalability testing. */ @@ -324,6 +336,14 @@ static struct rcu_scale_ops tasks_tracing_ops = { .name = "tasks-tracing" }; +#define TASKS_TRACING_OPS &tasks_tracing_ops, + +#else // #ifdef CONFIG_TASKS_TRACE_RCU + +#define TASKS_TRACING_OPS + +#endif // #else // #ifdef CONFIG_TASKS_TRACE_RCU + static unsigned long rcuscale_seq_diff(unsigned long new, unsigned long old) { if (!cur_ops->gp_diff) @@ -399,6 +419,7 @@ rcu_scale_writer(void *arg) VERBOSE_SCALEOUT_STRING("rcu_scale_writer task started"); WARN_ON(!wdpp); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + current->flags |= PF_NO_SETAFFINITY; sched_set_fifo_low(current); if (holdoff) @@ -797,7 +818,7 @@ rcu_scale_init(void) long i; int firsterr = 0; static struct rcu_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &srcud_ops, &tasks_ops, &tasks_tracing_ops + &rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS TASKS_TRACING_OPS }; if (!torture_init_begin(scale_type, verbose)) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 55d049c39608..503c2aa845a4 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -75,62 +75,52 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@ torture_param(int, extendables, RCUTORTURE_MAX_EXTEND, "Extend readers by disabling bh (1), irqs (2), or preempt (4)"); -torture_param(int, fqs_duration, 0, - "Duration of fqs bursts (us), 0 to disable"); +torture_param(int, fqs_duration, 0, "Duration of fqs bursts (us), 0 to disable"); torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)"); torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)"); -torture_param(int, fwd_progress, 1, "Test grace-period forward progress"); +torture_param(int, fwd_progress, 1, "Number of grace-period forward progress tasks (0 to disable)"); torture_param(int, fwd_progress_div, 4, "Fraction of CPU stall to wait"); -torture_param(int, fwd_progress_holdoff, 60, - "Time between forward-progress tests (s)"); -torture_param(bool, fwd_progress_need_resched, 1, - "Hide cond_resched() behind need_resched()"); +torture_param(int, fwd_progress_holdoff, 60, "Time between forward-progress tests (s)"); +torture_param(bool, fwd_progress_need_resched, 1, "Hide cond_resched() behind need_resched()"); torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives"); +torture_param(bool, gp_cond_exp, false, "Use conditional/async expedited GP wait primitives"); +torture_param(bool, gp_cond_full, false, "Use conditional/async full-state GP wait primitives"); +torture_param(bool, gp_cond_exp_full, false, + "Use conditional/async full-stateexpedited GP wait primitives"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); -torture_param(bool, gp_normal, false, - "Use normal (non-expedited) GP wait primitives"); +torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives"); torture_param(bool, gp_poll, false, "Use polling GP wait primitives"); +torture_param(bool, gp_poll_exp, false, "Use polling expedited GP wait primitives"); +torture_param(bool, gp_poll_full, false, "Use polling full-state GP wait primitives"); +torture_param(bool, gp_poll_exp_full, false, "Use polling full-state expedited GP wait primitives"); torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives"); torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers"); -torture_param(int, n_barrier_cbs, 0, - "# of callbacks/kthreads for barrier testing"); +torture_param(int, n_barrier_cbs, 0, "# of callbacks/kthreads for barrier testing"); torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads"); torture_param(int, nreaders, -1, "Number of RCU reader threads"); -torture_param(int, object_debug, 0, - "Enable debug-object double call_rcu() testing"); +torture_param(int, object_debug, 0, "Enable debug-object double call_rcu() testing"); torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); -torture_param(int, onoff_interval, 0, - "Time between CPU hotplugs (jiffies), 0=disable"); +torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (jiffies), 0=disable"); torture_param(int, nocbs_nthreads, 0, "Number of NOCB toggle threads, 0 to disable"); torture_param(int, nocbs_toggle, 1000, "Time between toggling nocb state (ms)"); -torture_param(int, read_exit_delay, 13, - "Delay between read-then-exit episodes (s)"); -torture_param(int, read_exit_burst, 16, - "# of read-then-exit bursts per episode, zero to disable"); +torture_param(int, read_exit_delay, 13, "Delay between read-then-exit episodes (s)"); +torture_param(int, read_exit_burst, 16, "# of read-then-exit bursts per episode, zero to disable"); torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles"); torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable."); torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); -torture_param(int, stall_cpu_holdoff, 10, - "Time to wait before starting stall (s)."); -torture_param(bool, stall_no_softlockup, false, - "Avoid softlockup warning during cpu stall."); +torture_param(int, stall_cpu_holdoff, 10, "Time to wait before starting stall (s)."); +torture_param(bool, stall_no_softlockup, false, "Avoid softlockup warning during cpu stall."); torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling."); torture_param(int, stall_cpu_block, 0, "Sleep while stalling."); -torture_param(int, stall_gp_kthread, 0, - "Grace-period kthread stall duration (s)."); -torture_param(int, stat_interval, 60, - "Number of seconds between stats printk()s"); +torture_param(int, stall_gp_kthread, 0, "Grace-period kthread stall duration (s)."); +torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of seconds to run/halt test"); torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); -torture_param(int, test_boost_duration, 4, - "Duration of each boost test, seconds."); -torture_param(int, test_boost_interval, 7, - "Interval between boost tests, seconds."); -torture_param(bool, test_no_idle_hz, true, - "Test support for tickless idle CPUs"); -torture_param(int, verbose, 1, - "Enable verbose debugging printk()s"); +torture_param(int, test_boost_duration, 4, "Duration of each boost test, seconds."); +torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds."); +torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs"); +torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); static char *torture_type = "rcu"; module_param(torture_type, charp, 0444); @@ -209,12 +199,24 @@ static int rcu_torture_writer_state; #define RTWS_DEF_FREE 3 #define RTWS_EXP_SYNC 4 #define RTWS_COND_GET 5 -#define RTWS_COND_SYNC 6 -#define RTWS_POLL_GET 7 -#define RTWS_POLL_WAIT 8 -#define RTWS_SYNC 9 -#define RTWS_STUTTER 10 -#define RTWS_STOPPING 11 +#define RTWS_COND_GET_FULL 6 +#define RTWS_COND_GET_EXP 7 +#define RTWS_COND_GET_EXP_FULL 8 +#define RTWS_COND_SYNC 9 +#define RTWS_COND_SYNC_FULL 10 +#define RTWS_COND_SYNC_EXP 11 +#define RTWS_COND_SYNC_EXP_FULL 12 +#define RTWS_POLL_GET 13 +#define RTWS_POLL_GET_FULL 14 +#define RTWS_POLL_GET_EXP 15 +#define RTWS_POLL_GET_EXP_FULL 16 +#define RTWS_POLL_WAIT 17 +#define RTWS_POLL_WAIT_FULL 18 +#define RTWS_POLL_WAIT_EXP 19 +#define RTWS_POLL_WAIT_EXP_FULL 20 +#define RTWS_SYNC 21 +#define RTWS_STUTTER 22 +#define RTWS_STOPPING 23 static const char * const rcu_torture_writer_state_names[] = { "RTWS_FIXED_DELAY", "RTWS_DELAY", @@ -222,9 +224,21 @@ static const char * const rcu_torture_writer_state_names[] = { "RTWS_DEF_FREE", "RTWS_EXP_SYNC", "RTWS_COND_GET", + "RTWS_COND_GET_FULL", + "RTWS_COND_GET_EXP", + "RTWS_COND_GET_EXP_FULL", "RTWS_COND_SYNC", + "RTWS_COND_SYNC_FULL", + "RTWS_COND_SYNC_EXP", + "RTWS_COND_SYNC_EXP_FULL", "RTWS_POLL_GET", + "RTWS_POLL_GET_FULL", + "RTWS_POLL_GET_EXP", + "RTWS_POLL_GET_EXP_FULL", "RTWS_POLL_WAIT", + "RTWS_POLL_WAIT_FULL", + "RTWS_POLL_WAIT_EXP", + "RTWS_POLL_WAIT_EXP_FULL", "RTWS_SYNC", "RTWS_STUTTER", "RTWS_STOPPING", @@ -337,10 +351,23 @@ struct rcu_torture_ops { void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); void (*exp_sync)(void); + unsigned long (*get_gp_state_exp)(void); + unsigned long (*start_gp_poll_exp)(void); + void (*start_gp_poll_exp_full)(struct rcu_gp_oldstate *rgosp); + bool (*poll_gp_state_exp)(unsigned long oldstate); + void (*cond_sync_exp)(unsigned long oldstate); + void (*cond_sync_exp_full)(struct rcu_gp_oldstate *rgosp); unsigned long (*get_gp_state)(void); + void (*get_gp_state_full)(struct rcu_gp_oldstate *rgosp); + unsigned long (*get_gp_completed)(void); + void (*get_gp_completed_full)(struct rcu_gp_oldstate *rgosp); unsigned long (*start_gp_poll)(void); + void (*start_gp_poll_full)(struct rcu_gp_oldstate *rgosp); bool (*poll_gp_state)(unsigned long oldstate); + bool (*poll_gp_state_full)(struct rcu_gp_oldstate *rgosp); + bool (*poll_need_2gp)(bool poll, bool poll_full); void (*cond_sync)(unsigned long oldstate); + void (*cond_sync_full)(struct rcu_gp_oldstate *rgosp); call_rcu_func_t call; void (*cb_barrier)(void); void (*fqs)(void); @@ -491,6 +518,11 @@ static void rcu_sync_torture_init(void) INIT_LIST_HEAD(&rcu_torture_removed); } +static bool rcu_poll_need_2gp(bool poll, bool poll_full) +{ + return poll; +} + static struct rcu_torture_ops rcu_ops = { .ttype = RCU_FLAVOR, .init = rcu_sync_torture_init, @@ -504,9 +536,21 @@ static struct rcu_torture_ops rcu_ops = { .sync = synchronize_rcu, .exp_sync = synchronize_rcu_expedited, .get_gp_state = get_state_synchronize_rcu, + .get_gp_state_full = get_state_synchronize_rcu_full, + .get_gp_completed = get_completed_synchronize_rcu, + .get_gp_completed_full = get_completed_synchronize_rcu_full, .start_gp_poll = start_poll_synchronize_rcu, + .start_gp_poll_full = start_poll_synchronize_rcu_full, .poll_gp_state = poll_state_synchronize_rcu, + .poll_gp_state_full = poll_state_synchronize_rcu_full, + .poll_need_2gp = rcu_poll_need_2gp, .cond_sync = cond_synchronize_rcu, + .cond_sync_full = cond_synchronize_rcu_full, + .get_gp_state_exp = get_state_synchronize_rcu, + .start_gp_poll_exp = start_poll_synchronize_rcu_expedited, + .start_gp_poll_exp_full = start_poll_synchronize_rcu_expedited_full, + .poll_gp_state_exp = poll_state_synchronize_rcu, + .cond_sync_exp = cond_synchronize_rcu_expedited, .call = call_rcu, .cb_barrier = rcu_barrier, .fqs = rcu_force_quiescent_state, @@ -706,6 +750,9 @@ static struct rcu_torture_ops srcud_ops = { .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, + .get_gp_state = srcu_torture_get_gp_state, + .start_gp_poll = srcu_torture_start_gp_poll, + .poll_gp_state = srcu_torture_poll_gp_state, .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, @@ -738,6 +785,50 @@ static struct rcu_torture_ops busted_srcud_ops = { }; /* + * Definitions for trivial CONFIG_PREEMPT=n-only torture testing. + * This implementation does not necessarily work well with CPU hotplug. + */ + +static void synchronize_rcu_trivial(void) +{ + int cpu; + + for_each_online_cpu(cpu) { + rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu)); + WARN_ON_ONCE(raw_smp_processor_id() != cpu); + } +} + +static int rcu_torture_read_lock_trivial(void) __acquires(RCU) +{ + preempt_disable(); + return 0; +} + +static void rcu_torture_read_unlock_trivial(int idx) __releases(RCU) +{ + preempt_enable(); +} + +static struct rcu_torture_ops trivial_ops = { + .ttype = RCU_TRIVIAL_FLAVOR, + .init = rcu_sync_torture_init, + .readlock = rcu_torture_read_lock_trivial, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_torture_read_unlock_trivial, + .readlock_held = torture_readlock_not_held, + .get_gp_seq = rcu_no_completed, + .sync = synchronize_rcu_trivial, + .exp_sync = synchronize_rcu_trivial, + .fqs = NULL, + .stats = NULL, + .irq_capable = 1, + .name = "trivial" +}; + +#ifdef CONFIG_TASKS_RCU + +/* * Definitions for RCU-tasks torture testing. */ @@ -780,47 +871,16 @@ static struct rcu_torture_ops tasks_ops = { .name = "tasks" }; -/* - * Definitions for trivial CONFIG_PREEMPT=n-only torture testing. - * This implementation does not necessarily work well with CPU hotplug. - */ +#define TASKS_OPS &tasks_ops, -static void synchronize_rcu_trivial(void) -{ - int cpu; +#else // #ifdef CONFIG_TASKS_RCU - for_each_online_cpu(cpu) { - rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu)); - WARN_ON_ONCE(raw_smp_processor_id() != cpu); - } -} +#define TASKS_OPS -static int rcu_torture_read_lock_trivial(void) __acquires(RCU) -{ - preempt_disable(); - return 0; -} +#endif // #else #ifdef CONFIG_TASKS_RCU -static void rcu_torture_read_unlock_trivial(int idx) __releases(RCU) -{ - preempt_enable(); -} -static struct rcu_torture_ops trivial_ops = { - .ttype = RCU_TRIVIAL_FLAVOR, - .init = rcu_sync_torture_init, - .readlock = rcu_torture_read_lock_trivial, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = rcu_torture_read_unlock_trivial, - .readlock_held = torture_readlock_not_held, - .get_gp_seq = rcu_no_completed, - .sync = synchronize_rcu_trivial, - .exp_sync = synchronize_rcu_trivial, - .fqs = NULL, - .stats = NULL, - .irq_capable = 1, - .name = "trivial" -}; +#ifdef CONFIG_TASKS_RUDE_RCU /* * Definitions for rude RCU-tasks torture testing. @@ -851,6 +911,17 @@ static struct rcu_torture_ops tasks_rude_ops = { .name = "tasks-rude" }; +#define TASKS_RUDE_OPS &tasks_rude_ops, + +#else // #ifdef CONFIG_TASKS_RUDE_RCU + +#define TASKS_RUDE_OPS + +#endif // #else #ifdef CONFIG_TASKS_RUDE_RCU + + +#ifdef CONFIG_TASKS_TRACE_RCU + /* * Definitions for tracing RCU-tasks torture testing. */ @@ -893,6 +964,15 @@ static struct rcu_torture_ops tasks_tracing_ops = { .name = "tasks-tracing" }; +#define TASKS_TRACING_OPS &tasks_tracing_ops, + +#else // #ifdef CONFIG_TASKS_TRACE_RCU + +#define TASKS_TRACING_OPS + +#endif // #else #ifdef CONFIG_TASKS_TRACE_RCU + + static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) { if (!cur_ops->gp_diff) @@ -1103,9 +1183,8 @@ rcu_torture_fqs(void *arg) return 0; } -// Used by writers to randomly choose from the available grace-period -// primitives. The only purpose of the initialization is to size the array. -static int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC, RTWS_COND_GET, RTWS_POLL_GET, RTWS_SYNC }; +// Used by writers to randomly choose from the available grace-period primitives. +static int synctype[ARRAY_SIZE(rcu_torture_writer_state_names)] = { }; static int nsynctypes; /* @@ -1113,18 +1192,60 @@ static int nsynctypes; */ static void rcu_torture_write_types(void) { - bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; - bool gp_poll1 = gp_poll, gp_sync1 = gp_sync; + bool gp_cond1 = gp_cond, gp_cond_exp1 = gp_cond_exp, gp_cond_full1 = gp_cond_full; + bool gp_cond_exp_full1 = gp_cond_exp_full, gp_exp1 = gp_exp, gp_poll_exp1 = gp_poll_exp; + bool gp_poll_exp_full1 = gp_poll_exp_full, gp_normal1 = gp_normal, gp_poll1 = gp_poll; + bool gp_poll_full1 = gp_poll_full, gp_sync1 = gp_sync; /* Initialize synctype[] array. If none set, take default. */ - if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_poll1 && !gp_sync1) - gp_cond1 = gp_exp1 = gp_normal1 = gp_poll1 = gp_sync1 = true; + if (!gp_cond1 && + !gp_cond_exp1 && + !gp_cond_full1 && + !gp_cond_exp_full1 && + !gp_exp1 && + !gp_poll_exp1 && + !gp_poll_exp_full1 && + !gp_normal1 && + !gp_poll1 && + !gp_poll_full1 && + !gp_sync1) { + gp_cond1 = true; + gp_cond_exp1 = true; + gp_cond_full1 = true; + gp_cond_exp_full1 = true; + gp_exp1 = true; + gp_poll_exp1 = true; + gp_poll_exp_full1 = true; + gp_normal1 = true; + gp_poll1 = true; + gp_poll_full1 = true; + gp_sync1 = true; + } if (gp_cond1 && cur_ops->get_gp_state && cur_ops->cond_sync) { synctype[nsynctypes++] = RTWS_COND_GET; pr_info("%s: Testing conditional GPs.\n", __func__); } else if (gp_cond && (!cur_ops->get_gp_state || !cur_ops->cond_sync)) { pr_alert("%s: gp_cond without primitives.\n", __func__); } + if (gp_cond_exp1 && cur_ops->get_gp_state_exp && cur_ops->cond_sync_exp) { + synctype[nsynctypes++] = RTWS_COND_GET_EXP; + pr_info("%s: Testing conditional expedited GPs.\n", __func__); + } else if (gp_cond_exp && (!cur_ops->get_gp_state_exp || !cur_ops->cond_sync_exp)) { + pr_alert("%s: gp_cond_exp without primitives.\n", __func__); + } + if (gp_cond_full1 && cur_ops->get_gp_state && cur_ops->cond_sync_full) { + synctype[nsynctypes++] = RTWS_COND_GET_FULL; + pr_info("%s: Testing conditional full-state GPs.\n", __func__); + } else if (gp_cond_full && (!cur_ops->get_gp_state || !cur_ops->cond_sync_full)) { + pr_alert("%s: gp_cond_full without primitives.\n", __func__); + } + if (gp_cond_exp_full1 && cur_ops->get_gp_state_exp && cur_ops->cond_sync_exp_full) { + synctype[nsynctypes++] = RTWS_COND_GET_EXP_FULL; + pr_info("%s: Testing conditional full-state expedited GPs.\n", __func__); + } else if (gp_cond_exp_full && + (!cur_ops->get_gp_state_exp || !cur_ops->cond_sync_exp_full)) { + pr_alert("%s: gp_cond_exp_full without primitives.\n", __func__); + } if (gp_exp1 && cur_ops->exp_sync) { synctype[nsynctypes++] = RTWS_EXP_SYNC; pr_info("%s: Testing expedited GPs.\n", __func__); @@ -1143,6 +1264,25 @@ static void rcu_torture_write_types(void) } else if (gp_poll && (!cur_ops->start_gp_poll || !cur_ops->poll_gp_state)) { pr_alert("%s: gp_poll without primitives.\n", __func__); } + if (gp_poll_full1 && cur_ops->start_gp_poll_full && cur_ops->poll_gp_state_full) { + synctype[nsynctypes++] = RTWS_POLL_GET_FULL; + pr_info("%s: Testing polling full-state GPs.\n", __func__); + } else if (gp_poll_full && (!cur_ops->start_gp_poll_full || !cur_ops->poll_gp_state_full)) { + pr_alert("%s: gp_poll_full without primitives.\n", __func__); + } + if (gp_poll_exp1 && cur_ops->start_gp_poll_exp && cur_ops->poll_gp_state_exp) { + synctype[nsynctypes++] = RTWS_POLL_GET_EXP; + pr_info("%s: Testing polling expedited GPs.\n", __func__); + } else if (gp_poll_exp && (!cur_ops->start_gp_poll_exp || !cur_ops->poll_gp_state_exp)) { + pr_alert("%s: gp_poll_exp without primitives.\n", __func__); + } + if (gp_poll_exp_full1 && cur_ops->start_gp_poll_exp_full && cur_ops->poll_gp_state_full) { + synctype[nsynctypes++] = RTWS_POLL_GET_EXP_FULL; + pr_info("%s: Testing polling full-state expedited GPs.\n", __func__); + } else if (gp_poll_exp_full && + (!cur_ops->start_gp_poll_exp_full || !cur_ops->poll_gp_state_full)) { + pr_alert("%s: gp_poll_exp_full without primitives.\n", __func__); + } if (gp_sync1 && cur_ops->sync) { synctype[nsynctypes++] = RTWS_SYNC; pr_info("%s: Testing normal GPs.\n", __func__); @@ -1152,6 +1292,40 @@ static void rcu_torture_write_types(void) } /* + * Do the specified rcu_torture_writer() synchronous grace period, + * while also testing out the polled APIs. Note well that the single-CPU + * grace-period optimizations must be accounted for. + */ +static void do_rtws_sync(struct torture_random_state *trsp, void (*sync)(void)) +{ + unsigned long cookie; + struct rcu_gp_oldstate cookie_full; + bool dopoll; + bool dopoll_full; + unsigned long r = torture_random(trsp); + + dopoll = cur_ops->get_gp_state && cur_ops->poll_gp_state && !(r & 0x300); + dopoll_full = cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full && !(r & 0xc00); + if (dopoll || dopoll_full) + cpus_read_lock(); + if (dopoll) + cookie = cur_ops->get_gp_state(); + if (dopoll_full) + cur_ops->get_gp_state_full(&cookie_full); + if (cur_ops->poll_need_2gp && cur_ops->poll_need_2gp(dopoll, dopoll_full)) + sync(); + sync(); + WARN_ONCE(dopoll && !cur_ops->poll_gp_state(cookie), + "%s: Cookie check 3 failed %pS() online %*pbl.", + __func__, sync, cpumask_pr_args(cpu_online_mask)); + WARN_ONCE(dopoll_full && !cur_ops->poll_gp_state_full(&cookie_full), + "%s: Cookie check 4 failed %pS() online %*pbl", + __func__, sync, cpumask_pr_args(cpu_online_mask)); + if (dopoll || dopoll_full) + cpus_read_unlock(); +} + +/* * RCU torture writer kthread. Repeatedly substitutes a new structure * for that pointed to by rcu_torture_current, freeing the old structure * after a series of grace periods (the "pipeline"). @@ -1162,8 +1336,10 @@ rcu_torture_writer(void *arg) bool boot_ended; bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal(); unsigned long cookie; + struct rcu_gp_oldstate cookie_full; int expediting = 0; unsigned long gp_snap; + struct rcu_gp_oldstate gp_snap_full; int i; int idx; int oldnice = task_nice(current); @@ -1178,7 +1354,7 @@ rcu_torture_writer(void *arg) " GP expediting controlled from boot/sysfs for %s.\n", torture_type, cur_ops->name); if (WARN_ONCE(nsynctypes == 0, - "rcu_torture_writer: No update-side primitives.\n")) { + "%s: No update-side primitives.\n", __func__)) { /* * No updates primitives, so don't try updating. * The resulting test won't be testing much, hence the @@ -1186,6 +1362,7 @@ rcu_torture_writer(void *arg) */ rcu_torture_writer_state = RTWS_STOPPING; torture_kthread_stopping("rcu_torture_writer"); + return 0; } do { @@ -1210,16 +1387,36 @@ rcu_torture_writer(void *arg) atomic_inc(&rcu_torture_wcount[i]); WRITE_ONCE(old_rp->rtort_pipe_count, old_rp->rtort_pipe_count + 1); + + // Make sure readers block polled grace periods. if (cur_ops->get_gp_state && cur_ops->poll_gp_state) { idx = cur_ops->readlock(); cookie = cur_ops->get_gp_state(); - WARN_ONCE(rcu_torture_writer_state != RTWS_DEF_FREE && - cur_ops->poll_gp_state(cookie), + WARN_ONCE(cur_ops->poll_gp_state(cookie), "%s: Cookie check 1 failed %s(%d) %lu->%lu\n", __func__, rcu_torture_writer_state_getname(), rcu_torture_writer_state, cookie, cur_ops->get_gp_state()); + if (cur_ops->get_gp_completed) { + cookie = cur_ops->get_gp_completed(); + WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie)); + } + cur_ops->readunlock(idx); + } + if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full) { + idx = cur_ops->readlock(); + cur_ops->get_gp_state_full(&cookie_full); + WARN_ONCE(cur_ops->poll_gp_state_full(&cookie_full), + "%s: Cookie check 5 failed %s(%d) online %*pbl\n", + __func__, + rcu_torture_writer_state_getname(), + rcu_torture_writer_state, + cpumask_pr_args(cpu_online_mask)); + if (cur_ops->get_gp_completed_full) { + cur_ops->get_gp_completed_full(&cookie_full); + WARN_ON_ONCE(!cur_ops->poll_gp_state_full(&cookie_full)); + } cur_ops->readunlock(idx); } switch (synctype[torture_random(&rand) % nsynctypes]) { @@ -1229,7 +1426,7 @@ rcu_torture_writer(void *arg) break; case RTWS_EXP_SYNC: rcu_torture_writer_state = RTWS_EXP_SYNC; - cur_ops->exp_sync(); + do_rtws_sync(&rand, cur_ops->exp_sync); rcu_torture_pipe_update(old_rp); break; case RTWS_COND_GET: @@ -1240,6 +1437,30 @@ rcu_torture_writer(void *arg) cur_ops->cond_sync(gp_snap); rcu_torture_pipe_update(old_rp); break; + case RTWS_COND_GET_EXP: + rcu_torture_writer_state = RTWS_COND_GET_EXP; + gp_snap = cur_ops->get_gp_state_exp(); + torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + rcu_torture_writer_state = RTWS_COND_SYNC_EXP; + cur_ops->cond_sync_exp(gp_snap); + rcu_torture_pipe_update(old_rp); + break; + case RTWS_COND_GET_FULL: + rcu_torture_writer_state = RTWS_COND_GET_FULL; + cur_ops->get_gp_state_full(&gp_snap_full); + torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + rcu_torture_writer_state = RTWS_COND_SYNC_FULL; + cur_ops->cond_sync_full(&gp_snap_full); + rcu_torture_pipe_update(old_rp); + break; + case RTWS_COND_GET_EXP_FULL: + rcu_torture_writer_state = RTWS_COND_GET_EXP_FULL; + cur_ops->get_gp_state_full(&gp_snap_full); + torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + rcu_torture_writer_state = RTWS_COND_SYNC_EXP_FULL; + cur_ops->cond_sync_exp_full(&gp_snap_full); + rcu_torture_pipe_update(old_rp); + break; case RTWS_POLL_GET: rcu_torture_writer_state = RTWS_POLL_GET; gp_snap = cur_ops->start_gp_poll(); @@ -1249,9 +1470,36 @@ rcu_torture_writer(void *arg) &rand); rcu_torture_pipe_update(old_rp); break; + case RTWS_POLL_GET_FULL: + rcu_torture_writer_state = RTWS_POLL_GET_FULL; + cur_ops->start_gp_poll_full(&gp_snap_full); + rcu_torture_writer_state = RTWS_POLL_WAIT_FULL; + while (!cur_ops->poll_gp_state_full(&gp_snap_full)) + torture_hrtimeout_jiffies(torture_random(&rand) % 16, + &rand); + rcu_torture_pipe_update(old_rp); + break; + case RTWS_POLL_GET_EXP: + rcu_torture_writer_state = RTWS_POLL_GET_EXP; + gp_snap = cur_ops->start_gp_poll_exp(); + rcu_torture_writer_state = RTWS_POLL_WAIT_EXP; + while (!cur_ops->poll_gp_state_exp(gp_snap)) + torture_hrtimeout_jiffies(torture_random(&rand) % 16, + &rand); + rcu_torture_pipe_update(old_rp); + break; + case RTWS_POLL_GET_EXP_FULL: + rcu_torture_writer_state = RTWS_POLL_GET_EXP_FULL; + cur_ops->start_gp_poll_exp_full(&gp_snap_full); + rcu_torture_writer_state = RTWS_POLL_WAIT_EXP_FULL; + while (!cur_ops->poll_gp_state_full(&gp_snap_full)) + torture_hrtimeout_jiffies(torture_random(&rand) % 16, + &rand); + rcu_torture_pipe_update(old_rp); + break; case RTWS_SYNC: rcu_torture_writer_state = RTWS_SYNC; - cur_ops->sync(); + do_rtws_sync(&rand, cur_ops->sync); rcu_torture_pipe_update(old_rp); break; default: @@ -1287,8 +1535,9 @@ rcu_torture_writer(void *arg) if (list_empty(&rcu_tortures[i].rtort_free) && rcu_access_pointer(rcu_torture_current) != &rcu_tortures[i]) { - rcu_ftrace_dump(DUMP_ALL); + tracing_off(); WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count); + rcu_ftrace_dump(DUMP_ALL); } if (stutter_waited) sched_set_normal(current, oldnice); @@ -1317,11 +1566,23 @@ static int rcu_torture_fakewriter(void *arg) { unsigned long gp_snap; + struct rcu_gp_oldstate gp_snap_full; DEFINE_TORTURE_RANDOM(rand); VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started"); set_user_nice(current, MAX_NICE); + if (WARN_ONCE(nsynctypes == 0, + "%s: No update-side primitives.\n", __func__)) { + /* + * No updates primitives, so don't try updating. + * The resulting test won't be testing much, hence the + * above WARN_ONCE(). + */ + torture_kthread_stopping("rcu_torture_fakewriter"); + return 0; + } + do { torture_hrtimeout_jiffies(torture_random(&rand) % 10, &rand); if (cur_ops->cb_barrier != NULL && @@ -1339,6 +1600,21 @@ rcu_torture_fakewriter(void *arg) torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); cur_ops->cond_sync(gp_snap); break; + case RTWS_COND_GET_EXP: + gp_snap = cur_ops->get_gp_state_exp(); + torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + cur_ops->cond_sync_exp(gp_snap); + break; + case RTWS_COND_GET_FULL: + cur_ops->get_gp_state_full(&gp_snap_full); + torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + cur_ops->cond_sync_full(&gp_snap_full); + break; + case RTWS_COND_GET_EXP_FULL: + cur_ops->get_gp_state_full(&gp_snap_full); + torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + cur_ops->cond_sync_exp_full(&gp_snap_full); + break; case RTWS_POLL_GET: gp_snap = cur_ops->start_gp_poll(); while (!cur_ops->poll_gp_state(gp_snap)) { @@ -1346,6 +1622,27 @@ rcu_torture_fakewriter(void *arg) &rand); } break; + case RTWS_POLL_GET_FULL: + cur_ops->start_gp_poll_full(&gp_snap_full); + while (!cur_ops->poll_gp_state_full(&gp_snap_full)) { + torture_hrtimeout_jiffies(torture_random(&rand) % 16, + &rand); + } + break; + case RTWS_POLL_GET_EXP: + gp_snap = cur_ops->start_gp_poll_exp(); + while (!cur_ops->poll_gp_state_exp(gp_snap)) { + torture_hrtimeout_jiffies(torture_random(&rand) % 16, + &rand); + } + break; + case RTWS_POLL_GET_EXP_FULL: + cur_ops->start_gp_poll_exp_full(&gp_snap_full); + while (!cur_ops->poll_gp_state_full(&gp_snap_full)) { + torture_hrtimeout_jiffies(torture_random(&rand) % 16, + &rand); + } + break; case RTWS_SYNC: cur_ops->sync(); break; @@ -1609,7 +1906,9 @@ rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp, */ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) { + bool checkpolling = !(torture_random(trsp) & 0xfff); unsigned long cookie; + struct rcu_gp_oldstate cookie_full; int i; unsigned long started; unsigned long completed; @@ -1625,8 +1924,12 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) WARN_ON_ONCE(!rcu_is_watching()); newstate = rcutorture_extend_mask(readstate, trsp); rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++); - if (cur_ops->get_gp_state && cur_ops->poll_gp_state) - cookie = cur_ops->get_gp_state(); + if (checkpolling) { + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) + cookie = cur_ops->get_gp_state(); + if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full) + cur_ops->get_gp_state_full(&cookie_full); + } started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, @@ -1660,13 +1963,22 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) } __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); - if (cur_ops->get_gp_state && cur_ops->poll_gp_state) - WARN_ONCE(cur_ops->poll_gp_state(cookie), - "%s: Cookie check 2 failed %s(%d) %lu->%lu\n", - __func__, - rcu_torture_writer_state_getname(), - rcu_torture_writer_state, - cookie, cur_ops->get_gp_state()); + if (checkpolling) { + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) + WARN_ONCE(cur_ops->poll_gp_state(cookie), + "%s: Cookie check 2 failed %s(%d) %lu->%lu\n", + __func__, + rcu_torture_writer_state_getname(), + rcu_torture_writer_state, + cookie, cur_ops->get_gp_state()); + if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full) + WARN_ONCE(cur_ops->poll_gp_state_full(&cookie_full), + "%s: Cookie check 6 failed %s(%d) online %*pbl\n", + __func__, + rcu_torture_writer_state_getname(), + rcu_torture_writer_state, + cpumask_pr_args(cpu_online_mask)); + } rcutorture_one_extend(&readstate, 0, trsp, rtrsp); WARN_ON_ONCE(readstate); // This next splat is expected behavior if leakpointer, especially @@ -1823,7 +2135,7 @@ rcu_torture_stats_print(void) batchsummary[i] += READ_ONCE(per_cpu(rcu_torture_batch, cpu)[i]); } } - for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) { + for (i = RCU_TORTURE_PIPE_LEN; i >= 0; i--) { if (pipesummary[i] != 0) break; } @@ -1945,7 +2257,13 @@ static void rcu_torture_mem_dump_obj(void) static int z; kcp = kmem_cache_create("rcuscale", 136, 8, SLAB_STORE_USER, NULL); + if (WARN_ON_ONCE(!kcp)) + return; rhp = kmem_cache_alloc(kcp, GFP_KERNEL); + if (WARN_ON_ONCE(!rhp)) { + kmem_cache_destroy(kcp); + return; + } pr_alert("mem_dump_obj() slab test: rcu_torture_stats = %px, &rhp = %px, rhp = %px, &z = %px\n", stats_task, &rhp, rhp, &z); pr_alert("mem_dump_obj(ZERO_SIZE_PTR):"); mem_dump_obj(ZERO_SIZE_PTR); @@ -1962,6 +2280,8 @@ static void rcu_torture_mem_dump_obj(void) kmem_cache_free(kcp, rhp); kmem_cache_destroy(kcp); rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); + if (WARN_ON_ONCE(!rhp)) + return; pr_alert("mem_dump_obj() kmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp); pr_alert("mem_dump_obj(kmalloc %px):", rhp); mem_dump_obj(rhp); @@ -1969,6 +2289,8 @@ static void rcu_torture_mem_dump_obj(void) mem_dump_obj(&rhp->func); kfree(rhp); rhp = vmalloc(4096); + if (WARN_ON_ONCE(!rhp)) + return; pr_alert("mem_dump_obj() vmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp); pr_alert("mem_dump_obj(vmalloc %px):", rhp); mem_dump_obj(rhp); @@ -2030,6 +2352,19 @@ static int rcutorture_booster_init(unsigned int cpu) if (boost_tasks[cpu] != NULL) return 0; /* Already created, nothing more to do. */ + // Testing RCU priority boosting requires rcutorture do + // some serious abuse. Counter this by running ksoftirqd + // at higher priority. + if (IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) { + struct sched_param sp; + struct task_struct *t; + + t = per_cpu(ksoftirqd, cpu); + WARN_ON_ONCE(!t); + sp.sched_priority = 2; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + } + /* Don't allow time recalculation while creating a new task. */ mutex_lock(&boost_mutex); rcu_torture_disable_rt_throttle(); @@ -2471,12 +2806,12 @@ static int rcutorture_oom_notify(struct notifier_block *self, for (i = 0; i < fwd_progress; i++) ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]); pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs); - rcu_barrier(); + cur_ops->cb_barrier(); ncbs = 0; for (i = 0; i < fwd_progress; i++) ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]); pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs); - rcu_barrier(); + cur_ops->cb_barrier(); ncbs = 0; for (i = 0; i < fwd_progress; i++) ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]); @@ -2828,7 +3163,6 @@ static int rcu_torture_read_exit_child(void *trsp_in) // Parent kthread which creates and destroys read-exit child kthreads. static int rcu_torture_read_exit(void *unused) { - int count = 0; bool errexit = false; int i; struct task_struct *tsp; @@ -2840,34 +3174,28 @@ static int rcu_torture_read_exit(void *unused) // Each pass through this loop does one read-exit episode. do { - if (++count > read_exit_burst) { - VERBOSE_TOROUT_STRING("rcu_torture_read_exit: End of episode"); - rcu_barrier(); // Wait for task_struct free, avoid OOM. - for (i = 0; i < read_exit_delay; i++) { - schedule_timeout_uninterruptible(HZ); - if (READ_ONCE(read_exit_child_stop)) - break; + VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of episode"); + for (i = 0; i < read_exit_burst; i++) { + if (READ_ONCE(read_exit_child_stop)) + break; + stutter_wait("rcu_torture_read_exit"); + // Spawn child. + tsp = kthread_run(rcu_torture_read_exit_child, + &trs, "%s", "rcu_torture_read_exit_child"); + if (IS_ERR(tsp)) { + TOROUT_ERRSTRING("out of memory"); + errexit = true; + break; } - if (!READ_ONCE(read_exit_child_stop)) - VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of episode"); - count = 0; - } - if (READ_ONCE(read_exit_child_stop)) - break; - // Spawn child. - tsp = kthread_run(rcu_torture_read_exit_child, - &trs, "%s", - "rcu_torture_read_exit_child"); - if (IS_ERR(tsp)) { - TOROUT_ERRSTRING("out of memory"); - errexit = true; - tsp = NULL; - break; + cond_resched(); + kthread_stop(tsp); + n_read_exits++; } - cond_resched(); - kthread_stop(tsp); - n_read_exits ++; - stutter_wait("rcu_torture_read_exit"); + VERBOSE_TOROUT_STRING("rcu_torture_read_exit: End of episode"); + rcu_barrier(); // Wait for task_struct free, avoid OOM. + i = 0; + for (; !errexit && !READ_ONCE(read_exit_child_stop) && i < read_exit_delay; i++) + schedule_timeout_uninterruptible(HZ); } while (!errexit && !READ_ONCE(read_exit_child_stop)); // Clean up and exit. @@ -2916,10 +3244,12 @@ rcu_torture_cleanup(void) pr_info("%s: Invoking %pS().\n", __func__, cur_ops->cb_barrier); cur_ops->cb_barrier(); } + rcu_gp_slow_unregister(NULL); return; } if (!cur_ops) { torture_cleanup_end(); + rcu_gp_slow_unregister(NULL); return; } @@ -3016,6 +3346,7 @@ rcu_torture_cleanup(void) else rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); torture_cleanup_end(); + rcu_gp_slow_unregister(&rcu_fwd_cb_nodelay); } #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD @@ -3074,6 +3405,7 @@ static void rcu_test_debug_objects(void) pr_alert("%s: WARN: Duplicate call_rcu() test complete.\n", KBUILD_MODNAME); destroy_rcu_head_on_stack(&rh1); destroy_rcu_head_on_stack(&rh2); + kfree(rhp); #else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n", KBUILD_MODNAME); #endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ @@ -3096,9 +3428,9 @@ rcu_torture_init(void) int flags = 0; unsigned long gp_seq = 0; static struct rcu_torture_ops *torture_ops[] = { - &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, - &busted_srcud_ops, &tasks_ops, &tasks_rude_ops, - &tasks_tracing_ops, &trivial_ops, + &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, &busted_srcud_ops, + TASKS_OPS TASKS_RUDE_OPS TASKS_TRACING_OPS + &trivial_ops, }; if (!torture_init_begin(torture_type, verbose)) @@ -3281,21 +3613,6 @@ rcu_torture_init(void) rcutor_hp = firsterr; if (torture_init_error(firsterr)) goto unwind; - - // Testing RCU priority boosting requires rcutorture do - // some serious abuse. Counter this by running ksoftirqd - // at higher priority. - if (IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) { - for_each_online_cpu(cpu) { - struct sched_param sp; - struct task_struct *t; - - t = per_cpu(ksoftirqd, cpu); - WARN_ON_ONCE(!t); - sp.sched_priority = 2; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); - } - } } shutdown_jiffies = jiffies + shutdown_secs * HZ; firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); @@ -3320,6 +3637,7 @@ rcu_torture_init(void) if (object_debug) rcu_test_debug_objects(); torture_init_end(); + rcu_gp_slow_register(&rcu_fwd_cb_nodelay); return 0; unwind: diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 5489ff7f478e..435c884c02b5 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -207,6 +207,8 @@ static struct ref_scale_ops srcu_ops = { .name = "srcu" }; +#ifdef CONFIG_TASKS_RCU + // Definitions for RCU Tasks ref scale testing: Empty read markers. // These definitions also work for RCU Rude readers. static void rcu_tasks_ref_scale_read_section(const int nloops) @@ -232,6 +234,16 @@ static struct ref_scale_ops rcu_tasks_ops = { .name = "rcu-tasks" }; +#define RCU_TASKS_OPS &rcu_tasks_ops, + +#else // #ifdef CONFIG_TASKS_RCU + +#define RCU_TASKS_OPS + +#endif // #else // #ifdef CONFIG_TASKS_RCU + +#ifdef CONFIG_TASKS_TRACE_RCU + // Definitions for RCU Tasks Trace ref scale testing. static void rcu_trace_ref_scale_read_section(const int nloops) { @@ -261,6 +273,14 @@ static struct ref_scale_ops rcu_trace_ops = { .name = "rcu-trace" }; +#define RCU_TRACE_OPS &rcu_trace_ops, + +#else // #ifdef CONFIG_TASKS_TRACE_RCU + +#define RCU_TRACE_OPS + +#endif // #else // #ifdef CONFIG_TASKS_TRACE_RCU + // Definitions for reference count static atomic_t refcnt; @@ -365,7 +385,7 @@ static struct ref_scale_ops rwsem_ops = { }; // Definitions for global spinlock -static DEFINE_SPINLOCK(test_lock); +static DEFINE_RAW_SPINLOCK(test_lock); static void ref_lock_section(const int nloops) { @@ -373,8 +393,8 @@ static void ref_lock_section(const int nloops) preempt_disable(); for (i = nloops; i >= 0; i--) { - spin_lock(&test_lock); - spin_unlock(&test_lock); + raw_spin_lock(&test_lock); + raw_spin_unlock(&test_lock); } preempt_enable(); } @@ -385,9 +405,9 @@ static void ref_lock_delay_section(const int nloops, const int udl, const int nd preempt_disable(); for (i = nloops; i >= 0; i--) { - spin_lock(&test_lock); + raw_spin_lock(&test_lock); un_delay(udl, ndl); - spin_unlock(&test_lock); + raw_spin_unlock(&test_lock); } preempt_enable(); } @@ -407,8 +427,8 @@ static void ref_lock_irq_section(const int nloops) preempt_disable(); for (i = nloops; i >= 0; i--) { - spin_lock_irqsave(&test_lock, flags); - spin_unlock_irqrestore(&test_lock, flags); + raw_spin_lock_irqsave(&test_lock, flags); + raw_spin_unlock_irqrestore(&test_lock, flags); } preempt_enable(); } @@ -420,9 +440,9 @@ static void ref_lock_irq_delay_section(const int nloops, const int udl, const in preempt_disable(); for (i = nloops; i >= 0; i--) { - spin_lock_irqsave(&test_lock, flags); + raw_spin_lock_irqsave(&test_lock, flags); un_delay(udl, ndl); - spin_unlock_irqrestore(&test_lock, flags); + raw_spin_unlock_irqrestore(&test_lock, flags); } preempt_enable(); } @@ -790,7 +810,7 @@ ref_scale_init(void) long i; int firsterr = 0; static struct ref_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops, &refcnt_ops, &rwlock_ops, + &rcu_ops, &srcu_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, }; diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 92c002d65482..33adafdad261 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -117,7 +117,7 @@ void srcu_drive_gp(struct work_struct *wp) struct srcu_struct *ssp; ssp = container_of(wp, struct srcu_struct, srcu_work); - if (ssp->srcu_gp_running || USHORT_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) + if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) return; /* Already running or nothing to do. */ /* Remove recently arrived callbacks and wait for readers. */ @@ -150,17 +150,17 @@ void srcu_drive_gp(struct work_struct *wp) * straighten that out. */ WRITE_ONCE(ssp->srcu_gp_running, false); - if (USHORT_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) + if (ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) schedule_work(&ssp->srcu_work); } EXPORT_SYMBOL_GPL(srcu_drive_gp); static void srcu_gp_start_if_needed(struct srcu_struct *ssp) { - unsigned short cookie; + unsigned long cookie; cookie = get_state_synchronize_srcu(ssp); - if (USHORT_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) + if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) return; WRITE_ONCE(ssp->srcu_idx_max, cookie); if (!READ_ONCE(ssp->srcu_gp_running)) { @@ -215,7 +215,7 @@ unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp) barrier(); ret = (READ_ONCE(ssp->srcu_idx) + 3) & ~0x1; barrier(); - return ret & USHRT_MAX; + return ret; } EXPORT_SYMBOL_GPL(get_state_synchronize_srcu); @@ -240,10 +240,10 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu); */ bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) { - bool ret = USHORT_CMP_GE(READ_ONCE(ssp->srcu_idx), cookie); + unsigned long cur_s = READ_ONCE(ssp->srcu_idx); barrier(); - return ret; + return ULONG_CMP_GE(cur_s, cookie) || ULONG_CMP_LT(cur_s, cookie - 3); } EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 6833d8887181..1c304fec89c0 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -24,6 +24,7 @@ #include <linux/smp.h> #include <linux/delay.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/srcu.h> #include "rcu.h" @@ -38,6 +39,35 @@ module_param(exp_holdoff, ulong, 0444); static ulong counter_wrap_check = (ULONG_MAX >> 2); module_param(counter_wrap_check, ulong, 0444); +/* + * Control conversion to SRCU_SIZE_BIG: + * 0: Don't convert at all. + * 1: Convert at init_srcu_struct() time. + * 2: Convert when rcutorture invokes srcu_torture_stats_print(). + * 3: Decide at boot time based on system shape (default). + * 0x1x: Convert when excessive contention encountered. + */ +#define SRCU_SIZING_NONE 0 +#define SRCU_SIZING_INIT 1 +#define SRCU_SIZING_TORTURE 2 +#define SRCU_SIZING_AUTO 3 +#define SRCU_SIZING_CONTEND 0x10 +#define SRCU_SIZING_IS(x) ((convert_to_big & ~SRCU_SIZING_CONTEND) == x) +#define SRCU_SIZING_IS_NONE() (SRCU_SIZING_IS(SRCU_SIZING_NONE)) +#define SRCU_SIZING_IS_INIT() (SRCU_SIZING_IS(SRCU_SIZING_INIT)) +#define SRCU_SIZING_IS_TORTURE() (SRCU_SIZING_IS(SRCU_SIZING_TORTURE)) +#define SRCU_SIZING_IS_CONTEND() (convert_to_big & SRCU_SIZING_CONTEND) +static int convert_to_big = SRCU_SIZING_AUTO; +module_param(convert_to_big, int, 0444); + +/* Number of CPUs to trigger init_srcu_struct()-time transition to big. */ +static int big_cpu_lim __read_mostly = 128; +module_param(big_cpu_lim, int, 0444); + +/* Contention events per jiffy to initiate transition to big. */ +static int small_contention_lim __read_mostly = 100; +module_param(small_contention_lim, int, 0444); + /* Early-boot callback-management, so early that no lock is required! */ static LIST_HEAD(srcu_boot_list); static bool __read_mostly srcu_init_done; @@ -48,39 +78,90 @@ static void process_srcu(struct work_struct *work); static void srcu_delay_timer(struct timer_list *t); /* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */ -#define spin_lock_rcu_node(p) \ -do { \ - spin_lock(&ACCESS_PRIVATE(p, lock)); \ - smp_mb__after_unlock_lock(); \ +#define spin_lock_rcu_node(p) \ +do { \ + spin_lock(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ } while (0) #define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock)) -#define spin_lock_irq_rcu_node(p) \ -do { \ - spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \ - smp_mb__after_unlock_lock(); \ +#define spin_lock_irq_rcu_node(p) \ +do { \ + spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ } while (0) -#define spin_unlock_irq_rcu_node(p) \ +#define spin_unlock_irq_rcu_node(p) \ spin_unlock_irq(&ACCESS_PRIVATE(p, lock)) -#define spin_lock_irqsave_rcu_node(p, flags) \ -do { \ - spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ - smp_mb__after_unlock_lock(); \ +#define spin_lock_irqsave_rcu_node(p, flags) \ +do { \ + spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ + smp_mb__after_unlock_lock(); \ } while (0) -#define spin_unlock_irqrestore_rcu_node(p, flags) \ - spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ +#define spin_trylock_irqsave_rcu_node(p, flags) \ +({ \ + bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ + \ + if (___locked) \ + smp_mb__after_unlock_lock(); \ + ___locked; \ +}) + +#define spin_unlock_irqrestore_rcu_node(p, flags) \ + spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ /* - * Initialize SRCU combining tree. Note that statically allocated + * Initialize SRCU per-CPU data. Note that statically allocated * srcu_struct structures might already have srcu_read_lock() and * srcu_read_unlock() running against them. So if the is_static parameter * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[]. */ -static void init_srcu_struct_nodes(struct srcu_struct *ssp) +static void init_srcu_struct_data(struct srcu_struct *ssp) +{ + int cpu; + struct srcu_data *sdp; + + /* + * Initialize the per-CPU srcu_data array, which feeds into the + * leaves of the srcu_node tree. + */ + WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) != + ARRAY_SIZE(sdp->srcu_unlock_count)); + for_each_possible_cpu(cpu) { + sdp = per_cpu_ptr(ssp->sda, cpu); + spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); + rcu_segcblist_init(&sdp->srcu_cblist); + sdp->srcu_cblist_invoking = false; + sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq; + sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq; + sdp->mynode = NULL; + sdp->cpu = cpu; + INIT_WORK(&sdp->work, srcu_invoke_callbacks); + timer_setup(&sdp->delay_work, srcu_delay_timer, 0); + sdp->ssp = ssp; + } +} + +/* Invalid seq state, used during snp node initialization */ +#define SRCU_SNP_INIT_SEQ 0x2 + +/* + * Check whether sequence number corresponding to snp node, + * is invalid. + */ +static inline bool srcu_invl_snp_seq(unsigned long s) +{ + return rcu_seq_state(s) == SRCU_SNP_INIT_SEQ; +} + +/* + * Allocated and initialize SRCU combining tree. Returns @true if + * allocation succeeded and @false otherwise. + */ +static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) { int cpu; int i; @@ -92,6 +173,9 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp) /* Initialize geometry if it has not already been initialized. */ rcu_init_geometry(); + ssp->node = kcalloc(rcu_num_nodes, sizeof(*ssp->node), gfp_flags); + if (!ssp->node) + return false; /* Work out the overall tree geometry. */ ssp->level[0] = &ssp->node[0]; @@ -105,10 +189,10 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp) WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != ARRAY_SIZE(snp->srcu_data_have_cbs)); for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) { - snp->srcu_have_cbs[i] = 0; + snp->srcu_have_cbs[i] = SRCU_SNP_INIT_SEQ; snp->srcu_data_have_cbs[i] = 0; } - snp->srcu_gp_seq_needed_exp = 0; + snp->srcu_gp_seq_needed_exp = SRCU_SNP_INIT_SEQ; snp->grplo = -1; snp->grphi = -1; if (snp == &ssp->node[0]) { @@ -129,39 +213,31 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp) * Initialize the per-CPU srcu_data array, which feeds into the * leaves of the srcu_node tree. */ - WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) != - ARRAY_SIZE(sdp->srcu_unlock_count)); level = rcu_num_lvls - 1; snp_first = ssp->level[level]; for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(ssp->sda, cpu); - spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); - rcu_segcblist_init(&sdp->srcu_cblist); - sdp->srcu_cblist_invoking = false; - sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq; - sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq; sdp->mynode = &snp_first[cpu / levelspread[level]]; for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) { if (snp->grplo < 0) snp->grplo = cpu; snp->grphi = cpu; } - sdp->cpu = cpu; - INIT_WORK(&sdp->work, srcu_invoke_callbacks); - timer_setup(&sdp->delay_work, srcu_delay_timer, 0); - sdp->ssp = ssp; sdp->grpmask = 1 << (cpu - sdp->mynode->grplo); } + smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_WAIT_BARRIER); + return true; } /* * Initialize non-compile-time initialized fields, including the - * associated srcu_node and srcu_data structures. The is_static - * parameter is passed through to init_srcu_struct_nodes(), and - * also tells us that ->sda has already been wired up to srcu_data. + * associated srcu_node and srcu_data structures. The is_static parameter + * tells us that ->sda has already been wired up to srcu_data. */ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) { + ssp->srcu_size_state = SRCU_SIZE_SMALL; + ssp->node = NULL; mutex_init(&ssp->srcu_cb_mutex); mutex_init(&ssp->srcu_gp_mutex); ssp->srcu_idx = 0; @@ -170,13 +246,25 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) mutex_init(&ssp->srcu_barrier_mutex); atomic_set(&ssp->srcu_barrier_cpu_cnt, 0); INIT_DELAYED_WORK(&ssp->work, process_srcu); + ssp->sda_is_static = is_static; if (!is_static) ssp->sda = alloc_percpu(struct srcu_data); if (!ssp->sda) return -ENOMEM; - init_srcu_struct_nodes(ssp); + init_srcu_struct_data(ssp); ssp->srcu_gp_seq_needed_exp = 0; ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); + if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) { + if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) { + if (!ssp->sda_is_static) { + free_percpu(ssp->sda); + ssp->sda = NULL; + return -ENOMEM; + } + } else { + WRITE_ONCE(ssp->srcu_size_state, SRCU_SIZE_BIG); + } + } smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */ return 0; } @@ -214,6 +302,86 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /* + * Initiate a transition to SRCU_SIZE_BIG with lock held. + */ +static void __srcu_transition_to_big(struct srcu_struct *ssp) +{ + lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock)); + smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_ALLOC); +} + +/* + * Initiate an idempotent transition to SRCU_SIZE_BIG. + */ +static void srcu_transition_to_big(struct srcu_struct *ssp) +{ + unsigned long flags; + + /* Double-checked locking on ->srcu_size-state. */ + if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL) + return; + spin_lock_irqsave_rcu_node(ssp, flags); + if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL) { + spin_unlock_irqrestore_rcu_node(ssp, flags); + return; + } + __srcu_transition_to_big(ssp); + spin_unlock_irqrestore_rcu_node(ssp, flags); +} + +/* + * Check to see if the just-encountered contention event justifies + * a transition to SRCU_SIZE_BIG. + */ +static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp) +{ + unsigned long j; + + if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_size_state) + return; + j = jiffies; + if (ssp->srcu_size_jiffies != j) { + ssp->srcu_size_jiffies = j; + ssp->srcu_n_lock_retries = 0; + } + if (++ssp->srcu_n_lock_retries <= small_contention_lim) + return; + __srcu_transition_to_big(ssp); +} + +/* + * Acquire the specified srcu_data structure's ->lock, but check for + * excessive contention, which results in initiation of a transition + * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module + * parameter permits this. + */ +static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned long *flags) +{ + struct srcu_struct *ssp = sdp->ssp; + + if (spin_trylock_irqsave_rcu_node(sdp, *flags)) + return; + spin_lock_irqsave_rcu_node(ssp, *flags); + spin_lock_irqsave_check_contention(ssp); + spin_unlock_irqrestore_rcu_node(ssp, *flags); + spin_lock_irqsave_rcu_node(sdp, *flags); +} + +/* + * Acquire the specified srcu_struct structure's ->lock, but check for + * excessive contention, which results in initiation of a transition + * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module + * parameter permits this. + */ +static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags) +{ + if (spin_trylock_irqsave_rcu_node(ssp, *flags)) + return; + spin_lock_irqsave_rcu_node(ssp, *flags); + spin_lock_irqsave_check_contention(ssp); +} + +/* * First-use initialization of statically allocated srcu_struct * structure. Wiring up the combining tree is more than can be * done with compile-time initialization, so this check is added @@ -343,7 +511,52 @@ static bool srcu_readers_active(struct srcu_struct *ssp) return sum; } -#define SRCU_INTERVAL 1 +/* + * We use an adaptive strategy for synchronize_srcu() and especially for + * synchronize_srcu_expedited(). We spin for a fixed time period + * (defined below, boot time configurable) to allow SRCU readers to exit + * their read-side critical sections. If there are still some readers + * after one jiffy, we repeatedly block for one jiffy time periods. + * The blocking time is increased as the grace-period age increases, + * with max blocking time capped at 10 jiffies. + */ +#define SRCU_DEFAULT_RETRY_CHECK_DELAY 5 + +static ulong srcu_retry_check_delay = SRCU_DEFAULT_RETRY_CHECK_DELAY; +module_param(srcu_retry_check_delay, ulong, 0444); + +#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending. +#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers. + +#define SRCU_DEFAULT_MAX_NODELAY_PHASE_LO 3UL // Lowmark on default per-GP-phase + // no-delay instances. +#define SRCU_DEFAULT_MAX_NODELAY_PHASE_HI 1000UL // Highmark on default per-GP-phase + // no-delay instances. + +#define SRCU_UL_CLAMP_LO(val, low) ((val) > (low) ? (val) : (low)) +#define SRCU_UL_CLAMP_HI(val, high) ((val) < (high) ? (val) : (high)) +#define SRCU_UL_CLAMP(val, low, high) SRCU_UL_CLAMP_HI(SRCU_UL_CLAMP_LO((val), (low)), (high)) +// per-GP-phase no-delay instances adjusted to allow non-sleeping poll upto +// one jiffies time duration. Mult by 2 is done to factor in the srcu_get_delay() +// called from process_srcu(). +#define SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED \ + (2UL * USEC_PER_SEC / HZ / SRCU_DEFAULT_RETRY_CHECK_DELAY) + +// Maximum per-GP-phase consecutive no-delay instances. +#define SRCU_DEFAULT_MAX_NODELAY_PHASE \ + SRCU_UL_CLAMP(SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED, \ + SRCU_DEFAULT_MAX_NODELAY_PHASE_LO, \ + SRCU_DEFAULT_MAX_NODELAY_PHASE_HI) + +static ulong srcu_max_nodelay_phase = SRCU_DEFAULT_MAX_NODELAY_PHASE; +module_param(srcu_max_nodelay_phase, ulong, 0444); + +// Maximum consecutive no-delay instances. +#define SRCU_DEFAULT_MAX_NODELAY (SRCU_DEFAULT_MAX_NODELAY_PHASE > 100 ? \ + SRCU_DEFAULT_MAX_NODELAY_PHASE : 100) + +static ulong srcu_max_nodelay = SRCU_DEFAULT_MAX_NODELAY; +module_param(srcu_max_nodelay, ulong, 0444); /* * Return grace-period delay, zero if there are expedited grace @@ -351,10 +564,24 @@ static bool srcu_readers_active(struct srcu_struct *ssp) */ static unsigned long srcu_get_delay(struct srcu_struct *ssp) { - if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), - READ_ONCE(ssp->srcu_gp_seq_needed_exp))) - return 0; - return SRCU_INTERVAL; + unsigned long gpstart; + unsigned long j; + unsigned long jbase = SRCU_INTERVAL; + + if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp))) + jbase = 0; + if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))) { + j = jiffies - 1; + gpstart = READ_ONCE(ssp->srcu_gp_start); + if (time_after(j, gpstart)) + jbase += j - gpstart; + if (!jbase) { + WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1); + if (READ_ONCE(ssp->srcu_n_exp_nodelay) > srcu_max_nodelay_phase) + jbase = 1; + } + } + return jbase > SRCU_MAX_INTERVAL ? SRCU_MAX_INTERVAL : jbase; } /** @@ -382,13 +609,20 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) return; /* Forgot srcu_barrier(), so just leak it! */ } if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) || + WARN_ON(rcu_seq_current(&ssp->srcu_gp_seq) != ssp->srcu_gp_seq_needed) || WARN_ON(srcu_readers_active(ssp))) { - pr_info("%s: Active srcu_struct %p state: %d\n", - __func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))); + pr_info("%s: Active srcu_struct %p read state: %d gp state: %lu/%lu\n", + __func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)), + rcu_seq_current(&ssp->srcu_gp_seq), ssp->srcu_gp_seq_needed); return; /* Caller forgot to stop doing call_srcu()? */ } - free_percpu(ssp->sda); - ssp->sda = NULL; + if (!ssp->sda_is_static) { + free_percpu(ssp->sda); + ssp->sda = NULL; + } + kfree(ssp->node); + ssp->node = NULL; + ssp->srcu_size_state = SRCU_SIZE_SMALL; } EXPORT_SYMBOL_GPL(cleanup_srcu_struct); @@ -421,22 +655,17 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx) EXPORT_SYMBOL_GPL(__srcu_read_unlock); /* - * We use an adaptive strategy for synchronize_srcu() and especially for - * synchronize_srcu_expedited(). We spin for a fixed time period - * (defined below) to allow SRCU readers to exit their read-side critical - * sections. If there are still some readers after a few microseconds, - * we repeatedly block for 1-millisecond time periods. - */ -#define SRCU_RETRY_CHECK_DELAY 5 - -/* * Start an SRCU grace period. */ static void srcu_gp_start(struct srcu_struct *ssp) { - struct srcu_data *sdp = this_cpu_ptr(ssp->sda); + struct srcu_data *sdp; int state; + if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) + sdp = per_cpu_ptr(ssp->sda, 0); + else + sdp = this_cpu_ptr(ssp->sda); lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock)); WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)); spin_lock_rcu_node(sdp); /* Interrupts already disabled. */ @@ -445,6 +674,8 @@ static void srcu_gp_start(struct srcu_struct *ssp) (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, rcu_seq_snap(&ssp->srcu_gp_seq)); spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */ + WRITE_ONCE(ssp->srcu_gp_start, jiffies); + WRITE_ONCE(ssp->srcu_n_exp_nodelay, 0); smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */ rcu_seq_start(&ssp->srcu_gp_seq); state = rcu_seq_state(ssp->srcu_gp_seq); @@ -508,7 +739,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp */ static void srcu_gp_end(struct srcu_struct *ssp) { - unsigned long cbdelay; + unsigned long cbdelay = 1; bool cbs; bool last_lvl; int cpu; @@ -517,7 +748,9 @@ static void srcu_gp_end(struct srcu_struct *ssp) int idx; unsigned long mask; struct srcu_data *sdp; + unsigned long sgsne; struct srcu_node *snp; + int ss_state; /* Prevent more than one additional grace period. */ mutex_lock(&ssp->srcu_cb_mutex); @@ -526,7 +759,9 @@ static void srcu_gp_end(struct srcu_struct *ssp) spin_lock_irq_rcu_node(ssp); idx = rcu_seq_state(ssp->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); - cbdelay = srcu_get_delay(ssp); + if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp))) + cbdelay = 0; + WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns()); rcu_seq_end(&ssp->srcu_gp_seq); gpseq = rcu_seq_current(&ssp->srcu_gp_seq); @@ -537,38 +772,45 @@ static void srcu_gp_end(struct srcu_struct *ssp) /* A new grace period can start at this point. But only one. */ /* Initiate callback invocation as needed. */ - idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); - srcu_for_each_node_breadth_first(ssp, snp) { - spin_lock_irq_rcu_node(snp); - cbs = false; - last_lvl = snp >= ssp->level[rcu_num_lvls - 1]; - if (last_lvl) - cbs = snp->srcu_have_cbs[idx] == gpseq; - snp->srcu_have_cbs[idx] = gpseq; - rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); - if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq)) - WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq); - mask = snp->srcu_data_have_cbs[idx]; - snp->srcu_data_have_cbs[idx] = 0; - spin_unlock_irq_rcu_node(snp); - if (cbs) - srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay); - - /* Occasionally prevent srcu_data counter wrap. */ - if (!(gpseq & counter_wrap_check) && last_lvl) - for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { - sdp = per_cpu_ptr(ssp->sda, cpu); - spin_lock_irqsave_rcu_node(sdp, flags); - if (ULONG_CMP_GE(gpseq, - sdp->srcu_gp_seq_needed + 100)) - sdp->srcu_gp_seq_needed = gpseq; - if (ULONG_CMP_GE(gpseq, - sdp->srcu_gp_seq_needed_exp + 100)) - sdp->srcu_gp_seq_needed_exp = gpseq; - spin_unlock_irqrestore_rcu_node(sdp, flags); - } + ss_state = smp_load_acquire(&ssp->srcu_size_state); + if (ss_state < SRCU_SIZE_WAIT_BARRIER) { + srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, 0), cbdelay); + } else { + idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); + srcu_for_each_node_breadth_first(ssp, snp) { + spin_lock_irq_rcu_node(snp); + cbs = false; + last_lvl = snp >= ssp->level[rcu_num_lvls - 1]; + if (last_lvl) + cbs = ss_state < SRCU_SIZE_BIG || snp->srcu_have_cbs[idx] == gpseq; + snp->srcu_have_cbs[idx] = gpseq; + rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); + sgsne = snp->srcu_gp_seq_needed_exp; + if (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, gpseq)) + WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq); + if (ss_state < SRCU_SIZE_BIG) + mask = ~0; + else + mask = snp->srcu_data_have_cbs[idx]; + snp->srcu_data_have_cbs[idx] = 0; + spin_unlock_irq_rcu_node(snp); + if (cbs) + srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay); + } } + /* Occasionally prevent srcu_data counter wrap. */ + if (!(gpseq & counter_wrap_check)) + for_each_possible_cpu(cpu) { + sdp = per_cpu_ptr(ssp->sda, cpu); + spin_lock_irqsave_rcu_node(sdp, flags); + if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100)) + sdp->srcu_gp_seq_needed = gpseq; + if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed_exp + 100)) + sdp->srcu_gp_seq_needed_exp = gpseq; + spin_unlock_irqrestore_rcu_node(sdp, flags); + } + /* Callback initiation done, allow grace periods after next. */ mutex_unlock(&ssp->srcu_cb_mutex); @@ -583,6 +825,14 @@ static void srcu_gp_end(struct srcu_struct *ssp) } else { spin_unlock_irq_rcu_node(ssp); } + + /* Transition to big if needed. */ + if (ss_state != SRCU_SIZE_SMALL && ss_state != SRCU_SIZE_BIG) { + if (ss_state == SRCU_SIZE_ALLOC) + init_srcu_struct_nodes(ssp, GFP_KERNEL); + else + smp_store_release(&ssp->srcu_size_state, ss_state + 1); + } } /* @@ -596,20 +846,24 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp unsigned long s) { unsigned long flags; + unsigned long sgsne; - for (; snp != NULL; snp = snp->srcu_parent) { - if (rcu_seq_done(&ssp->srcu_gp_seq, s) || - ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s)) - return; - spin_lock_irqsave_rcu_node(snp, flags); - if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) { + if (snp) + for (; snp != NULL; snp = snp->srcu_parent) { + sgsne = READ_ONCE(snp->srcu_gp_seq_needed_exp); + if (rcu_seq_done(&ssp->srcu_gp_seq, s) || + (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s))) + return; + spin_lock_irqsave_rcu_node(snp, flags); + sgsne = snp->srcu_gp_seq_needed_exp; + if (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)) { + spin_unlock_irqrestore_rcu_node(snp, flags); + return; + } + WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); spin_unlock_irqrestore_rcu_node(snp, flags); - return; } - WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); - spin_unlock_irqrestore_rcu_node(snp, flags); - } - spin_lock_irqsave_rcu_node(ssp, flags); + spin_lock_irqsave_ssp_contention(ssp, &flags); if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s); spin_unlock_irqrestore_rcu_node(ssp, flags); @@ -630,39 +884,47 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, { unsigned long flags; int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs); - struct srcu_node *snp = sdp->mynode; + unsigned long sgsne; + struct srcu_node *snp; + struct srcu_node *snp_leaf; unsigned long snp_seq; - /* Each pass through the loop does one level of the srcu_node tree. */ - for (; snp != NULL; snp = snp->srcu_parent) { - if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != sdp->mynode) - return; /* GP already done and CBs recorded. */ - spin_lock_irqsave_rcu_node(snp, flags); - if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) { + /* Ensure that snp node tree is fully initialized before traversing it */ + if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) + snp_leaf = NULL; + else + snp_leaf = sdp->mynode; + + if (snp_leaf) + /* Each pass through the loop does one level of the srcu_node tree. */ + for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) { + if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != snp_leaf) + return; /* GP already done and CBs recorded. */ + spin_lock_irqsave_rcu_node(snp, flags); snp_seq = snp->srcu_have_cbs[idx]; - if (snp == sdp->mynode && snp_seq == s) - snp->srcu_data_have_cbs[idx] |= sdp->grpmask; - spin_unlock_irqrestore_rcu_node(snp, flags); - if (snp == sdp->mynode && snp_seq != s) { - srcu_schedule_cbs_sdp(sdp, do_norm - ? SRCU_INTERVAL - : 0); + if (!srcu_invl_snp_seq(snp_seq) && ULONG_CMP_GE(snp_seq, s)) { + if (snp == snp_leaf && snp_seq == s) + snp->srcu_data_have_cbs[idx] |= sdp->grpmask; + spin_unlock_irqrestore_rcu_node(snp, flags); + if (snp == snp_leaf && snp_seq != s) { + srcu_schedule_cbs_sdp(sdp, do_norm ? SRCU_INTERVAL : 0); + return; + } + if (!do_norm) + srcu_funnel_exp_start(ssp, snp, s); return; } - if (!do_norm) - srcu_funnel_exp_start(ssp, snp, s); - return; + snp->srcu_have_cbs[idx] = s; + if (snp == snp_leaf) + snp->srcu_data_have_cbs[idx] |= sdp->grpmask; + sgsne = snp->srcu_gp_seq_needed_exp; + if (!do_norm && (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, s))) + WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); + spin_unlock_irqrestore_rcu_node(snp, flags); } - snp->srcu_have_cbs[idx] = s; - if (snp == sdp->mynode) - snp->srcu_data_have_cbs[idx] |= sdp->grpmask; - if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s)) - WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); - spin_unlock_irqrestore_rcu_node(snp, flags); - } /* Top of tree, must ensure the grace period will be started. */ - spin_lock_irqsave_rcu_node(ssp, flags); + spin_lock_irqsave_ssp_contention(ssp, &flags); if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed, s)) { /* * Record need for grace period s. Pair with load @@ -678,9 +940,15 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, rcu_seq_state(ssp->srcu_gp_seq) == SRCU_STATE_IDLE) { WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)); srcu_gp_start(ssp); + + // And how can that list_add() in the "else" clause + // possibly be safe for concurrent execution? Well, + // it isn't. And it does not have to be. After all, it + // can only be executed during early boot when there is only + // the one boot CPU running with interrupts still disabled. if (likely(srcu_init_done)) queue_delayed_work(rcu_gp_wq, &ssp->work, - srcu_get_delay(ssp)); + !!srcu_get_delay(ssp)); else if (list_empty(&ssp->work.work.entry)) list_add(&ssp->work.work.entry, &srcu_boot_list); } @@ -694,12 +962,16 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, */ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount) { + unsigned long curdelay; + + curdelay = !srcu_get_delay(ssp); + for (;;) { if (srcu_readers_active_idx_check(ssp, idx)) return true; - if (--trycount + !srcu_get_delay(ssp) <= 0) + if ((--trycount + curdelay) <= 0) return false; - udelay(SRCU_RETRY_CHECK_DELAY); + udelay(srcu_retry_check_delay); } } @@ -814,11 +1086,17 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, bool needgp = false; unsigned long s; struct srcu_data *sdp; + struct srcu_node *sdp_mynode; + int ss_state; check_init_srcu_struct(ssp); idx = srcu_read_lock(ssp); - sdp = raw_cpu_ptr(ssp->sda); - spin_lock_irqsave_rcu_node(sdp, flags); + ss_state = smp_load_acquire(&ssp->srcu_size_state); + if (ss_state < SRCU_SIZE_WAIT_CALL) + sdp = per_cpu_ptr(ssp->sda, 0); + else + sdp = raw_cpu_ptr(ssp->sda); + spin_lock_irqsave_sdp_contention(sdp, &flags); if (rhp) rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); rcu_segcblist_advance(&sdp->srcu_cblist, @@ -834,10 +1112,17 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, needexp = true; } spin_unlock_irqrestore_rcu_node(sdp, flags); + + /* Ensure that snp node tree is fully initialized before traversing it */ + if (ss_state < SRCU_SIZE_WAIT_BARRIER) + sdp_mynode = NULL; + else + sdp_mynode = sdp->mynode; + if (needgp) srcu_funnel_gp_start(ssp, sdp, s, do_norm); else if (needexp) - srcu_funnel_exp_start(ssp, sdp->mynode, s); + srcu_funnel_exp_start(ssp, sdp_mynode, s); srcu_read_unlock(ssp, idx); return s; } @@ -1097,6 +1382,28 @@ static void srcu_barrier_cb(struct rcu_head *rhp) complete(&ssp->srcu_barrier_completion); } +/* + * Enqueue an srcu_barrier() callback on the specified srcu_data + * structure's ->cblist. but only if that ->cblist already has at least one + * callback enqueued. Note that if a CPU already has callbacks enqueue, + * it must have already registered the need for a future grace period, + * so all we need do is enqueue a callback that will use the same grace + * period as the last callback already in the queue. + */ +static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp) +{ + spin_lock_irq_rcu_node(sdp); + atomic_inc(&ssp->srcu_barrier_cpu_cnt); + sdp->srcu_barrier_head.func = srcu_barrier_cb; + debug_rcu_head_queue(&sdp->srcu_barrier_head); + if (!rcu_segcblist_entrain(&sdp->srcu_cblist, + &sdp->srcu_barrier_head)) { + debug_rcu_head_unqueue(&sdp->srcu_barrier_head); + atomic_dec(&ssp->srcu_barrier_cpu_cnt); + } + spin_unlock_irq_rcu_node(sdp); +} + /** * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. * @ssp: srcu_struct on which to wait for in-flight callbacks. @@ -1104,7 +1411,7 @@ static void srcu_barrier_cb(struct rcu_head *rhp) void srcu_barrier(struct srcu_struct *ssp) { int cpu; - struct srcu_data *sdp; + int idx; unsigned long s = rcu_seq_snap(&ssp->srcu_barrier_seq); check_init_srcu_struct(ssp); @@ -1120,27 +1427,13 @@ void srcu_barrier(struct srcu_struct *ssp) /* Initial count prevents reaching zero until all CBs are posted. */ atomic_set(&ssp->srcu_barrier_cpu_cnt, 1); - /* - * Each pass through this loop enqueues a callback, but only - * on CPUs already having callbacks enqueued. Note that if - * a CPU already has callbacks enqueue, it must have already - * registered the need for a future grace period, so all we - * need do is enqueue a callback that will use the same - * grace period as the last callback already in the queue. - */ - for_each_possible_cpu(cpu) { - sdp = per_cpu_ptr(ssp->sda, cpu); - spin_lock_irq_rcu_node(sdp); - atomic_inc(&ssp->srcu_barrier_cpu_cnt); - sdp->srcu_barrier_head.func = srcu_barrier_cb; - debug_rcu_head_queue(&sdp->srcu_barrier_head); - if (!rcu_segcblist_entrain(&sdp->srcu_cblist, - &sdp->srcu_barrier_head)) { - debug_rcu_head_unqueue(&sdp->srcu_barrier_head); - atomic_dec(&ssp->srcu_barrier_cpu_cnt); - } - spin_unlock_irq_rcu_node(sdp); - } + idx = srcu_read_lock(ssp); + if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) + srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, 0)); + else + for_each_possible_cpu(cpu) + srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, cpu)); + srcu_read_unlock(ssp, idx); /* Remove the initial count, at which point reaching zero can happen. */ if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt)) @@ -1214,6 +1507,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) srcu_flip(ssp); spin_lock_irq_rcu_node(ssp); rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2); + ssp->srcu_n_exp_nodelay = 0; spin_unlock_irq_rcu_node(ssp); } @@ -1228,6 +1522,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) mutex_unlock(&ssp->srcu_gp_mutex); return; /* readers present, retry later. */ } + ssp->srcu_n_exp_nodelay = 0; srcu_gp_end(ssp); /* Releases ->srcu_gp_mutex. */ } } @@ -1318,12 +1613,28 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay) */ static void process_srcu(struct work_struct *work) { + unsigned long curdelay; + unsigned long j; struct srcu_struct *ssp; ssp = container_of(work, struct srcu_struct, work.work); srcu_advance_state(ssp); - srcu_reschedule(ssp, srcu_get_delay(ssp)); + curdelay = srcu_get_delay(ssp); + if (curdelay) { + WRITE_ONCE(ssp->reschedule_count, 0); + } else { + j = jiffies; + if (READ_ONCE(ssp->reschedule_jiffies) == j) { + WRITE_ONCE(ssp->reschedule_count, READ_ONCE(ssp->reschedule_count) + 1); + if (READ_ONCE(ssp->reschedule_count) > srcu_max_nodelay) + curdelay = 1; + } else { + WRITE_ONCE(ssp->reschedule_count, 1); + WRITE_ONCE(ssp->reschedule_jiffies, j); + } + } + srcu_reschedule(ssp, curdelay); } void srcutorture_get_gp_data(enum rcutorture_type test_type, @@ -1337,43 +1648,69 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, } EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); +static const char * const srcu_size_state_name[] = { + "SRCU_SIZE_SMALL", + "SRCU_SIZE_ALLOC", + "SRCU_SIZE_WAIT_BARRIER", + "SRCU_SIZE_WAIT_CALL", + "SRCU_SIZE_WAIT_CBS1", + "SRCU_SIZE_WAIT_CBS2", + "SRCU_SIZE_WAIT_CBS3", + "SRCU_SIZE_WAIT_CBS4", + "SRCU_SIZE_BIG", + "SRCU_SIZE_???", +}; + void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) { int cpu; int idx; unsigned long s0 = 0, s1 = 0; + int ss_state = READ_ONCE(ssp->srcu_size_state); + int ss_state_idx = ss_state; idx = ssp->srcu_idx & 0x1; - pr_alert("%s%s Tree SRCU g%ld per-CPU(idx=%d):", - tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), idx); - for_each_possible_cpu(cpu) { - unsigned long l0, l1; - unsigned long u0, u1; - long c0, c1; - struct srcu_data *sdp; - - sdp = per_cpu_ptr(ssp->sda, cpu); - u0 = data_race(sdp->srcu_unlock_count[!idx]); - u1 = data_race(sdp->srcu_unlock_count[idx]); - - /* - * Make sure that a lock is always counted if the corresponding - * unlock is counted. - */ - smp_rmb(); - - l0 = data_race(sdp->srcu_lock_count[!idx]); - l1 = data_race(sdp->srcu_lock_count[idx]); - - c0 = l0 - u0; - c1 = l1 - u1; - pr_cont(" %d(%ld,%ld %c)", - cpu, c0, c1, - "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]); - s0 += c0; - s1 += c1; + if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name)) + ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1; + pr_alert("%s%s Tree SRCU g%ld state %d (%s)", + tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), ss_state, + srcu_size_state_name[ss_state_idx]); + if (!ssp->sda) { + // Called after cleanup_srcu_struct(), perhaps. + pr_cont(" No per-CPU srcu_data structures (->sda == NULL).\n"); + } else { + pr_cont(" per-CPU(idx=%d):", idx); + for_each_possible_cpu(cpu) { + unsigned long l0, l1; + unsigned long u0, u1; + long c0, c1; + struct srcu_data *sdp; + + sdp = per_cpu_ptr(ssp->sda, cpu); + u0 = data_race(sdp->srcu_unlock_count[!idx]); + u1 = data_race(sdp->srcu_unlock_count[idx]); + + /* + * Make sure that a lock is always counted if the corresponding + * unlock is counted. + */ + smp_rmb(); + + l0 = data_race(sdp->srcu_lock_count[!idx]); + l1 = data_race(sdp->srcu_lock_count[idx]); + + c0 = l0 - u0; + c1 = l1 - u1; + pr_cont(" %d(%ld,%ld %c)", + cpu, c0, c1, + "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]); + s0 += c0; + s1 += c1; + } + pr_cont(" T(%ld,%ld)\n", s0, s1); } - pr_cont(" T(%ld,%ld)\n", s0, s1); + if (SRCU_SIZING_IS_TORTURE()) + srcu_transition_to_big(ssp); } EXPORT_SYMBOL_GPL(srcu_torture_stats_print); @@ -1382,6 +1719,11 @@ static int __init srcu_bootup_announce(void) pr_info("Hierarchical SRCU implementation.\n"); if (exp_holdoff != DEFAULT_SRCU_EXP_HOLDOFF) pr_info("\tNon-default auto-expedite holdoff of %lu ns.\n", exp_holdoff); + if (srcu_retry_check_delay != SRCU_DEFAULT_RETRY_CHECK_DELAY) + pr_info("\tNon-default retry check delay of %lu us.\n", srcu_retry_check_delay); + if (srcu_max_nodelay != SRCU_DEFAULT_MAX_NODELAY) + pr_info("\tNon-default max no-delay of %lu.\n", srcu_max_nodelay); + pr_info("\tMax phase no-delay instances is %lu.\n", srcu_max_nodelay_phase); return 0; } early_initcall(srcu_bootup_announce); @@ -1390,6 +1732,17 @@ void __init srcu_init(void) { struct srcu_struct *ssp; + /* Decide on srcu_struct-size strategy. */ + if (SRCU_SIZING_IS(SRCU_SIZING_AUTO)) { + if (nr_cpu_ids >= big_cpu_lim) { + convert_to_big = SRCU_SIZING_INIT; // Don't bother waiting for contention. + pr_info("%s: Setting srcu_struct sizes to big.\n", __func__); + } else { + convert_to_big = SRCU_SIZING_NONE | SRCU_SIZING_CONTEND; + pr_info("%s: Setting srcu_struct sizes based on contention.\n", __func__); + } + } + /* * Once that is set, call_srcu() can follow the normal path and * queue delayed work. This must follow RCU workqueues creation @@ -1400,6 +1753,8 @@ void __init srcu_init(void) ssp = list_first_entry(&srcu_boot_list, struct srcu_struct, work.work.entry); list_del_init(&ssp->work.work.entry); + if (SRCU_SIZING_IS(SRCU_SIZING_INIT) && ssp->srcu_size_state == SRCU_SIZE_SMALL) + ssp->srcu_size_state = SRCU_SIZE_ALLOC; queue_work(rcu_gp_wq, &ssp->work.work); } } diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index 33d896d85902..5cefc702158f 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -111,7 +111,7 @@ static void rcu_sync_func(struct rcu_head *rhp) * a slowpath during the update. After this function returns, all * subsequent calls to rcu_sync_is_idle() will return false, which * tells readers to stay off their fastpaths. A later call to - * rcu_sync_exit() re-enables reader slowpaths. + * rcu_sync_exit() re-enables reader fastpaths. * * When called in isolation, rcu_sync_enter() must wait for a grace * period, however, closely spaced calls to rcu_sync_enter() can diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 99cf3a13954c..f5bf6fb430da 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -14,7 +14,7 @@ struct rcu_tasks; typedef void (*rcu_tasks_gp_func_t)(struct rcu_tasks *rtp); -typedef void (*pregp_func_t)(void); +typedef void (*pregp_func_t)(struct list_head *hop); typedef void (*pertask_func_t)(struct task_struct *t, struct list_head *hop); typedef void (*postscan_func_t)(struct list_head *hop); typedef void (*holdouts_func_t)(struct list_head *hop, bool ndrpt, bool *frptp); @@ -29,6 +29,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); * @rtp_work: Work queue for invoking callbacks. * @rtp_irq_work: IRQ work queue for deferred wakeups. * @barrier_q_head: RCU callback for barrier operation. + * @rtp_blkd_tasks: List of tasks blocked as readers. * @cpu: CPU number corresponding to this entry. * @rtpp: Pointer to the rcu_tasks structure. */ @@ -40,14 +41,16 @@ struct rcu_tasks_percpu { struct work_struct rtp_work; struct irq_work rtp_irq_work; struct rcu_head barrier_q_head; + struct list_head rtp_blkd_tasks; int cpu; struct rcu_tasks *rtpp; }; /** * struct rcu_tasks - Definition for a Tasks-RCU-like mechanism. - * @cbs_wq: Wait queue allowing new callback to get kthread's attention. + * @cbs_wait: RCU wait allowing a new callback to get kthread's attention. * @cbs_gbl_lock: Lock protecting callback list. + * @tasks_gp_mutex: Mutex protecting grace period, needed during mid-boot dead zone. * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. * @gp_func: This flavor's grace-period-wait function. * @gp_state: Grace period's most recent state transition (debugging). @@ -77,8 +80,9 @@ struct rcu_tasks_percpu { * @kname: This flavor's kthread name. */ struct rcu_tasks { - struct wait_queue_head cbs_wq; + struct rcuwait cbs_wait; raw_spinlock_t cbs_gbl_lock; + struct mutex tasks_gp_mutex; int gp_state; int gp_sleep; int init_fract; @@ -113,12 +117,13 @@ static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp); #define DEFINE_RCU_TASKS(rt_name, gp, call, n) \ static DEFINE_PER_CPU(struct rcu_tasks_percpu, rt_name ## __percpu) = { \ .lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name ## __percpu.cbs_pcpu_lock), \ - .rtp_irq_work = IRQ_WORK_INIT(call_rcu_tasks_iw_wakeup), \ + .rtp_irq_work = IRQ_WORK_INIT_HARD(call_rcu_tasks_iw_wakeup), \ }; \ static struct rcu_tasks rt_name = \ { \ - .cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rt_name.cbs_wq), \ + .cbs_wait = __RCUWAIT_INITIALIZER(rt_name.wait), \ .cbs_gbl_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_gbl_lock), \ + .tasks_gp_mutex = __MUTEX_INITIALIZER(rt_name.tasks_gp_mutex), \ .gp_func = gp, \ .call_func = call, \ .rtpcpu = &rt_name ## __percpu, \ @@ -140,9 +145,15 @@ static int rcu_task_ipi_delay __read_mostly = RCU_TASK_IPI_DELAY; module_param(rcu_task_ipi_delay, int, 0644); /* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ +#define RCU_TASK_BOOT_STALL_TIMEOUT (HZ * 30) #define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; module_param(rcu_task_stall_timeout, int, 0644); +#define RCU_TASK_STALL_INFO (HZ * 10) +static int rcu_task_stall_info __read_mostly = RCU_TASK_STALL_INFO; +module_param(rcu_task_stall_info, int, 0644); +static int rcu_task_stall_info_mult __read_mostly = 3; +module_param(rcu_task_stall_info_mult, int, 0444); static int rcu_task_enqueue_lim __read_mostly = -1; module_param(rcu_task_enqueue_lim, int, 0444); @@ -248,6 +259,8 @@ static void cblist_init_generic(struct rcu_tasks *rtp) INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq); rtpcp->cpu = cpu; rtpcp->rtpp = rtp; + if (!rtpcp->rtp_blkd_tasks.next) + INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks); raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled. } raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); @@ -261,14 +274,16 @@ static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp) struct rcu_tasks_percpu *rtpcp = container_of(iwp, struct rcu_tasks_percpu, rtp_irq_work); rtp = rtpcp->rtpp; - wake_up(&rtp->cbs_wq); + rcuwait_wake_up(&rtp->cbs_wait); } // Enqueue a callback for the specified flavor of Tasks RCU. static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, struct rcu_tasks *rtp) { + int chosen_cpu; unsigned long flags; + int ideal_cpu; unsigned long j; bool needadjust = false; bool needwake; @@ -278,8 +293,9 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, rhp->func = func; local_irq_save(flags); rcu_read_lock(); - rtpcp = per_cpu_ptr(rtp->rtpcpu, - smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift)); + ideal_cpu = smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift); + chosen_cpu = cpumask_next(ideal_cpu - 1, cpu_possible_mask); + rtpcp = per_cpu_ptr(rtp->rtpcpu, chosen_cpu); if (!raw_spin_trylock_rcu_node(rtpcp)) { // irqs already disabled. raw_spin_lock_rcu_node(rtpcp); // irqs already disabled. j = jiffies; @@ -315,17 +331,6 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, irq_work_queue(&rtpcp->rtp_irq_work); } -// Wait for a grace period for the specified flavor of Tasks RCU. -static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp) -{ - /* Complain if the scheduler has not started. */ - RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, - "synchronize_rcu_tasks called too soon"); - - /* Wait for the grace period. */ - wait_rcu_gp(rtp->call_func); -} - // RCU callback function for rcu_barrier_tasks_generic(). static void rcu_barrier_tasks_generic_cb(struct rcu_head *rhp) { @@ -431,6 +436,11 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) WRITE_ONCE(rtp->percpu_dequeue_lim, 1); pr_info("Completing switch %s to CPU-0 callback queuing.\n", rtp->name); } + for (cpu = rtp->percpu_dequeue_lim; cpu < nr_cpu_ids; cpu++) { + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + + WARN_ON_ONCE(rcu_segcblist_n_cbs(&rtpcp->cblist)); + } raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); } @@ -460,7 +470,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu } } - if (rcu_segcblist_empty(&rtpcp->cblist)) + if (rcu_segcblist_empty(&rtpcp->cblist) || !cpu_possible(cpu)) return; raw_spin_lock_irqsave_rcu_node(rtpcp, flags); rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq)); @@ -489,10 +499,41 @@ static void rcu_tasks_invoke_cbs_wq(struct work_struct *wp) rcu_tasks_invoke_cbs(rtp, rtpcp); } -/* RCU-tasks kthread that detects grace periods and invokes callbacks. */ -static int __noreturn rcu_tasks_kthread(void *arg) +// Wait for one grace period. +static void rcu_tasks_one_gp(struct rcu_tasks *rtp, bool midboot) { int needgpcb; + + mutex_lock(&rtp->tasks_gp_mutex); + + // If there were none, wait a bit and start over. + if (unlikely(midboot)) { + needgpcb = 0x2; + } else { + set_tasks_gp_state(rtp, RTGS_WAIT_CBS); + rcuwait_wait_event(&rtp->cbs_wait, + (needgpcb = rcu_tasks_need_gpcb(rtp)), + TASK_IDLE); + } + + if (needgpcb & 0x2) { + // Wait for one grace period. + set_tasks_gp_state(rtp, RTGS_WAIT_GP); + rtp->gp_start = jiffies; + rcu_seq_start(&rtp->tasks_gp_seq); + rtp->gp_func(rtp); + rcu_seq_end(&rtp->tasks_gp_seq); + } + + // Invoke callbacks. + set_tasks_gp_state(rtp, RTGS_INVOKE_CBS); + rcu_tasks_invoke_cbs(rtp, per_cpu_ptr(rtp->rtpcpu, 0)); + mutex_unlock(&rtp->tasks_gp_mutex); +} + +// RCU-tasks kthread that detects grace periods and invokes callbacks. +static int __noreturn rcu_tasks_kthread(void *arg) +{ struct rcu_tasks *rtp = arg; /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ @@ -506,27 +547,28 @@ static int __noreturn rcu_tasks_kthread(void *arg) * This loop is terminated by the system going down. ;-) */ for (;;) { - set_tasks_gp_state(rtp, RTGS_WAIT_CBS); - - /* If there were none, wait a bit and start over. */ - wait_event_idle(rtp->cbs_wq, (needgpcb = rcu_tasks_need_gpcb(rtp))); + // Wait for one grace period and invoke any callbacks + // that are ready. + rcu_tasks_one_gp(rtp, false); - if (needgpcb & 0x2) { - // Wait for one grace period. - set_tasks_gp_state(rtp, RTGS_WAIT_GP); - rtp->gp_start = jiffies; - rcu_seq_start(&rtp->tasks_gp_seq); - rtp->gp_func(rtp); - rcu_seq_end(&rtp->tasks_gp_seq); - } + // Paranoid sleep to keep this from entering a tight loop. + schedule_timeout_idle(rtp->gp_sleep); + } +} - /* Invoke callbacks. */ - set_tasks_gp_state(rtp, RTGS_INVOKE_CBS); - rcu_tasks_invoke_cbs(rtp, per_cpu_ptr(rtp->rtpcpu, 0)); +// Wait for a grace period for the specified flavor of Tasks RCU. +static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp) +{ + /* Complain if the scheduler has not started. */ + WARN_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, + "synchronize_rcu_tasks called too soon"); - /* Paranoid sleep to keep this from entering a tight loop */ - schedule_timeout_idle(rtp->gp_sleep); + // If the grace-period kthread is running, use it. + if (READ_ONCE(rtp->kthread_ptr)) { + wait_rcu_gp(rtp->call_func); + return; } + rcu_tasks_one_gp(rtp, true); } /* Spawn RCU-tasks grace-period kthread. */ @@ -548,8 +590,15 @@ static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp) static void __init rcu_tasks_bootup_oddness(void) { #if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) + int rtsimc; + if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT) pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout); + rtsimc = clamp(rcu_task_stall_info_mult, 1, 10); + if (rtsimc != rcu_task_stall_info_mult) { + pr_info("\tTasks-RCU CPU stall info multiplier clamped to %d (rcu_task_stall_info_mult).\n", rtsimc); + rcu_task_stall_info_mult = rtsimc; + } #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_RCU pr_info("\tTrampoline variant of Tasks RCU enabled.\n"); @@ -568,7 +617,17 @@ static void __init rcu_tasks_bootup_oddness(void) /* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) { - struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, 0); // for_each... + int cpu; + bool havecbs = false; + + for_each_possible_cpu(cpu) { + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + + if (!data_race(rcu_segcblist_empty(&rtpcp->cblist))) { + havecbs = true; + break; + } + } pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c %s\n", rtp->kname, tasks_gp_state_getname(rtp), data_race(rtp->gp_state), @@ -576,7 +635,7 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) data_race(rcu_seq_current(&rtp->tasks_gp_seq)), data_race(rtp->n_ipis_fails), data_race(rtp->n_ipis), ".k"[!!data_race(rtp->kthread_ptr)], - ".C"[!data_race(rcu_segcblist_empty(&rtpcp->cblist))], + ".C"[havecbs], s); } #endif // #ifndef CONFIG_TINY_RCU @@ -592,13 +651,18 @@ static void exit_tasks_rcu_finish_trace(struct task_struct *t); /* Wait for one RCU-tasks grace period. */ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) { - struct task_struct *g, *t; - unsigned long lastreport; - LIST_HEAD(holdouts); + struct task_struct *g; int fract; + LIST_HEAD(holdouts); + unsigned long j; + unsigned long lastinfo; + unsigned long lastreport; + bool reported = false; + int rtsi; + struct task_struct *t; set_tasks_gp_state(rtp, RTGS_PRE_WAIT_GP); - rtp->pregp_func(); + rtp->pregp_func(&holdouts); /* * There were callbacks, so we need to wait for an RCU-tasks @@ -607,10 +671,12 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) * and make a list of them in holdouts. */ set_tasks_gp_state(rtp, RTGS_SCAN_TASKLIST); - rcu_read_lock(); - for_each_process_thread(g, t) - rtp->pertask_func(t, &holdouts); - rcu_read_unlock(); + if (rtp->pertask_func) { + rcu_read_lock(); + for_each_process_thread(g, t) + rtp->pertask_func(t, &holdouts); + rcu_read_unlock(); + } set_tasks_gp_state(rtp, RTGS_POST_SCAN_TASKLIST); rtp->postscan_func(&holdouts); @@ -621,30 +687,50 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) * is empty, we are done. */ lastreport = jiffies; + lastinfo = lastreport; + rtsi = READ_ONCE(rcu_task_stall_info); // Start off with initial wait and slowly back off to 1 HZ wait. fract = rtp->init_fract; while (!list_empty(&holdouts)) { + ktime_t exp; bool firstreport; bool needreport; int rtst; - /* Slowly back off waiting for holdouts */ + // Slowly back off waiting for holdouts set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS); - schedule_timeout_idle(fract); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + schedule_timeout_idle(fract); + } else { + exp = jiffies_to_nsecs(fract); + __set_current_state(TASK_IDLE); + schedule_hrtimeout_range(&exp, jiffies_to_nsecs(HZ / 2), HRTIMER_MODE_REL_HARD); + } if (fract < HZ) fract++; rtst = READ_ONCE(rcu_task_stall_timeout); needreport = rtst > 0 && time_after(jiffies, lastreport + rtst); - if (needreport) + if (needreport) { lastreport = jiffies; + reported = true; + } firstreport = true; WARN_ON(signal_pending(current)); set_tasks_gp_state(rtp, RTGS_SCAN_HOLDOUTS); rtp->holdouts_func(&holdouts, needreport, &firstreport); + + // Print pre-stall informational messages if needed. + j = jiffies; + if (rtsi > 0 && !reported && time_after(j, lastinfo + rtsi)) { + lastinfo = j; + rtsi = rtsi * rcu_task_stall_info_mult; + pr_info("%s: %s grace period %lu is %lu jiffies old.\n", + __func__, rtp->kname, rtp->tasks_gp_seq, j - rtp->gp_start); + } } set_tasks_gp_state(rtp, RTGS_POST_GP); @@ -708,7 +794,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) // disabling. /* Pre-grace-period preparation. */ -static void rcu_tasks_pregp_step(void) +static void rcu_tasks_pregp_step(struct list_head *hop) { /* * Wait for all pre-existing t->on_rq and t->nvcsw transitions @@ -950,6 +1036,9 @@ static void rcu_tasks_be_rude(struct work_struct *work) // Wait for one rude RCU-tasks grace period. static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp) { + if (num_online_cpus() <= 1) + return; // Fastpath for only one CPU. + rtp->n_ipis += cpumask_weight(cpu_online_mask); schedule_on_each_cpu(rcu_tasks_be_rude); } @@ -1050,11 +1139,10 @@ EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread); // 3. Avoids expensive read-side instructions, having overhead similar // to that of Preemptible RCU. // -// There are of course downsides. The grace-period code can send IPIs to -// CPUs, even when those CPUs are in the idle loop or in nohz_full userspace. -// It is necessary to scan the full tasklist, much as for Tasks RCU. There -// is a single callback queue guarded by a single lock, again, much as for -// Tasks RCU. If needed, these downsides can be at least partially remedied. +// There are of course downsides. For example, the grace-period code +// can send IPIs to CPUs, even when those CPUs are in the idle loop or +// in nohz_full userspace. If needed, these downsides can be at least +// partially remedied. // // Perhaps most important, this variant of RCU does not affect the vanilla // flavors, rcu_preempt and rcu_sched. The fact that RCU Tasks Trace @@ -1067,38 +1155,30 @@ EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread); // invokes these functions in this order: // // rcu_tasks_trace_pregp_step(): -// Initialize the count of readers and block CPU-hotplug operations. -// rcu_tasks_trace_pertask(), invoked on every non-idle task: -// Initialize per-task state and attempt to identify an immediate -// quiescent state for that task, or, failing that, attempt to -// set that task's .need_qs flag so that task's next outermost -// rcu_read_unlock_trace() will report the quiescent state (in which -// case the count of readers is incremented). If both attempts fail, -// the task is added to a "holdout" list. Note that IPIs are used -// to invoke trc_read_check_handler() in the context of running tasks -// in order to avoid ordering overhead on common-case shared-variable -// accessses. +// Disables CPU hotplug, adds all currently executing tasks to the +// holdout list, then checks the state of all tasks that blocked +// or were preempted within their current RCU Tasks Trace read-side +// critical section, adding them to the holdout list if appropriate. +// Finally, this function re-enables CPU hotplug. +// The ->pertask_func() pointer is NULL, so there is no per-task processing. // rcu_tasks_trace_postscan(): -// Initialize state and attempt to identify an immediate quiescent -// state as above (but only for idle tasks), unblock CPU-hotplug -// operations, and wait for an RCU grace period to avoid races with -// tasks that are in the process of exiting. +// Invokes synchronize_rcu() to wait for late-stage exiting tasks +// to finish exiting. // check_all_holdout_tasks_trace(), repeatedly until holdout list is empty: // Scans the holdout list, attempting to identify a quiescent state // for each task on the list. If there is a quiescent state, the -// corresponding task is removed from the holdout list. +// corresponding task is removed from the holdout list. Once this +// list is empty, the grace period has completed. // rcu_tasks_trace_postgp(): -// Wait for the count of readers do drop to zero, reporting any stalls. -// Also execute full memory barriers to maintain ordering with code -// executing after the grace period. +// Provides the needed full memory barrier and does debug checks. // // The exit_tasks_rcu_finish_trace() synchronizes with exiting tasks. // -// Pre-grace-period update-side code is ordered before the grace -// period via the ->cbs_lock and barriers in rcu_tasks_kthread(). -// Pre-grace-period read-side code is ordered before the grace period by -// atomic_dec_and_test() of the count of readers (for IPIed readers) and by -// scheduler context-switch ordering (for locked-down non-running readers). +// Pre-grace-period update-side code is ordered before the grace period +// via the ->cbs_lock and barriers in rcu_tasks_kthread(). Pre-grace-period +// read-side code is ordered before the grace period by atomic operations +// on .b.need_qs flag of each task involved in this process, or by scheduler +// context-switch ordering (for locked-down non-running readers). // The lockdep state must be outside of #ifdef to be useful. #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -1110,9 +1190,6 @@ EXPORT_SYMBOL_GPL(rcu_trace_lock_map); #ifdef CONFIG_TASKS_TRACE_RCU -static atomic_t trc_n_readers_need_end; // Number of waited-for readers. -static DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks. - // Record outstanding IPIs to each CPU. No point in sending two... static DEFINE_PER_CPU(bool, trc_ipi_to_cpu); @@ -1121,44 +1198,104 @@ static DEFINE_PER_CPU(bool, trc_ipi_to_cpu); static unsigned long n_heavy_reader_attempts; static unsigned long n_heavy_reader_updates; static unsigned long n_heavy_reader_ofl_updates; +static unsigned long n_trc_holdouts; void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace, "RCU Tasks Trace"); +/* Load from ->trc_reader_special.b.need_qs with proper ordering. */ +static u8 rcu_ld_need_qs(struct task_struct *t) +{ + smp_mb(); // Enforce full grace-period ordering. + return smp_load_acquire(&t->trc_reader_special.b.need_qs); +} + +/* Store to ->trc_reader_special.b.need_qs with proper ordering. */ +static void rcu_st_need_qs(struct task_struct *t, u8 v) +{ + smp_store_release(&t->trc_reader_special.b.need_qs, v); + smp_mb(); // Enforce full grace-period ordering. +} + /* - * This irq_work handler allows rcu_read_unlock_trace() to be invoked - * while the scheduler locks are held. + * Do a cmpxchg() on ->trc_reader_special.b.need_qs, allowing for + * the four-byte operand-size restriction of some platforms. + * Returns the old value, which is often ignored. */ -static void rcu_read_unlock_iw(struct irq_work *iwp) +u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new) { - wake_up(&trc_wait); + union rcu_special ret; + union rcu_special trs_old = READ_ONCE(t->trc_reader_special); + union rcu_special trs_new = trs_old; + + if (trs_old.b.need_qs != old) + return trs_old.b.need_qs; + trs_new.b.need_qs = new; + ret.s = cmpxchg(&t->trc_reader_special.s, trs_old.s, trs_new.s); + return ret.b.need_qs; } -static DEFINE_IRQ_WORK(rcu_tasks_trace_iw, rcu_read_unlock_iw); +EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs); -/* If we are the last reader, wake up the grace-period kthread. */ +/* + * If we are the last reader, signal the grace-period kthread. + * Also remove from the per-CPU list of blocked tasks. + */ void rcu_read_unlock_trace_special(struct task_struct *t) { - int nq = READ_ONCE(t->trc_reader_special.b.need_qs); + unsigned long flags; + struct rcu_tasks_percpu *rtpcp; + union rcu_special trs; + + // Open-coded full-word version of rcu_ld_need_qs(). + smp_mb(); // Enforce full grace-period ordering. + trs = smp_load_acquire(&t->trc_reader_special); - if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && - t->trc_reader_special.b.need_mb) + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && t->trc_reader_special.b.need_mb) smp_mb(); // Pairs with update-side barriers. // Update .need_qs before ->trc_reader_nesting for irq/NMI handlers. - if (nq) - WRITE_ONCE(t->trc_reader_special.b.need_qs, false); + if (trs.b.need_qs == (TRC_NEED_QS_CHECKED | TRC_NEED_QS)) { + u8 result = rcu_trc_cmpxchg_need_qs(t, TRC_NEED_QS_CHECKED | TRC_NEED_QS, + TRC_NEED_QS_CHECKED); + + WARN_ONCE(result != trs.b.need_qs, "%s: result = %d", __func__, result); + } + if (trs.b.blocked) { + rtpcp = per_cpu_ptr(rcu_tasks_trace.rtpcpu, t->trc_blkd_cpu); + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + list_del_init(&t->trc_blkd_node); + WRITE_ONCE(t->trc_reader_special.b.blocked, false); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + } WRITE_ONCE(t->trc_reader_nesting, 0); - if (nq && atomic_dec_and_test(&trc_n_readers_need_end)) - irq_work_queue(&rcu_tasks_trace_iw); } EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special); +/* Add a newly blocked reader task to its CPU's list. */ +void rcu_tasks_trace_qs_blkd(struct task_struct *t) +{ + unsigned long flags; + struct rcu_tasks_percpu *rtpcp; + + local_irq_save(flags); + rtpcp = this_cpu_ptr(rcu_tasks_trace.rtpcpu); + raw_spin_lock_rcu_node(rtpcp); // irqs already disabled + t->trc_blkd_cpu = smp_processor_id(); + if (!rtpcp->rtp_blkd_tasks.next) + INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks); + list_add(&t->trc_blkd_node, &rtpcp->rtp_blkd_tasks); + WRITE_ONCE(t->trc_reader_special.b.blocked, true); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); +} +EXPORT_SYMBOL_GPL(rcu_tasks_trace_qs_blkd); + /* Add a task to the holdout list, if it is not already on the list. */ static void trc_add_holdout(struct task_struct *t, struct list_head *bhp) { if (list_empty(&t->trc_holdout_list)) { get_task_struct(t); list_add(&t->trc_holdout_list, bhp); + n_trc_holdouts++; } } @@ -1168,37 +1305,36 @@ static void trc_del_holdout(struct task_struct *t) if (!list_empty(&t->trc_holdout_list)) { list_del_init(&t->trc_holdout_list); put_task_struct(t); + n_trc_holdouts--; } } /* IPI handler to check task state. */ static void trc_read_check_handler(void *t_in) { + int nesting; struct task_struct *t = current; struct task_struct *texp = t_in; // If the task is no longer running on this CPU, leave. - if (unlikely(texp != t)) { + if (unlikely(texp != t)) goto reset_ipi; // Already on holdout list, so will check later. - } // If the task is not in a read-side critical section, and // if this is the last reader, awaken the grace-period kthread. - if (likely(!READ_ONCE(t->trc_reader_nesting))) { - WRITE_ONCE(t->trc_reader_checked, true); + nesting = READ_ONCE(t->trc_reader_nesting); + if (likely(!nesting)) { + rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED); goto reset_ipi; } // If we are racing with an rcu_read_unlock_trace(), try again later. - if (unlikely(READ_ONCE(t->trc_reader_nesting) < 0)) + if (unlikely(nesting < 0)) goto reset_ipi; - WRITE_ONCE(t->trc_reader_checked, true); - // Get here if the task is in a read-side critical section. Set - // its state so that it will awaken the grace-period kthread upon - // exit from that critical section. - atomic_inc(&trc_n_readers_need_end); // One more to wait on. - WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs)); - WRITE_ONCE(t->trc_reader_special.b.need_qs, true); + // Get here if the task is in a read-side critical section. + // Set its state so that it will update state for the grace-period + // kthread upon exit from that critical section. + rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED); reset_ipi: // Allow future IPIs to be sent on CPU and for task. @@ -1209,48 +1345,50 @@ reset_ipi: } /* Callback function for scheduler to check locked-down task. */ -static int trc_inspect_reader(struct task_struct *t, void *arg) +static int trc_inspect_reader(struct task_struct *t, void *bhp_in) { + struct list_head *bhp = bhp_in; int cpu = task_cpu(t); int nesting; bool ofl = cpu_is_offline(cpu); - if (task_curr(t)) { - WARN_ON_ONCE(ofl && !is_idle_task(t)); - + if (task_curr(t) && !ofl) { // If no chance of heavyweight readers, do it the hard way. - if (!ofl && !IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) + if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) return -EINVAL; // If heavyweight readers are enabled on the remote task, // we can inspect its state despite its currently running. // However, we cannot safely change its state. n_heavy_reader_attempts++; - if (!ofl && // Check for "running" idle tasks on offline CPUs. - !rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting)) + // Check for "running" idle tasks on offline CPUs. + if (!rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting)) return -EINVAL; // No quiescent state, do it the hard way. n_heavy_reader_updates++; - if (ofl) - n_heavy_reader_ofl_updates++; nesting = 0; } else { // The task is not running, so C-language access is safe. nesting = t->trc_reader_nesting; + WARN_ON_ONCE(ofl && task_curr(t) && !is_idle_task(t)); + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && ofl) + n_heavy_reader_ofl_updates++; } // If not exiting a read-side critical section, mark as checked // so that the grace-period kthread will remove it from the // holdout list. - t->trc_reader_checked = nesting >= 0; - if (nesting <= 0) - return nesting ? -EINVAL : 0; // If in QS, done, otherwise try again later. + if (!nesting) { + rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED); + return 0; // In QS, so done. + } + if (nesting < 0) + return -EINVAL; // Reader transitioning, try again later. // The task is in a read-side critical section, so set up its - // state so that it will awaken the grace-period kthread upon exit - // from that critical section. - atomic_inc(&trc_n_readers_need_end); // One more to wait on. - WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs)); - WRITE_ONCE(t->trc_reader_special.b.need_qs, true); + // state so that it will update state upon exit from that critical + // section. + if (!rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED)) + trc_add_holdout(t, bhp); return 0; } @@ -1266,14 +1404,14 @@ static void trc_wait_for_one_reader(struct task_struct *t, // The current task had better be in a quiescent state. if (t == current) { - t->trc_reader_checked = true; + rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED); WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting)); return; } // Attempt to nail down the task for inspection. get_task_struct(t); - if (!task_call_func(t, trc_inspect_reader, NULL)) { + if (!task_call_func(t, trc_inspect_reader, bhp)) { put_task_struct(t); return; } @@ -1311,56 +1449,95 @@ static void trc_wait_for_one_reader(struct task_struct *t, } } +/* + * Initialize for first-round processing for the specified task. + * Return false if task is NULL or already taken care of, true otherwise. + */ +static bool rcu_tasks_trace_pertask_prep(struct task_struct *t, bool notself) +{ + // During early boot when there is only the one boot CPU, there + // is no idle task for the other CPUs. Also, the grace-period + // kthread is always in a quiescent state. In addition, just return + // if this task is already on the list. + if (unlikely(t == NULL) || (t == current && notself) || !list_empty(&t->trc_holdout_list)) + return false; + + rcu_st_need_qs(t, 0); + t->trc_ipi_to_cpu = -1; + return true; +} + +/* Do first-round processing for the specified task. */ +static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop) +{ + if (rcu_tasks_trace_pertask_prep(t, true)) + trc_wait_for_one_reader(t, hop); +} + /* Initialize for a new RCU-tasks-trace grace period. */ -static void rcu_tasks_trace_pregp_step(void) +static void rcu_tasks_trace_pregp_step(struct list_head *hop) { + LIST_HEAD(blkd_tasks); int cpu; - - // Allow for fast-acting IPIs. - atomic_set(&trc_n_readers_need_end, 1); + unsigned long flags; + struct rcu_tasks_percpu *rtpcp; + struct task_struct *t; // There shouldn't be any old IPIs, but... for_each_possible_cpu(cpu) WARN_ON_ONCE(per_cpu(trc_ipi_to_cpu, cpu)); - // Disable CPU hotplug across the tasklist scan. - // This also waits for all readers in CPU-hotplug code paths. + // Disable CPU hotplug across the CPU scan for the benefit of + // any IPIs that might be needed. This also waits for all readers + // in CPU-hotplug code paths. cpus_read_lock(); -} -/* Do first-round processing for the specified task. */ -static void rcu_tasks_trace_pertask(struct task_struct *t, - struct list_head *hop) -{ - // During early boot when there is only the one boot CPU, there - // is no idle task for the other CPUs. Just return. - if (unlikely(t == NULL)) - return; + // These rcu_tasks_trace_pertask_prep() calls are serialized to + // allow safe access to the hop list. + for_each_online_cpu(cpu) { + rcu_read_lock(); + t = cpu_curr_snapshot(cpu); + if (rcu_tasks_trace_pertask_prep(t, true)) + trc_add_holdout(t, hop); + rcu_read_unlock(); + cond_resched_tasks_rcu_qs(); + } - WRITE_ONCE(t->trc_reader_special.b.need_qs, false); - WRITE_ONCE(t->trc_reader_checked, false); - t->trc_ipi_to_cpu = -1; - trc_wait_for_one_reader(t, hop); + // Only after all running tasks have been accounted for is it + // safe to take care of the tasks that have blocked within their + // current RCU tasks trace read-side critical section. + for_each_possible_cpu(cpu) { + rtpcp = per_cpu_ptr(rcu_tasks_trace.rtpcpu, cpu); + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + list_splice_init(&rtpcp->rtp_blkd_tasks, &blkd_tasks); + while (!list_empty(&blkd_tasks)) { + rcu_read_lock(); + t = list_first_entry(&blkd_tasks, struct task_struct, trc_blkd_node); + list_del_init(&t->trc_blkd_node); + list_add(&t->trc_blkd_node, &rtpcp->rtp_blkd_tasks); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + rcu_tasks_trace_pertask(t, hop); + rcu_read_unlock(); + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + } + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + cond_resched_tasks_rcu_qs(); + } + + // Re-enable CPU hotplug now that the holdout list is populated. + cpus_read_unlock(); } /* - * Do intermediate processing between task and holdout scans and - * pick up the idle tasks. + * Do intermediate processing between task and holdout scans. */ static void rcu_tasks_trace_postscan(struct list_head *hop) { - int cpu; - - for_each_possible_cpu(cpu) - rcu_tasks_trace_pertask(idle_task(cpu), hop); - - // Re-enable CPU hotplug now that the tasklist scan has completed. - cpus_read_unlock(); - // Wait for late-stage exiting tasks to finish exiting. // These might have passed the call to exit_tasks_rcu_finish(). synchronize_rcu(); - // Any tasks that exit after this point will set ->trc_reader_checked. + // Any tasks that exit after this point will set + // TRC_NEED_QS_CHECKED in ->trc_reader_special.b.need_qs. } /* Communicate task state back to the RCU tasks trace stall warning request. */ @@ -1374,11 +1551,11 @@ static int trc_check_slow_task(struct task_struct *t, void *arg) { struct trc_stall_chk_rdr *trc_rdrp = arg; - if (task_curr(t)) + if (task_curr(t) && cpu_online(task_cpu(t))) return false; // It is running, so decline to inspect it. trc_rdrp->nesting = READ_ONCE(t->trc_reader_nesting); trc_rdrp->ipi_to_cpu = READ_ONCE(t->trc_ipi_to_cpu); - trc_rdrp->needqs = READ_ONCE(t->trc_reader_special.b.need_qs); + trc_rdrp->needqs = rcu_ld_need_qs(t); return true; } @@ -1395,18 +1572,21 @@ static void show_stalled_task_trace(struct task_struct *t, bool *firstreport) } cpu = task_cpu(t); if (!task_call_func(t, trc_check_slow_task, &trc_rdr)) - pr_alert("P%d: %c\n", + pr_alert("P%d: %c%c\n", t->pid, + ".I"[t->trc_ipi_to_cpu >= 0], ".i"[is_idle_tsk]); else - pr_alert("P%d: %c%c%c nesting: %d%c cpu: %d\n", + pr_alert("P%d: %c%c%c%c nesting: %d%c%c cpu: %d%s\n", t->pid, ".I"[trc_rdr.ipi_to_cpu >= 0], ".i"[is_idle_tsk], ".N"[cpu >= 0 && tick_nohz_full_cpu(cpu)], + ".B"[!!data_race(t->trc_reader_special.b.blocked)], trc_rdr.nesting, - " N"[!!trc_rdr.needqs], - cpu); + " !CN"[trc_rdr.needqs & 0x3], + " ?"[trc_rdr.needqs > 0x3], + cpu, cpu_online(cpu) ? "" : "(offline)"); sched_show_task(t); } @@ -1426,21 +1606,22 @@ static void check_all_holdout_tasks_trace(struct list_head *hop, { struct task_struct *g, *t; - // Disable CPU hotplug across the holdout list scan. + // Disable CPU hotplug across the holdout list scan for IPIs. cpus_read_lock(); list_for_each_entry_safe(t, g, hop, trc_holdout_list) { // If safe and needed, try to check the current task. if (READ_ONCE(t->trc_ipi_to_cpu) == -1 && - !READ_ONCE(t->trc_reader_checked)) + !(rcu_ld_need_qs(t) & TRC_NEED_QS_CHECKED)) trc_wait_for_one_reader(t, hop); // If check succeeded, remove this task from the list. if (smp_load_acquire(&t->trc_ipi_to_cpu) == -1 && - READ_ONCE(t->trc_reader_checked)) + rcu_ld_need_qs(t) == TRC_NEED_QS_CHECKED) trc_del_holdout(t); else if (needreport) show_stalled_task_trace(t, firstreport); + cond_resched_tasks_rcu_qs(); } // Re-enable CPU hotplug now that the holdout list scan has completed. @@ -1461,10 +1642,6 @@ static void rcu_tasks_trace_empty_fn(void *unused) static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) { int cpu; - bool firstreport; - struct task_struct *g, *t; - LIST_HEAD(holdouts); - long ret; // Wait for any lingering IPI handlers to complete. Note that // if a CPU has gone offline or transitioned to userspace in the @@ -1475,37 +1652,6 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) if (WARN_ON_ONCE(smp_load_acquire(per_cpu_ptr(&trc_ipi_to_cpu, cpu)))) smp_call_function_single(cpu, rcu_tasks_trace_empty_fn, NULL, 1); - // Remove the safety count. - smp_mb__before_atomic(); // Order vs. earlier atomics - atomic_dec(&trc_n_readers_need_end); - smp_mb__after_atomic(); // Order vs. later atomics - - // Wait for readers. - set_tasks_gp_state(rtp, RTGS_WAIT_READERS); - for (;;) { - ret = wait_event_idle_exclusive_timeout( - trc_wait, - atomic_read(&trc_n_readers_need_end) == 0, - READ_ONCE(rcu_task_stall_timeout)); - if (ret) - break; // Count reached zero. - // Stall warning time, so make a list of the offenders. - rcu_read_lock(); - for_each_process_thread(g, t) - if (READ_ONCE(t->trc_reader_special.b.need_qs)) - trc_add_holdout(t, &holdouts); - rcu_read_unlock(); - firstreport = true; - list_for_each_entry_safe(t, g, &holdouts, trc_holdout_list) { - if (READ_ONCE(t->trc_reader_special.b.need_qs)) - show_stalled_task_trace(t, &firstreport); - trc_del_holdout(t); // Release task_struct reference. - } - if (firstreport) - pr_err("INFO: rcu_tasks_trace detected stalls? (Counter/taskslist mismatch?)\n"); - show_stalled_ipi_trace(); - pr_err("\t%d holdouts\n", atomic_read(&trc_n_readers_need_end)); - } smp_mb(); // Caller's code must be ordered after wakeup. // Pairs with pretty much every ordering primitive. } @@ -1513,11 +1659,14 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) /* Report any needed quiescent state for this exiting task. */ static void exit_tasks_rcu_finish_trace(struct task_struct *t) { - WRITE_ONCE(t->trc_reader_checked, true); + union rcu_special trs = READ_ONCE(t->trc_reader_special); + + rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED); WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting)); - WRITE_ONCE(t->trc_reader_nesting, 0); - if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs))) + if (WARN_ON_ONCE(rcu_ld_need_qs(t) & TRC_NEED_QS || trs.b.blocked)) rcu_read_unlock_trace_special(t); + else + WRITE_ONCE(t->trc_reader_nesting, 0); } /** @@ -1591,7 +1740,6 @@ static int __init rcu_spawn_tasks_trace_kthread(void) rcu_tasks_trace.init_fract = 1; } rcu_tasks_trace.pregp_func = rcu_tasks_trace_pregp_step; - rcu_tasks_trace.pertask_func = rcu_tasks_trace_pertask; rcu_tasks_trace.postscan_func = rcu_tasks_trace_postscan; rcu_tasks_trace.holdouts_func = check_all_holdout_tasks_trace; rcu_tasks_trace.postgp_func = rcu_tasks_trace_postgp; @@ -1604,7 +1752,8 @@ void show_rcu_tasks_trace_gp_kthread(void) { char buf[64]; - sprintf(buf, "N%d h:%lu/%lu/%lu", atomic_read(&trc_n_readers_need_end), + sprintf(buf, "N%lu h:%lu/%lu/%lu", + data_race(n_trc_holdouts), data_race(n_heavy_reader_ofl_updates), data_race(n_heavy_reader_updates), data_race(n_heavy_reader_attempts)); @@ -1631,23 +1780,24 @@ struct rcu_tasks_test_desc { struct rcu_head rh; const char *name; bool notrun; + unsigned long runstart; }; static struct rcu_tasks_test_desc tests[] = { { .name = "call_rcu_tasks()", /* If not defined, the test is skipped. */ - .notrun = !IS_ENABLED(CONFIG_TASKS_RCU), + .notrun = IS_ENABLED(CONFIG_TASKS_RCU), }, { .name = "call_rcu_tasks_rude()", /* If not defined, the test is skipped. */ - .notrun = !IS_ENABLED(CONFIG_TASKS_RUDE_RCU), + .notrun = IS_ENABLED(CONFIG_TASKS_RUDE_RCU), }, { .name = "call_rcu_tasks_trace()", /* If not defined, the test is skipped. */ - .notrun = !IS_ENABLED(CONFIG_TASKS_TRACE_RCU) + .notrun = IS_ENABLED(CONFIG_TASKS_TRACE_RCU) } }; @@ -1658,46 +1808,85 @@ static void test_rcu_tasks_callback(struct rcu_head *rhp) pr_info("Callback from %s invoked.\n", rttd->name); - rttd->notrun = true; + rttd->notrun = false; } static void rcu_tasks_initiate_self_tests(void) { + unsigned long j = jiffies; + pr_info("Running RCU-tasks wait API self tests\n"); #ifdef CONFIG_TASKS_RCU + tests[0].runstart = j; synchronize_rcu_tasks(); call_rcu_tasks(&tests[0].rh, test_rcu_tasks_callback); #endif #ifdef CONFIG_TASKS_RUDE_RCU + tests[1].runstart = j; synchronize_rcu_tasks_rude(); call_rcu_tasks_rude(&tests[1].rh, test_rcu_tasks_callback); #endif #ifdef CONFIG_TASKS_TRACE_RCU + tests[2].runstart = j; synchronize_rcu_tasks_trace(); call_rcu_tasks_trace(&tests[2].rh, test_rcu_tasks_callback); #endif } +/* + * Return: 0 - test passed + * 1 - test failed, but have not timed out yet + * -1 - test failed and timed out + */ static int rcu_tasks_verify_self_tests(void) { int ret = 0; int i; + unsigned long bst = rcu_task_stall_timeout; + if (bst <= 0 || bst > RCU_TASK_BOOT_STALL_TIMEOUT) + bst = RCU_TASK_BOOT_STALL_TIMEOUT; for (i = 0; i < ARRAY_SIZE(tests); i++) { - if (!tests[i].notrun) { // still hanging. - pr_err("%s has been failed.\n", tests[i].name); - ret = -1; + while (tests[i].notrun) { // still hanging. + if (time_after(jiffies, tests[i].runstart + bst)) { + pr_err("%s has failed boot-time tests.\n", tests[i].name); + ret = -1; + break; + } + ret = 1; + break; } } - - if (ret) - WARN_ON(1); + WARN_ON(ret < 0); return ret; } -late_initcall(rcu_tasks_verify_self_tests); + +/* + * Repeat the rcu_tasks_verify_self_tests() call once every second until the + * test passes or has timed out. + */ +static struct delayed_work rcu_tasks_verify_work; +static void rcu_tasks_verify_work_fn(struct work_struct *work __maybe_unused) +{ + int ret = rcu_tasks_verify_self_tests(); + + if (ret <= 0) + return; + + /* Test fails but not timed out yet, reschedule another check */ + schedule_delayed_work(&rcu_tasks_verify_work, HZ); +} + +static int rcu_tasks_verify_schedule_work(void) +{ + INIT_DELAYED_WORK(&rcu_tasks_verify_work, rcu_tasks_verify_work_fn); + rcu_tasks_verify_work_fn(NULL); + return 0; +} +late_initcall(rcu_tasks_verify_schedule_work); #else /* #ifdef CONFIG_PROVE_RCU */ static void rcu_tasks_initiate_self_tests(void) { } #endif /* #else #ifdef CONFIG_PROVE_RCU */ diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 340b3f8b090d..a33a8d4942c3 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -58,7 +58,7 @@ void rcu_qs(void) rcu_ctrlblk.donetail = rcu_ctrlblk.curtail; raise_softirq_irqoff(RCU_SOFTIRQ); } - WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 1); + WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 2); local_irq_restore(flags); } @@ -139,8 +139,10 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused /* * Wait for a grace period to elapse. But it is illegal to invoke * synchronize_rcu() from within an RCU read-side critical section. - * Therefore, any legal call to synchronize_rcu() is a quiescent - * state, and so on a UP system, synchronize_rcu() need do nothing. + * Therefore, any legal call to synchronize_rcu() is a quiescent state, + * and so on a UP system, synchronize_rcu() need do nothing, other than + * let the polled APIs know that another grace period elapsed. + * * (But Lai Jiangshan points out the benefits of doing might_sleep() * to reduce latency.) * @@ -152,9 +154,14 @@ void synchronize_rcu(void) lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_rcu() in RCU read-side critical section"); + WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 2); } EXPORT_SYMBOL_GPL(synchronize_rcu); +static void tiny_rcu_leak_callback(struct rcu_head *rhp) +{ +} + /* * Post an RCU callback to be invoked after the end of an RCU grace * period. But since we have but one CPU, that would be after any @@ -162,9 +169,20 @@ EXPORT_SYMBOL_GPL(synchronize_rcu); */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { + static atomic_t doublefrees; unsigned long flags; - debug_rcu_head_queue(head); + if (debug_rcu_head_queue(head)) { + if (atomic_inc_return(&doublefrees) < 4) { + pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func); + mem_dump_obj(head); + } + + if (!__is_kvfree_rcu_offset((unsigned long)head->func)) + WRITE_ONCE(head->func, tiny_rcu_leak_callback); + return; + } + head->func = func; head->next = NULL; @@ -181,6 +199,16 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func) EXPORT_SYMBOL_GPL(call_rcu); /* + * Store a grace-period-counter "cookie". For more information, + * see the Tree RCU header comment. + */ +void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) +{ + rgosp->rgos_norm = RCU_GET_STATE_COMPLETED; +} +EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu_full); + +/* * Return a grace-period-counter "cookie". For more information, * see the Tree RCU header comment. */ @@ -213,10 +241,24 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); */ bool poll_state_synchronize_rcu(unsigned long oldstate) { - return READ_ONCE(rcu_ctrlblk.gp_seq) != oldstate; + return oldstate == RCU_GET_STATE_COMPLETED || READ_ONCE(rcu_ctrlblk.gp_seq) != oldstate; } EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); +#ifdef CONFIG_KASAN_GENERIC +void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +{ + if (head) { + void *ptr = (void *) head - (unsigned long) func; + + kasan_record_aux_stack_noalloc(ptr); + } + + __kvfree_call_rcu(head, func); +} +EXPORT_SYMBOL_GPL(kvfree_call_rcu); +#endif + void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a4b8189455d5..6bb8e72bc815 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -62,6 +62,7 @@ #include <linux/vmalloc.h> #include <linux/mm.h> #include <linux/kasan.h> +#include <linux/context_tracking.h> #include "../time/tick-internal.h" #include "tree.h" @@ -75,9 +76,7 @@ /* Data structures. */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { - .dynticks_nesting = 1, - .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, - .dynticks = ATOMIC_INIT(1), + .gpwrap = true, #ifdef CONFIG_RCU_NOCB_CPU .cblist.flags = SEGCBLIST_RCU_CORE, #endif @@ -154,7 +153,11 @@ static void sync_sched_exp_online_cleanup(int cpu); static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp); static bool rcu_rdp_is_offloaded(struct rcu_data *rdp); -/* rcuc/rcub/rcuop kthread realtime priority */ +/* + * rcuc/rcub/rcuop kthread realtime priority. The "rcuop" + * real-time priority(enabling/disabling) is controlled by + * the extra CONFIG_RCU_NOCB_CPU_CB_BOOST configuration. + */ static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; module_param(kthread_prio, int, 0444); @@ -263,56 +266,6 @@ void rcu_softirq_qs(void) } /* - * Increment the current CPU's rcu_data structure's ->dynticks field - * with ordering. Return the new value. - */ -static noinline noinstr unsigned long rcu_dynticks_inc(int incby) -{ - return arch_atomic_add_return(incby, this_cpu_ptr(&rcu_data.dynticks)); -} - -/* - * Record entry into an extended quiescent state. This is only to be - * called when not already in an extended quiescent state, that is, - * RCU is watching prior to the call to this function and is no longer - * watching upon return. - */ -static noinstr void rcu_dynticks_eqs_enter(void) -{ - int seq; - - /* - * CPUs seeing atomic_add_return() must see prior RCU read-side - * critical sections, and we also must force ordering with the - * next idle sojourn. - */ - rcu_dynticks_task_trace_enter(); // Before ->dynticks update! - seq = rcu_dynticks_inc(1); - // RCU is no longer watching. Better be in extended quiescent state! - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & 0x1)); -} - -/* - * Record exit from an extended quiescent state. This is only to be - * called from an extended quiescent state, that is, RCU is not watching - * prior to the call to this function and is watching upon return. - */ -static noinstr void rcu_dynticks_eqs_exit(void) -{ - int seq; - - /* - * CPUs seeing atomic_add_return() must see prior idle sojourns, - * and we also must force ordering with the next RCU read-side - * critical section. - */ - seq = rcu_dynticks_inc(1); - // RCU is now watching. Better not be in an extended quiescent state! - rcu_dynticks_task_trace_exit(); // After ->dynticks update! - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & 0x1)); -} - -/* * Reset the current CPU's ->dynticks counter to indicate that the * newly onlined CPU is no longer in an extended quiescent state. * This will either leave the counter unchanged, or increment it @@ -324,31 +277,19 @@ static noinstr void rcu_dynticks_eqs_exit(void) */ static void rcu_dynticks_eqs_online(void) { - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - - if (atomic_read(&rdp->dynticks) & 0x1) + if (ct_dynticks() & RCU_DYNTICKS_IDX) return; - rcu_dynticks_inc(1); -} - -/* - * Is the current CPU in an extended quiescent state? - * - * No ordering, as we are sampling CPU-local information. - */ -static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void) -{ - return !(arch_atomic_read(this_cpu_ptr(&rcu_data.dynticks)) & 0x1); + ct_state_inc(RCU_DYNTICKS_IDX); } /* * Snapshot the ->dynticks counter with full ordering so as to allow * stable comparison of this counter with past and future snapshots. */ -static int rcu_dynticks_snap(struct rcu_data *rdp) +static int rcu_dynticks_snap(int cpu) { smp_mb(); // Fundamental RCU ordering guarantee. - return atomic_read_acquire(&rdp->dynticks); + return ct_dynticks_cpu_acquire(cpu); } /* @@ -357,15 +298,13 @@ static int rcu_dynticks_snap(struct rcu_data *rdp) */ static bool rcu_dynticks_in_eqs(int snap) { - return !(snap & 0x1); + return !(snap & RCU_DYNTICKS_IDX); } /* Return true if the specified CPU is currently idle from an RCU viewpoint. */ bool rcu_is_idle_cpu(int cpu) { - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - - return rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)); + return rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu)); } /* @@ -375,7 +314,7 @@ bool rcu_is_idle_cpu(int cpu) */ static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap) { - return snap != rcu_dynticks_snap(rdp); + return snap != rcu_dynticks_snap(rdp->cpu); } /* @@ -384,19 +323,17 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap) */ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) { - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); int snap; // If not quiescent, force back to earlier extended quiescent state. - snap = atomic_read(&rdp->dynticks) & ~0x1; - + snap = ct_dynticks_cpu(cpu) & ~RCU_DYNTICKS_IDX; smp_rmb(); // Order ->dynticks and *vp reads. if (READ_ONCE(*vp)) return false; // Non-zero, so report failure; smp_rmb(); // Order *vp read and ->dynticks re-read. // If still in the same extended quiescent state, we are good! - return snap == atomic_read(&rdp->dynticks); + return snap == ct_dynticks_cpu(cpu); } /* @@ -415,9 +352,9 @@ notrace void rcu_momentary_dyntick_idle(void) int seq; raw_cpu_write(rcu_data.rcu_need_heavy_qs, false); - seq = rcu_dynticks_inc(2); + seq = ct_state_inc(2 * RCU_DYNTICKS_IDX); /* It is illegal to call this from idle state. */ - WARN_ON_ONCE(!(seq & 0x1)); + WARN_ON_ONCE(!(seq & RCU_DYNTICKS_IDX)); rcu_preempt_deferred_qs(current); } EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle); @@ -442,13 +379,13 @@ static int rcu_is_cpu_rrupt_from_idle(void) lockdep_assert_irqs_disabled(); /* Check for counter underflows */ - RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) < 0, + RCU_LOCKDEP_WARN(ct_dynticks_nesting() < 0, "RCU dynticks_nesting counter underflow!"); - RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 0, + RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() <= 0, "RCU dynticks_nmi_nesting counter underflow/zero!"); /* Are we at first interrupt nesting level? */ - nesting = __this_cpu_read(rcu_data.dynticks_nmi_nesting); + nesting = ct_dynticks_nmi_nesting(); if (nesting > 1) return false; @@ -458,7 +395,7 @@ static int rcu_is_cpu_rrupt_from_idle(void) WARN_ON_ONCE(!nesting && !is_idle_task(current)); /* Does CPU appear to be idle from an RCU standpoint? */ - return __this_cpu_read(rcu_data.dynticks_nesting) == 0; + return ct_dynticks_nesting() == 0; } #define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10) @@ -609,66 +546,7 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, } EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); -/* - * Enter an RCU extended quiescent state, which can be either the - * idle loop or adaptive-tickless usermode execution. - * - * We crowbar the ->dynticks_nmi_nesting field to zero to allow for - * the possibility of usermode upcalls having messed up our count - * of interrupt nesting level during the prior busy period. - */ -static noinstr void rcu_eqs_enter(bool user) -{ - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - - WARN_ON_ONCE(rdp->dynticks_nmi_nesting != DYNTICK_IRQ_NONIDLE); - WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - rdp->dynticks_nesting == 0); - if (rdp->dynticks_nesting != 1) { - // RCU will still be watching, so just do accounting and leave. - rdp->dynticks_nesting--; - return; - } - - lockdep_assert_irqs_disabled(); - instrumentation_begin(); - trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks)); - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); - rcu_preempt_deferred_qs(current); - - // instrumentation for the noinstr rcu_dynticks_eqs_enter() - instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); - - instrumentation_end(); - WRITE_ONCE(rdp->dynticks_nesting, 0); /* Avoid irq-access tearing. */ - // RCU is watching here ... - rcu_dynticks_eqs_enter(); - // ... but is no longer watching here. - rcu_dynticks_task_enter(); -} - -/** - * rcu_idle_enter - inform RCU that current CPU is entering idle - * - * Enter idle mode, in other words, -leave- the mode in which RCU - * read-side critical sections can occur. (Though RCU read-side - * critical sections can occur in irq handlers in idle, a possibility - * handled by irq_enter() and irq_exit().) - * - * If you add or remove a call to rcu_idle_enter(), be sure to test with - * CONFIG_RCU_EQS_DEBUG=y. - */ -void rcu_idle_enter(void) -{ - lockdep_assert_irqs_disabled(); - rcu_eqs_enter(false); -} -EXPORT_SYMBOL_GPL(rcu_idle_enter); - -#ifdef CONFIG_NO_HZ_FULL - -#if !defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK) +#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) /* * An empty function that will trigger a reschedule on * IRQ tail once IRQs get re-enabled on userspace/guest resume. @@ -690,7 +568,7 @@ static DEFINE_PER_CPU(struct irq_work, late_wakeup_work) = * last resort is to fire a local irq_work that will trigger a reschedule once IRQs * get re-enabled again. */ -noinstr static void rcu_irq_work_resched(void) +noinstr void rcu_irq_work_resched(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -706,114 +584,7 @@ noinstr static void rcu_irq_work_resched(void) } instrumentation_end(); } - -#else -static inline void rcu_irq_work_resched(void) { } -#endif - -/** - * rcu_user_enter - inform RCU that we are resuming userspace. - * - * Enter RCU idle mode right before resuming userspace. No use of RCU - * is permitted between this call and rcu_user_exit(). This way the - * CPU doesn't need to maintain the tick for RCU maintenance purposes - * when the CPU runs in userspace. - * - * If you add or remove a call to rcu_user_enter(), be sure to test with - * CONFIG_RCU_EQS_DEBUG=y. - */ -noinstr void rcu_user_enter(void) -{ - lockdep_assert_irqs_disabled(); - - /* - * Other than generic entry implementation, we may be past the last - * rescheduling opportunity in the entry code. Trigger a self IPI - * that will fire and reschedule once we resume in user/guest mode. - */ - rcu_irq_work_resched(); - rcu_eqs_enter(true); -} - -#endif /* CONFIG_NO_HZ_FULL */ - -/** - * rcu_nmi_exit - inform RCU of exit from NMI context - * - * If we are returning from the outermost NMI handler that interrupted an - * RCU-idle period, update rdp->dynticks and rdp->dynticks_nmi_nesting - * to let the RCU grace-period handling know that the CPU is back to - * being RCU-idle. - * - * If you add or remove a call to rcu_nmi_exit(), be sure to test - * with CONFIG_RCU_EQS_DEBUG=y. - */ -noinstr void rcu_nmi_exit(void) -{ - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - - instrumentation_begin(); - /* - * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks. - * (We are exiting an NMI handler, so RCU better be paying attention - * to us!) - */ - WARN_ON_ONCE(rdp->dynticks_nmi_nesting <= 0); - WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs()); - - /* - * If the nesting level is not 1, the CPU wasn't RCU-idle, so - * leave it in non-RCU-idle state. - */ - if (rdp->dynticks_nmi_nesting != 1) { - trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, - atomic_read(&rdp->dynticks)); - WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */ - rdp->dynticks_nmi_nesting - 2); - instrumentation_end(); - return; - } - - /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ - trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks)); - WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ - - // instrumentation for the noinstr rcu_dynticks_eqs_enter() - instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); - instrumentation_end(); - - // RCU is watching here ... - rcu_dynticks_eqs_enter(); - // ... but is no longer watching here. - - if (!in_nmi()) - rcu_dynticks_task_enter(); -} - -/** - * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle - * - * Exit from an interrupt handler, which might possibly result in entering - * idle mode, in other words, leaving the mode in which read-side critical - * sections can occur. The caller must have disabled interrupts. - * - * This code assumes that the idle loop never does anything that might - * result in unbalanced calls to irq_enter() and irq_exit(). If your - * architecture's idle loop violates this assumption, RCU will give you what - * you deserve, good and hard. But very infrequently and irreproducibly. - * - * Use things like work queues to work around this limitation. - * - * You have been warned. - * - * If you add or remove a call to rcu_irq_exit(), be sure to test with - * CONFIG_RCU_EQS_DEBUG=y. - */ -void noinstr rcu_irq_exit(void) -{ - lockdep_assert_irqs_disabled(); - rcu_nmi_exit(); -} +#endif /* #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) */ #ifdef CONFIG_PROVE_RCU /** @@ -823,9 +594,9 @@ void rcu_irq_exit_check_preempt(void) { lockdep_assert_irqs_disabled(); - RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0, + RCU_LOCKDEP_WARN(ct_dynticks_nesting() <= 0, "RCU dynticks_nesting counter underflow/zero!"); - RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) != + RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() != DYNTICK_IRQ_NONIDLE, "Bad RCU dynticks_nmi_nesting counter\n"); RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), @@ -833,95 +604,8 @@ void rcu_irq_exit_check_preempt(void) } #endif /* #ifdef CONFIG_PROVE_RCU */ -/* - * Wrapper for rcu_irq_exit() where interrupts are enabled. - * - * If you add or remove a call to rcu_irq_exit_irqson(), be sure to test - * with CONFIG_RCU_EQS_DEBUG=y. - */ -void rcu_irq_exit_irqson(void) -{ - unsigned long flags; - - local_irq_save(flags); - rcu_irq_exit(); - local_irq_restore(flags); -} - -/* - * Exit an RCU extended quiescent state, which can be either the - * idle loop or adaptive-tickless usermode execution. - * - * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to - * allow for the possibility of usermode upcalls messing up our count of - * interrupt nesting level during the busy period that is just now starting. - */ -static void noinstr rcu_eqs_exit(bool user) -{ - struct rcu_data *rdp; - long oldval; - - lockdep_assert_irqs_disabled(); - rdp = this_cpu_ptr(&rcu_data); - oldval = rdp->dynticks_nesting; - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); - if (oldval) { - // RCU was already watching, so just do accounting and leave. - rdp->dynticks_nesting++; - return; - } - rcu_dynticks_task_exit(); - // RCU is not watching here ... - rcu_dynticks_eqs_exit(); - // ... but is watching here. - instrumentation_begin(); - - // instrumentation for the noinstr rcu_dynticks_eqs_exit() - instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); - - trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks)); - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); - WRITE_ONCE(rdp->dynticks_nesting, 1); - WARN_ON_ONCE(rdp->dynticks_nmi_nesting); - WRITE_ONCE(rdp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); - instrumentation_end(); -} - -/** - * rcu_idle_exit - inform RCU that current CPU is leaving idle - * - * Exit idle mode, in other words, -enter- the mode in which RCU - * read-side critical sections can occur. - * - * If you add or remove a call to rcu_idle_exit(), be sure to test with - * CONFIG_RCU_EQS_DEBUG=y. - */ -void rcu_idle_exit(void) -{ - unsigned long flags; - - local_irq_save(flags); - rcu_eqs_exit(false); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(rcu_idle_exit); - #ifdef CONFIG_NO_HZ_FULL /** - * rcu_user_exit - inform RCU that we are exiting userspace. - * - * Exit RCU idle mode while entering the kernel because it can - * run a RCU read side critical section anytime. - * - * If you add or remove a call to rcu_user_exit(), be sure to test with - * CONFIG_RCU_EQS_DEBUG=y. - */ -void noinstr rcu_user_exit(void) -{ - rcu_eqs_exit(true); -} - -/** * __rcu_irq_enter_check_tick - Enable scheduler tick on CPU if RCU needs it. * * The scheduler tick is not normally enabled when CPUs enter the kernel @@ -983,109 +667,6 @@ void __rcu_irq_enter_check_tick(void) } #endif /* CONFIG_NO_HZ_FULL */ -/** - * rcu_nmi_enter - inform RCU of entry to NMI context - * - * If the CPU was idle from RCU's viewpoint, update rdp->dynticks and - * rdp->dynticks_nmi_nesting to let the RCU grace-period handling know - * that the CPU is active. This implementation permits nested NMIs, as - * long as the nesting level does not overflow an int. (You will probably - * run out of stack space first.) - * - * If you add or remove a call to rcu_nmi_enter(), be sure to test - * with CONFIG_RCU_EQS_DEBUG=y. - */ -noinstr void rcu_nmi_enter(void) -{ - long incby = 2; - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - - /* Complain about underflow. */ - WARN_ON_ONCE(rdp->dynticks_nmi_nesting < 0); - - /* - * If idle from RCU viewpoint, atomically increment ->dynticks - * to mark non-idle and increment ->dynticks_nmi_nesting by one. - * Otherwise, increment ->dynticks_nmi_nesting by two. This means - * if ->dynticks_nmi_nesting is equal to one, we are guaranteed - * to be in the outermost NMI handler that interrupted an RCU-idle - * period (observation due to Andy Lutomirski). - */ - if (rcu_dynticks_curr_cpu_in_eqs()) { - - if (!in_nmi()) - rcu_dynticks_task_exit(); - - // RCU is not watching here ... - rcu_dynticks_eqs_exit(); - // ... but is watching here. - - instrumentation_begin(); - // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs() - instrument_atomic_read(&rdp->dynticks, sizeof(rdp->dynticks)); - // instrumentation for the noinstr rcu_dynticks_eqs_exit() - instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); - - incby = 1; - } else if (!in_nmi()) { - instrumentation_begin(); - rcu_irq_enter_check_tick(); - } else { - instrumentation_begin(); - } - - trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), - rdp->dynticks_nmi_nesting, - rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks)); - instrumentation_end(); - WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */ - rdp->dynticks_nmi_nesting + incby); - barrier(); -} - -/** - * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle - * - * Enter an interrupt handler, which might possibly result in exiting - * idle mode, in other words, entering the mode in which read-side critical - * sections can occur. The caller must have disabled interrupts. - * - * Note that the Linux kernel is fully capable of entering an interrupt - * handler that it never exits, for example when doing upcalls to user mode! - * This code assumes that the idle loop never does upcalls to user mode. - * If your architecture's idle loop does do upcalls to user mode (or does - * anything else that results in unbalanced calls to the irq_enter() and - * irq_exit() functions), RCU will give you what you deserve, good and hard. - * But very infrequently and irreproducibly. - * - * Use things like work queues to work around this limitation. - * - * You have been warned. - * - * If you add or remove a call to rcu_irq_enter(), be sure to test with - * CONFIG_RCU_EQS_DEBUG=y. - */ -noinstr void rcu_irq_enter(void) -{ - lockdep_assert_irqs_disabled(); - rcu_nmi_enter(); -} - -/* - * Wrapper for rcu_irq_enter() where interrupts are enabled. - * - * If you add or remove a call to rcu_irq_enter_irqson(), be sure to test - * with CONFIG_RCU_EQS_DEBUG=y. - */ -void rcu_irq_enter_irqson(void) -{ - unsigned long flags; - - local_irq_save(flags); - rcu_irq_enter(); - local_irq_restore(flags); -} - /* * Check to see if any future non-offloaded RCU-related work will need * to be done by the current CPU, even if none need be done immediately, @@ -1223,7 +804,7 @@ static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) */ static int dyntick_save_progress_counter(struct rcu_data *rdp) { - rdp->dynticks_snap = rcu_dynticks_snap(rdp); + rdp->dynticks_snap = rcu_dynticks_snap(rdp->cpu); if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti")); rcu_gpnum_ovf(rdp->mynode, rdp); @@ -1679,6 +1260,8 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */ if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap) WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed); + if (IS_ENABLED(CONFIG_PROVE_RCU) && READ_ONCE(rdp->gpwrap)) + WRITE_ONCE(rdp->last_sched_clock, jiffies); WRITE_ONCE(rdp->gpwrap, false); rcu_gpnum_ovf(rnp, rdp); return ret; @@ -1705,11 +1288,37 @@ static void note_gp_changes(struct rcu_data *rdp) rcu_gp_kthread_wake(); } +static atomic_t *rcu_gp_slow_suppress; + +/* Register a counter to suppress debugging grace-period delays. */ +void rcu_gp_slow_register(atomic_t *rgssp) +{ + WARN_ON_ONCE(rcu_gp_slow_suppress); + + WRITE_ONCE(rcu_gp_slow_suppress, rgssp); +} +EXPORT_SYMBOL_GPL(rcu_gp_slow_register); + +/* Unregister a counter, with NULL for not caring which. */ +void rcu_gp_slow_unregister(atomic_t *rgssp) +{ + WARN_ON_ONCE(rgssp && rgssp != rcu_gp_slow_suppress); + + WRITE_ONCE(rcu_gp_slow_suppress, NULL); +} +EXPORT_SYMBOL_GPL(rcu_gp_slow_unregister); + +static bool rcu_gp_slow_is_suppressed(void) +{ + atomic_t *rgssp = READ_ONCE(rcu_gp_slow_suppress); + + return rgssp && atomic_read(rgssp); +} + static void rcu_gp_slow(int delay) { - if (delay > 0 && - !(rcu_seq_ctr(rcu_state.gp_seq) % - (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) + if (!rcu_gp_slow_is_suppressed() && delay > 0 && + !(rcu_seq_ctr(rcu_state.gp_seq) % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) schedule_timeout_idle(delay); } @@ -1747,6 +1356,79 @@ static void rcu_strict_gp_boundary(void *unused) invoke_rcu_core(); } +// Has rcu_init() been invoked? This is used (for example) to determine +// whether spinlocks may be acquired safely. +static bool rcu_init_invoked(void) +{ + return !!rcu_state.n_online_cpus; +} + +// Make the polled API aware of the beginning of a grace period. +static void rcu_poll_gp_seq_start(unsigned long *snap) +{ + struct rcu_node *rnp = rcu_get_root(); + + if (rcu_init_invoked()) + raw_lockdep_assert_held_rcu_node(rnp); + + // If RCU was idle, note beginning of GP. + if (!rcu_seq_state(rcu_state.gp_seq_polled)) + rcu_seq_start(&rcu_state.gp_seq_polled); + + // Either way, record current state. + *snap = rcu_state.gp_seq_polled; +} + +// Make the polled API aware of the end of a grace period. +static void rcu_poll_gp_seq_end(unsigned long *snap) +{ + struct rcu_node *rnp = rcu_get_root(); + + if (rcu_init_invoked()) + raw_lockdep_assert_held_rcu_node(rnp); + + // If the previously noted GP is still in effect, record the + // end of that GP. Either way, zero counter to avoid counter-wrap + // problems. + if (*snap && *snap == rcu_state.gp_seq_polled) { + rcu_seq_end(&rcu_state.gp_seq_polled); + rcu_state.gp_seq_polled_snap = 0; + rcu_state.gp_seq_polled_exp_snap = 0; + } else { + *snap = 0; + } +} + +// Make the polled API aware of the beginning of a grace period, but +// where caller does not hold the root rcu_node structure's lock. +static void rcu_poll_gp_seq_start_unlocked(unsigned long *snap) +{ + struct rcu_node *rnp = rcu_get_root(); + + if (rcu_init_invoked()) { + lockdep_assert_irqs_enabled(); + raw_spin_lock_irq_rcu_node(rnp); + } + rcu_poll_gp_seq_start(snap); + if (rcu_init_invoked()) + raw_spin_unlock_irq_rcu_node(rnp); +} + +// Make the polled API aware of the end of a grace period, but where +// caller does not hold the root rcu_node structure's lock. +static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap) +{ + struct rcu_node *rnp = rcu_get_root(); + + if (rcu_init_invoked()) { + lockdep_assert_irqs_enabled(); + raw_spin_lock_irq_rcu_node(rnp); + } + rcu_poll_gp_seq_end(snap); + if (rcu_init_invoked()) + raw_spin_unlock_irq_rcu_node(rnp); +} + /* * Initialize a new grace period. Return false if no grace period required. */ @@ -1782,6 +1464,7 @@ static noinline_for_stack bool rcu_gp_init(void) rcu_seq_start(&rcu_state.gp_seq); ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start")); + rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap); raw_spin_unlock_irq_rcu_node(rnp); /* @@ -1943,19 +1626,23 @@ static void rcu_gp_fqs(bool first_time) */ static noinline_for_stack void rcu_gp_fqs_loop(void) { - bool first_gp_fqs; + bool first_gp_fqs = true; int gf = 0; unsigned long j; int ret; struct rcu_node *rnp = rcu_get_root(); - first_gp_fqs = true; j = READ_ONCE(jiffies_till_first_fqs); if (rcu_state.cbovld) gf = RCU_GP_FLAG_OVLD; ret = 0; for (;;) { - if (!ret) { + if (rcu_state.cbovld) { + j = (j + 2) / 3; + if (j <= 0) + j = 1; + } + if (!ret || time_before(jiffies + j, rcu_state.jiffies_force_qs)) { WRITE_ONCE(rcu_state.jiffies_force_qs, jiffies + j); /* * jiffies_force_qs before RCU_GP_WAIT_FQS state @@ -1973,7 +1660,15 @@ static noinline_for_stack void rcu_gp_fqs_loop(void) rcu_gp_torture_wait(); WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS); /* Locking provides needed memory barriers. */ - /* If grace period done, leave loop. */ + /* + * Exit the loop if the root rcu_node structure indicates that the grace period + * has ended, leave the loop. The rcu_preempt_blocked_readers_cgp(rnp) check + * is required only for single-node rcu_node trees because readers blocking + * the current grace period are queued only on leaf rcu_node structures. + * For multi-node trees, checking the root node's ->qsmask suffices, because a + * given root node's ->qsmask bit is cleared only when all CPUs and tasks from + * the corresponding leaf nodes have passed through their quiescent state. + */ if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) break; @@ -2041,6 +1736,7 @@ static noinline void rcu_gp_cleanup(void) * safe for us to drop the lock in order to mark the grace * period as completed in all of the rcu_node structures. */ + rcu_poll_gp_seq_end(&rcu_state.gp_seq_polled_snap); raw_spin_unlock_irq_rcu_node(rnp); /* @@ -2060,6 +1756,8 @@ static noinline void rcu_gp_cleanup(void) dump_blkd_tasks(rnp, 10); WARN_ON_ONCE(rnp->qsmask); WRITE_ONCE(rnp->gp_seq, new_gp_seq); + if (!rnp->parent) + smp_mb(); // Order against failing poll_state_synchronize_rcu_full(). rdp = this_cpu_ptr(&rcu_data); if (rnp == rdp->mynode) needgp = __note_gp_changes(rnp, rdp) || needgp; @@ -2096,14 +1794,29 @@ static noinline void rcu_gp_cleanup(void) /* Advance CBs to reduce false positives below. */ offloaded = rcu_rdp_is_offloaded(rdp); if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) { + + // We get here if a grace period was needed (“needgp”) + // and the above call to rcu_accelerate_cbs() did not set + // the RCU_GP_FLAG_INIT bit in ->gp_state (which records + // the need for another grace period). The purpose + // of the “offloaded” check is to avoid invoking + // rcu_accelerate_cbs() on an offloaded CPU because we do not + // hold the ->nocb_lock needed to safely access an offloaded + // ->cblist. We do not want to acquire that lock because + // it can be heavily contended during callback floods. + WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT); WRITE_ONCE(rcu_state.gp_req_activity, jiffies); - trace_rcu_grace_period(rcu_state.name, - rcu_state.gp_seq, - TPS("newreq")); + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("newreq")); } else { - WRITE_ONCE(rcu_state.gp_flags, - rcu_state.gp_flags & RCU_GP_FLAG_INIT); + + // We get here either if there is no need for an + // additional grace period or if rcu_accelerate_cbs() has + // already set the RCU_GP_FLAG_INIT bit in ->gp_flags. + // So all we need to do is to clear all of the other + // ->gp_flags bits. + + WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags & RCU_GP_FLAG_INIT); } raw_spin_unlock_irq_rcu_node(rnp); @@ -2487,7 +2200,7 @@ static void rcu_do_batch(struct rcu_data *rdp) trace_rcu_batch_end(rcu_state.name, 0, !rcu_segcblist_empty(&rdp->cblist), need_resched(), is_idle_task(current), - rcu_is_callbacks_kthread()); + rcu_is_callbacks_kthread(rdp)); return; } @@ -2565,7 +2278,7 @@ static void rcu_do_batch(struct rcu_data *rdp) rcu_nocb_lock_irqsave(rdp, flags); rdp->n_cbs_invoked += count; trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(), - is_idle_task(current), rcu_is_callbacks_kthread()); + is_idle_task(current), rcu_is_callbacks_kthread(rdp)); /* Update counts and requeue any remaining callbacks. */ rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl); @@ -2609,6 +2322,13 @@ static void rcu_do_batch(struct rcu_data *rdp) */ void rcu_sched_clock_irq(int user) { + unsigned long j; + + if (IS_ENABLED(CONFIG_PROVE_RCU)) { + j = jiffies; + WARN_ON_ONCE(time_before(j, __this_cpu_read(rcu_data.last_sched_clock))); + __this_cpu_write(rcu_data.last_sched_clock, j); + } trace_rcu_utilization(TPS("Start scheduler-tick")); lockdep_assert_irqs_disabled(); raw_cpu_inc(rcu_data.ticks_this_gp); @@ -2624,6 +2344,8 @@ void rcu_sched_clock_irq(int user) rcu_flavor_sched_clock_irq(user); if (rcu_pending(user)) invoke_rcu_core(); + if (user || rcu_is_cpu_rrupt_from_idle()) + rcu_note_voluntary_context_switch(current); lockdep_assert_irqs_disabled(); trace_rcu_utilization(TPS("End scheduler-tick")); @@ -3113,7 +2835,7 @@ EXPORT_SYMBOL_GPL(call_rcu); /* Maximum number of jiffies to wait before draining a batch. */ -#define KFREE_DRAIN_JIFFIES (HZ / 50) +#define KFREE_DRAIN_JIFFIES (5 * HZ) #define KFREE_N_BATCHES 2 #define FREE_N_CHANNELS 2 @@ -3159,7 +2881,6 @@ struct kfree_rcu_cpu_work { * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period * @lock: Synchronize access to this structure * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES - * @monitor_todo: Tracks whether a @monitor_work delayed work is pending * @initialized: The @rcu_work fields have been initialized * @count: Number of objects for which GP not started * @bkvcache: @@ -3184,7 +2905,6 @@ struct kfree_rcu_cpu { struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; raw_spinlock_t lock; struct delayed_work monitor_work; - bool monitor_todo; bool initialized; int count; @@ -3364,6 +3084,33 @@ static void kfree_rcu_work(struct work_struct *work) } } +static bool +need_offload_krc(struct kfree_rcu_cpu *krcp) +{ + int i; + + for (i = 0; i < FREE_N_CHANNELS; i++) + if (krcp->bkvhead[i]) + return true; + + return !!krcp->head; +} + +static void +schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) +{ + long delay, delay_left; + + delay = READ_ONCE(krcp->count) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES; + if (delayed_work_pending(&krcp->monitor_work)) { + delay_left = krcp->monitor_work.timer.expires - jiffies; + if (delay < delay_left) + mod_delayed_work(system_wq, &krcp->monitor_work, delay); + return; + } + queue_delayed_work(system_wq, &krcp->monitor_work, delay); +} + /* * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. */ @@ -3420,10 +3167,8 @@ static void kfree_rcu_monitor(struct work_struct *work) // of the channels that is still busy we should rearm the // work to repeat an attempt. Because previous batches are // still in progress. - if (!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head) - krcp->monitor_todo = false; - else - schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + if (need_offload_krc(krcp)) + schedule_delayed_monitor_work(krcp); raw_spin_unlock_irqrestore(&krcp->lock, flags); } @@ -3456,15 +3201,16 @@ static void fill_page_cache_func(struct work_struct *work) bnode = (struct kvfree_rcu_bulk_data *) __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); - if (bnode) { - raw_spin_lock_irqsave(&krcp->lock, flags); - pushed = put_cached_bnode(krcp, bnode); - raw_spin_unlock_irqrestore(&krcp->lock, flags); + if (!bnode) + break; - if (!pushed) { - free_page((unsigned long) bnode); - break; - } + raw_spin_lock_irqsave(&krcp->lock, flags); + pushed = put_cached_bnode(krcp, bnode); + raw_spin_unlock_irqrestore(&krcp->lock, flags); + + if (!pushed) { + free_page((unsigned long) bnode); + break; } } @@ -3610,11 +3356,8 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) WRITE_ONCE(krcp->count, krcp->count + 1); // Set timer to drain after KFREE_DRAIN_JIFFIES. - if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && - !krcp->monitor_todo) { - krcp->monitor_todo = true; - schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); - } + if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING) + schedule_delayed_monitor_work(krcp); unlock_return: krc_this_cpu_unlock(krcp, flags); @@ -3647,7 +3390,7 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) atomic_set(&krcp->backoff_page_cache_fill, 1); } - return count; + return count == 0 ? SHRINK_EMPTY : count; } static unsigned long @@ -3689,54 +3432,28 @@ void __init kfree_rcu_scheduler_running(void) struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); raw_spin_lock_irqsave(&krcp->lock, flags); - if ((!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head) || - krcp->monitor_todo) { - raw_spin_unlock_irqrestore(&krcp->lock, flags); - continue; - } - krcp->monitor_todo = true; - schedule_delayed_work_on(cpu, &krcp->monitor_work, - KFREE_DRAIN_JIFFIES); + if (need_offload_krc(krcp)) + schedule_delayed_monitor_work(krcp); raw_spin_unlock_irqrestore(&krcp->lock, flags); } } /* * During early boot, any blocking grace-period wait automatically - * implies a grace period. Later on, this is never the case for PREEMPTION. + * implies a grace period. * - * However, because a context switch is a grace period for !PREEMPTION, any - * blocking grace-period wait automatically implies a grace period if - * there is only one CPU online at any point time during execution of - * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to - * occasionally incorrectly indicate that there are multiple CPUs online - * when there was in fact only one the whole time, as this just adds some - * overhead: RCU still operates correctly. + * Later on, this could in theory be the case for kernels built with + * CONFIG_SMP=y && CONFIG_PREEMPTION=y running on a single CPU, but this + * is not a common case. Furthermore, this optimization would cause + * the rcu_gp_oldstate structure to expand by 50%, so this potential + * grace-period optimization is ignored once the scheduler is running. */ static int rcu_blocking_is_gp(void) { - int ret; - - if (IS_ENABLED(CONFIG_PREEMPTION)) - return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE; + if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE) + return false; might_sleep(); /* Check for RCU read-side critical section. */ - preempt_disable(); - /* - * If the rcu_state.n_online_cpus counter is equal to one, - * there is only one CPU, and that CPU sees all prior accesses - * made by any CPU that was online at the time of its access. - * Furthermore, if this counter is equal to one, its value cannot - * change until after the preempt_enable() below. - * - * Furthermore, if rcu_state.n_online_cpus is equal to one here, - * all later CPUs (both this one and any that come online later - * on) are guaranteed to see all accesses prior to this point - * in the code, without the need for additional memory barriers. - * Those memory barriers are provided by CPU-hotplug code. - */ - ret = READ_ONCE(rcu_state.n_online_cpus) <= 1; - preempt_enable(); - return ret; + return true; } /** @@ -3779,20 +3496,59 @@ static int rcu_blocking_is_gp(void) */ void synchronize_rcu(void) { + unsigned long flags; + struct rcu_node *rnp; + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_rcu() in RCU read-side critical section"); - if (rcu_blocking_is_gp()) - return; // Context allows vacuous grace periods. - if (rcu_gp_is_expedited()) - synchronize_rcu_expedited(); - else - wait_rcu_gp(call_rcu); + if (!rcu_blocking_is_gp()) { + if (rcu_gp_is_expedited()) + synchronize_rcu_expedited(); + else + wait_rcu_gp(call_rcu); + return; + } + + // Context allows vacuous grace periods. + // Note well that this code runs with !PREEMPT && !SMP. + // In addition, all code that advances grace periods runs at + // process level. Therefore, this normal GP overlaps with other + // normal GPs only by being fully nested within them, which allows + // reuse of ->gp_seq_polled_snap. + rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap); + rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap); + + // Update the normal grace-period counters to record + // this grace period, but only those used by the boot CPU. + // The rcu_scheduler_starting() will take care of the rest of + // these counters. + local_irq_save(flags); + WARN_ON_ONCE(num_online_cpus() > 1); + rcu_state.gp_seq += (1 << RCU_SEQ_CTR_SHIFT); + for (rnp = this_cpu_ptr(&rcu_data)->mynode; rnp; rnp = rnp->parent) + rnp->gp_seq_needed = rnp->gp_seq = rcu_state.gp_seq; + local_irq_restore(flags); } EXPORT_SYMBOL_GPL(synchronize_rcu); /** + * get_completed_synchronize_rcu_full - Return a full pre-completed polled state cookie + * @rgosp: Place to put state cookie + * + * Stores into @rgosp a value that will always be treated by functions + * like poll_state_synchronize_rcu_full() as a cookie whose grace period + * has already completed. + */ +void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) +{ + rgosp->rgos_norm = RCU_GET_STATE_COMPLETED; + rgosp->rgos_exp = RCU_GET_STATE_COMPLETED; +} +EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu_full); + +/** * get_state_synchronize_rcu - Snapshot current RCU state * * Returns a cookie that is used by a later call to cond_synchronize_rcu() @@ -3806,26 +3562,47 @@ unsigned long get_state_synchronize_rcu(void) * before the load from ->gp_seq. */ smp_mb(); /* ^^^ */ - return rcu_seq_snap(&rcu_state.gp_seq); + return rcu_seq_snap(&rcu_state.gp_seq_polled); } EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); /** - * start_poll_synchronize_rcu - Snapshot and start RCU grace period + * get_state_synchronize_rcu_full - Snapshot RCU state, both normal and expedited + * @rgosp: location to place combined normal/expedited grace-period state * - * Returns a cookie that is used by a later call to cond_synchronize_rcu() - * or poll_state_synchronize_rcu() to determine whether or not a full - * grace period has elapsed in the meantime. If the needed grace period - * is not already slated to start, notifies RCU core of the need for that - * grace period. + * Places the normal and expedited grace-period states in @rgosp. This + * state value can be passed to a later call to cond_synchronize_rcu_full() + * or poll_state_synchronize_rcu_full() to determine whether or not a + * grace period (whether normal or expedited) has elapsed in the meantime. + * The rcu_gp_oldstate structure takes up twice the memory of an unsigned + * long, but is guaranteed to see all grace periods. In contrast, the + * combined state occupies less memory, but can sometimes fail to take + * grace periods into account. * - * Interrupts must be enabled for the case where it is necessary to awaken - * the grace-period kthread. + * This does not guarantee that the needed grace period will actually + * start. */ -unsigned long start_poll_synchronize_rcu(void) +void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) +{ + struct rcu_node *rnp = rcu_get_root(); + + /* + * Any prior manipulation of RCU-protected data must happen + * before the loads from ->gp_seq and ->expedited_sequence. + */ + smp_mb(); /* ^^^ */ + rgosp->rgos_norm = rcu_seq_snap(&rnp->gp_seq); + rgosp->rgos_exp = rcu_seq_snap(&rcu_state.expedited_sequence); +} +EXPORT_SYMBOL_GPL(get_state_synchronize_rcu_full); + +/* + * Helper function for start_poll_synchronize_rcu() and + * start_poll_synchronize_rcu_full(). + */ +static void start_poll_synchronize_rcu_common(void) { unsigned long flags; - unsigned long gp_seq = get_state_synchronize_rcu(); bool needwake; struct rcu_data *rdp; struct rcu_node *rnp; @@ -3835,21 +3612,67 @@ unsigned long start_poll_synchronize_rcu(void) rdp = this_cpu_ptr(&rcu_data); rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); // irqs already disabled. - needwake = rcu_start_this_gp(rnp, rdp, gp_seq); + // Note it is possible for a grace period to have elapsed between + // the above call to get_state_synchronize_rcu() and the below call + // to rcu_seq_snap. This is OK, the worst that happens is that we + // get a grace period that no one needed. These accesses are ordered + // by smp_mb(), and we are accessing them in the opposite order + // from which they are updated at grace-period start, as required. + needwake = rcu_start_this_gp(rnp, rdp, rcu_seq_snap(&rcu_state.gp_seq)); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (needwake) rcu_gp_kthread_wake(); +} + +/** + * start_poll_synchronize_rcu - Snapshot and start RCU grace period + * + * Returns a cookie that is used by a later call to cond_synchronize_rcu() + * or poll_state_synchronize_rcu() to determine whether or not a full + * grace period has elapsed in the meantime. If the needed grace period + * is not already slated to start, notifies RCU core of the need for that + * grace period. + * + * Interrupts must be enabled for the case where it is necessary to awaken + * the grace-period kthread. + */ +unsigned long start_poll_synchronize_rcu(void) +{ + unsigned long gp_seq = get_state_synchronize_rcu(); + + start_poll_synchronize_rcu_common(); return gp_seq; } EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); /** - * poll_state_synchronize_rcu - Conditionally wait for an RCU grace period + * start_poll_synchronize_rcu_full - Take a full snapshot and start RCU grace period + * @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full() * + * Places the normal and expedited grace-period states in *@rgos. This + * state value can be passed to a later call to cond_synchronize_rcu_full() + * or poll_state_synchronize_rcu_full() to determine whether or not a + * grace period (whether normal or expedited) has elapsed in the meantime. + * If the needed grace period is not already slated to start, notifies + * RCU core of the need for that grace period. + * + * Interrupts must be enabled for the case where it is necessary to awaken + * the grace-period kthread. + */ +void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) +{ + get_state_synchronize_rcu_full(rgosp); + + start_poll_synchronize_rcu_common(); +} +EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_full); + +/** + * poll_state_synchronize_rcu - Has the specified RCU grace period completed? * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu() * * If a full RCU grace period has elapsed since the earlier call from - * which oldstate was obtained, return @true, otherwise return @false. + * which @oldstate was obtained, return @true, otherwise return @false. * If @false is returned, it is the caller's responsibility to invoke this * function later on until it does return @true. Alternatively, the caller * can explicitly wait for a grace period, for example, by passing @oldstate @@ -3857,11 +3680,12 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); * * Yes, this function does not take counter wrap into account. * But counter wrap is harmless. If the counter wraps, we have waited for - * more than 2 billion grace periods (and way more on a 64-bit system!). - * Those needing to keep oldstate values for very long time periods - * (many hours even on 32-bit systems) should check them occasionally - * and either refresh them or set a flag indicating that the grace period - * has completed. + * more than a billion grace periods (and way more on a 64-bit system!). + * Those needing to keep old state values for very long time periods + * (many hours even on 32-bit systems) should check them occasionally and + * either refresh them or set a flag indicating that the grace period has + * completed. Alternatively, they can use get_completed_synchronize_rcu() + * to get a guaranteed-completed grace-period state. * * This function provides the same memory-ordering guarantees that * would be provided by a synchronize_rcu() that was invoked at the call @@ -3870,7 +3694,8 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); */ bool poll_state_synchronize_rcu(unsigned long oldstate) { - if (rcu_seq_done(&rcu_state.gp_seq, oldstate)) { + if (oldstate == RCU_GET_STATE_COMPLETED || + rcu_seq_done_exact(&rcu_state.gp_seq_polled, oldstate)) { smp_mb(); /* Ensure GP ends before subsequent accesses. */ return true; } @@ -3879,22 +3704,70 @@ bool poll_state_synchronize_rcu(unsigned long oldstate) EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); /** - * cond_synchronize_rcu - Conditionally wait for an RCU grace period + * poll_state_synchronize_rcu_full - Has the specified RCU grace period completed? + * @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full() * - * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu() + * If a full RCU grace period has elapsed since the earlier call from + * which *rgosp was obtained, return @true, otherwise return @false. + * If @false is returned, it is the caller's responsibility to invoke this + * function later on until it does return @true. Alternatively, the caller + * can explicitly wait for a grace period, for example, by passing @rgosp + * to cond_synchronize_rcu() or by directly invoking synchronize_rcu(). + * + * Yes, this function does not take counter wrap into account. + * But counter wrap is harmless. If the counter wraps, we have waited + * for more than a billion grace periods (and way more on a 64-bit + * system!). Those needing to keep rcu_gp_oldstate values for very + * long time periods (many hours even on 32-bit systems) should check + * them occasionally and either refresh them or set a flag indicating + * that the grace period has completed. Alternatively, they can use + * get_completed_synchronize_rcu_full() to get a guaranteed-completed + * grace-period state. + * + * This function provides the same memory-ordering guarantees that would + * be provided by a synchronize_rcu() that was invoked at the call to + * the function that provided @rgosp, and that returned at the end of this + * function. And this guarantee requires that the root rcu_node structure's + * ->gp_seq field be checked instead of that of the rcu_state structure. + * The problem is that the just-ending grace-period's callbacks can be + * invoked between the time that the root rcu_node structure's ->gp_seq + * field is updated and the time that the rcu_state structure's ->gp_seq + * field is updated. Therefore, if a single synchronize_rcu() is to + * cause a subsequent poll_state_synchronize_rcu_full() to return @true, + * then the root rcu_node structure is the one that needs to be polled. + */ +bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) +{ + struct rcu_node *rnp = rcu_get_root(); + + smp_mb(); // Order against root rcu_node structure grace-period cleanup. + if (rgosp->rgos_norm == RCU_GET_STATE_COMPLETED || + rcu_seq_done_exact(&rnp->gp_seq, rgosp->rgos_norm) || + rgosp->rgos_exp == RCU_GET_STATE_COMPLETED || + rcu_seq_done_exact(&rcu_state.expedited_sequence, rgosp->rgos_exp)) { + smp_mb(); /* Ensure GP ends before subsequent accesses. */ + return true; + } + return false; +} +EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu_full); + +/** + * cond_synchronize_rcu - Conditionally wait for an RCU grace period + * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited() * * If a full RCU grace period has elapsed since the earlier call to * get_state_synchronize_rcu() or start_poll_synchronize_rcu(), just return. * Otherwise, invoke synchronize_rcu() to wait for a full grace period. * - * Yes, this function does not take counter wrap into account. But - * counter wrap is harmless. If the counter wraps, we have waited for + * Yes, this function does not take counter wrap into account. + * But counter wrap is harmless. If the counter wraps, we have waited for * more than 2 billion grace periods (and way more on a 64-bit system!), - * so waiting for one additional grace period should be just fine. + * so waiting for a couple of additional grace periods should be just fine. * * This function provides the same memory-ordering guarantees that * would be provided by a synchronize_rcu() that was invoked at the call - * to the function that provided @oldstate, and that returned at the end + * to the function that provided @oldstate and that returned at the end * of this function. */ void cond_synchronize_rcu(unsigned long oldstate) @@ -3904,6 +3777,33 @@ void cond_synchronize_rcu(unsigned long oldstate) } EXPORT_SYMBOL_GPL(cond_synchronize_rcu); +/** + * cond_synchronize_rcu_full - Conditionally wait for an RCU grace period + * @rgosp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full() + * + * If a full RCU grace period has elapsed since the call to + * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), + * or start_poll_synchronize_rcu_expedited_full() from which @rgosp was + * obtained, just return. Otherwise, invoke synchronize_rcu() to wait + * for a full grace period. + * + * Yes, this function does not take counter wrap into account. + * But counter wrap is harmless. If the counter wraps, we have waited for + * more than 2 billion grace periods (and way more on a 64-bit system!), + * so waiting for a couple of additional grace periods should be just fine. + * + * This function provides the same memory-ordering guarantees that + * would be provided by a synchronize_rcu() that was invoked at the call + * to the function that provided @rgosp and that returned at the end of + * this function. + */ +void cond_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) +{ + if (!poll_state_synchronize_rcu_full(rgosp)) + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(cond_synchronize_rcu_full); + /* * Check to see if there is any immediate RCU-related work to be done by * the current CPU, returning 1 if so and zero otherwise. The checks are @@ -4167,18 +4067,20 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) static void __init rcu_boot_init_percpu_data(int cpu) { + struct context_tracking *ct = this_cpu_ptr(&context_tracking); struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); /* Set up local state, ensuring consistent view of global state. */ rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); INIT_WORK(&rdp->strict_work, strict_work_handler); - WARN_ON_ONCE(rdp->dynticks_nesting != 1); - WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp))); + WARN_ON_ONCE(ct->dynticks_nesting != 1); + WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu))); rdp->barrier_seq_snap = rcu_state.barrier_sequence; rdp->rcu_ofl_gp_seq = rcu_state.gp_seq; rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED; rdp->rcu_onl_gp_seq = rcu_state.gp_seq; rdp->rcu_onl_gp_flags = RCU_GP_CLEANED; + rdp->last_sched_clock = jiffies; rdp->cpu = cpu; rcu_boot_init_nocb_percpu_data(rdp); } @@ -4196,6 +4098,7 @@ rcu_boot_init_percpu_data(int cpu) int rcutree_prepare_cpu(unsigned int cpu) { unsigned long flags; + struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); struct rcu_node *rnp = rcu_get_root(); @@ -4204,7 +4107,7 @@ int rcutree_prepare_cpu(unsigned int cpu) rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs); rdp->blimit = blimit; - rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */ + ct->dynticks_nesting = 1; /* CPU not up, no tearing. */ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ /* @@ -4386,6 +4289,7 @@ void rcu_report_dead(unsigned int cpu) rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags); if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */ /* Report quiescent state -before- changing ->qsmaskinitnext! */ + rcu_disable_urgency_upon_qs(rdp); rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); raw_spin_lock_irqsave_rcu_node(rnp, flags); } @@ -4431,6 +4335,7 @@ void rcutree_migrate_callbacks(int cpu) needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp); rcu_segcblist_disable(&rdp->cblist); WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist)); + check_cb_ovld_locked(my_rdp, my_rnp); if (rcu_rdp_is_offloaded(my_rdp)) { raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */ __call_rcu_nocb_wake(my_rdp, true, flags); @@ -4471,6 +4376,51 @@ static int rcu_pm_notify(struct notifier_block *self, return NOTIFY_OK; } +#ifdef CONFIG_RCU_EXP_KTHREAD +struct kthread_worker *rcu_exp_gp_kworker; +struct kthread_worker *rcu_exp_par_gp_kworker; + +static void __init rcu_start_exp_gp_kworkers(void) +{ + const char *par_gp_kworker_name = "rcu_exp_par_gp_kthread_worker"; + const char *gp_kworker_name = "rcu_exp_gp_kthread_worker"; + struct sched_param param = { .sched_priority = kthread_prio }; + + rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name); + if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) { + pr_err("Failed to create %s!\n", gp_kworker_name); + return; + } + + rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name); + if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) { + pr_err("Failed to create %s!\n", par_gp_kworker_name); + kthread_destroy_worker(rcu_exp_gp_kworker); + return; + } + + sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, ¶m); + sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO, + ¶m); +} + +static inline void rcu_alloc_par_gp_wq(void) +{ +} +#else /* !CONFIG_RCU_EXP_KTHREAD */ +struct workqueue_struct *rcu_par_gp_wq; + +static void __init rcu_start_exp_gp_kworkers(void) +{ +} + +static inline void rcu_alloc_par_gp_wq(void) +{ + rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); + WARN_ON(!rcu_par_gp_wq); +} +#endif /* CONFIG_RCU_EXP_KTHREAD */ + /* * Spawn the kthreads that handle RCU's grace periods. */ @@ -4480,6 +4430,7 @@ static int __init rcu_spawn_gp_kthread(void) struct rcu_node *rnp; struct sched_param sp; struct task_struct *t; + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); rcu_scheduler_fully_active = 1; t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name); @@ -4497,9 +4448,17 @@ static int __init rcu_spawn_gp_kthread(void) smp_store_release(&rcu_state.gp_kthread, t); /* ^^^ */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); wake_up_process(t); - rcu_spawn_nocb_kthreads(); - rcu_spawn_boost_kthreads(); + /* This is a pre-SMP initcall, we expect a single CPU */ + WARN_ON(num_online_cpus() > 1); + /* + * Those kthreads couldn't be created on rcu_init() -> rcutree_prepare_cpu() + * due to rcu_scheduler_fully_active. + */ + rcu_spawn_cpu_nocb_kthread(smp_processor_id()); + rcu_spawn_one_boost_kthread(rdp->mynode); rcu_spawn_core_kthreads(); + /* Create kthread worker for expedited GPs */ + rcu_start_exp_gp_kworkers(); return 0; } early_initcall(rcu_spawn_gp_kthread); @@ -4516,9 +4475,20 @@ early_initcall(rcu_spawn_gp_kthread); */ void rcu_scheduler_starting(void) { + unsigned long flags; + struct rcu_node *rnp; + WARN_ON(num_online_cpus() != 1); WARN_ON(nr_context_switches() > 0); rcu_test_sync_prims(); + + // Fix up the ->gp_seq counters. + local_irq_save(flags); + rcu_for_each_node_breadth_first(rnp) + rnp->gp_seq_needed = rnp->gp_seq = rcu_state.gp_seq; + local_irq_restore(flags); + + // Switch out of early boot mode. rcu_scheduler_active = RCU_SCHEDULER_INIT; rcu_test_sync_prims(); } @@ -4592,6 +4562,9 @@ static void __init rcu_init_one(void) init_waitqueue_head(&rnp->exp_wq[3]); spin_lock_init(&rnp->exp_lock); mutex_init(&rnp->boost_kthread_mutex); + raw_spin_lock_init(&rnp->exp_poll_lock); + rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED; + INIT_WORK(&rnp->exp_poll_wq, sync_rcu_do_polled_gp); } } @@ -4745,7 +4718,6 @@ static void __init rcu_dump_rcu_node_tree(void) } struct workqueue_struct *rcu_gp_wq; -struct workqueue_struct *rcu_par_gp_wq; static void __init kfree_rcu_batch_init(void) { @@ -4776,13 +4748,13 @@ static void __init kfree_rcu_batch_init(void) INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func); krcp->initialized = true; } - if (register_shrinker(&kfree_rcu_shrinker)) + if (register_shrinker(&kfree_rcu_shrinker, "rcu-kfree")) pr_err("Failed to register kfree_rcu() shrinker!\n"); } void __init rcu_init(void) { - int cpu; + int cpu = smp_processor_id(); rcu_early_boot_tests(); @@ -4802,17 +4774,15 @@ void __init rcu_init(void) * or the scheduler are operational. */ pm_notifier(rcu_pm_notify, 0); - for_each_online_cpu(cpu) { - rcutree_prepare_cpu(cpu); - rcu_cpu_starting(cpu); - rcutree_online_cpu(cpu); - } + WARN_ON(num_online_cpus() > 1); // Only one CPU this early in boot. + rcutree_prepare_cpu(cpu); + rcu_cpu_starting(cpu); + rcutree_online_cpu(cpu); /* Create workqueue for Tree SRCU and for expedited GPs. */ rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); WARN_ON(!rcu_gp_wq); - rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); - WARN_ON(!rcu_par_gp_wq); + rcu_alloc_par_gp_wq(); /* Fill in default value for rcutree.qovld boot parameter. */ /* -After- the rcu_node ->lock fields are initialized! */ @@ -4820,6 +4790,10 @@ void __init rcu_init(void) qovld_calc = DEFAULT_RCU_QOVLD_MULT * qhimark; else qovld_calc = qovld; + + // Kick-start any polled grace periods that started early. + if (!(per_cpu_ptr(&rcu_data, cpu)->mynode->exp_seq_poll_rq & 0x1)) + (void)start_poll_synchronize_rcu_expedited(); } #include "tree_stall.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 926673ebe355..d4a97e40ea9c 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -10,6 +10,7 @@ */ #include <linux/cache.h> +#include <linux/kthread.h> #include <linux/spinlock.h> #include <linux/rtmutex.h> #include <linux/threads.h> @@ -23,7 +24,11 @@ /* Communicate arguments to a workqueue handler. */ struct rcu_exp_work { unsigned long rew_s; +#ifdef CONFIG_RCU_EXP_KTHREAD + struct kthread_work rew_work; +#else struct work_struct rew_work; +#endif /* CONFIG_RCU_EXP_KTHREAD */ }; /* RCU's kthread states for tracing. */ @@ -128,6 +133,10 @@ struct rcu_node { wait_queue_head_t exp_wq[4]; struct rcu_exp_work rew; bool exp_need_flush; /* Need to flush workitem? */ + raw_spinlock_t exp_poll_lock; + /* Lock and data for polled expedited grace periods. */ + unsigned long exp_seq_poll_rq; + struct work_struct exp_poll_wq; } ____cacheline_internodealigned_in_smp; /* @@ -182,9 +191,6 @@ struct rcu_data { /* 3) dynticks interface. */ int dynticks_snap; /* Per-GP tracking for dynticks. */ - long dynticks_nesting; /* Track process nesting level. */ - long dynticks_nmi_nesting; /* Track irq/NMI nesting level. */ - atomic_t dynticks; /* Even value for idle, else odd. */ bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */ bool rcu_urgent_qs; /* GP old need light quiescent state. */ bool rcu_forced_tick; /* Forced tick to provide QS. */ @@ -230,6 +236,7 @@ struct rcu_data { * if rdp_gp. */ struct list_head nocb_entry_rdp; /* rcu_data node in wakeup chain. */ + struct rcu_data *nocb_toggling_rdp; /* rdp queued for (de-)offloading */ /* The following fields are used by CB kthread, hence new cacheline. */ struct rcu_data *nocb_gp_rdp ____cacheline_internodealigned_in_smp; @@ -254,6 +261,7 @@ struct rcu_data { unsigned long rcu_onl_gp_seq; /* ->gp_seq at last online. */ short rcu_onl_gp_flags; /* ->gp_flags at last online. */ unsigned long last_fqs_resched; /* Time of last rcu_resched(). */ + unsigned long last_sched_clock; /* Jiffies of last rcu_sched_clock_irq(). */ int cpu; }; @@ -317,6 +325,9 @@ struct rcu_state { short gp_state; /* GP kthread sleep state. */ unsigned long gp_wake_time; /* Last GP kthread wake. */ unsigned long gp_wake_seq; /* ->gp_seq at ^^^. */ + unsigned long gp_seq_polled; /* GP seq for polled API. */ + unsigned long gp_seq_polled_snap; /* ->gp_seq_polled at normal GP start. */ + unsigned long gp_seq_polled_exp_snap; /* ->gp_seq_polled at expedited GP start. */ /* End of fields guarded by root rcu_node's lock. */ @@ -364,6 +375,7 @@ struct rcu_state { arch_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp; /* Synchronize offline with */ /* GP pre-initialization. */ + int nocb_is_setup; /* nocb is setup from boot */ }; /* Values for rcu_state structure's gp_flags field. */ @@ -418,13 +430,11 @@ static void rcu_flavor_sched_clock_irq(int user); static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); -static bool rcu_is_callbacks_kthread(void); +static bool rcu_is_callbacks_kthread(struct rcu_data *rdp); static void rcu_cpu_kthread_setup(unsigned int cpu); static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp); -static void __init rcu_spawn_boost_kthreads(void); static bool rcu_preempt_has_tasks(struct rcu_node *rnp); static bool rcu_preempt_need_deferred_qs(struct task_struct *t); -static void rcu_preempt_deferred_qs(struct task_struct *t); static void zero_cpu_stall_ticks(struct rcu_data *rdp); static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); @@ -439,7 +449,6 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level); static bool do_nocb_deferred_wakeup(struct rcu_data *rdp); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); static void rcu_spawn_cpu_nocb_kthread(int cpu); -static void __init rcu_spawn_nocb_kthreads(void); static void show_rcu_nocb_state(struct rcu_data *rdp); static void rcu_nocb_lock(struct rcu_data *rdp); static void rcu_nocb_unlock(struct rcu_data *rdp); @@ -465,10 +474,6 @@ do { \ static void rcu_bind_gp_kthread(void); static bool rcu_nohz_full_cpu(void); -static void rcu_dynticks_task_enter(void); -static void rcu_dynticks_task_exit(void); -static void rcu_dynticks_task_trace_enter(void); -static void rcu_dynticks_task_trace_exit(void); /* Forward declarations for tree_stall.h */ static void record_gp_stall_check_time(void); @@ -476,3 +481,6 @@ static void rcu_iw_handler(struct irq_work *iwp); static void check_cpu_stall(struct rcu_data *rdp); static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, const unsigned long gpssdelay); + +/* Forward declarations for tree_exp.h. */ +static void sync_rcu_do_polled_gp(struct work_struct *wp); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 60197ea24ceb..18e9b4cd78ef 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -18,6 +18,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp); static void rcu_exp_gp_seq_start(void) { rcu_seq_start(&rcu_state.expedited_sequence); + rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_exp_snap); } /* @@ -34,6 +35,7 @@ static __maybe_unused unsigned long rcu_exp_gp_seq_endval(void) */ static void rcu_exp_gp_seq_end(void) { + rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_exp_snap); rcu_seq_end(&rcu_state.expedited_sequence); smp_mb(); /* Ensure that consecutive grace periods serialize. */ } @@ -334,15 +336,13 @@ fastpath: * Select the CPUs within the specified rcu_node that the upcoming * expedited grace period needs to wait for. */ -static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) +static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp) { int cpu; unsigned long flags; unsigned long mask_ofl_test; unsigned long mask_ofl_ipi; int ret; - struct rcu_exp_work *rewp = - container_of(wp, struct rcu_exp_work, rew_work); struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew); raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -358,7 +358,7 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) !(rnp->qsmaskinitnext & mask)) { mask_ofl_test |= mask; } else { - snap = rcu_dynticks_snap(rdp); + snap = rcu_dynticks_snap(cpu); if (rcu_dynticks_in_eqs(snap)) mask_ofl_test |= mask; else @@ -417,13 +417,119 @@ retry_ipi: rcu_report_exp_cpu_mult(rnp, mask_ofl_test, false); } +static void rcu_exp_sel_wait_wake(unsigned long s); + +#ifdef CONFIG_RCU_EXP_KTHREAD +static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp) +{ + struct rcu_exp_work *rewp = + container_of(wp, struct rcu_exp_work, rew_work); + + __sync_rcu_exp_select_node_cpus(rewp); +} + +static inline bool rcu_gp_par_worker_started(void) +{ + return !!READ_ONCE(rcu_exp_par_gp_kworker); +} + +static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp) +{ + kthread_init_work(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); + /* + * Use rcu_exp_par_gp_kworker, because flushing a work item from + * another work item on the same kthread worker can result in + * deadlock. + */ + kthread_queue_work(rcu_exp_par_gp_kworker, &rnp->rew.rew_work); +} + +static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp) +{ + kthread_flush_work(&rnp->rew.rew_work); +} + +/* + * Work-queue handler to drive an expedited grace period forward. + */ +static void wait_rcu_exp_gp(struct kthread_work *wp) +{ + struct rcu_exp_work *rewp; + + rewp = container_of(wp, struct rcu_exp_work, rew_work); + rcu_exp_sel_wait_wake(rewp->rew_s); +} + +static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew) +{ + kthread_init_work(&rew->rew_work, wait_rcu_exp_gp); + kthread_queue_work(rcu_exp_gp_kworker, &rew->rew_work); +} + +static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew) +{ +} +#else /* !CONFIG_RCU_EXP_KTHREAD */ +static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) +{ + struct rcu_exp_work *rewp = + container_of(wp, struct rcu_exp_work, rew_work); + + __sync_rcu_exp_select_node_cpus(rewp); +} + +static inline bool rcu_gp_par_worker_started(void) +{ + return !!READ_ONCE(rcu_par_gp_wq); +} + +static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp) +{ + int cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1); + + INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); + /* If all offline, queue the work on an unbound CPU. */ + if (unlikely(cpu > rnp->grphi - rnp->grplo)) + cpu = WORK_CPU_UNBOUND; + else + cpu += rnp->grplo; + queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work); +} + +static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp) +{ + flush_work(&rnp->rew.rew_work); +} + +/* + * Work-queue handler to drive an expedited grace period forward. + */ +static void wait_rcu_exp_gp(struct work_struct *wp) +{ + struct rcu_exp_work *rewp; + + rewp = container_of(wp, struct rcu_exp_work, rew_work); + rcu_exp_sel_wait_wake(rewp->rew_s); +} + +static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew) +{ + INIT_WORK_ONSTACK(&rew->rew_work, wait_rcu_exp_gp); + queue_work(rcu_gp_wq, &rew->rew_work); +} + +static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew) +{ + destroy_work_on_stack(&rew->rew_work); +} +#endif /* CONFIG_RCU_EXP_KTHREAD */ + /* * Select the nodes that the upcoming expedited grace period needs * to wait for. */ static void sync_rcu_exp_select_cpus(void) { - int cpu; struct rcu_node *rnp; trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("reset")); @@ -435,28 +541,21 @@ static void sync_rcu_exp_select_cpus(void) rnp->exp_need_flush = false; if (!READ_ONCE(rnp->expmask)) continue; /* Avoid early boot non-existent wq. */ - if (!READ_ONCE(rcu_par_gp_wq) || + if (!rcu_gp_par_worker_started() || rcu_scheduler_active != RCU_SCHEDULER_RUNNING || rcu_is_last_leaf_node(rnp)) { - /* No workqueues yet or last leaf, do direct call. */ + /* No worker started yet or last leaf, do direct call. */ sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work); continue; } - INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); - cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1); - /* If all offline, queue the work on an unbound CPU. */ - if (unlikely(cpu > rnp->grphi - rnp->grplo)) - cpu = WORK_CPU_UNBOUND; - else - cpu += rnp->grplo; - queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work); + sync_rcu_exp_select_cpus_queue_work(rnp); rnp->exp_need_flush = true; } - /* Wait for workqueue jobs (if any) to complete. */ + /* Wait for jobs (if any) to complete. */ rcu_for_each_leaf_node(rnp) if (rnp->exp_need_flush) - flush_work(&rnp->rew.rew_work); + sync_rcu_exp_select_cpus_flush_work(rnp); } /* @@ -496,7 +595,7 @@ static void synchronize_rcu_expedited_wait(void) struct rcu_node *rnp_root = rcu_get_root(); trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait")); - jiffies_stall = rcu_jiffies_till_stall_check(); + jiffies_stall = rcu_exp_jiffies_till_stall_check(); jiffies_start = jiffies; if (tick_nohz_full_enabled() && rcu_inkernel_boot_has_ended()) { if (synchronize_rcu_expedited_wait_once(1)) @@ -524,7 +623,6 @@ static void synchronize_rcu_expedited_wait(void) return; if (rcu_stall_is_suppressed()) continue; - panic_on_rcu_stall(); trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall")); pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rcu_state.name); @@ -539,10 +637,11 @@ static void synchronize_rcu_expedited_wait(void) continue; ndetected++; rdp = per_cpu_ptr(&rcu_data, cpu); - pr_cont(" %d-%c%c%c", cpu, + pr_cont(" %d-%c%c%c%c", cpu, "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rnp->expmaskinit)], - "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); + "N."[!!(rdp->grpmask & rnp->expmaskinitnext)], + "D."[!!(rdp->cpu_no_qs.b.exp)]); } } pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", @@ -571,7 +670,8 @@ static void synchronize_rcu_expedited_wait(void) dump_cpu_task(cpu); } } - jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; + jiffies_stall = 3 * rcu_exp_jiffies_till_stall_check() + 3; + panic_on_rcu_stall(); } } @@ -622,17 +722,6 @@ static void rcu_exp_sel_wait_wake(unsigned long s) rcu_exp_wait_wake(s); } -/* - * Work-queue handler to drive an expedited grace period forward. - */ -static void wait_rcu_exp_gp(struct work_struct *wp) -{ - struct rcu_exp_work *rewp; - - rewp = container_of(wp, struct rcu_exp_work, rew_work); - rcu_exp_sel_wait_wake(rewp->rew_s); -} - #ifdef CONFIG_PREEMPT_RCU /* @@ -739,11 +828,13 @@ static void rcu_exp_handler(void *unused) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; + bool preempt_bh_enabled = !(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)); if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || __this_cpu_read(rcu_data.cpu_no_qs.b.exp)) return; - if (rcu_is_cpu_rrupt_from_idle()) { + if (rcu_is_cpu_rrupt_from_idle() || + (IS_ENABLED(CONFIG_PREEMPT_COUNT) && preempt_bh_enabled)) { rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); return; } @@ -817,6 +908,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) void synchronize_rcu_expedited(void) { bool boottime = (rcu_scheduler_active == RCU_SCHEDULER_INIT); + unsigned long flags; struct rcu_exp_work rew; struct rcu_node *rnp; unsigned long s; @@ -827,8 +919,21 @@ void synchronize_rcu_expedited(void) "Illegal synchronize_rcu_expedited() in RCU read-side critical section"); /* Is the state is such that the call is a grace period? */ - if (rcu_blocking_is_gp()) - return; + if (rcu_blocking_is_gp()) { + // Note well that this code runs with !PREEMPT && !SMP. + // In addition, all code that advances grace periods runs + // at process level. Therefore, this expedited GP overlaps + // with other expedited GPs only by being fully nested within + // them, which allows reuse of ->gp_seq_polled_exp_snap. + rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_exp_snap); + rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_exp_snap); + + local_irq_save(flags); + WARN_ON_ONCE(num_online_cpus() > 1); + rcu_state.expedited_sequence += (1 << RCU_SEQ_CTR_SHIFT); + local_irq_restore(flags); + return; // Context allows vacuous grace periods. + } /* If expedited grace periods are prohibited, fall back to normal. */ if (rcu_gp_is_normal()) { @@ -848,20 +953,154 @@ void synchronize_rcu_expedited(void) } else { /* Marshall arguments & schedule the expedited grace period. */ rew.rew_s = s; - INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); - queue_work(rcu_gp_wq, &rew.rew_work); + synchronize_rcu_expedited_queue_work(&rew); } /* Wait for expedited grace period to complete. */ rnp = rcu_get_root(); wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], sync_exp_work_done(s)); - smp_mb(); /* Workqueue actions happen before return. */ + smp_mb(); /* Work actions happen before return. */ /* Let the next expedited grace period start. */ mutex_unlock(&rcu_state.exp_mutex); if (likely(!boottime)) - destroy_work_on_stack(&rew.rew_work); + synchronize_rcu_expedited_destroy_work(&rew); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +/* + * Ensure that start_poll_synchronize_rcu_expedited() has the expedited + * RCU grace periods that it needs. + */ +static void sync_rcu_do_polled_gp(struct work_struct *wp) +{ + unsigned long flags; + int i = 0; + struct rcu_node *rnp = container_of(wp, struct rcu_node, exp_poll_wq); + unsigned long s; + + raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags); + s = rnp->exp_seq_poll_rq; + rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED; + raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags); + if (s == RCU_GET_STATE_COMPLETED) + return; + while (!poll_state_synchronize_rcu(s)) { + synchronize_rcu_expedited(); + if (i == 10 || i == 20) + pr_info("%s: i = %d s = %lx gp_seq_polled = %lx\n", __func__, i, s, READ_ONCE(rcu_state.gp_seq_polled)); + i++; + } + raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags); + s = rnp->exp_seq_poll_rq; + if (poll_state_synchronize_rcu(s)) + rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED; + raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags); +} + +/** + * start_poll_synchronize_rcu_expedited - Snapshot current RCU state and start expedited grace period + * + * Returns a cookie to pass to a call to cond_synchronize_rcu(), + * cond_synchronize_rcu_expedited(), or poll_state_synchronize_rcu(), + * allowing them to determine whether or not any sort of grace period has + * elapsed in the meantime. If the needed expedited grace period is not + * already slated to start, initiates that grace period. + */ +unsigned long start_poll_synchronize_rcu_expedited(void) +{ + unsigned long flags; + struct rcu_data *rdp; + struct rcu_node *rnp; + unsigned long s; + + s = get_state_synchronize_rcu(); + rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id()); + rnp = rdp->mynode; + if (rcu_init_invoked()) + raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags); + if (!poll_state_synchronize_rcu(s)) { + rnp->exp_seq_poll_rq = s; + if (rcu_init_invoked()) + queue_work(rcu_gp_wq, &rnp->exp_poll_wq); + } + if (rcu_init_invoked()) + raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags); + + return s; +} +EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited); + +/** + * start_poll_synchronize_rcu_expedited_full - Take a full snapshot and start expedited grace period + * @rgosp: Place to put snapshot of grace-period state + * + * Places the normal and expedited grace-period states in rgosp. This + * state value can be passed to a later call to cond_synchronize_rcu_full() + * or poll_state_synchronize_rcu_full() to determine whether or not a + * grace period (whether normal or expedited) has elapsed in the meantime. + * If the needed expedited grace period is not already slated to start, + * initiates that grace period. + */ +void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp) +{ + get_state_synchronize_rcu_full(rgosp); + (void)start_poll_synchronize_rcu_expedited(); +} +EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited_full); + +/** + * cond_synchronize_rcu_expedited - Conditionally wait for an expedited RCU grace period + * + * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited() + * + * If any type of full RCU grace period has elapsed since the earlier + * call to get_state_synchronize_rcu(), start_poll_synchronize_rcu(), + * or start_poll_synchronize_rcu_expedited(), just return. Otherwise, + * invoke synchronize_rcu_expedited() to wait for a full grace period. + * + * Yes, this function does not take counter wrap into account. + * But counter wrap is harmless. If the counter wraps, we have waited for + * more than 2 billion grace periods (and way more on a 64-bit system!), + * so waiting for a couple of additional grace periods should be just fine. + * + * This function provides the same memory-ordering guarantees that + * would be provided by a synchronize_rcu() that was invoked at the call + * to the function that provided @oldstate and that returned at the end + * of this function. + */ +void cond_synchronize_rcu_expedited(unsigned long oldstate) +{ + if (!poll_state_synchronize_rcu(oldstate)) + synchronize_rcu_expedited(); +} +EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited); + +/** + * cond_synchronize_rcu_expedited_full - Conditionally wait for an expedited RCU grace period + * @rgosp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full() + * + * If a full RCU grace period has elapsed since the call to + * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), + * or start_poll_synchronize_rcu_expedited_full() from which @rgosp was + * obtained, just return. Otherwise, invoke synchronize_rcu_expedited() + * to wait for a full grace period. + * + * Yes, this function does not take counter wrap into account. + * But counter wrap is harmless. If the counter wraps, we have waited for + * more than 2 billion grace periods (and way more on a 64-bit system!), + * so waiting for a couple of additional grace periods should be just fine. + * + * This function provides the same memory-ordering guarantees that + * would be provided by a synchronize_rcu() that was invoked at the call + * to the function that provided @rgosp and that returned at the end of + * this function. + */ +void cond_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp) +{ + if (!poll_state_synchronize_rcu_full(rgosp)) + synchronize_rcu_expedited(); +} +EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited_full); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 636d0546a4e9..0a5f0ef41484 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -60,9 +60,6 @@ static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. * If the list is invalid, a warning is emitted and all CPUs are offloaded. */ - -static bool rcu_nocb_is_setup; - static int __init rcu_nocb_setup(char *str) { alloc_bootmem_cpumask_var(&rcu_nocb_mask); @@ -72,7 +69,7 @@ static int __init rcu_nocb_setup(char *str) cpumask_setall(rcu_nocb_mask); } } - rcu_nocb_is_setup = true; + rcu_state.nocb_is_setup = true; return 1; } __setup("rcu_nocbs", rcu_nocb_setup); @@ -215,14 +212,6 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) init_swait_queue_head(&rnp->nocb_gp_wq[1]); } -/* Is the specified CPU a no-CBs CPU? */ -bool rcu_is_nocb_cpu(int cpu) -{ - if (cpumask_available(rcu_nocb_mask)) - return cpumask_test_cpu(cpu, rcu_nocb_mask); - return false; -} - static bool __wake_nocb_gp(struct rcu_data *rdp_gp, struct rcu_data *rdp, bool force, unsigned long flags) @@ -557,52 +546,51 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, } } -/* - * Check if we ignore this rdp. - * - * We check that without holding the nocb lock but - * we make sure not to miss a freshly offloaded rdp - * with the current ordering: - * - * rdp_offload_toggle() nocb_gp_enabled_cb() - * ------------------------- ---------------------------- - * WRITE flags LOCK nocb_gp_lock - * LOCK nocb_gp_lock READ/WRITE nocb_gp_sleep - * READ/WRITE nocb_gp_sleep UNLOCK nocb_gp_lock - * UNLOCK nocb_gp_lock READ flags - */ -static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp) -{ - u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP; - - return rcu_segcblist_test_flags(&rdp->cblist, flags); -} - -static inline bool nocb_gp_update_state_deoffloading(struct rcu_data *rdp, - bool *needwake_state) +static int nocb_gp_toggle_rdp(struct rcu_data *rdp, + bool *wake_state) { struct rcu_segcblist *cblist = &rdp->cblist; + unsigned long flags; + int ret; - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { - rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP); - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) - *needwake_state = true; - } - return false; + rcu_nocb_lock_irqsave(rdp, flags); + if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) && + !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { + /* + * Offloading. Set our flag and notify the offload worker. + * We will handle this rdp until it ever gets de-offloaded. + */ + rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP); + if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) + *wake_state = true; + ret = 1; + } else if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) && + rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { + /* + * De-offloading. Clear our flag and notify the de-offload worker. + * We will ignore this rdp until it ever gets re-offloaded. + */ + rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP); + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) + *wake_state = true; + ret = 0; + } else { + WARN_ON_ONCE(1); + ret = -1; } - /* - * De-offloading. Clear our flag and notify the de-offload worker. - * We will ignore this rdp until it ever gets re-offloaded. - */ - WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); - rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP); - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) - *needwake_state = true; - return true; + rcu_nocb_unlock_irqrestore(rdp, flags); + + return ret; } +static void nocb_gp_sleep(struct rcu_data *my_rdp, int cpu) +{ + trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep")); + swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq, + !READ_ONCE(my_rdp->nocb_gp_sleep)); + trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep")); +} /* * No-CBs GP kthreads come here to wait for additional callbacks to show up @@ -620,7 +608,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) bool needwait_gp = false; // This prevents actual uninitialized use. bool needwake; bool needwake_gp; - struct rcu_data *rdp; + struct rcu_data *rdp, *rdp_toggling = NULL; struct rcu_node *rnp; unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning. bool wasempty = false; @@ -645,19 +633,10 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) * is added to the list, so the skipped-over rcu_data structures * won't be ignored for long. */ - list_for_each_entry_rcu(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp, 1) { - bool needwake_state = false; - - if (!nocb_gp_enabled_cb(rdp)) - continue; + list_for_each_entry(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp) { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check")); rcu_nocb_lock_irqsave(rdp, flags); - if (nocb_gp_update_state_deoffloading(rdp, &needwake_state)) { - rcu_nocb_unlock_irqrestore(rdp, flags); - if (needwake_state) - swake_up_one(&rdp->nocb_state_wq); - continue; - } + lockdep_assert_held(&rdp->nocb_lock); bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); if (bypass_ncbs && (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) || @@ -667,8 +646,6 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) { rcu_nocb_unlock_irqrestore(rdp, flags); - if (needwake_state) - swake_up_one(&rdp->nocb_state_wq); continue; /* No callbacks here, try next. */ } if (bypass_ncbs) { @@ -716,8 +693,6 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) } if (needwake_gp) rcu_gp_kthread_wake(); - if (needwake_state) - swake_up_one(&rdp->nocb_state_wq); } my_rdp->nocb_gp_bypass = bypass; @@ -734,13 +709,19 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) /* Polling, so trace if first poll in the series. */ if (gotcbs) trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll")); - schedule_timeout_idle(1); + if (list_empty(&my_rdp->nocb_head_rdp)) { + raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); + if (!my_rdp->nocb_toggling_rdp) + WRITE_ONCE(my_rdp->nocb_gp_sleep, true); + raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); + /* Wait for any offloading rdp */ + nocb_gp_sleep(my_rdp, cpu); + } else { + schedule_timeout_idle(1); + } } else if (!needwait_gp) { /* Wait for callbacks to appear. */ - trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep")); - swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq, - !READ_ONCE(my_rdp->nocb_gp_sleep)); - trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep")); + nocb_gp_sleep(my_rdp, cpu); } else { rnp = my_rdp->mynode; trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait")); @@ -750,15 +731,49 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) !READ_ONCE(my_rdp->nocb_gp_sleep)); trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait")); } + if (!rcu_nocb_poll) { raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); + // (De-)queue an rdp to/from the group if its nocb state is changing + rdp_toggling = my_rdp->nocb_toggling_rdp; + if (rdp_toggling) + my_rdp->nocb_toggling_rdp = NULL; + if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); del_timer(&my_rdp->nocb_timer); } WRITE_ONCE(my_rdp->nocb_gp_sleep, true); raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); + } else { + rdp_toggling = READ_ONCE(my_rdp->nocb_toggling_rdp); + if (rdp_toggling) { + /* + * Paranoid locking to make sure nocb_toggling_rdp is well + * reset *before* we (re)set SEGCBLIST_KTHREAD_GP or we could + * race with another round of nocb toggling for this rdp. + * Nocb locking should prevent from that already but we stick + * to paranoia, especially in rare path. + */ + raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); + my_rdp->nocb_toggling_rdp = NULL; + raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); + } + } + + if (rdp_toggling) { + bool wake_state = false; + int ret; + + ret = nocb_gp_toggle_rdp(rdp_toggling, &wake_state); + if (ret == 1) + list_add_tail(&rdp_toggling->nocb_entry_rdp, &my_rdp->nocb_head_rdp); + else if (ret == 0) + list_del(&rdp_toggling->nocb_entry_rdp); + if (wake_state) + swake_up_one(&rdp_toggling->nocb_state_wq); } + my_rdp->nocb_gp_seq = -1; WARN_ON(signal_pending(current)); } @@ -977,16 +992,15 @@ static int rdp_offload_toggle(struct rcu_data *rdp, swake_up_one(&rdp->nocb_cb_wq); raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); + // Queue this rdp for add/del to/from the list to iterate on rcuog + WRITE_ONCE(rdp_gp->nocb_toggling_rdp, rdp); if (rdp_gp->nocb_gp_sleep) { rdp_gp->nocb_gp_sleep = false; wake_gp = true; } raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); - if (wake_gp) - wake_up_process(rdp_gp->nocb_gp_kthread); - - return 0; + return wake_gp; } static long rcu_nocb_rdp_deoffload(void *arg) @@ -994,9 +1008,15 @@ static long rcu_nocb_rdp_deoffload(void *arg) struct rcu_data *rdp = arg; struct rcu_segcblist *cblist = &rdp->cblist; unsigned long flags; - int ret; + int wake_gp; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; - WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); + /* + * rcu_nocb_rdp_deoffload() may be called directly if + * rcuog/o[p] spawn failed, because at this time the rdp->cpu + * is not online yet. + */ + WARN_ON_ONCE((rdp->cpu != raw_smp_processor_id()) && cpu_online(rdp->cpu)); pr_info("De-offloading %d\n", rdp->cpu); @@ -1020,12 +1040,41 @@ static long rcu_nocb_rdp_deoffload(void *arg) */ rcu_segcblist_set_flags(cblist, SEGCBLIST_RCU_CORE); invoke_rcu_core(); - ret = rdp_offload_toggle(rdp, false, flags); - swait_event_exclusive(rdp->nocb_state_wq, - !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB | - SEGCBLIST_KTHREAD_GP)); - /* Stop nocb_gp_wait() from iterating over this structure. */ - list_del_rcu(&rdp->nocb_entry_rdp); + wake_gp = rdp_offload_toggle(rdp, false, flags); + + mutex_lock(&rdp_gp->nocb_gp_kthread_mutex); + if (rdp_gp->nocb_gp_kthread) { + if (wake_gp) + wake_up_process(rdp_gp->nocb_gp_kthread); + + /* + * If rcuo[p] kthread spawn failed, directly remove SEGCBLIST_KTHREAD_CB. + * Just wait SEGCBLIST_KTHREAD_GP to be cleared by rcuog. + */ + if (!rdp->nocb_cb_kthread) { + rcu_nocb_lock_irqsave(rdp, flags); + rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB); + rcu_nocb_unlock_irqrestore(rdp, flags); + } + + swait_event_exclusive(rdp->nocb_state_wq, + !rcu_segcblist_test_flags(cblist, + SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP)); + } else { + /* + * No kthread to clear the flags for us or remove the rdp from the nocb list + * to iterate. Do it here instead. Locking doesn't look stricly necessary + * but we stick to paranoia in this rare path. + */ + rcu_nocb_lock_irqsave(rdp, flags); + rcu_segcblist_clear_flags(&rdp->cblist, + SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP); + rcu_nocb_unlock_irqrestore(rdp, flags); + + list_del(&rdp->nocb_entry_rdp); + } + mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); + /* * Lock one last time to acquire latest callback updates from kthreads * so we can later handle callbacks locally without locking. @@ -1046,7 +1095,7 @@ static long rcu_nocb_rdp_deoffload(void *arg) WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); - return ret; + return 0; } int rcu_nocb_cpu_deoffload(int cpu) @@ -1054,20 +1103,20 @@ int rcu_nocb_cpu_deoffload(int cpu) struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); int ret = 0; - mutex_lock(&rcu_state.barrier_mutex); cpus_read_lock(); + mutex_lock(&rcu_state.barrier_mutex); if (rcu_rdp_is_offloaded(rdp)) { if (cpu_online(cpu)) { ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp); if (!ret) cpumask_clear_cpu(cpu, rcu_nocb_mask); } else { - pr_info("NOCB: Can't CB-deoffload an offline CPU\n"); + pr_info("NOCB: Cannot CB-deoffload offline CPU %d\n", rdp->cpu); ret = -EINVAL; } } - cpus_read_unlock(); mutex_unlock(&rcu_state.barrier_mutex); + cpus_read_unlock(); return ret; } @@ -1078,7 +1127,8 @@ static long rcu_nocb_rdp_offload(void *arg) struct rcu_data *rdp = arg; struct rcu_segcblist *cblist = &rdp->cblist; unsigned long flags; - int ret; + int wake_gp; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); /* @@ -1088,17 +1138,10 @@ static long rcu_nocb_rdp_offload(void *arg) if (!rdp->nocb_gp_rdp) return -EINVAL; - pr_info("Offloading %d\n", rdp->cpu); + if (WARN_ON_ONCE(!rdp_gp->nocb_gp_kthread)) + return -EINVAL; - /* - * Cause future nocb_gp_wait() invocations to iterate over - * structure, resetting ->nocb_gp_sleep and waking up the related - * "rcuog". Since nocb_gp_wait() in turn locks ->nocb_gp_lock - * before setting ->nocb_gp_sleep again, we are guaranteed to - * iterate this newly added structure before "rcuog" goes to - * sleep again. - */ - list_add_tail_rcu(&rdp->nocb_entry_rdp, &rdp->nocb_gp_rdp->nocb_head_rdp); + pr_info("Offloading %d\n", rdp->cpu); /* * Can't use rcu_nocb_lock_irqsave() before SEGCBLIST_LOCKING @@ -1122,7 +1165,9 @@ static long rcu_nocb_rdp_offload(void *arg) * WRITE flags READ callbacks * rcu_nocb_unlock() rcu_nocb_unlock() */ - ret = rdp_offload_toggle(rdp, true, flags); + wake_gp = rdp_offload_toggle(rdp, true, flags); + if (wake_gp) + wake_up_process(rdp_gp->nocb_gp_kthread); swait_event_exclusive(rdp->nocb_state_wq, rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) && rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); @@ -1135,7 +1180,7 @@ static long rcu_nocb_rdp_offload(void *arg) rcu_segcblist_clear_flags(cblist, SEGCBLIST_RCU_CORE); rcu_nocb_unlock_irqrestore(rdp, flags); - return ret; + return 0; } int rcu_nocb_cpu_offload(int cpu) @@ -1143,20 +1188,20 @@ int rcu_nocb_cpu_offload(int cpu) struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); int ret = 0; - mutex_lock(&rcu_state.barrier_mutex); cpus_read_lock(); + mutex_lock(&rcu_state.barrier_mutex); if (!rcu_rdp_is_offloaded(rdp)) { if (cpu_online(cpu)) { ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp); if (!ret) cpumask_set_cpu(cpu, rcu_nocb_mask); } else { - pr_info("NOCB: Can't CB-offload an offline CPU\n"); + pr_info("NOCB: Cannot CB-offload offline CPU %d\n", rdp->cpu); ret = -EINVAL; } } - cpus_read_unlock(); mutex_unlock(&rcu_state.barrier_mutex); + cpus_read_unlock(); return ret; } @@ -1166,11 +1211,21 @@ void __init rcu_init_nohz(void) { int cpu; bool need_rcu_nocb_mask = false; + bool offload_all = false; struct rcu_data *rdp; +#if defined(CONFIG_RCU_NOCB_CPU_DEFAULT_ALL) + if (!rcu_state.nocb_is_setup) { + need_rcu_nocb_mask = true; + offload_all = true; + } +#endif /* #if defined(CONFIG_RCU_NOCB_CPU_DEFAULT_ALL) */ + #if defined(CONFIG_NO_HZ_FULL) - if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask)) + if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask)) { need_rcu_nocb_mask = true; + offload_all = false; /* NO_HZ_FULL has its own mask. */ + } #endif /* #if defined(CONFIG_NO_HZ_FULL) */ if (need_rcu_nocb_mask) { @@ -1180,10 +1235,10 @@ void __init rcu_init_nohz(void) return; } } - rcu_nocb_is_setup = true; + rcu_state.nocb_is_setup = true; } - if (!rcu_nocb_is_setup) + if (!rcu_state.nocb_is_setup) return; #if defined(CONFIG_NO_HZ_FULL) @@ -1191,6 +1246,9 @@ void __init rcu_init_nohz(void) cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); #endif /* #if defined(CONFIG_NO_HZ_FULL) */ + if (offload_all) + cpumask_setall(rcu_nocb_mask); + if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n"); cpumask_and(rcu_nocb_mask, cpu_possible_mask, @@ -1241,7 +1299,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) struct task_struct *t; struct sched_param sp; - if (!rcu_scheduler_fully_active || !rcu_nocb_is_setup) + if (!rcu_scheduler_fully_active || !rcu_state.nocb_is_setup) return; /* If there already is an rcuo kthread, then nothing to do. */ @@ -1257,7 +1315,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) "rcuog/%d", rdp_gp->cpu); if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) { mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); - return; + goto end; } WRITE_ONCE(rdp_gp->nocb_gp_kthread, t); if (kthread_prio) @@ -1269,28 +1327,21 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) t = kthread_run(rcu_nocb_cb_kthread, rdp, "rcuo%c/%d", rcu_state.abbr, cpu); if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__)) - return; + goto end; - if (kthread_prio) + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_CB_BOOST) && kthread_prio) sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + WRITE_ONCE(rdp->nocb_cb_kthread, t); WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread); -} - -/* - * Once the scheduler is running, spawn rcuo kthreads for all online - * no-CBs CPUs. This assumes that the early_initcall()s happen before - * non-boot CPUs come online -- if this changes, we will need to add - * some mutual exclusion. - */ -static void __init rcu_spawn_nocb_kthreads(void) -{ - int cpu; - - if (rcu_nocb_is_setup) { - for_each_online_cpu(cpu) - rcu_spawn_cpu_nocb_kthread(cpu); + return; +end: + mutex_lock(&rcu_state.barrier_mutex); + if (rcu_rdp_is_offloaded(rdp)) { + rcu_nocb_rdp_deoffload(rdp); + cpumask_clear_cpu(cpu, rcu_nocb_mask); } + mutex_unlock(&rcu_state.barrier_mutex); } /* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */ @@ -1401,8 +1452,8 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp) (long)rdp->nocb_gp_seq, rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops), rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.', - rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1, - show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread)); + rdp->nocb_gp_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1, + show_rcu_should_be_on_cpu(rdp->nocb_gp_kthread)); } /* Dump out nocb kthread state for the specified rcu_data structure. */ @@ -1446,7 +1497,7 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)], rcu_segcblist_n_cbs(&rdp->cblist), rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.', - rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1, + rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_cb_kthread) : -1, show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread)); /* It is OK for GP kthreads to have GP state. */ @@ -1549,10 +1600,6 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) { } -static void __init rcu_spawn_nocb_kthreads(void) -{ -} - static void show_rcu_nocb_state(struct rcu_data *rdp) { } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 8360d86db1c0..e3142ee35fc6 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -460,7 +460,7 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp) * be quite short, for example, in the case of the call from * rcu_read_unlock_special(). */ -static void +static notrace void rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) { bool empty_exp; @@ -486,6 +486,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) t->rcu_read_unlock_special.s = 0; if (special.b.need_qs) { if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) { + rdp->cpu_no_qs.b.norm = false; rcu_report_qs_rdp(rdp); udelay(rcu_unlock_delay); } else { @@ -580,7 +581,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) * is disabled. This function cannot be expected to understand these * nuances, so the caller must handle them. */ -static bool rcu_preempt_need_deferred_qs(struct task_struct *t) +static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t) { return (__this_cpu_read(rcu_data.cpu_no_qs.b.exp) || READ_ONCE(t->rcu_read_unlock_special.s)) && @@ -594,7 +595,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) * evaluate safety in terms of interrupt, softirq, and preemption * disabling. */ -static void rcu_preempt_deferred_qs(struct task_struct *t) +notrace void rcu_preempt_deferred_qs(struct task_struct *t) { unsigned long flags; @@ -640,7 +641,8 @@ static void rcu_read_unlock_special(struct task_struct *t) expboost = (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) || (rdp->grpmask & READ_ONCE(rnp->expmask)) || - IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) || + (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && + ((rdp->grpmask & READ_ONCE(rnp->qsmask)) || t->rcu_blocked_node)) || (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled && t->rcu_blocked_node); // Need to defer quiescent state until everything is enabled. @@ -660,7 +662,13 @@ static void rcu_read_unlock_special(struct task_struct *t) expboost && !rdp->defer_qs_iw_pending && cpu_online(rdp->cpu)) { // Get scheduler to re-evaluate and call hooks. // If !IRQ_WORK, FQS scan will eventually IPI. - init_irq_work(&rdp->defer_qs_iw, rcu_preempt_deferred_qs_handler); + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && + IS_ENABLED(CONFIG_PREEMPT_RT)) + rdp->defer_qs_iw = IRQ_WORK_INIT_HARD( + rcu_preempt_deferred_qs_handler); + else + init_irq_work(&rdp->defer_qs_iw, + rcu_preempt_deferred_qs_handler); rdp->defer_qs_iw_pending = true; irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); } @@ -711,9 +719,6 @@ static void rcu_flavor_sched_clock_irq(int user) struct task_struct *t = current; lockdep_assert_irqs_disabled(); - if (user || rcu_is_cpu_rrupt_from_idle()) { - rcu_note_voluntary_context_switch(current); - } if (rcu_preempt_depth() > 0 || (preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) { /* No QS, force context switch if deferred. */ @@ -817,6 +822,7 @@ void rcu_read_unlock_strict(void) if (irqs_disabled() || preempt_count() || !rcu_state.gp_kthread) return; rdp = this_cpu_ptr(&rcu_data); + rdp->cpu_no_qs.b.norm = false; rcu_report_qs_rdp(rdp); udelay(rcu_unlock_delay); } @@ -862,7 +868,7 @@ void rcu_all_qs(void) if (!raw_cpu_read(rcu_data.rcu_urgent_qs)) return; - preempt_disable(); + preempt_disable(); // For CONFIG_PREEMPT_COUNT=y kernels /* Load rcu_urgent_qs before other flags. */ if (!smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) { preempt_enable(); @@ -892,8 +898,8 @@ void rcu_note_context_switch(bool preempt) this_cpu_write(rcu_data.rcu_urgent_qs, false); if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) rcu_momentary_dyntick_idle(); - rcu_tasks_qs(current, preempt); out: + rcu_tasks_qs(current, preempt); trace_rcu_utilization(TPS("End context switch")); } EXPORT_SYMBOL_GPL(rcu_note_context_switch); @@ -919,16 +925,19 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp) * Because there is no preemptible RCU, there can be no deferred quiescent * states. */ -static bool rcu_preempt_need_deferred_qs(struct task_struct *t) +static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t) { return false; } -// Except that we do need to respond to a request by an expedited grace -// period for a quiescent state from this CPU. Note that requests from -// tasks are handled when removing the task from the blocked-tasks list -// below. -static void rcu_preempt_deferred_qs(struct task_struct *t) +// Except that we do need to respond to a request by an expedited +// grace period for a quiescent state from this CPU. Note that in +// non-preemptible kernels, there can be no context switches within RCU +// read-side critical sections, which in turn means that the leaf rcu_node +// structure's blocked-tasks list is always empty. is therefore no need to +// actually check it. Instead, a quiescent state from this CPU suffices, +// and this function is only called from such a quiescent state. +notrace void rcu_preempt_deferred_qs(struct task_struct *t) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -965,7 +974,6 @@ static void rcu_flavor_sched_clock_irq(int user) * neither access nor modify, at least not while the * corresponding CPU is online. */ - rcu_qs(); } } @@ -1005,6 +1013,25 @@ static void rcu_cpu_kthread_setup(unsigned int cpu) WRITE_ONCE(rdp->rcuc_activity, jiffies); } +static bool rcu_is_callbacks_nocb_kthread(struct rcu_data *rdp) +{ +#ifdef CONFIG_RCU_NOCB_CPU + return rdp->nocb_cb_kthread == current; +#else + return false; +#endif +} + +/* + * Is the current CPU running the RCU-callbacks kthread? + * Caller must have preemption disabled. + */ +static bool rcu_is_callbacks_kthread(struct rcu_data *rdp) +{ + return rdp->rcu_cpu_kthread_task == current || + rcu_is_callbacks_nocb_kthread(rdp); +} + #ifdef CONFIG_RCU_BOOST /* @@ -1124,7 +1151,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) __releases(rnp->lock) { raw_lockdep_assert_held_rcu_node(rnp); - if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { + if (!rnp->boost_kthread_task || + (!rcu_preempt_blocked_readers_cgp(rnp) && !rnp->exp_tasks)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } @@ -1132,7 +1160,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) (rnp->gp_tasks != NULL && rnp->boost_tasks == NULL && rnp->qsmask == 0 && - (!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld))) { + (!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld || + IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)))) { if (rnp->exp_tasks == NULL) WRITE_ONCE(rnp->boost_tasks, rnp->gp_tasks); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -1143,15 +1172,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) } } -/* - * Is the current CPU running the RCU-callbacks kthread? - * Caller must have preemption disabled. - */ -static bool rcu_is_callbacks_kthread(void) -{ - return __this_cpu_read(rcu_data.rcu_cpu_kthread_task) == current; -} - #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) /* @@ -1219,25 +1239,16 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) cpu != outgoingcpu) cpumask_set_cpu(cpu, cm); cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU)); - if (cpumask_empty(cm)) + if (cpumask_empty(cm)) { cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU)); + if (outgoingcpu >= 0) + cpumask_clear_cpu(outgoingcpu, cm); + } set_cpus_allowed_ptr(t, cm); mutex_unlock(&rnp->boost_kthread_mutex); free_cpumask_var(cm); } -/* - * Spawn boost kthreads -- called as soon as the scheduler is running. - */ -static void __init rcu_spawn_boost_kthreads(void) -{ - struct rcu_node *rnp; - - rcu_for_each_leaf_node(rnp) - if (rcu_rnp_online_cpus(rnp)) - rcu_spawn_one_boost_kthread(rnp); -} - #else /* #ifdef CONFIG_RCU_BOOST */ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) @@ -1246,11 +1257,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } -static bool rcu_is_callbacks_kthread(void) -{ - return false; -} - static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) { } @@ -1263,10 +1269,6 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) { } -static void __init rcu_spawn_boost_kthreads(void) -{ -} - #endif /* #else #ifdef CONFIG_RCU_BOOST */ /* @@ -1298,37 +1300,3 @@ static void rcu_bind_gp_kthread(void) return; housekeeping_affine(current, HK_TYPE_RCU); } - -/* Record the current task on dyntick-idle entry. */ -static __always_inline void rcu_dynticks_task_enter(void) -{ -#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) - WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id()); -#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ -} - -/* Record no current task on dyntick-idle exit. */ -static __always_inline void rcu_dynticks_task_exit(void) -{ -#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) - WRITE_ONCE(current->rcu_tasks_idle_cpu, -1); -#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ -} - -/* Turn on heavyweight RCU tasks trace readers on idle/user entry. */ -static __always_inline void rcu_dynticks_task_trace_enter(void) -{ -#ifdef CONFIG_TASKS_TRACE_RCU - if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) - current->trc_reader_special.b.need_mb = true; -#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ -} - -/* Turn off heavyweight RCU tasks trace readers on idle/user exit. */ -static __always_inline void rcu_dynticks_task_trace_exit(void) -{ -#ifdef CONFIG_TASKS_TRACE_RCU - if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) - current->trc_reader_special.b.need_mb = false; -#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ -} diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 0c5d8516516a..5653560573e2 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -25,6 +25,34 @@ int sysctl_max_rcu_stall_to_panic __read_mostly; #define RCU_STALL_MIGHT_DIV 8 #define RCU_STALL_MIGHT_MIN (2 * HZ) +int rcu_exp_jiffies_till_stall_check(void) +{ + int cpu_stall_timeout = READ_ONCE(rcu_exp_cpu_stall_timeout); + int exp_stall_delay_delta = 0; + int till_stall_check; + + // Zero says to use rcu_cpu_stall_timeout, but in milliseconds. + if (!cpu_stall_timeout) + cpu_stall_timeout = jiffies_to_msecs(rcu_jiffies_till_stall_check()); + + // Limit check must be consistent with the Kconfig limits for + // CONFIG_RCU_EXP_CPU_STALL_TIMEOUT, so check the allowed range. + // The minimum clamped value is "2UL", because at least one full + // tick has to be guaranteed. + till_stall_check = clamp(msecs_to_jiffies(cpu_stall_timeout), 2UL, 21UL * HZ); + + if (cpu_stall_timeout && jiffies_to_msecs(till_stall_check) != cpu_stall_timeout) + WRITE_ONCE(rcu_exp_cpu_stall_timeout, jiffies_to_msecs(till_stall_check)); + +#ifdef CONFIG_PROVE_RCU + /* Add extra ~25% out of till_stall_check. */ + exp_stall_delay_delta = ((till_stall_check * 25) / 100) + 1; +#endif + + return till_stall_check + exp_stall_delay_delta; +} +EXPORT_SYMBOL_GPL(rcu_exp_jiffies_till_stall_check); + /* Limit-check stall timeouts specified at boottime and runtime. */ int rcu_jiffies_till_stall_check(void) { @@ -340,7 +368,7 @@ static void rcu_dump_cpu_stacks(void) if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { if (cpu_is_offline(cpu)) pr_err("Offline CPU %d blocking current GP.\n", cpu); - else if (!trigger_single_cpu_backtrace(cpu)) + else dump_cpu_task(cpu); } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -381,7 +409,19 @@ static bool rcu_is_gp_kthread_starving(unsigned long *jp) static bool rcu_is_rcuc_kthread_starving(struct rcu_data *rdp, unsigned long *jp) { - unsigned long j = jiffies - READ_ONCE(rdp->rcuc_activity); + int cpu; + struct task_struct *rcuc; + unsigned long j; + + rcuc = rdp->rcu_cpu_kthread_task; + if (!rcuc) + return false; + + cpu = task_cpu(rcuc); + if (cpu_is_offline(cpu) || idle_cpu(cpu)) + return false; + + j = jiffies - READ_ONCE(rdp->rcuc_activity); if (jp) *jp = j; @@ -406,6 +446,9 @@ static void print_cpu_stall_info(int cpu) struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); char *ticks_title; unsigned long ticks_value; + bool rcuc_starved; + unsigned long j; + char buf[32]; /* * We could be printing a lot while holding a spinlock. Avoid @@ -422,8 +465,11 @@ static void print_cpu_stall_info(int cpu) } delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); falsepositive = rcu_is_gp_kthread_starving(NULL) && - rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)); - pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n", + rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu)); + rcuc_starved = rcu_is_rcuc_kthread_starving(rdp, &j); + if (rcuc_starved) + sprintf(buf, " rcuc=%ld jiffies(starved)", j); + pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%04x/%ld/%#lx softirq=%u/%u fqs=%ld%s%s\n", cpu, "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], @@ -432,36 +478,14 @@ static void print_cpu_stall_info(int cpu) rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' : "!."[!delta], ticks_value, ticks_title, - rcu_dynticks_snap(rdp) & 0xfff, - rdp->dynticks_nesting, rdp->dynticks_nmi_nesting, + rcu_dynticks_snap(cpu) & 0xffff, + ct_dynticks_nesting_cpu(cpu), ct_dynticks_nmi_nesting_cpu(cpu), rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, + rcuc_starved ? buf : "", falsepositive ? " (false positive?)" : ""); } -static void rcuc_kthread_dump(struct rcu_data *rdp) -{ - int cpu; - unsigned long j; - struct task_struct *rcuc; - - rcuc = rdp->rcu_cpu_kthread_task; - if (!rcuc) - return; - - cpu = task_cpu(rcuc); - if (cpu_is_offline(cpu) || idle_cpu(cpu)) - return; - - if (!rcu_is_rcuc_kthread_starving(rdp, &j)) - return; - - pr_err("%s kthread starved for %ld jiffies\n", rcuc->comm, j); - sched_show_task(rcuc); - if (!trigger_single_cpu_backtrace(cpu)) - dump_cpu_task(cpu); -} - /* Complain about starvation of grace-period kthread. */ static void rcu_check_gp_kthread_starvation(void) { @@ -487,8 +511,7 @@ static void rcu_check_gp_kthread_starvation(void) pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu); } else { pr_err("Stack dump where RCU GP kthread last ran:\n"); - if (!trigger_single_cpu_backtrace(cpu)) - dump_cpu_task(cpu); + dump_cpu_task(cpu); } } wake_up_process(gpk); @@ -565,9 +588,9 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) for_each_possible_cpu(cpu) totqlen += rcu_get_n_cbs_cpu(cpu); - pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", + pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu ncpus=%d)\n", smp_processor_id(), (long)(jiffies - gps), - (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); + (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus); if (ndetected) { rcu_dump_cpu_stacks(); @@ -626,16 +649,13 @@ static void print_cpu_stall(unsigned long gps) raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags); for_each_possible_cpu(cpu) totqlen += rcu_get_n_cbs_cpu(cpu); - pr_cont("\t(t=%lu jiffies g=%ld q=%lu)\n", + pr_cont("\t(t=%lu jiffies g=%ld q=%lu ncpus=%d)\n", jiffies - gps, - (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); + (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus); rcu_check_gp_kthread_expired_fqs_timer(); rcu_check_gp_kthread_starvation(); - if (!use_softirq) - rcuc_kthread_dump(rdp); - rcu_dump_cpu_stacks(); raw_spin_lock_irqsave_rcu_node(rnp, flags); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 180ff9c41fa8..738842c4886b 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -85,7 +85,7 @@ module_param(rcu_normal_after_boot, int, 0444); * and while lockdep is disabled. * * Note that if the CPU is in the idle loop from an RCU point of view (ie: - * that we are in the section between rcu_idle_enter() and rcu_idle_exit()) + * that we are in the section between ct_idle_enter() and ct_idle_exit()) * then rcu_read_lock_held() sets ``*ret`` to false even if the CPU did an * rcu_read_lock(). The reason for this is that RCU ignores CPUs that are * in such a section, considering these as in extended quiescent state, @@ -506,6 +506,8 @@ EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); module_param(rcu_cpu_stall_suppress, int, 0644); int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; module_param(rcu_cpu_stall_timeout, int, 0644); +int rcu_exp_cpu_stall_timeout __read_mostly = CONFIG_RCU_EXP_CPU_STALL_TIMEOUT; +module_param(rcu_exp_cpu_stall_timeout, int, 0644); #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ // Suppress boot-time RCU CPU stall warnings and rcutorture writer stall @@ -514,6 +516,19 @@ int rcu_cpu_stall_suppress_at_boot __read_mostly; // !0 = suppress boot stalls. EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress_at_boot); module_param(rcu_cpu_stall_suppress_at_boot, int, 0444); +/** + * get_completed_synchronize_rcu - Return a pre-completed polled state cookie + * + * Returns a value that will always be treated by functions like + * poll_state_synchronize_rcu() as a cookie whose grace period has already + * completed. + */ +unsigned long get_completed_synchronize_rcu(void) +{ + return RCU_GET_STATE_COMPLETED; +} +EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu); + #ifdef CONFIG_PROVE_RCU /* diff --git a/kernel/reboot.c b/kernel/reboot.c index 6bcc5d6a6572..3c35445bf5ad 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -23,7 +23,7 @@ * this indicates whether you can reboot with ctrl-alt-del: the default is yes */ -int C_A_D = 1; +static int C_A_D = 1; struct pid *cad_pid; EXPORT_SYMBOL(cad_pid); @@ -48,12 +48,20 @@ int reboot_cpu; enum reboot_type reboot_type = BOOT_ACPI; int reboot_force; +struct sys_off_handler { + struct notifier_block nb; + int (*sys_off_cb)(struct sys_off_data *data); + void *cb_data; + enum sys_off_mode mode; + bool blocking; + void *list; +}; + /* - * If set, this is used for preparing the system to power off. + * Temporary stub that prevents linkage failure while we're in process + * of removing all uses of legacy pm_power_off() around the kernel. */ - -void (*pm_power_off_prepare)(void); -EXPORT_SYMBOL_GPL(pm_power_off_prepare); +void __weak (*pm_power_off)(void); /** * emergency_restart - reboot the system @@ -281,6 +289,370 @@ void kernel_halt(void) } EXPORT_SYMBOL_GPL(kernel_halt); +/* + * Notifier list for kernel code which wants to be called + * to prepare system for power off. + */ +static BLOCKING_NOTIFIER_HEAD(power_off_prep_handler_list); + +/* + * Notifier list for kernel code which wants to be called + * to power off system. + */ +static ATOMIC_NOTIFIER_HEAD(power_off_handler_list); + +static int sys_off_notify(struct notifier_block *nb, + unsigned long mode, void *cmd) +{ + struct sys_off_handler *handler; + struct sys_off_data data = {}; + + handler = container_of(nb, struct sys_off_handler, nb); + data.cb_data = handler->cb_data; + data.mode = mode; + data.cmd = cmd; + + return handler->sys_off_cb(&data); +} + +static struct sys_off_handler platform_sys_off_handler; + +static struct sys_off_handler *alloc_sys_off_handler(int priority) +{ + struct sys_off_handler *handler; + gfp_t flags; + + /* + * Platforms like m68k can't allocate sys_off handler dynamically + * at the early boot time because memory allocator isn't available yet. + */ + if (priority == SYS_OFF_PRIO_PLATFORM) { + handler = &platform_sys_off_handler; + if (handler->cb_data) + return ERR_PTR(-EBUSY); + } else { + if (system_state > SYSTEM_RUNNING) + flags = GFP_ATOMIC; + else + flags = GFP_KERNEL; + + handler = kzalloc(sizeof(*handler), flags); + if (!handler) + return ERR_PTR(-ENOMEM); + } + + return handler; +} + +static void free_sys_off_handler(struct sys_off_handler *handler) +{ + if (handler == &platform_sys_off_handler) + memset(handler, 0, sizeof(*handler)); + else + kfree(handler); +} + +/** + * register_sys_off_handler - Register sys-off handler + * @mode: Sys-off mode + * @priority: Handler priority + * @callback: Callback function + * @cb_data: Callback argument + * + * Registers system power-off or restart handler that will be invoked + * at the step corresponding to the given sys-off mode. Handler's callback + * should return NOTIFY_DONE to permit execution of the next handler in + * the call chain or NOTIFY_STOP to break the chain (in error case for + * example). + * + * Multiple handlers can be registered at the default priority level. + * + * Only one handler can be registered at the non-default priority level, + * otherwise ERR_PTR(-EBUSY) is returned. + * + * Returns a new instance of struct sys_off_handler on success, or + * an ERR_PTR()-encoded error code otherwise. + */ +struct sys_off_handler * +register_sys_off_handler(enum sys_off_mode mode, + int priority, + int (*callback)(struct sys_off_data *data), + void *cb_data) +{ + struct sys_off_handler *handler; + int err; + + handler = alloc_sys_off_handler(priority); + if (IS_ERR(handler)) + return handler; + + switch (mode) { + case SYS_OFF_MODE_POWER_OFF_PREPARE: + handler->list = &power_off_prep_handler_list; + handler->blocking = true; + break; + + case SYS_OFF_MODE_POWER_OFF: + handler->list = &power_off_handler_list; + break; + + case SYS_OFF_MODE_RESTART: + handler->list = &restart_handler_list; + break; + + default: + free_sys_off_handler(handler); + return ERR_PTR(-EINVAL); + } + + handler->nb.notifier_call = sys_off_notify; + handler->nb.priority = priority; + handler->sys_off_cb = callback; + handler->cb_data = cb_data; + handler->mode = mode; + + if (handler->blocking) { + if (priority == SYS_OFF_PRIO_DEFAULT) + err = blocking_notifier_chain_register(handler->list, + &handler->nb); + else + err = blocking_notifier_chain_register_unique_prio(handler->list, + &handler->nb); + } else { + if (priority == SYS_OFF_PRIO_DEFAULT) + err = atomic_notifier_chain_register(handler->list, + &handler->nb); + else + err = atomic_notifier_chain_register_unique_prio(handler->list, + &handler->nb); + } + + if (err) { + free_sys_off_handler(handler); + return ERR_PTR(err); + } + + return handler; +} +EXPORT_SYMBOL_GPL(register_sys_off_handler); + +/** + * unregister_sys_off_handler - Unregister sys-off handler + * @handler: Sys-off handler + * + * Unregisters given sys-off handler. + */ +void unregister_sys_off_handler(struct sys_off_handler *handler) +{ + int err; + + if (IS_ERR_OR_NULL(handler)) + return; + + if (handler->blocking) + err = blocking_notifier_chain_unregister(handler->list, + &handler->nb); + else + err = atomic_notifier_chain_unregister(handler->list, + &handler->nb); + + /* sanity check, shall never happen */ + WARN_ON(err); + + free_sys_off_handler(handler); +} +EXPORT_SYMBOL_GPL(unregister_sys_off_handler); + +static void devm_unregister_sys_off_handler(void *data) +{ + struct sys_off_handler *handler = data; + + unregister_sys_off_handler(handler); +} + +/** + * devm_register_sys_off_handler - Register sys-off handler + * @dev: Device that registers handler + * @mode: Sys-off mode + * @priority: Handler priority + * @callback: Callback function + * @cb_data: Callback argument + * + * Registers resource-managed sys-off handler. + * + * Returns zero on success, or error code on failure. + */ +int devm_register_sys_off_handler(struct device *dev, + enum sys_off_mode mode, + int priority, + int (*callback)(struct sys_off_data *data), + void *cb_data) +{ + struct sys_off_handler *handler; + + handler = register_sys_off_handler(mode, priority, callback, cb_data); + if (IS_ERR(handler)) + return PTR_ERR(handler); + + return devm_add_action_or_reset(dev, devm_unregister_sys_off_handler, + handler); +} +EXPORT_SYMBOL_GPL(devm_register_sys_off_handler); + +/** + * devm_register_power_off_handler - Register power-off handler + * @dev: Device that registers callback + * @callback: Callback function + * @cb_data: Callback's argument + * + * Registers resource-managed sys-off handler with a default priority + * and using power-off mode. + * + * Returns zero on success, or error code on failure. + */ +int devm_register_power_off_handler(struct device *dev, + int (*callback)(struct sys_off_data *data), + void *cb_data) +{ + return devm_register_sys_off_handler(dev, + SYS_OFF_MODE_POWER_OFF, + SYS_OFF_PRIO_DEFAULT, + callback, cb_data); +} +EXPORT_SYMBOL_GPL(devm_register_power_off_handler); + +/** + * devm_register_restart_handler - Register restart handler + * @dev: Device that registers callback + * @callback: Callback function + * @cb_data: Callback's argument + * + * Registers resource-managed sys-off handler with a default priority + * and using restart mode. + * + * Returns zero on success, or error code on failure. + */ +int devm_register_restart_handler(struct device *dev, + int (*callback)(struct sys_off_data *data), + void *cb_data) +{ + return devm_register_sys_off_handler(dev, + SYS_OFF_MODE_RESTART, + SYS_OFF_PRIO_DEFAULT, + callback, cb_data); +} +EXPORT_SYMBOL_GPL(devm_register_restart_handler); + +static struct sys_off_handler *platform_power_off_handler; + +static int platform_power_off_notify(struct sys_off_data *data) +{ + void (*platform_power_power_off_cb)(void) = data->cb_data; + + platform_power_power_off_cb(); + + return NOTIFY_DONE; +} + +/** + * register_platform_power_off - Register platform-level power-off callback + * @power_off: Power-off callback + * + * Registers power-off callback that will be called as last step + * of the power-off sequence. This callback is expected to be invoked + * for the last resort. Only one platform power-off callback is allowed + * to be registered at a time. + * + * Returns zero on success, or error code on failure. + */ +int register_platform_power_off(void (*power_off)(void)) +{ + struct sys_off_handler *handler; + + handler = register_sys_off_handler(SYS_OFF_MODE_POWER_OFF, + SYS_OFF_PRIO_PLATFORM, + platform_power_off_notify, + power_off); + if (IS_ERR(handler)) + return PTR_ERR(handler); + + platform_power_off_handler = handler; + + return 0; +} +EXPORT_SYMBOL_GPL(register_platform_power_off); + +/** + * unregister_platform_power_off - Unregister platform-level power-off callback + * @power_off: Power-off callback + * + * Unregisters previously registered platform power-off callback. + */ +void unregister_platform_power_off(void (*power_off)(void)) +{ + if (platform_power_off_handler && + platform_power_off_handler->cb_data == power_off) { + unregister_sys_off_handler(platform_power_off_handler); + platform_power_off_handler = NULL; + } +} +EXPORT_SYMBOL_GPL(unregister_platform_power_off); + +static int legacy_pm_power_off(struct sys_off_data *data) +{ + if (pm_power_off) + pm_power_off(); + + return NOTIFY_DONE; +} + +static void do_kernel_power_off_prepare(void) +{ + blocking_notifier_call_chain(&power_off_prep_handler_list, 0, NULL); +} + +/** + * do_kernel_power_off - Execute kernel power-off handler call chain + * + * Expected to be called as last step of the power-off sequence. + * + * Powers off the system immediately if a power-off handler function has + * been registered. Otherwise does nothing. + */ +void do_kernel_power_off(void) +{ + struct sys_off_handler *sys_off = NULL; + + /* + * Register sys-off handlers for legacy PM callback. This allows + * legacy PM callbacks temporary co-exist with the new sys-off API. + * + * TODO: Remove legacy handlers once all legacy PM users will be + * switched to the sys-off based APIs. + */ + if (pm_power_off) + sys_off = register_sys_off_handler(SYS_OFF_MODE_POWER_OFF, + SYS_OFF_PRIO_DEFAULT, + legacy_pm_power_off, NULL); + + atomic_notifier_call_chain(&power_off_handler_list, 0, NULL); + + unregister_sys_off_handler(sys_off); +} + +/** + * kernel_can_power_off - check whether system can be powered off + * + * Returns true if power-off handler is registered and system can be + * powered off, false otherwise. + */ +bool kernel_can_power_off(void) +{ + return !atomic_notifier_call_chain_is_empty(&power_off_handler_list) || + pm_power_off; +} +EXPORT_SYMBOL_GPL(kernel_can_power_off); + /** * kernel_power_off - power_off the system * @@ -289,8 +661,7 @@ EXPORT_SYMBOL_GPL(kernel_halt); void kernel_power_off(void) { kernel_shutdown_prepare(SYSTEM_POWER_OFF); - if (pm_power_off_prepare) - pm_power_off_prepare(); + do_kernel_power_off_prepare(); migrate_to_reboot_cpu(); syscore_shutdown(); pr_emerg("Power down\n"); @@ -340,7 +711,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, /* Instead of trying to make the power_off code look like * halt when pm_power_off is not set do it the easy way. */ - if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) + if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !kernel_can_power_off()) cmd = LINUX_REBOOT_CMD_HALT; mutex_lock(&system_transition_mutex); @@ -417,7 +788,8 @@ void ctrl_alt_del(void) kill_cad_pid(SIGINT, 1); } -char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; +#define POWEROFF_CMD_PATH_LEN 256 +static char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; static const char reboot_cmd[] = "/sbin/reboot"; static int run_cmd(const char *cmd) @@ -867,6 +1239,33 @@ static struct attribute *reboot_attrs[] = { NULL, }; +#ifdef CONFIG_SYSCTL +static struct ctl_table kern_reboot_table[] = { + { + .procname = "poweroff_cmd", + .data = &poweroff_cmd, + .maxlen = POWEROFF_CMD_PATH_LEN, + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "ctrl-alt-del", + .data = &C_A_D, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static void __init kernel_reboot_sysctls_init(void) +{ + register_sysctl_init("kernel", kern_reboot_table); +} +#else +#define kernel_reboot_sysctls_init() do { } while (0) +#endif /* CONFIG_SYSCTL */ + static const struct attribute_group reboot_attr_group = { .attrs = reboot_attrs, }; @@ -886,6 +1285,8 @@ static int __init reboot_ksysfs_init(void) return ret; } + kernel_reboot_sysctls_init(); + return 0; } late_initcall(reboot_ksysfs_init); diff --git a/kernel/relay.c b/kernel/relay.c index d1a67fbb819d..6a611e779e95 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -440,7 +440,7 @@ int relay_prepare_cpu(unsigned int cpu) mutex_lock(&relay_channels_mutex); list_for_each_entry(chan, &relay_channels, list) { - if ((buf = *per_cpu_ptr(chan->buf, cpu))) + if (*per_cpu_ptr(chan->buf, cpu)) continue; buf = relay_open_buf(chan, cpu); if (!buf) { diff --git a/kernel/resource.c b/kernel/resource.c index 34eaee179689..4c5e80b92f2f 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -489,8 +489,9 @@ int __weak page_is_ram(unsigned long pfn) } EXPORT_SYMBOL_GPL(page_is_ram); -static int __region_intersects(resource_size_t start, size_t size, - unsigned long flags, unsigned long desc) +static int __region_intersects(struct resource *parent, resource_size_t start, + size_t size, unsigned long flags, + unsigned long desc) { struct resource res; int type = 0; int other = 0; @@ -499,7 +500,7 @@ static int __region_intersects(resource_size_t start, size_t size, res.start = start; res.end = start + size - 1; - for (p = iomem_resource.child; p ; p = p->sibling) { + for (p = parent->child; p ; p = p->sibling) { bool is_type = (((p->flags & flags) == flags) && ((desc == IORES_DESC_NONE) || (desc == p->desc))); @@ -543,7 +544,7 @@ int region_intersects(resource_size_t start, size_t size, unsigned long flags, int ret; read_lock(&resource_lock); - ret = __region_intersects(start, size, flags, desc); + ret = __region_intersects(&iomem_resource, start, size, flags, desc); read_unlock(&resource_lock); return ret; @@ -891,6 +892,13 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new) } write_unlock(&resource_lock); } +/* + * Not for general consumption, only early boot memory map parsing, PCI + * resource discovery, and late discovery of CXL resources are expected + * to use this interface. The former are built-in and only the latter, + * CXL, is a module. + */ +EXPORT_SYMBOL_NS_GPL(insert_resource_expand_to_fit, CXL); /** * remove_resource - Remove a resource in the resource tree @@ -1773,62 +1781,139 @@ void resource_list_free(struct list_head *head) } EXPORT_SYMBOL(resource_list_free); -#ifdef CONFIG_DEVICE_PRIVATE -static struct resource *__request_free_mem_region(struct device *dev, - struct resource *base, unsigned long size, const char *name) +#ifdef CONFIG_GET_FREE_REGION +#define GFR_DESCENDING (1UL << 0) +#define GFR_REQUEST_REGION (1UL << 1) +#define GFR_DEFAULT_ALIGN (1UL << PA_SECTION_SHIFT) + +static resource_size_t gfr_start(struct resource *base, resource_size_t size, + resource_size_t align, unsigned long flags) +{ + if (flags & GFR_DESCENDING) { + resource_size_t end; + + end = min_t(resource_size_t, base->end, + (1ULL << MAX_PHYSMEM_BITS) - 1); + return end - size + 1; + } + + return ALIGN(base->start, align); +} + +static bool gfr_continue(struct resource *base, resource_size_t addr, + resource_size_t size, unsigned long flags) +{ + if (flags & GFR_DESCENDING) + return addr > size && addr >= base->start; + /* + * In the ascend case be careful that the last increment by + * @size did not wrap 0. + */ + return addr > addr - size && + addr <= min_t(resource_size_t, base->end, + (1ULL << MAX_PHYSMEM_BITS) - 1); +} + +static resource_size_t gfr_next(resource_size_t addr, resource_size_t size, + unsigned long flags) { - resource_size_t end, addr; + if (flags & GFR_DESCENDING) + return addr - size; + return addr + size; +} + +static void remove_free_mem_region(void *_res) +{ + struct resource *res = _res; + + if (res->parent) + remove_resource(res); + free_resource(res); +} + +static struct resource * +get_free_mem_region(struct device *dev, struct resource *base, + resource_size_t size, const unsigned long align, + const char *name, const unsigned long desc, + const unsigned long flags) +{ + resource_size_t addr; struct resource *res; struct region_devres *dr = NULL; - size = ALIGN(size, 1UL << PA_SECTION_SHIFT); - end = min_t(unsigned long, base->end, (1UL << MAX_PHYSMEM_BITS) - 1); - addr = end - size + 1UL; + size = ALIGN(size, align); res = alloc_resource(GFP_KERNEL); if (!res) return ERR_PTR(-ENOMEM); - if (dev) { + if (dev && (flags & GFR_REQUEST_REGION)) { dr = devres_alloc(devm_region_release, sizeof(struct region_devres), GFP_KERNEL); if (!dr) { free_resource(res); return ERR_PTR(-ENOMEM); } + } else if (dev) { + if (devm_add_action_or_reset(dev, remove_free_mem_region, res)) + return ERR_PTR(-ENOMEM); } write_lock(&resource_lock); - for (; addr > size && addr >= base->start; addr -= size) { - if (__region_intersects(addr, size, 0, IORES_DESC_NONE) != - REGION_DISJOINT) + for (addr = gfr_start(base, size, align, flags); + gfr_continue(base, addr, size, flags); + addr = gfr_next(addr, size, flags)) { + if (__region_intersects(base, addr, size, 0, IORES_DESC_NONE) != + REGION_DISJOINT) continue; - if (__request_region_locked(res, &iomem_resource, addr, size, - name, 0)) - break; + if (flags & GFR_REQUEST_REGION) { + if (__request_region_locked(res, &iomem_resource, addr, + size, name, 0)) + break; - if (dev) { - dr->parent = &iomem_resource; - dr->start = addr; - dr->n = size; - devres_add(dev, dr); - } + if (dev) { + dr->parent = &iomem_resource; + dr->start = addr; + dr->n = size; + devres_add(dev, dr); + } - res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; - write_unlock(&resource_lock); + res->desc = desc; + write_unlock(&resource_lock); + + + /* + * A driver is claiming this region so revoke any + * mappings. + */ + revoke_iomem(res); + } else { + res->start = addr; + res->end = addr + size - 1; + res->name = name; + res->desc = desc; + res->flags = IORESOURCE_MEM; + + /* + * Only succeed if the resource hosts an exclusive + * range after the insert + */ + if (__insert_resource(base, res) || res->child) + break; + + write_unlock(&resource_lock); + } - /* - * A driver is claiming this region so revoke any mappings. - */ - revoke_iomem(res); return res; } write_unlock(&resource_lock); - free_resource(res); - if (dr) + if (flags & GFR_REQUEST_REGION) { + free_resource(res); devres_free(dr); + } else if (dev) + devm_release_action(dev, remove_free_mem_region, res); return ERR_PTR(-ERANGE); } @@ -1847,18 +1932,48 @@ static struct resource *__request_free_mem_region(struct device *dev, struct resource *devm_request_free_mem_region(struct device *dev, struct resource *base, unsigned long size) { - return __request_free_mem_region(dev, base, size, dev_name(dev)); + unsigned long flags = GFR_DESCENDING | GFR_REQUEST_REGION; + + return get_free_mem_region(dev, base, size, GFR_DEFAULT_ALIGN, + dev_name(dev), + IORES_DESC_DEVICE_PRIVATE_MEMORY, flags); } EXPORT_SYMBOL_GPL(devm_request_free_mem_region); struct resource *request_free_mem_region(struct resource *base, unsigned long size, const char *name) { - return __request_free_mem_region(NULL, base, size, name); + unsigned long flags = GFR_DESCENDING | GFR_REQUEST_REGION; + + return get_free_mem_region(NULL, base, size, GFR_DEFAULT_ALIGN, name, + IORES_DESC_DEVICE_PRIVATE_MEMORY, flags); } EXPORT_SYMBOL_GPL(request_free_mem_region); -#endif /* CONFIG_DEVICE_PRIVATE */ +/** + * alloc_free_mem_region - find a free region relative to @base + * @base: resource that will parent the new resource + * @size: size in bytes of memory to allocate from @base + * @align: alignment requirements for the allocation + * @name: resource name + * + * Buses like CXL, that can dynamically instantiate new memory regions, + * need a method to allocate physical address space for those regions. + * Allocate and insert a new resource to cover a free, unclaimed by a + * descendant of @base, range in the span of @base. + */ +struct resource *alloc_free_mem_region(struct resource *base, + unsigned long size, unsigned long align, + const char *name) +{ + /* Default of ascending direction and insert resource */ + unsigned long flags = 0; + + return get_free_mem_region(NULL, base, size, align, name, + IORES_DESC_NONE, flags); +} +EXPORT_SYMBOL_NS_GPL(alloc_free_mem_region, CXL); +#endif /* CONFIG_GET_FREE_REGION */ static int __init strict_iomem(char *str) { diff --git a/kernel/rseq.c b/kernel/rseq.c index 97ac20b4f738..bda8175f8f99 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -18,8 +18,9 @@ #define CREATE_TRACE_POINTS #include <trace/events/rseq.h> -#define RSEQ_CS_PREEMPT_MIGRATE_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE | \ - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT) +#define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \ + RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \ + RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) /* * @@ -175,23 +176,15 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags) u32 flags, event_mask; int ret; + if (WARN_ON_ONCE(cs_flags & RSEQ_CS_NO_RESTART_FLAGS) || cs_flags) + return -EINVAL; + /* Get thread flags. */ ret = get_user(flags, &t->rseq->flags); if (ret) return ret; - /* Take critical section flags into account. */ - flags |= cs_flags; - - /* - * Restart on signal can only be inhibited when restart on - * preempt and restart on migrate are inhibited too. Otherwise, - * a preempted signal handler could fail to restart the prior - * execution context on sigreturn. - */ - if (unlikely((flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) && - (flags & RSEQ_CS_PREEMPT_MIGRATE_FLAGS) != - RSEQ_CS_PREEMPT_MIGRATE_FLAGS)) + if (WARN_ON_ONCE(flags & RSEQ_CS_NO_RESTART_FLAGS) || flags) return -EINVAL; /* @@ -203,7 +196,7 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags) t->rseq_event_mask = 0; preempt_enable(); - return !!(event_mask & ~flags); + return !!event_mask; } static int clear_rseq_cs(struct task_struct *t) diff --git a/kernel/scftorture.c b/kernel/scftorture.c index dcb0410950e4..5d113aa59e77 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -267,9 +267,10 @@ static void scf_handler(void *scfc_in) } this_cpu_inc(scf_invoked_count); if (longwait <= 0) { - if (!(r & 0xffc0)) + if (!(r & 0xffc0)) { udelay(r & 0x3f); - goto out; + goto out; + } } if (r & 0xfff) goto out; diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 16092b49ff6a..4ebaf97f7bd8 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -36,6 +36,7 @@ void __init autogroup_init(struct task_struct *init_task) kref_init(&autogroup_default.kref); init_rwsem(&autogroup_default.lock); init_task->signal->autogroup = &autogroup_default; + sched_autogroup_sysctl_init(); } void autogroup_free(struct task_group *tg) @@ -219,7 +220,6 @@ void sched_autogroup_exit(struct signal_struct *sig) static int __init setup_autogroup(char *str) { sysctl_sched_autogroup_enabled = 0; - sched_autogroup_sysctl_init(); return 1; } diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c index e0104b45029a..d9dc9ab3773f 100644 --- a/kernel/sched/build_policy.c +++ b/kernel/sched/build_policy.c @@ -15,6 +15,7 @@ /* Headers: */ #include <linux/sched/clock.h> #include <linux/sched/cputime.h> +#include <linux/sched/hotplug.h> #include <linux/sched/posix-timers.h> #include <linux/sched/rt.h> @@ -31,6 +32,7 @@ #include <uapi/linux/sched/types.h> #include "sched.h" +#include "smp.h" #include "autogroup.h" #include "stats.h" diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c index eec0849b2aae..99bdd96f454f 100644 --- a/kernel/sched/build_utility.c +++ b/kernel/sched/build_utility.c @@ -14,6 +14,7 @@ #include <linux/sched/debug.h> #include <linux/sched/isolation.h> #include <linux/sched/loadavg.h> +#include <linux/sched/nohz.h> #include <linux/sched/mm.h> #include <linux/sched/rseq_api.h> #include <linux/sched/task_stack.h> diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index d9272d9061a3..e374c0c923da 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -287,7 +287,7 @@ again: clock = wrap_max(clock, min_clock); clock = wrap_min(clock, max_clock); - if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock) + if (!try_cmpxchg64(&scd->clock, &old_clock, clock)) goto again; return clock; @@ -349,7 +349,7 @@ again: val = remote_clock; } - if (cmpxchg64(ptr, old_val, val) != old_val) + if (!try_cmpxchg64(ptr, &old_val, val)) goto again; return val; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d575b4914925..60fdc0faf1c9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -26,7 +26,10 @@ #include <linux/topology.h> #include <linux/sched/clock.h> #include <linux/sched/cond_resched.h> +#include <linux/sched/cputime.h> #include <linux/sched/debug.h> +#include <linux/sched/hotplug.h> +#include <linux/sched/init.h> #include <linux/sched/isolation.h> #include <linux/sched/loadavg.h> #include <linux/sched/mm.h> @@ -70,6 +73,7 @@ #include <uapi/linux/sched/types.h> +#include <asm/irq_regs.h> #include <asm/switch_to.h> #include <asm/tlb.h> @@ -88,7 +92,7 @@ #include "stats.h" #include "../workqueue_internal.h" -#include "../../fs/io-wq.h" +#include "../../io_uring/io-wq.h" #include "../smpboot.h" /* @@ -145,12 +149,6 @@ const_debug unsigned int sysctl_sched_nr_migrate = 8; const_debug unsigned int sysctl_sched_nr_migrate = 32; #endif -/* - * period over which we measure -rt task CPU usage in us. - * default: 1s - */ -unsigned int sysctl_sched_rt_period = 1000000; - __read_mostly int scheduler_running; #ifdef CONFIG_SCHED_CORE @@ -445,13 +443,6 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } #endif /* CONFIG_SCHED_CORE */ /* - * part of the period that we allow rt tasks to run in us. - * default: 0.95s - */ -int sysctl_sched_rt_runtime = 950000; - - -/* * Serialization rules: * * Lock order: @@ -610,10 +601,10 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2) swap(rq1, rq2); raw_spin_rq_lock(rq1); - if (__rq_lockp(rq1) == __rq_lockp(rq2)) - return; + if (__rq_lockp(rq1) != __rq_lockp(rq2)) + raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING); - raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING); + double_rq_clock_clear_update(rq1, rq2); } #endif @@ -883,15 +874,11 @@ static inline void hrtick_rq_init(struct rq *rq) ({ \ typeof(ptr) _ptr = (ptr); \ typeof(mask) _mask = (mask); \ - typeof(*_ptr) _old, _val = *_ptr; \ + typeof(*_ptr) _val = *_ptr; \ \ - for (;;) { \ - _old = cmpxchg(_ptr, _val, _val | _mask); \ - if (_old == _val) \ - break; \ - _val = _old; \ - } \ - _old; \ + do { \ + } while (!try_cmpxchg(_ptr, &_val, _val | _mask)); \ + _val; \ }) #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) @@ -900,7 +887,7 @@ static inline void hrtick_rq_init(struct rq *rq) * this avoids any races wrt polling state changes and thereby avoids * spurious IPIs. */ -static bool set_nr_and_not_polling(struct task_struct *p) +static inline bool set_nr_and_not_polling(struct task_struct *p) { struct thread_info *ti = task_thread_info(p); return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); @@ -915,30 +902,28 @@ static bool set_nr_and_not_polling(struct task_struct *p) static bool set_nr_if_polling(struct task_struct *p) { struct thread_info *ti = task_thread_info(p); - typeof(ti->flags) old, val = READ_ONCE(ti->flags); + typeof(ti->flags) val = READ_ONCE(ti->flags); for (;;) { if (!(val & _TIF_POLLING_NRFLAG)) return false; if (val & _TIF_NEED_RESCHED) return true; - old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); - if (old == val) + if (try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED)) break; - val = old; } return true; } #else -static bool set_nr_and_not_polling(struct task_struct *p) +static inline bool set_nr_and_not_polling(struct task_struct *p) { set_tsk_need_resched(p); return true; } #ifdef CONFIG_SMP -static bool set_nr_if_polling(struct task_struct *p) +static inline bool set_nr_if_polling(struct task_struct *p) { return false; } @@ -1319,10 +1304,10 @@ static void set_load_weight(struct task_struct *p, bool update_load) static DEFINE_MUTEX(uclamp_mutex); /* Max allowed minimum utilization */ -unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; +static unsigned int __maybe_unused sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; /* Max allowed maximum utilization */ -unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; +static unsigned int __maybe_unused sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; /* * By default RT tasks run at the maximum performance point/capacity of the @@ -1339,7 +1324,7 @@ unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; * This knob will not override the system default sched_util_clamp_min defined * above. */ -unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE; +static unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE; /* All clamps are required to be less or equal than these values */ static struct uclamp_se uclamp_default[UCLAMP_CNT]; @@ -1469,33 +1454,6 @@ static void uclamp_update_util_min_rt_default(struct task_struct *p) task_rq_unlock(rq, p, &rf); } -static void uclamp_sync_util_min_rt_default(void) -{ - struct task_struct *g, *p; - - /* - * copy_process() sysctl_uclamp - * uclamp_min_rt = X; - * write_lock(&tasklist_lock) read_lock(&tasklist_lock) - * // link thread smp_mb__after_spinlock() - * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock); - * sched_post_fork() for_each_process_thread() - * __uclamp_sync_rt() __uclamp_sync_rt() - * - * Ensures that either sched_post_fork() will observe the new - * uclamp_min_rt or for_each_process_thread() will observe the new - * task. - */ - read_lock(&tasklist_lock); - smp_mb__after_spinlock(); - read_unlock(&tasklist_lock); - - rcu_read_lock(); - for_each_process_thread(g, p) - uclamp_update_util_min_rt_default(p); - rcu_read_unlock(); -} - static inline struct uclamp_se uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id) { @@ -1775,6 +1733,11 @@ uclamp_update_active_tasks(struct cgroup_subsys_state *css) } static void cpu_util_update_eff(struct cgroup_subsys_state *css); +#endif + +#ifdef CONFIG_SYSCTL +#ifdef CONFIG_UCLAMP_TASK +#ifdef CONFIG_UCLAMP_TASK_GROUP static void uclamp_update_root_tg(void) { struct task_group *tg = &root_task_group; @@ -1792,7 +1755,34 @@ static void uclamp_update_root_tg(void) static void uclamp_update_root_tg(void) { } #endif -int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, +static void uclamp_sync_util_min_rt_default(void) +{ + struct task_struct *g, *p; + + /* + * copy_process() sysctl_uclamp + * uclamp_min_rt = X; + * write_lock(&tasklist_lock) read_lock(&tasklist_lock) + * // link thread smp_mb__after_spinlock() + * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock); + * sched_post_fork() for_each_process_thread() + * __uclamp_sync_rt() __uclamp_sync_rt() + * + * Ensures that either sched_post_fork() will observe the new + * uclamp_min_rt or for_each_process_thread() will observe the new + * task. + */ + read_lock(&tasklist_lock); + smp_mb__after_spinlock(); + read_unlock(&tasklist_lock); + + rcu_read_lock(); + for_each_process_thread(g, p) + uclamp_update_util_min_rt_default(p); + rcu_read_unlock(); +} + +static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { bool update_root_tg = false; @@ -1856,6 +1846,8 @@ done: return result; } +#endif +#endif static int uclamp_validate(struct task_struct *p, const struct sched_attr *attr) @@ -2190,7 +2182,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) { if (p->sched_class == rq->curr->sched_class) rq->curr->sched_class->check_preempt_curr(rq, p, flags); - else if (p->sched_class > rq->curr->sched_class) + else if (sched_class_above(p->sched_class, rq->curr->sched_class)) resched_curr(rq); /* @@ -2408,7 +2400,7 @@ static int migration_cpu_stop(void *data) * __migrate_task() such that we will not miss enforcing cpus_ptr * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. */ - flush_smp_call_function_from_idle(); + flush_smp_call_function_queue(); raw_spin_lock(&p->pi_lock); rq_lock(rq, &rf); @@ -3811,7 +3803,7 @@ bool cpus_share_cache(int this_cpu, int that_cpu) return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } -static inline bool ttwu_queue_cond(int cpu, int wake_flags) +static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) { /* * Do not complicate things with the async wake_list while the CPU is @@ -3820,6 +3812,10 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags) if (!cpu_active(cpu)) return false; + /* Ensure the task will still be allowed to run on the CPU. */ + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) + return false; + /* * If the CPU does not share cache, then queue the task on the * remote rqs wakelist to avoid accessing remote data. @@ -3827,13 +3823,21 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags) if (!cpus_share_cache(smp_processor_id(), cpu)) return true; + if (cpu == smp_processor_id()) + return false; + /* - * If the task is descheduling and the only running task on the - * CPU then use the wakelist to offload the task activation to - * the soon-to-be-idle CPU as the current CPU is likely busy. - * nr_running is checked to avoid unnecessary task stacking. + * If the wakee cpu is idle, or the task is descheduling and the + * only running task on the CPU, then use the wakelist to offload + * the task activation to the idle (or soon-to-be-idle) CPU as + * the current CPU is likely busy. nr_running is checked to + * avoid unnecessary task stacking. + * + * Note that we can only get here with (wakee) p->on_rq=0, + * p->on_cpu can be whatever, we've done the dequeue, so + * the wakee has been accounted out of ->nr_running. */ - if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1) + if (!cpu_rq(cpu)->nr_running) return true; return false; @@ -3841,10 +3845,7 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags) static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) { - if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) { - if (WARN_ON_ONCE(cpu == smp_processor_id())) - return false; - + if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(p, cpu)) { sched_clock_cpu(cpu); /* Sync clocks across CPUs */ __ttwu_queue_wakelist(p, cpu, wake_flags); return true; @@ -4166,7 +4167,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * scheduling. */ if (smp_load_acquire(&p->on_cpu) && - ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU)) + ttwu_queue_wakelist(p, task_cpu(p), wake_flags)) goto unlock; /* @@ -4267,6 +4268,38 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg) } /** + * cpu_curr_snapshot - Return a snapshot of the currently running task + * @cpu: The CPU on which to snapshot the task. + * + * Returns the task_struct pointer of the task "currently" running on + * the specified CPU. If the same task is running on that CPU throughout, + * the return value will be a pointer to that task's task_struct structure. + * If the CPU did any context switches even vaguely concurrently with the + * execution of this function, the return value will be a pointer to the + * task_struct structure of a randomly chosen task that was running on + * that CPU somewhere around the time that this function was executing. + * + * If the specified CPU was offline, the return value is whatever it + * is, perhaps a pointer to the task_struct structure of that CPU's idle + * task, but there is no guarantee. Callers wishing a useful return + * value must take some action to ensure that the specified CPU remains + * online throughout. + * + * This function executes full memory barriers before and after fetching + * the pointer, which permits the caller to confine this function's fetch + * with respect to the caller's accesses to other shared variables. + */ +struct task_struct *cpu_curr_snapshot(int cpu) +{ + struct task_struct *t; + + smp_mb(); /* Pairing determined by caller's synchronization design. */ + t = rcu_dereference(cpu_curr(cpu)); + smp_mb(); /* Pairing determined by caller's synchronization design. */ + return t; +} + +/** * wake_up_process - Wake up a specific process * @p: The process to be woken up. * @@ -4430,7 +4463,7 @@ out: __setup("schedstats=", setup_schedstats); #ifdef CONFIG_PROC_SYSCTL -int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, +static int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; @@ -4452,6 +4485,52 @@ int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, #endif /* CONFIG_PROC_SYSCTL */ #endif /* CONFIG_SCHEDSTATS */ +#ifdef CONFIG_SYSCTL +static struct ctl_table sched_core_sysctls[] = { +#ifdef CONFIG_SCHEDSTATS + { + .procname = "sched_schedstats", + .data = NULL, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_schedstats, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SCHEDSTATS */ +#ifdef CONFIG_UCLAMP_TASK + { + .procname = "sched_util_clamp_min", + .data = &sysctl_sched_uclamp_util_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, + { + .procname = "sched_util_clamp_max", + .data = &sysctl_sched_uclamp_util_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, + { + .procname = "sched_util_clamp_min_rt_default", + .data = &sysctl_sched_uclamp_util_min_rt_default, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, +#endif /* CONFIG_UCLAMP_TASK */ + {} +}; +static int __init sched_core_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_core_sysctls); + return 0; +} +late_initcall(sched_core_sysctl_init); +#endif /* CONFIG_SYSCTL */ + /* * fork()/clone()-time setup: */ @@ -4710,7 +4789,8 @@ static inline void prepare_task(struct task_struct *next) * Claim the task as running, we do this before switching to it * such that any running task will have this set. * - * See the ttwu() WF_ON_CPU case and its ordering comment. + * See the smp_load_acquire(&p->on_cpu) case in ttwu() and + * its ordering comment. */ WRITE_ONCE(next->on_cpu, 1); #endif @@ -4755,25 +4835,55 @@ static void do_balance_callbacks(struct rq *rq, struct callback_head *head) static void balance_push(struct rq *rq); +/* + * balance_push_callback is a right abuse of the callback interface and plays + * by significantly different rules. + * + * Where the normal balance_callback's purpose is to be ran in the same context + * that queued it (only later, when it's safe to drop rq->lock again), + * balance_push_callback is specifically targeted at __schedule(). + * + * This abuse is tolerated because it places all the unlikely/odd cases behind + * a single test, namely: rq->balance_callback == NULL. + */ struct callback_head balance_push_callback = { .next = NULL, .func = (void (*)(struct callback_head *))balance_push, }; -static inline struct callback_head *splice_balance_callbacks(struct rq *rq) +static inline struct callback_head * +__splice_balance_callbacks(struct rq *rq, bool split) { struct callback_head *head = rq->balance_callback; + if (likely(!head)) + return NULL; + lockdep_assert_rq_held(rq); - if (head) + /* + * Must not take balance_push_callback off the list when + * splice_balance_callbacks() and balance_callbacks() are not + * in the same rq->lock section. + * + * In that case it would be possible for __schedule() to interleave + * and observe the list empty. + */ + if (split && head == &balance_push_callback) + head = NULL; + else rq->balance_callback = NULL; return head; } +static inline struct callback_head *splice_balance_callbacks(struct rq *rq) +{ + return __splice_balance_callbacks(rq, true); +} + static void __balance_callbacks(struct rq *rq) { - do_balance_callbacks(rq, splice_balance_callbacks(rq)); + do_balance_callbacks(rq, __splice_balance_callbacks(rq, false)); } static inline void balance_callbacks(struct rq *rq, struct callback_head *head) @@ -5689,7 +5799,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * higher scheduling class, because otherwise those lose the * opportunity to pull in more work from other CPUs. */ - if (likely(prev->sched_class <= &fair_sched_class && + if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) && rq->nr_running == rq->cfs.h_nr_running)) { p = pick_next_task_fair(rq, prev, rf); @@ -5752,6 +5862,8 @@ static inline struct task_struct *pick_task(struct rq *rq) extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi); +static void queue_core_balance(struct rq *rq); + static struct task_struct * pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { @@ -5801,7 +5913,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } rq->core_pick = NULL; - return next; + goto out; } put_prev_task_balance(rq, prev, rf); @@ -5851,7 +5963,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) */ WARN_ON_ONCE(fi_before); task_vruntime_update(rq, next, false); - goto done; + goto out_set_next; } } @@ -5970,8 +6082,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) resched_curr(rq_i); } -done: +out_set_next: set_next_task(rq, next); +out: + if (rq->core->core_forceidle_count && next == rq->idle) + queue_core_balance(rq); + return next; } @@ -6000,7 +6116,7 @@ static bool try_steal_cookie(int this, int that) if (p == src->core_pick || p == src->curr) goto next; - if (!cpumask_test_cpu(this, &p->cpus_mask)) + if (!is_cpu_allowed(p, this)) goto next; if (p->core_occupation > dst->idle->core_occupation) @@ -6066,7 +6182,7 @@ static void sched_core_balance(struct rq *rq) static DEFINE_PER_CPU(struct callback_head, core_balance_head); -void queue_core_balance(struct rq *rq) +static void queue_core_balance(struct rq *rq) { if (!sched_core_enabled(rq)) return; @@ -6304,10 +6420,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) /* * We must load prev->state once (task_struct::state is volatile), such - * that: - * - * - we form a control dependency vs deactivate_task() below. - * - ptrace_{,un}freeze_traced() can change ->state underneath us. + * that we form a control dependency vs deactivate_task() below. */ prev_state = READ_ONCE(prev->__state); if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) { @@ -6376,7 +6489,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) migrate_disable_switch(rq, prev); psi_sched_switch(prev, next, !task_on_rq_queued(prev)); - trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev_state, prev, next); + trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state); /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); @@ -6424,8 +6537,12 @@ static inline void sched_submit_work(struct task_struct *tsk) io_wq_worker_sleeping(tsk); } - if (tsk_is_pi_blocked(tsk)) - return; + /* + * spinlock and rwlock must not flush block requests. This will + * deadlock if the callback attempts to acquire a lock which is + * already acquired. + */ + SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT); /* * If we are going to sleep and we have plugged IO queued, @@ -6483,7 +6600,7 @@ void __sched schedule_idle(void) } while (need_resched()); } -#if defined(CONFIG_CONTEXT_TRACKING) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK) +#if defined(CONFIG_CONTEXT_TRACKING_USER) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_USER_OFFSTACK) asmlinkage __visible void __sched schedule_user(void) { /* @@ -6922,17 +7039,29 @@ out_unlock: EXPORT_SYMBOL(set_user_nice); /* - * can_nice - check if a task can reduce its nice value + * is_nice_reduction - check if nice value is an actual reduction + * + * Similar to can_nice() but does not perform a capability check. + * * @p: task * @nice: nice value */ -int can_nice(const struct task_struct *p, const int nice) +static bool is_nice_reduction(const struct task_struct *p, const int nice) { /* Convert nice value [19,-20] to rlimit style value [1,40]: */ int nice_rlim = nice_to_rlimit(nice); - return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || - capable(CAP_SYS_NICE)); + return (nice_rlim <= task_rlimit(p, RLIMIT_NICE)); +} + +/* + * can_nice - check if a task can reduce its nice value + * @p: task + * @nice: nice value + */ +int can_nice(const struct task_struct *p, const int nice) +{ + return is_nice_reduction(p, nice) || capable(CAP_SYS_NICE); } #ifdef __ARCH_WANT_SYS_NICE @@ -7061,12 +7190,14 @@ struct task_struct *idle_task(int cpu) * required to meet deadlines. */ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, - unsigned long max, enum cpu_util_type type, + enum cpu_util_type type, struct task_struct *p) { - unsigned long dl_util, util, irq; + unsigned long dl_util, util, irq, max; struct rq *rq = cpu_rq(cpu); + max = arch_scale_cpu_capacity(cpu); + if (!uclamp_is_used() && type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { return max; @@ -7146,10 +7277,9 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, return min(max, util); } -unsigned long sched_cpu_util(int cpu, unsigned long max) +unsigned long sched_cpu_util(int cpu) { - return effective_cpu_util(cpu, cpu_util_cfs(cpu), max, - ENERGY_UTIL, NULL); + return effective_cpu_util(cpu, cpu_util_cfs(cpu), ENERGY_UTIL, NULL); } #endif /* CONFIG_SMP */ @@ -7211,6 +7341,69 @@ static bool check_same_owner(struct task_struct *p) return match; } +/* + * Allow unprivileged RT tasks to decrease priority. + * Only issue a capable test if needed and only once to avoid an audit + * event on permitted non-privileged operations: + */ +static int user_check_sched_setscheduler(struct task_struct *p, + const struct sched_attr *attr, + int policy, int reset_on_fork) +{ + if (fair_policy(policy)) { + if (attr->sched_nice < task_nice(p) && + !is_nice_reduction(p, attr->sched_nice)) + goto req_priv; + } + + if (rt_policy(policy)) { + unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); + + /* Can't set/change the rt policy: */ + if (policy != p->policy && !rlim_rtprio) + goto req_priv; + + /* Can't increase priority: */ + if (attr->sched_priority > p->rt_priority && + attr->sched_priority > rlim_rtprio) + goto req_priv; + } + + /* + * Can't set/change SCHED_DEADLINE policy at all for now + * (safest behavior); in the future we would like to allow + * unprivileged DL tasks to increase their relative deadline + * or reduce their runtime (both ways reducing utilization) + */ + if (dl_policy(policy)) + goto req_priv; + + /* + * Treat SCHED_IDLE as nice 20. Only allow a switch to + * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. + */ + if (task_has_idle_policy(p) && !idle_policy(policy)) { + if (!is_nice_reduction(p, task_nice(p))) + goto req_priv; + } + + /* Can't change other user's priorities: */ + if (!check_same_owner(p)) + goto req_priv; + + /* Normal users shall not reset the sched_reset_on_fork flag: */ + if (p->sched_reset_on_fork && !reset_on_fork) + goto req_priv; + + return 0; + +req_priv: + if (!capable(CAP_SYS_NICE)) + return -EPERM; + + return 0; +} + static int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi) @@ -7252,58 +7445,11 @@ recheck: (rt_policy(policy) != (attr->sched_priority != 0))) return -EINVAL; - /* - * Allow unprivileged RT tasks to decrease priority: - */ - if (user && !capable(CAP_SYS_NICE)) { - if (fair_policy(policy)) { - if (attr->sched_nice < task_nice(p) && - !can_nice(p, attr->sched_nice)) - return -EPERM; - } - - if (rt_policy(policy)) { - unsigned long rlim_rtprio = - task_rlimit(p, RLIMIT_RTPRIO); - - /* Can't set/change the rt policy: */ - if (policy != p->policy && !rlim_rtprio) - return -EPERM; - - /* Can't increase priority: */ - if (attr->sched_priority > p->rt_priority && - attr->sched_priority > rlim_rtprio) - return -EPERM; - } - - /* - * Can't set/change SCHED_DEADLINE policy at all for now - * (safest behavior); in the future we would like to allow - * unprivileged DL tasks to increase their relative deadline - * or reduce their runtime (both ways reducing utilization) - */ - if (dl_policy(policy)) - return -EPERM; - - /* - * Treat SCHED_IDLE as nice 20. Only allow a switch to - * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. - */ - if (task_has_idle_policy(p) && !idle_policy(policy)) { - if (!can_nice(p, task_nice(p))) - return -EPERM; - } - - /* Can't change other user's priorities: */ - if (!check_same_owner(p)) - return -EPERM; - - /* Normal users shall not reset the sched_reset_on_fork flag: */ - if (p->sched_reset_on_fork && !reset_on_fork) - return -EPERM; - } - if (user) { + retval = user_check_sched_setscheduler(p, attr, policy, reset_on_fork); + if (retval) + return retval; + if (attr->sched_flags & SCHED_FLAG_SUGOV) return -EINVAL; @@ -8409,6 +8555,18 @@ static void __init preempt_dynamic_init(void) } } +#define PREEMPT_MODEL_ACCESSOR(mode) \ + bool preempt_model_##mode(void) \ + { \ + WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \ + return preempt_dynamic_mode == preempt_dynamic_##mode; \ + } \ + EXPORT_SYMBOL_GPL(preempt_model_##mode) + +PREEMPT_MODEL_ACCESSOR(none); +PREEMPT_MODEL_ACCESSOR(voluntary); +PREEMPT_MODEL_ACCESSOR(full); + #else /* !CONFIG_PREEMPT_DYNAMIC */ static inline void preempt_dynamic_init(void) { } @@ -8859,7 +9017,7 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur, } int task_can_attach(struct task_struct *p, - const struct cpumask *cs_cpus_allowed) + const struct cpumask *cs_effective_cpus) { int ret = 0; @@ -8878,9 +9036,11 @@ int task_can_attach(struct task_struct *p, } if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, - cs_cpus_allowed)) { - int cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed); + cs_effective_cpus)) { + int cpu = cpumask_any_and(cpu_active_mask, cs_effective_cpus); + if (unlikely(cpu >= nr_cpu_ids)) + return -EINVAL; ret = dl_cpu_busy(cpu, p); } @@ -9443,7 +9603,7 @@ static struct kmem_cache *task_group_cache __read_mostly; #endif DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); -DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); +DECLARE_PER_CPU(cpumask_var_t, select_rq_mask); void __init sched_init(void) { @@ -9451,11 +9611,11 @@ void __init sched_init(void) int i; /* Make sure the linker didn't screw up */ - BUG_ON(&idle_sched_class + 1 != &fair_sched_class || - &fair_sched_class + 1 != &rt_sched_class || - &rt_sched_class + 1 != &dl_sched_class); + BUG_ON(&idle_sched_class != &fair_sched_class + 1 || + &fair_sched_class != &rt_sched_class + 1 || + &rt_sched_class != &dl_sched_class + 1); #ifdef CONFIG_SMP - BUG_ON(&dl_sched_class + 1 != &stop_sched_class); + BUG_ON(&dl_sched_class != &stop_sched_class + 1); #endif wait_bit_init(); @@ -9492,7 +9652,7 @@ void __init sched_init(void) for_each_possible_cpu(i) { per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( cpumask_size(), GFP_KERNEL, cpu_to_node(i)); - per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node( + per_cpu(select_rq_mask, i) = (cpumask_var_t)kzalloc_node( cpumask_size(), GFP_KERNEL, cpu_to_node(i)); } #endif /* CONFIG_CPUMASK_OFFSTACK */ @@ -11024,6 +11184,19 @@ struct cgroup_subsys cpu_cgrp_subsys = { void dump_cpu_task(int cpu) { + if (cpu == smp_processor_id() && in_hardirq()) { + struct pt_regs *regs; + + regs = get_irq_regs(); + if (regs) { + show_regs(regs); + return; + } + } + + if (trigger_single_cpu_backtrace(cpu)) + return; + pr_info("Task dump for CPU %d:\n", cpu); sched_show_task(cpu_curr(cpu)); } diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index 38a2cec21014..93878cb2a46d 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -56,7 +56,6 @@ static unsigned long sched_core_update_cookie(struct task_struct *p, unsigned long old_cookie; struct rq_flags rf; struct rq *rq; - bool enqueued; rq = task_rq_lock(p, &rf); @@ -68,14 +67,16 @@ static unsigned long sched_core_update_cookie(struct task_struct *p, */ SCHED_WARN_ON((p->core_cookie || cookie) && !sched_core_enabled(rq)); - enqueued = sched_core_enqueued(p); - if (enqueued) + if (sched_core_enqueued(p)) sched_core_dequeue(rq, p, DEQUEUE_SAVE); old_cookie = p->core_cookie; p->core_cookie = cookie; - if (enqueued) + /* + * Consider the cases: !prev_cookie and !cookie. + */ + if (cookie && task_on_rq_queued(p)) sched_core_enqueue(rq, p); /* @@ -277,7 +278,11 @@ void __sched_core_account_forceidle(struct rq *rq) if (p == rq_i->idle) continue; - __schedstat_add(p->stats.core_forceidle_sum, delta); + /* + * Note: this will account forceidle to the current cpu, even + * if it comes from our SMT sibling. + */ + __account_forceidle_time(p, delta); } } diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 3dbf351d12d5..9161d1136d01 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -25,6 +25,9 @@ struct sugov_policy { unsigned int next_freq; unsigned int cached_raw_freq; + /* max CPU capacity, which is equal for all CPUs in freq. domain */ + unsigned long max; + /* The next fields are only needed if fast switch cannot be used: */ struct irq_work irq_work; struct kthread_work work; @@ -48,7 +51,6 @@ struct sugov_cpu { unsigned long util; unsigned long bw_dl; - unsigned long max; /* The field below is for single-CPU policies only: */ #ifdef CONFIG_NO_HZ_COMMON @@ -157,11 +159,9 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, static void sugov_get_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); - unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu); - sg_cpu->max = max; sg_cpu->bw_dl = cpu_bw_dl(rq); - sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu), max, + sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu), FREQUENCY_UTIL, NULL); } @@ -254,6 +254,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, */ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time) { + struct sugov_policy *sg_policy = sg_cpu->sg_policy; unsigned long boost; /* No boost currently required */ @@ -281,7 +282,8 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time) * sg_cpu->util is already in capacity scale; convert iowait_boost * into the same scale so we can compare. */ - boost = (sg_cpu->iowait_boost * sg_cpu->max) >> SCHED_CAPACITY_SHIFT; + boost = sg_cpu->iowait_boost * sg_policy->max; + boost >>= SCHED_CAPACITY_SHIFT; boost = uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL); if (sg_cpu->util < boost) sg_cpu->util = boost; @@ -338,7 +340,7 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time, if (!sugov_update_single_common(sg_cpu, time, flags)) return; - next_f = get_next_freq(sg_policy, sg_cpu->util, sg_cpu->max); + next_f = get_next_freq(sg_policy, sg_cpu->util, sg_policy->max); /* * Do not reduce the frequency if the CPU has not been idle * recently, as the reduction is likely to be premature then. @@ -374,6 +376,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, unsigned int flags) { struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); + struct sugov_policy *sg_policy = sg_cpu->sg_policy; unsigned long prev_util = sg_cpu->util; /* @@ -400,7 +403,8 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, sg_cpu->util = prev_util; cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl), - map_util_perf(sg_cpu->util), sg_cpu->max); + map_util_perf(sg_cpu->util), + sg_policy->max); sg_cpu->sg_policy->last_freq_update_time = time; } @@ -409,25 +413,19 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) { struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct cpufreq_policy *policy = sg_policy->policy; - unsigned long util = 0, max = 1; + unsigned long util = 0; unsigned int j; for_each_cpu(j, policy->cpus) { struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); - unsigned long j_util, j_max; sugov_get_util(j_sg_cpu); sugov_iowait_apply(j_sg_cpu, time); - j_util = j_sg_cpu->util; - j_max = j_sg_cpu->max; - if (j_util * max > j_max * util) { - util = j_util; - max = j_max; - } + util = max(j_sg_cpu->util, util); } - return get_next_freq(sg_policy, util, max); + return get_next_freq(sg_policy, util, sg_policy->max); } static void @@ -753,7 +751,7 @@ static int sugov_start(struct cpufreq_policy *policy) { struct sugov_policy *sg_policy = policy->governor_data; void (*uu)(struct update_util_data *data, u64 time, unsigned int flags); - unsigned int cpu; + unsigned int cpu = cpumask_first(policy->cpus); sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; sg_policy->last_freq_update_time = 0; @@ -761,6 +759,7 @@ static int sugov_start(struct cpufreq_policy *policy) sg_policy->work_in_progress = false; sg_policy->limits_changed = false; sg_policy->cached_raw_freq = 0; + sg_policy->max = arch_scale_cpu_capacity(cpu); sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 78a233d43757..95fc77853743 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -226,6 +226,21 @@ void account_idle_time(u64 cputime) cpustat[CPUTIME_IDLE] += cputime; } + +#ifdef CONFIG_SCHED_CORE +/* + * Account for forceidle time due to core scheduling. + * + * REQUIRES: schedstat is enabled. + */ +void __account_forceidle_time(struct task_struct *p, u64 delta) +{ + __schedstat_add(p->stats.core_forceidle_sum, delta); + + task_group_account_field(p, CPUTIME_FORCEIDLE, delta); +} +#endif + /* * When a guest is interrupted for a longer amount of time, missed clock * ticks are not redelivered later. Due to that, this function may on diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fb4255ae0b2c..0ab79d819a0d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -16,6 +16,42 @@ * Fabio Checconi <fchecconi@gmail.com> */ +/* + * Default limits for DL period; on the top end we guard against small util + * tasks still getting ridiculously long effective runtimes, on the bottom end we + * guard against timer DoS. + */ +static unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */ +static unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */ +#ifdef CONFIG_SYSCTL +static struct ctl_table sched_dl_sysctls[] = { + { + .procname = "sched_deadline_period_max_us", + .data = &sysctl_sched_dl_period_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = (void *)&sysctl_sched_dl_period_min, + }, + { + .procname = "sched_deadline_period_min_us", + .data = &sysctl_sched_dl_period_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra2 = (void *)&sysctl_sched_dl_period_max, + }, + {} +}; + +static int __init sched_dl_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_dl_sysctls); + return 0; +} +late_initcall(sched_dl_sysctl_init); +#endif + static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) { return container_of(dl_se, struct task_struct, dl); @@ -1220,8 +1256,6 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se) return (dl_se->runtime <= 0); } -extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); - /* * This function implements the GRUB accounting rule: * according to the GRUB reclaiming algorithm, the runtime is @@ -1669,7 +1703,10 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * the throttle. */ p->dl.dl_throttled = 0; - BUG_ON(!is_dl_boosted(&p->dl) || flags != ENQUEUE_REPLENISH); + if (!(flags & ENQUEUE_REPLENISH)) + printk_deferred_once("sched: DL de-boosted task PID %d: REPLENISH flag missing\n", + task_pid_nr(p)); + return; } @@ -1832,6 +1869,7 @@ out: static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused) { + struct rq_flags rf; struct rq *rq; if (READ_ONCE(p->__state) != TASK_WAKING) @@ -1843,7 +1881,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused * from try_to_wake_up(). Hence, p->pi_lock is locked, but * rq->lock is not... So, lock it */ - raw_spin_rq_lock(rq); + rq_lock(rq, &rf); if (p->dl.dl_non_contending) { update_rq_clock(rq); sub_running_bw(&p->dl, &rq->dl); @@ -1859,7 +1897,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused put_task_struct(p); } sub_rq_bw(&p->dl, &rq->dl); - raw_spin_rq_unlock(rq); + rq_unlock(rq, &rf); } static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) @@ -2319,13 +2357,7 @@ retry: deactivate_task(rq, next_task, 0); set_task_cpu(next_task, later_rq->cpu); - - /* - * Update the later_rq clock here, because the clock is used - * by the cpufreq_update_util() inside __add_running_bw(). - */ - update_rq_clock(later_rq); - activate_task(later_rq, next_task, ENQUEUE_NOCLOCK); + activate_task(later_rq, next_task, 0); ret = 1; resched_curr(later_rq); @@ -2880,14 +2912,6 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) } /* - * Default limits for DL period; on the top end we guard against small util - * tasks still getting ridiculously long effective runtimes, on the bottom end we - * guard against timer DoS. - */ -unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */ -unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */ - -/* * This function validates the new parameters of a -deadline task. * We ask for the deadline not being zero, and greater or equal * than the runtime, as well as the period of being zero or diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index bb3d63bdf4ae..667876da8382 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -416,7 +416,7 @@ void update_sched_domain_debugfs(void) char buf[32]; snprintf(buf, sizeof(buf), "cpu%d", cpu); - debugfs_remove(debugfs_lookup(buf, sd_dentry)); + debugfs_lookup_and_remove(buf, sd_dentry); d_cpu = debugfs_create_dir(buf, sd_dentry); i = 0; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d4bd299d67ab..914096c5b1ae 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -36,6 +36,7 @@ #include <linux/sched/cond_resched.h> #include <linux/sched/cputime.h> #include <linux/sched/isolation.h> +#include <linux/sched/nohz.h> #include <linux/cpuidle.h> #include <linux/interrupt.h> @@ -173,7 +174,37 @@ int __weak arch_asym_cpu_priority(int cpu) * * (default: 5 msec, units: microseconds) */ -unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; +static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; +#endif + +#ifdef CONFIG_SYSCTL +static struct ctl_table sched_fair_sysctls[] = { + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_CFS_BANDWIDTH + { + .procname = "sched_cfs_bandwidth_slice_us", + .data = &sysctl_sched_cfs_bandwidth_slice, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + }, +#endif + {} +}; + +static int __init sched_fair_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_fair_sysctls); + return 0; +} +late_initcall(sched_fair_sysctl_init); #endif static inline void update_load_add(struct load_weight *lw, unsigned long inc) @@ -313,19 +344,6 @@ const struct sched_class fair_sched_class; #define for_each_sched_entity(se) \ for (; se; se = se->parent) -static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) -{ - if (!path) - return; - - if (cfs_rq && task_group_is_autogroup(cfs_rq->tg)) - autogroup_path(cfs_rq->tg, path, len); - else if (cfs_rq && cfs_rq->tg->css.cgroup) - cgroup_path(cfs_rq->tg->css.cgroup, path, len); - else - strlcpy(path, "(null)", len); -} - static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); @@ -493,12 +511,6 @@ static int se_is_idle(struct sched_entity *se) #define for_each_sched_entity(se) \ for (; se; se = NULL) -static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) -{ - if (path) - strlcpy(path, "(null)", len); -} - static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { return true; @@ -600,11 +612,8 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) } /* ensure we never gain time by being placed backwards. */ - cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); -#ifndef CONFIG_64BIT - smp_wmb(); - cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; -#endif + u64_u32_store(cfs_rq->min_vruntime, + max_vruntime(cfs_rq->min_vruntime, vruntime)); } static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) @@ -1043,6 +1052,33 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) * Scheduling class queueing methods: */ +#ifdef CONFIG_NUMA +#define NUMA_IMBALANCE_MIN 2 + +static inline long +adjust_numa_imbalance(int imbalance, int dst_running, int imb_numa_nr) +{ + /* + * Allow a NUMA imbalance if busy CPUs is less than the maximum + * threshold. Above this threshold, individual tasks may be contending + * for both memory bandwidth and any shared HT resources. This is an + * approximation as the number of running tasks may not be related to + * the number of busy CPUs due to sched_setaffinity. + */ + if (dst_running > imb_numa_nr) + return imbalance; + + /* + * Allow a small imbalance based on a simple pair of communicating + * tasks that remain local when the destination is lightly loaded. + */ + if (imbalance <= NUMA_IMBALANCE_MIN) + return 0; + + return imbalance; +} +#endif /* CONFIG_NUMA */ + #ifdef CONFIG_NUMA_BALANCING /* * Approximate time to scan a full NUMA task in ms. The task scan period is @@ -1536,8 +1572,6 @@ struct task_numa_env { static unsigned long cpu_load(struct rq *rq); static unsigned long cpu_runnable(struct rq *rq); -static inline long adjust_numa_imbalance(int imbalance, - int dst_running, int imb_numa_nr); static inline enum numa_type numa_classify(unsigned int imbalance_pct, @@ -1778,6 +1812,15 @@ static bool task_numa_compare(struct task_numa_env *env, */ cur_ng = rcu_dereference(cur->numa_group); if (cur_ng == p_ng) { + /* + * Do not swap within a group or between tasks that have + * no group if there is spare capacity. Swapping does + * not address the load imbalance and helps one task at + * the cost of punishing another. + */ + if (env->dst_stats.node_type == node_has_spare) + goto unlock; + imp = taskimp + task_weight(cur, env->src_nid, dist) - task_weight(cur, env->dst_nid, dist); /* @@ -2873,6 +2916,7 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) p->node_stamp = 0; p->numa_scan_seq = mm ? mm->numa_scan_seq : 0; p->numa_scan_period = sysctl_numa_balancing_scan_delay; + p->numa_migrate_retry = 0; /* Protect against double add, see task_tick_numa and task_numa_work */ p->numa_work.next = &p->numa_work; p->numa_faults = NULL; @@ -2915,7 +2959,7 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr) /* * We don't care about NUMA placement if we don't have memory. */ - if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work) + if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work) return; /* @@ -3132,6 +3176,8 @@ void reweight_task(struct task_struct *p, int prio) load->inv_weight = sched_prio_to_wmult[prio]; } +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); + #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_SMP /* @@ -3242,8 +3288,6 @@ static long calc_group_shares(struct cfs_rq *cfs_rq) } #endif /* CONFIG_SMP */ -static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); - /* * Recomputes the group entity based on the current state of its group * runqueue. @@ -3301,6 +3345,34 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) } #ifdef CONFIG_SMP +static inline bool load_avg_is_decayed(struct sched_avg *sa) +{ + if (sa->load_sum) + return false; + + if (sa->util_sum) + return false; + + if (sa->runnable_sum) + return false; + + /* + * _avg must be null when _sum are null because _avg = _sum / divider + * Make sure that rounding and/or propagation of PELT values never + * break this. + */ + SCHED_WARN_ON(sa->load_avg || + sa->util_avg || + sa->runnable_avg); + + return true; +} + +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) +{ + return u64_u32_load_copy(cfs_rq->avg.last_update_time, + cfs_rq->last_update_time_copy); +} #ifdef CONFIG_FAIR_GROUP_SCHED /* * Because list_add_leaf_cfs_rq always places a child cfs_rq on the list @@ -3333,27 +3405,12 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) if (cfs_rq->load.weight) return false; - if (cfs_rq->avg.load_sum) - return false; - - if (cfs_rq->avg.util_sum) - return false; - - if (cfs_rq->avg.runnable_sum) + if (!load_avg_is_decayed(&cfs_rq->avg)) return false; if (child_cfs_rq_on_list(cfs_rq)) return false; - /* - * _avg must be null when _sum are null because _avg = _sum / divider - * Make sure that rounding and/or propagation of PELT values never - * break this. - */ - SCHED_WARN_ON(cfs_rq->avg.load_avg || - cfs_rq->avg.util_avg || - cfs_rq->avg.runnable_avg); - return true; } @@ -3411,27 +3468,9 @@ void set_task_rq_fair(struct sched_entity *se, if (!(se->avg.last_update_time && prev)) return; -#ifndef CONFIG_64BIT - { - u64 p_last_update_time_copy; - u64 n_last_update_time_copy; - - do { - p_last_update_time_copy = prev->load_last_update_time_copy; - n_last_update_time_copy = next->load_last_update_time_copy; - - smp_rmb(); + p_last_update_time = cfs_rq_last_update_time(prev); + n_last_update_time = cfs_rq_last_update_time(next); - p_last_update_time = prev->avg.last_update_time; - n_last_update_time = next->avg.last_update_time; - - } while (p_last_update_time != p_last_update_time_copy || - n_last_update_time != n_last_update_time_copy); - } -#else - p_last_update_time = prev->avg.last_update_time; - n_last_update_time = next->avg.last_update_time; -#endif __update_load_avg_blocked_se(p_last_update_time, se); se->avg.last_update_time = n_last_update_time; } @@ -3710,6 +3749,89 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum #endif /* CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_NO_HZ_COMMON +static inline void migrate_se_pelt_lag(struct sched_entity *se) +{ + u64 throttled = 0, now, lut; + struct cfs_rq *cfs_rq; + struct rq *rq; + bool is_idle; + + if (load_avg_is_decayed(&se->avg)) + return; + + cfs_rq = cfs_rq_of(se); + rq = rq_of(cfs_rq); + + rcu_read_lock(); + is_idle = is_idle_task(rcu_dereference(rq->curr)); + rcu_read_unlock(); + + /* + * The lag estimation comes with a cost we don't want to pay all the + * time. Hence, limiting to the case where the source CPU is idle and + * we know we are at the greatest risk to have an outdated clock. + */ + if (!is_idle) + return; + + /* + * Estimated "now" is: last_update_time + cfs_idle_lag + rq_idle_lag, where: + * + * last_update_time (the cfs_rq's last_update_time) + * = cfs_rq_clock_pelt()@cfs_rq_idle + * = rq_clock_pelt()@cfs_rq_idle + * - cfs->throttled_clock_pelt_time@cfs_rq_idle + * + * cfs_idle_lag (delta between rq's update and cfs_rq's update) + * = rq_clock_pelt()@rq_idle - rq_clock_pelt()@cfs_rq_idle + * + * rq_idle_lag (delta between now and rq's update) + * = sched_clock_cpu() - rq_clock()@rq_idle + * + * We can then write: + * + * now = rq_clock_pelt()@rq_idle - cfs->throttled_clock_pelt_time + + * sched_clock_cpu() - rq_clock()@rq_idle + * Where: + * rq_clock_pelt()@rq_idle is rq->clock_pelt_idle + * rq_clock()@rq_idle is rq->clock_idle + * cfs->throttled_clock_pelt_time@cfs_rq_idle + * is cfs_rq->throttled_pelt_idle + */ + +#ifdef CONFIG_CFS_BANDWIDTH + throttled = u64_u32_load(cfs_rq->throttled_pelt_idle); + /* The clock has been stopped for throttling */ + if (throttled == U64_MAX) + return; +#endif + now = u64_u32_load(rq->clock_pelt_idle); + /* + * Paired with _update_idle_rq_clock_pelt(). It ensures at the worst case + * is observed the old clock_pelt_idle value and the new clock_idle, + * which lead to an underestimation. The opposite would lead to an + * overestimation. + */ + smp_rmb(); + lut = cfs_rq_last_update_time(cfs_rq); + + now -= throttled; + if (now < lut) + /* + * cfs_rq->avg.last_update_time is more recent than our + * estimation, let's use it. + */ + now = lut; + else + now += sched_clock_cpu(cpu_of(rq)) - u64_u32_load(rq->clock_idle); + + __update_load_avg_blocked_se(now, se); +} +#else +static void migrate_se_pelt_lag(struct sched_entity *se) {} +#endif + /** * update_cfs_rq_load_avg - update the cfs_rq's load/util averages * @now: current time, as per cfs_rq_clock_pelt() @@ -3784,12 +3906,9 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) } decayed |= __update_load_avg_cfs_rq(now, cfs_rq); - -#ifndef CONFIG_64BIT - smp_wmb(); - cfs_rq->load_last_update_time_copy = sa->last_update_time; -#endif - + u64_u32_store_copy(sa->last_update_time, + cfs_rq->last_update_time_copy, + sa->last_update_time); return decayed; } @@ -3829,11 +3948,11 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s se->avg.runnable_sum = se->avg.runnable_avg * divider; - se->avg.load_sum = divider; - if (se_weight(se)) { - se->avg.load_sum = - div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se)); - } + se->avg.load_sum = se->avg.load_avg * divider; + if (se_weight(se) < se->avg.load_sum) + se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se)); + else + se->avg.load_sum = 1; enqueue_load_avg(cfs_rq, se); cfs_rq->avg.util_avg += se->avg.util_avg; @@ -3921,27 +4040,6 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s } } -#ifndef CONFIG_64BIT -static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) -{ - u64 last_update_time_copy; - u64 last_update_time; - - do { - last_update_time_copy = cfs_rq->load_last_update_time_copy; - smp_rmb(); - last_update_time = cfs_rq->avg.last_update_time; - } while (last_update_time != last_update_time_copy); - - return last_update_time; -} -#else -static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) -{ - return cfs_rq->avg.last_update_time; -} -#endif - /* * Synchronize entity load avg of dequeued entity without locking * the previous rq. @@ -4356,16 +4454,11 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) __enqueue_entity(cfs_rq, se); se->on_rq = 1; - /* - * When bandwidth control is enabled, cfs might have been removed - * because of a parent been throttled but cfs->nr_running > 1. Try to - * add it unconditionally. - */ - if (cfs_rq->nr_running == 1 || cfs_bandwidth_used()) - list_add_leaf_cfs_rq(cfs_rq); - - if (cfs_rq->nr_running == 1) + if (cfs_rq->nr_running == 1) { check_enqueue_throttle(cfs_rq); + if (!throttled_hierarchy(cfs_rq)) + list_add_leaf_cfs_rq(cfs_rq); + } } static void __clear_buddies_last(struct sched_entity *se) @@ -4465,6 +4558,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) update_min_vruntime(cfs_rq); + + if (cfs_rq->nr_running == 0) + update_idle_cfs_rq_clock_pelt(cfs_rq); } /* @@ -4846,11 +4942,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->throttle_count--; if (!cfs_rq->throttle_count) { - cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - - cfs_rq->throttled_clock_task; + cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - + cfs_rq->throttled_clock_pelt; /* Add cfs_rq with load or one or more already running entities to the list */ - if (!cfs_rq_is_decayed(cfs_rq) || cfs_rq->nr_running) + if (!cfs_rq_is_decayed(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); } @@ -4864,7 +4960,7 @@ static int tg_throttle_down(struct task_group *tg, void *data) /* group is entering throttled state, stop time */ if (!cfs_rq->throttle_count) { - cfs_rq->throttled_clock_task = rq_clock_task(rq); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); list_del_leaf_cfs_rq(cfs_rq); } cfs_rq->throttle_count++; @@ -4980,11 +5076,18 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) /* update hierarchical throttle state */ walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); - /* Nothing to run but something to decay (on_list)? Complete the branch */ if (!cfs_rq->load.weight) { - if (cfs_rq->on_list) - goto unthrottle_throttle; - return; + if (!cfs_rq->on_list) + return; + /* + * Nothing to run but something to decay (on_list)? + * Complete the branch. + */ + for_each_sched_entity(se) { + if (list_add_leaf_cfs_rq(cfs_rq_of(se))) + break; + } + goto unthrottle_throttle; } task_delta = cfs_rq->h_nr_running; @@ -5022,31 +5125,12 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(qcfs_rq)) goto unthrottle_throttle; - - /* - * One parent has been throttled and cfs_rq removed from the - * list. Add it back to not break the leaf list. - */ - if (throttled_hierarchy(qcfs_rq)) - list_add_leaf_cfs_rq(qcfs_rq); } /* At this point se is NULL and we are at root level*/ add_nr_running(rq, task_delta); unthrottle_throttle: - /* - * The cfs_rq_throttled() breaks in the above iteration can result in - * incomplete leaf list maintenance, resulting in triggering the - * assertion below. - */ - for_each_sched_entity(se) { - struct cfs_rq *qcfs_rq = cfs_rq_of(se); - - if (list_add_leaf_cfs_rq(qcfs_rq)) - break; - } - assert_list_leaf_cfs_rq(rq); /* Determine whether we need to wake up potentially idle CPU: */ @@ -5308,7 +5392,7 @@ static void sync_throttle(struct task_group *tg, int cpu) pcfs_rq = tg->parent->cfs_rq[cpu]; cfs_rq->throttle_count = pcfs_rq->throttle_count; - cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu)); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu)); } /* conditionally throttle active cfs_rq's from put_prev_entity() */ @@ -5701,13 +5785,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto enqueue_throttle; - - /* - * One parent has been throttled and cfs_rq removed from the - * list. Add it back to not break the leaf list. - */ - if (throttled_hierarchy(cfs_rq)) - list_add_leaf_cfs_rq(cfs_rq); } /* At this point se is NULL and we are at root level*/ @@ -5731,21 +5808,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_overutilized_status(rq); enqueue_throttle: - if (cfs_bandwidth_used()) { - /* - * When bandwidth control is enabled; the cfs_rq_throttled() - * breaks in the above iteration can result in incomplete - * leaf list maintenance, resulting in triggering the assertion - * below. - */ - for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); - - if (list_add_leaf_cfs_rq(cfs_rq)) - break; - } - } - assert_list_leaf_cfs_rq(rq); hrtick_update(rq); @@ -5832,7 +5894,7 @@ dequeue_throttle: /* Working cpumask for: load_balance, load_balance_newidle. */ DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); -DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); +DEFINE_PER_CPU(cpumask_var_t, select_rq_mask); #ifdef CONFIG_NO_HZ_COMMON @@ -6322,8 +6384,9 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd */ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) { - struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); + struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); int i, cpu, idle_cpu = -1, nr = INT_MAX; + struct sched_domain_shared *sd_share; struct rq *this_rq = this_rq(); int this = smp_processor_id(); struct sched_domain *this_sd; @@ -6363,6 +6426,17 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool time = cpu_clock(this); } + if (sched_feat(SIS_UTIL)) { + sd_share = rcu_dereference(per_cpu(sd_llc_shared, target)); + if (sd_share) { + /* because !--nr is the condition to stop scan */ + nr = READ_ONCE(sd_share->nr_idle_scan) + 1; + /* overloaded LLC is unlikely to have idle cpu/core */ + if (nr == 1) + return -1; + } + } + for_each_cpu_wrap(cpu, cpus, target + 1) { if (has_idle_core) { i = select_idle_core(p, cpu, cpus, &idle_cpu); @@ -6408,7 +6482,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) int cpu, best_cpu = -1; struct cpumask *cpus; - cpus = this_cpu_cpumask_var_ptr(select_idle_mask); + cpus = this_cpu_cpumask_var_ptr(select_rq_mask); cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); task_util = uclamp_task_util(p); @@ -6458,7 +6532,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) } /* - * per-cpu select_idle_mask usage + * per-cpu select_rq_mask usage */ lockdep_assert_irqs_disabled(); @@ -6544,6 +6618,68 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) } /* + * Predicts what cpu_util(@cpu) would return if @p was removed from @cpu + * (@dst_cpu = -1) or migrated to @dst_cpu. + */ +static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) +{ + struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; + unsigned long util = READ_ONCE(cfs_rq->avg.util_avg); + + /* + * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its + * contribution. If @p migrates from another CPU to @cpu add its + * contribution. In all the other cases @cpu is not impacted by the + * migration so its util_avg is already correct. + */ + if (task_cpu(p) == cpu && dst_cpu != cpu) + lsub_positive(&util, task_util(p)); + else if (task_cpu(p) != cpu && dst_cpu == cpu) + util += task_util(p); + + if (sched_feat(UTIL_EST)) { + unsigned long util_est; + + util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued); + + /* + * During wake-up @p isn't enqueued yet and doesn't contribute + * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued. + * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p + * has been enqueued. + * + * During exec (@dst_cpu = -1) @p is enqueued and does + * contribute to cpu_rq(cpu)->cfs.util_est.enqueued. + * Remove it to "simulate" cpu_util without @p's contribution. + * + * Despite the task_on_rq_queued(@p) check there is still a + * small window for a possible race when an exec + * select_task_rq_fair() races with LB's detach_task(). + * + * detach_task() + * deactivate_task() + * p->on_rq = TASK_ON_RQ_MIGRATING; + * -------------------------------- A + * dequeue_task() \ + * dequeue_task_fair() + Race Time + * util_est_dequeue() / + * -------------------------------- B + * + * The additional check "current == p" is required to further + * reduce the race window. + */ + if (dst_cpu == cpu) + util_est += _task_util_est(p); + else if (unlikely(task_on_rq_queued(p) || current == p)) + lsub_positive(&util_est, _task_util_est(p)); + + util = max(util, util_est); + } + + return min(util, capacity_orig_of(cpu)); +} + +/* * cpu_util_without: compute cpu utilization without any contributions from *p * @cpu: the CPU which utilization is requested * @p: the task which utilization should be discounted @@ -6558,175 +6694,104 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) */ static unsigned long cpu_util_without(int cpu, struct task_struct *p) { - struct cfs_rq *cfs_rq; - unsigned int util; - /* Task has no contribution or is new */ if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) return cpu_util_cfs(cpu); - cfs_rq = &cpu_rq(cpu)->cfs; - util = READ_ONCE(cfs_rq->avg.util_avg); - - /* Discount task's util from CPU's util */ - lsub_positive(&util, task_util(p)); + return cpu_util_next(cpu, p, -1); +} - /* - * Covered cases: - * - * a) if *p is the only task sleeping on this CPU, then: - * cpu_util (== task_util) > util_est (== 0) - * and thus we return: - * cpu_util_without = (cpu_util - task_util) = 0 - * - * b) if other tasks are SLEEPING on this CPU, which is now exiting - * IDLE, then: - * cpu_util >= task_util - * cpu_util > util_est (== 0) - * and thus we discount *p's blocked utilization to return: - * cpu_util_without = (cpu_util - task_util) >= 0 - * - * c) if other tasks are RUNNABLE on that CPU and - * util_est > cpu_util - * then we use util_est since it returns a more restrictive - * estimation of the spare capacity on that CPU, by just - * considering the expected utilization of tasks already - * runnable on that CPU. - * - * Cases a) and b) are covered by the above code, while case c) is - * covered by the following code when estimated utilization is - * enabled. - */ - if (sched_feat(UTIL_EST)) { - unsigned int estimated = - READ_ONCE(cfs_rq->avg.util_est.enqueued); +/* + * energy_env - Utilization landscape for energy estimation. + * @task_busy_time: Utilization contribution by the task for which we test the + * placement. Given by eenv_task_busy_time(). + * @pd_busy_time: Utilization of the whole perf domain without the task + * contribution. Given by eenv_pd_busy_time(). + * @cpu_cap: Maximum CPU capacity for the perf domain. + * @pd_cap: Entire perf domain capacity. (pd->nr_cpus * cpu_cap). + */ +struct energy_env { + unsigned long task_busy_time; + unsigned long pd_busy_time; + unsigned long cpu_cap; + unsigned long pd_cap; +}; - /* - * Despite the following checks we still have a small window - * for a possible race, when an execl's select_task_rq_fair() - * races with LB's detach_task(): - * - * detach_task() - * p->on_rq = TASK_ON_RQ_MIGRATING; - * ---------------------------------- A - * deactivate_task() \ - * dequeue_task() + RaceTime - * util_est_dequeue() / - * ---------------------------------- B - * - * The additional check on "current == p" it's required to - * properly fix the execl regression and it helps in further - * reducing the chances for the above race. - */ - if (unlikely(task_on_rq_queued(p) || current == p)) - lsub_positive(&estimated, _task_util_est(p)); +/* + * Compute the task busy time for compute_energy(). This time cannot be + * injected directly into effective_cpu_util() because of the IRQ scaling. + * The latter only makes sense with the most recent CPUs where the task has + * run. + */ +static inline void eenv_task_busy_time(struct energy_env *eenv, + struct task_struct *p, int prev_cpu) +{ + unsigned long busy_time, max_cap = arch_scale_cpu_capacity(prev_cpu); + unsigned long irq = cpu_util_irq(cpu_rq(prev_cpu)); - util = max(util, estimated); - } + if (unlikely(irq >= max_cap)) + busy_time = max_cap; + else + busy_time = scale_irq_capacity(task_util_est(p), irq, max_cap); - /* - * Utilization (estimated) can exceed the CPU capacity, thus let's - * clamp to the maximum CPU capacity to ensure consistency with - * cpu_util. - */ - return min_t(unsigned long, util, capacity_orig_of(cpu)); + eenv->task_busy_time = busy_time; } /* - * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued) - * to @dst_cpu. + * Compute the perf_domain (PD) busy time for compute_energy(). Based on the + * utilization for each @pd_cpus, it however doesn't take into account + * clamping since the ratio (utilization / cpu_capacity) is already enough to + * scale the EM reported power consumption at the (eventually clamped) + * cpu_capacity. + * + * The contribution of the task @p for which we want to estimate the + * energy cost is removed (by cpu_util_next()) and must be calculated + * separately (see eenv_task_busy_time). This ensures: + * + * - A stable PD utilization, no matter which CPU of that PD we want to place + * the task on. + * + * - A fair comparison between CPUs as the task contribution (task_util()) + * will always be the same no matter which CPU utilization we rely on + * (util_avg or util_est). + * + * Set @eenv busy time for the PD that spans @pd_cpus. This busy time can't + * exceed @eenv->pd_cap. */ -static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) +static inline void eenv_pd_busy_time(struct energy_env *eenv, + struct cpumask *pd_cpus, + struct task_struct *p) { - struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; - unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg); - - /* - * If @p migrates from @cpu to another, remove its contribution. Or, - * if @p migrates from another CPU to @cpu, add its contribution. In - * the other cases, @cpu is not impacted by the migration, so the - * util_avg should already be correct. - */ - if (task_cpu(p) == cpu && dst_cpu != cpu) - lsub_positive(&util, task_util(p)); - else if (task_cpu(p) != cpu && dst_cpu == cpu) - util += task_util(p); + unsigned long busy_time = 0; + int cpu; - if (sched_feat(UTIL_EST)) { - util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued); + for_each_cpu(cpu, pd_cpus) { + unsigned long util = cpu_util_next(cpu, p, -1); - /* - * During wake-up, the task isn't enqueued yet and doesn't - * appear in the cfs_rq->avg.util_est.enqueued of any rq, - * so just add it (if needed) to "simulate" what will be - * cpu_util after the task has been enqueued. - */ - if (dst_cpu == cpu) - util_est += _task_util_est(p); - - util = max(util, util_est); + busy_time += effective_cpu_util(cpu, util, ENERGY_UTIL, NULL); } - return min(util, capacity_orig_of(cpu)); + eenv->pd_busy_time = min(eenv->pd_cap, busy_time); } /* - * compute_energy(): Estimates the energy that @pd would consume if @p was - * migrated to @dst_cpu. compute_energy() predicts what will be the utilization - * landscape of @pd's CPUs after the task migration, and uses the Energy Model - * to compute what would be the energy if we decided to actually migrate that - * task. + * Compute the maximum utilization for compute_energy() when the task @p + * is placed on the cpu @dst_cpu. + * + * Returns the maximum utilization among @eenv->cpus. This utilization can't + * exceed @eenv->cpu_cap. */ -static long -compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) +static inline unsigned long +eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus, + struct task_struct *p, int dst_cpu) { - struct cpumask *pd_mask = perf_domain_span(pd); - unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask)); - unsigned long max_util = 0, sum_util = 0; - unsigned long _cpu_cap = cpu_cap; + unsigned long max_util = 0; int cpu; - _cpu_cap -= arch_scale_thermal_pressure(cpumask_first(pd_mask)); - - /* - * The capacity state of CPUs of the current rd can be driven by CPUs - * of another rd if they belong to the same pd. So, account for the - * utilization of these CPUs too by masking pd with cpu_online_mask - * instead of the rd span. - * - * If an entire pd is outside of the current rd, it will not appear in - * its pd list and will not be accounted by compute_energy(). - */ - for_each_cpu_and(cpu, pd_mask, cpu_online_mask) { - unsigned long util_freq = cpu_util_next(cpu, p, dst_cpu); - unsigned long cpu_util, util_running = util_freq; - struct task_struct *tsk = NULL; - - /* - * When @p is placed on @cpu: - * - * util_running = max(cpu_util, cpu_util_est) + - * max(task_util, _task_util_est) - * - * while cpu_util_next is: max(cpu_util + task_util, - * cpu_util_est + _task_util_est) - */ - if (cpu == dst_cpu) { - tsk = p; - util_running = - cpu_util_next(cpu, p, -1) + task_util_est(p); - } - - /* - * Busy time computation: utilization clamping is not - * required since the ratio (sum_util / cpu_capacity) - * is already enough to scale the EM reported power - * consumption at the (eventually clamped) cpu_capacity. - */ - cpu_util = effective_cpu_util(cpu, util_running, cpu_cap, - ENERGY_UTIL, NULL); - - sum_util += min(cpu_util, _cpu_cap); + for_each_cpu(cpu, pd_cpus) { + struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL; + unsigned long util = cpu_util_next(cpu, p, dst_cpu); + unsigned long cpu_util; /* * Performance domain frequency: utilization clamping @@ -6735,12 +6800,29 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) * NOTE: in case RT tasks are running, by default the * FREQUENCY_UTIL's utilization can be max OPP. */ - cpu_util = effective_cpu_util(cpu, util_freq, cpu_cap, - FREQUENCY_UTIL, tsk); - max_util = max(max_util, min(cpu_util, _cpu_cap)); + cpu_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk); + max_util = max(max_util, cpu_util); } - return em_cpu_energy(pd->em_pd, max_util, sum_util, _cpu_cap); + return min(max_util, eenv->cpu_cap); +} + +/* + * compute_energy(): Use the Energy Model to estimate the energy that @pd would + * consume for a given utilization landscape @eenv. When @dst_cpu < 0, the task + * contribution is ignored. + */ +static inline unsigned long +compute_energy(struct energy_env *eenv, struct perf_domain *pd, + struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu) +{ + unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu); + unsigned long busy_time = eenv->pd_busy_time; + + if (dst_cpu >= 0) + busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time); + + return em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap); } /* @@ -6784,12 +6866,13 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) */ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) { + struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX; - struct root_domain *rd = cpu_rq(smp_processor_id())->rd; - int cpu, best_energy_cpu = prev_cpu, target = -1; - unsigned long cpu_cap, util, base_energy = 0; + struct root_domain *rd = this_rq()->rd; + int cpu, best_energy_cpu, target = -1; struct sched_domain *sd; struct perf_domain *pd; + struct energy_env eenv; rcu_read_lock(); pd = rcu_dereference(rd->pd); @@ -6812,20 +6895,39 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (!task_util_est(p)) goto unlock; + eenv_task_busy_time(&eenv, p, prev_cpu); + for (; pd; pd = pd->next) { - unsigned long cur_delta, spare_cap, max_spare_cap = 0; + unsigned long cpu_cap, cpu_thermal_cap, util; + unsigned long cur_delta, max_spare_cap = 0; bool compute_prev_delta = false; - unsigned long base_energy_pd; int max_spare_cap_cpu = -1; + unsigned long base_energy; + + cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask); + + if (cpumask_empty(cpus)) + continue; + + /* Account thermal pressure for the energy estimation */ + cpu = cpumask_first(cpus); + cpu_thermal_cap = arch_scale_cpu_capacity(cpu); + cpu_thermal_cap -= arch_scale_thermal_pressure(cpu); + + eenv.cpu_cap = cpu_thermal_cap; + eenv.pd_cap = 0; + + for_each_cpu(cpu, cpus) { + eenv.pd_cap += cpu_thermal_cap; + + if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) + continue; - for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) { if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; util = cpu_util_next(cpu, p, cpu); cpu_cap = capacity_of(cpu); - spare_cap = cpu_cap; - lsub_positive(&spare_cap, util); /* * Skip CPUs that cannot satisfy the capacity request. @@ -6838,15 +6940,17 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (!fits_capacity(util, cpu_cap)) continue; + lsub_positive(&cpu_cap, util); + if (cpu == prev_cpu) { /* Always use prev_cpu as a candidate. */ compute_prev_delta = true; - } else if (spare_cap > max_spare_cap) { + } else if (cpu_cap > max_spare_cap) { /* * Find the CPU with the maximum spare capacity * in the performance domain. */ - max_spare_cap = spare_cap; + max_spare_cap = cpu_cap; max_spare_cap_cpu = cpu; } } @@ -6854,25 +6958,29 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (max_spare_cap_cpu < 0 && !compute_prev_delta) continue; + eenv_pd_busy_time(&eenv, cpus, p); /* Compute the 'base' energy of the pd, without @p */ - base_energy_pd = compute_energy(p, -1, pd); - base_energy += base_energy_pd; + base_energy = compute_energy(&eenv, pd, cpus, p, -1); /* Evaluate the energy impact of using prev_cpu. */ if (compute_prev_delta) { - prev_delta = compute_energy(p, prev_cpu, pd); - if (prev_delta < base_energy_pd) + prev_delta = compute_energy(&eenv, pd, cpus, p, + prev_cpu); + /* CPU utilization has changed */ + if (prev_delta < base_energy) goto unlock; - prev_delta -= base_energy_pd; + prev_delta -= base_energy; best_delta = min(best_delta, prev_delta); } /* Evaluate the energy impact of using max_spare_cap_cpu. */ if (max_spare_cap_cpu >= 0) { - cur_delta = compute_energy(p, max_spare_cap_cpu, pd); - if (cur_delta < base_energy_pd) + cur_delta = compute_energy(&eenv, pd, cpus, p, + max_spare_cap_cpu); + /* CPU utilization has changed */ + if (cur_delta < base_energy) goto unlock; - cur_delta -= base_energy_pd; + cur_delta -= base_energy; if (cur_delta < best_delta) { best_delta = cur_delta; best_energy_cpu = max_spare_cap_cpu; @@ -6881,12 +6989,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) } rcu_read_unlock(); - /* - * Pick the best CPU if prev_cpu cannot be used, or if it saves at - * least 6% of the energy used by prev_cpu. - */ - if ((prev_delta == ULONG_MAX) || - (prev_delta - best_delta) > ((prev_delta + base_energy) >> 4)) + if (best_delta < prev_delta) target = best_energy_cpu; return target; @@ -6982,6 +7085,8 @@ static void detach_entity_cfs_rq(struct sched_entity *se); */ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) { + struct sched_entity *se = &p->se; + /* * As blocked tasks retain absolute vruntime the migration needs to * deal with this by subtracting the old and adding the new @@ -6989,23 +7094,9 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) * the task on the new runqueue. */ if (READ_ONCE(p->__state) == TASK_WAKING) { - struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 min_vruntime; - -#ifndef CONFIG_64BIT - u64 min_vruntime_copy; - do { - min_vruntime_copy = cfs_rq->min_vruntime_copy; - smp_rmb(); - min_vruntime = cfs_rq->min_vruntime; - } while (min_vruntime != min_vruntime_copy); -#else - min_vruntime = cfs_rq->min_vruntime; -#endif - - se->vruntime -= min_vruntime; + se->vruntime -= u64_u32_load(cfs_rq->min_vruntime); } if (p->on_rq == TASK_ON_RQ_MIGRATING) { @@ -7014,25 +7105,29 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) * rq->lock and can modify state directly. */ lockdep_assert_rq_held(task_rq(p)); - detach_entity_cfs_rq(&p->se); + detach_entity_cfs_rq(se); } else { + remove_entity_load_avg(se); + /* - * We are supposed to update the task to "current" time, then - * its up to date and ready to go to new CPU/cfs_rq. But we - * have difficulty in getting what current time is, so simply - * throw away the out-of-date time. This will result in the - * wakee task is less decayed, but giving the wakee more load - * sounds not bad. + * Here, the task's PELT values have been updated according to + * the current rq's clock. But if that clock hasn't been + * updated in a while, a substantial idle time will be missed, + * leading to an inflation after wake-up on the new rq. + * + * Estimate the missing time from the cfs_rq last_update_time + * and update sched_avg to improve the PELT continuity after + * migration. */ - remove_entity_load_avg(&p->se); + migrate_se_pelt_lag(se); } /* Tell new CPU we are migrated */ - p->se.avg.last_update_time = 0; + se->avg.last_update_time = 0; /* We have migrated, no longer consider this task hot */ - p->se.exec_start = 0; + se->exec_start = 0; update_scan_period(p, new_cpu); } @@ -7616,8 +7711,8 @@ enum group_type { */ group_fully_busy, /* - * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity - * and must be migrated to a more powerful CPU. + * One task doesn't fit with CPU's capacity and must be migrated to a + * more powerful CPU. */ group_misfit_task, /* @@ -8198,6 +8293,9 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) { update_tg_load_avg(cfs_rq); + if (cfs_rq->nr_running == 0) + update_idle_cfs_rq_clock_pelt(cfs_rq); + if (cfs_rq == &rq->cfs) decayed = true; } @@ -8531,7 +8629,7 @@ static inline int sg_imbalanced(struct sched_group *group) /* * group_has_capacity returns true if the group has spare capacity that could * be used by some tasks. - * We consider that a group has spare capacity if the * number of task is + * We consider that a group has spare capacity if the number of task is * smaller than the number of CPUs or if the utilization is lower than the * available capacity for CFS tasks. * For the latter, we use a threshold to stabilize the state, to take into @@ -8700,6 +8798,19 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu); } +static inline bool +sched_reduced_capacity(struct rq *rq, struct sched_domain *sd) +{ + /* + * When there is more than 1 task, the group_overloaded case already + * takes care of cpu with reduced capacity + */ + if (rq->cfs.h_nr_running != 1) + return false; + + return check_cpu_capacity(rq, sd); +} + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. @@ -8722,8 +8833,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, for_each_cpu_and(i, sched_group_span(group), env->cpus) { struct rq *rq = cpu_rq(i); + unsigned long load = cpu_load(rq); - sgs->group_load += cpu_load(rq); + sgs->group_load += load; sgs->group_util += cpu_util_cfs(i); sgs->group_runnable += cpu_runnable(rq); sgs->sum_h_nr_running += rq->cfs.h_nr_running; @@ -8753,11 +8865,17 @@ static inline void update_sg_lb_stats(struct lb_env *env, if (local_group) continue; - /* Check for a misfit task on the cpu */ - if (env->sd->flags & SD_ASYM_CPUCAPACITY && - sgs->group_misfit_task_load < rq->misfit_task_load) { - sgs->group_misfit_task_load = rq->misfit_task_load; - *sg_status |= SG_OVERLOAD; + if (env->sd->flags & SD_ASYM_CPUCAPACITY) { + /* Check for a misfit task on the cpu */ + if (sgs->group_misfit_task_load < rq->misfit_task_load) { + sgs->group_misfit_task_load = rq->misfit_task_load; + *sg_status |= SG_OVERLOAD; + } + } else if ((env->idle != CPU_NOT_IDLE) && + sched_reduced_capacity(rq, env->sd)) { + /* Check for a task running on a CPU with reduced capacity */ + if (sgs->group_misfit_task_load < load) + sgs->group_misfit_task_load = load; } } @@ -8810,7 +8928,8 @@ static bool update_sd_pick_busiest(struct lb_env *env, * CPUs in the group should either be possible to resolve * internally or be covered by avg_load imbalance (eventually). */ - if (sgs->group_type == group_misfit_task && + if ((env->sd->flags & SD_ASYM_CPUCAPACITY) && + (sgs->group_type == group_misfit_task) && (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) || sds->local_stat.group_type != group_has_spare)) return false; @@ -9089,16 +9208,6 @@ static bool update_pick_idlest(struct sched_group *idlest, } /* - * Allow a NUMA imbalance if busy CPUs is less than 25% of the domain. - * This is an approximation as the number of running tasks may not be - * related to the number of busy CPUs due to sched_setaffinity. - */ -static inline bool allow_numa_imbalance(int running, int imb_numa_nr) -{ - return running <= imb_numa_nr; -} - -/* * find_idlest_group() finds and returns the least busy CPU group within the * domain. * @@ -9214,7 +9323,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) break; case group_has_spare: +#ifdef CONFIG_NUMA if (sd->flags & SD_NUMA) { + int imb_numa_nr = sd->imb_numa_nr; #ifdef CONFIG_NUMA_BALANCING int idlest_cpu; /* @@ -9227,17 +9338,31 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) idlest_cpu = cpumask_first(sched_group_span(idlest)); if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid) return idlest; -#endif +#endif /* CONFIG_NUMA_BALANCING */ /* * Otherwise, keep the task close to the wakeup source * and improve locality if the number of running tasks * would remain below threshold where an imbalance is - * allowed. If there is a real need of migration, - * periodic load balance will take care of it. + * allowed while accounting for the possibility the + * task is pinned to a subset of CPUs. If there is a + * real need of migration, periodic load balance will + * take care of it. */ - if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, sd->imb_numa_nr)) + if (p->nr_cpus_allowed != NR_CPUS) { + struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); + + cpumask_and(cpus, sched_group_span(local), p->cpus_ptr); + imb_numa_nr = min(cpumask_weight(cpus), sd->imb_numa_nr); + } + + imbalance = abs(local_sgs.idle_cpus - idlest_sgs.idle_cpus); + if (!adjust_numa_imbalance(imbalance, + local_sgs.sum_nr_running + 1, + imb_numa_nr)) { return NULL; + } } +#endif /* CONFIG_NUMA */ /* * Select group with highest number of idle CPUs. We could also @@ -9253,6 +9378,77 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) return idlest; } +static void update_idle_cpu_scan(struct lb_env *env, + unsigned long sum_util) +{ + struct sched_domain_shared *sd_share; + int llc_weight, pct; + u64 x, y, tmp; + /* + * Update the number of CPUs to scan in LLC domain, which could + * be used as a hint in select_idle_cpu(). The update of sd_share + * could be expensive because it is within a shared cache line. + * So the write of this hint only occurs during periodic load + * balancing, rather than CPU_NEWLY_IDLE, because the latter + * can fire way more frequently than the former. + */ + if (!sched_feat(SIS_UTIL) || env->idle == CPU_NEWLY_IDLE) + return; + + llc_weight = per_cpu(sd_llc_size, env->dst_cpu); + if (env->sd->span_weight != llc_weight) + return; + + sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu)); + if (!sd_share) + return; + + /* + * The number of CPUs to search drops as sum_util increases, when + * sum_util hits 85% or above, the scan stops. + * The reason to choose 85% as the threshold is because this is the + * imbalance_pct(117) when a LLC sched group is overloaded. + * + * let y = SCHED_CAPACITY_SCALE - p * x^2 [1] + * and y'= y / SCHED_CAPACITY_SCALE + * + * x is the ratio of sum_util compared to the CPU capacity: + * x = sum_util / (llc_weight * SCHED_CAPACITY_SCALE) + * y' is the ratio of CPUs to be scanned in the LLC domain, + * and the number of CPUs to scan is calculated by: + * + * nr_scan = llc_weight * y' [2] + * + * When x hits the threshold of overloaded, AKA, when + * x = 100 / pct, y drops to 0. According to [1], + * p should be SCHED_CAPACITY_SCALE * pct^2 / 10000 + * + * Scale x by SCHED_CAPACITY_SCALE: + * x' = sum_util / llc_weight; [3] + * + * and finally [1] becomes: + * y = SCHED_CAPACITY_SCALE - + * x'^2 * pct^2 / (10000 * SCHED_CAPACITY_SCALE) [4] + * + */ + /* equation [3] */ + x = sum_util; + do_div(x, llc_weight); + + /* equation [4] */ + pct = env->sd->imbalance_pct; + tmp = x * x * pct * pct; + do_div(tmp, 10000 * SCHED_CAPACITY_SCALE); + tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE); + y = SCHED_CAPACITY_SCALE - tmp; + + /* equation [2] */ + y *= llc_weight; + do_div(y, SCHED_CAPACITY_SCALE); + if ((int)y != sd_share->nr_idle_scan) + WRITE_ONCE(sd_share->nr_idle_scan, (int)y); +} + /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @env: The load balancing environment. @@ -9265,6 +9461,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; + unsigned long sum_util = 0; int sg_status = 0; do { @@ -9297,6 +9494,7 @@ next_group: sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity; + sum_util += sgs->group_util; sg = sg->next; } while (sg != env->sd->groups); @@ -9322,24 +9520,8 @@ next_group: WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED); trace_sched_overutilized_tp(rd, SG_OVERUTILIZED); } -} -#define NUMA_IMBALANCE_MIN 2 - -static inline long adjust_numa_imbalance(int imbalance, - int dst_running, int imb_numa_nr) -{ - if (!allow_numa_imbalance(dst_running, imb_numa_nr)) - return imbalance; - - /* - * Allow a small imbalance based on a simple pair of communicating - * tasks that remain local when the destination is lightly loaded. - */ - if (imbalance <= NUMA_IMBALANCE_MIN) - return 0; - - return imbalance; + update_idle_cpu_scan(env, sum_util); } /** @@ -9356,9 +9538,18 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s busiest = &sds->busiest_stat; if (busiest->group_type == group_misfit_task) { - /* Set imbalance to allow misfit tasks to be balanced. */ - env->migration_type = migrate_misfit; - env->imbalance = 1; + if (env->sd->flags & SD_ASYM_CPUCAPACITY) { + /* Set imbalance to allow misfit tasks to be balanced. */ + env->migration_type = migrate_misfit; + env->imbalance = 1; + } else { + /* + * Set load imbalance to allow moving task from cpu + * with reduced capacity. + */ + env->migration_type = migrate_load; + env->imbalance = busiest->group_misfit_task_load; + } return; } @@ -9426,7 +9617,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ env->migration_type = migrate_task; lsub_positive(&nr_diff, local->sum_nr_running); - env->imbalance = nr_diff >> 1; + env->imbalance = nr_diff; } else { /* @@ -9434,15 +9625,21 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * idle cpus. */ env->migration_type = migrate_task; - env->imbalance = max_t(long, 0, (local->idle_cpus - - busiest->idle_cpus) >> 1); + env->imbalance = max_t(long, 0, + (local->idle_cpus - busiest->idle_cpus)); } +#ifdef CONFIG_NUMA /* Consider allowing a small imbalance between NUMA groups */ if (env->sd->flags & SD_NUMA) { env->imbalance = adjust_numa_imbalance(env->imbalance, - local->sum_nr_running + 1, env->sd->imb_numa_nr); + local->sum_nr_running + 1, + env->sd->imb_numa_nr); } +#endif + + /* Number of tasks to move to restore balance */ + env->imbalance >>= 1; return; } @@ -9460,8 +9657,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) / local->group_capacity; - sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / - sds->total_capacity; /* * If the local group is more loaded than the selected * busiest group don't try to pull any tasks. @@ -9470,6 +9665,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s env->imbalance = 0; return; } + + sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / + sds->total_capacity; } /* @@ -9495,7 +9693,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded * has_spare nr_idle balanced N/A N/A balanced balanced * fully_busy nr_idle nr_idle N/A N/A balanced balanced - * misfit_task force N/A N/A N/A force force + * misfit_task force N/A N/A N/A N/A N/A * asym_packing force force N/A N/A force force * imbalanced force force N/A N/A force force * overloaded force force N/A N/A force avg_load @@ -9864,9 +10062,15 @@ static int should_we_balance(struct lb_env *env) /* * In the newly idle case, we will allow all the CPUs * to do the newly idle load balance. + * + * However, we bail out if we already have tasks or a wakeup pending, + * to optimize wakeup latency. */ - if (env->idle == CPU_NEWLY_IDLE) + if (env->idle == CPU_NEWLY_IDLE) { + if (env->dst_rq->nr_running > 0 || env->dst_rq->ttwu_pending) + return 0; return 1; + } /* Try to find first idle CPU */ for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { @@ -11317,9 +11521,13 @@ static inline bool vruntime_normalized(struct task_struct *p) */ static void propagate_entity_cfs_rq(struct sched_entity *se) { - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (cfs_rq_throttled(cfs_rq)) + return; - list_add_leaf_cfs_rq(cfs_rq_of(se)); + if (!throttled_hierarchy(cfs_rq)) + list_add_leaf_cfs_rq(cfs_rq); /* Start to propagate at parent */ se = se->parent; @@ -11327,14 +11535,13 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); - if (!cfs_rq_throttled(cfs_rq)){ - update_load_avg(cfs_rq, se, UPDATE_TG); - list_add_leaf_cfs_rq(cfs_rq); - continue; - } + update_load_avg(cfs_rq, se, UPDATE_TG); - if (list_add_leaf_cfs_rq(cfs_rq)) + if (cfs_rq_throttled(cfs_rq)) break; + + if (!throttled_hierarchy(cfs_rq)) + list_add_leaf_cfs_rq(cfs_rq); } } #else @@ -11452,10 +11659,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT_CACHED; - cfs_rq->min_vruntime = (u64)(-(1LL << 20)); -#ifndef CONFIG_64BIT - cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; -#endif + u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20))); #ifdef CONFIG_SMP raw_spin_lock_init(&cfs_rq->removed.lock); #endif @@ -11881,101 +12085,3 @@ __init void init_sched_fair_class(void) #endif /* SMP */ } - -/* - * Helper functions to facilitate extracting info from tracepoints. - */ - -const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq) -{ -#ifdef CONFIG_SMP - return cfs_rq ? &cfs_rq->avg : NULL; -#else - return NULL; -#endif -} -EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg); - -char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len) -{ - if (!cfs_rq) { - if (str) - strlcpy(str, "(null)", len); - else - return NULL; - } - - cfs_rq_tg_path(cfs_rq, str, len); - return str; -} -EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path); - -int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq) -{ - return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1; -} -EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu); - -const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq) -{ -#ifdef CONFIG_SMP - return rq ? &rq->avg_rt : NULL; -#else - return NULL; -#endif -} -EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt); - -const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq) -{ -#ifdef CONFIG_SMP - return rq ? &rq->avg_dl : NULL; -#else - return NULL; -#endif -} -EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl); - -const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq) -{ -#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ) - return rq ? &rq->avg_irq : NULL; -#else - return NULL; -#endif -} -EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq); - -int sched_trace_rq_cpu(struct rq *rq) -{ - return rq ? cpu_of(rq) : -1; -} -EXPORT_SYMBOL_GPL(sched_trace_rq_cpu); - -int sched_trace_rq_cpu_capacity(struct rq *rq) -{ - return rq ? -#ifdef CONFIG_SMP - rq->cpu_capacity -#else - SCHED_CAPACITY_SCALE -#endif - : -1; -} -EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity); - -const struct cpumask *sched_trace_rd_span(struct root_domain *rd) -{ -#ifdef CONFIG_SMP - return rd ? rd->span : NULL; -#else - return NULL; -#endif -} -EXPORT_SYMBOL_GPL(sched_trace_rd_span); - -int sched_trace_rq_nr_running(struct rq *rq) -{ - return rq ? rq->nr_running : -1; -} -EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 1cf435bbcd9c..ee7f23c76bd3 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -60,7 +60,8 @@ SCHED_FEAT(TTWU_QUEUE, true) /* * When doing wakeups, attempt to limit superfluous scans of the LLC domain. */ -SCHED_FEAT(SIS_PROP, true) +SCHED_FEAT(SIS_PROP, false) +SCHED_FEAT(SIS_UTIL, true) /* * Issue a WARN when we do multiple update_rq_clock() calls diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 8f8b5020e76a..f26ab2675f7d 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -53,14 +53,14 @@ static noinline int __cpuidle cpu_idle_poll(void) { trace_cpu_idle(0, smp_processor_id()); stop_critical_timings(); - rcu_idle_enter(); + ct_idle_enter(); local_irq_enable(); while (!tif_need_resched() && (cpu_idle_force_poll || tick_check_broadcast_expired())) cpu_relax(); - rcu_idle_exit(); + ct_idle_exit(); start_critical_timings(); trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); @@ -98,12 +98,12 @@ void __cpuidle default_idle_call(void) * * Trace IRQs enable here, then switch off RCU, and have * arch_cpu_idle() use raw_local_irq_enable(). Note that - * rcu_idle_enter() relies on lockdep IRQ state, so switch that + * ct_idle_enter() relies on lockdep IRQ state, so switch that * last -- this is very similar to the entry code. */ trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(_THIS_IP_); - rcu_idle_enter(); + lockdep_hardirqs_on_prepare(); + ct_idle_enter(); lockdep_hardirqs_on(_THIS_IP_); arch_cpu_idle(); @@ -116,7 +116,7 @@ void __cpuidle default_idle_call(void) */ raw_local_irq_disable(); lockdep_hardirqs_off(_THIS_IP_); - rcu_idle_exit(); + ct_idle_exit(); lockdep_hardirqs_on(_THIS_IP_); raw_local_irq_enable(); @@ -327,7 +327,7 @@ static void do_idle(void) * RCU relies on this call to be done outside of an RCU read-side * critical section. */ - flush_smp_call_function_from_idle(); + flush_smp_call_function_queue(); schedule_idle(); if (unlikely(klp_patch_pending(current))) @@ -434,7 +434,6 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir { update_idle_core(rq); schedstat_inc(rq->sched_goidle); - queue_core_balance(rq); } #ifdef CONFIG_SMP diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index c336f5f481bc..3a0e0dc28721 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -61,6 +61,25 @@ static inline void cfs_se_util_change(struct sched_avg *avg) WRITE_ONCE(avg->util_est.enqueued, enqueued); } +static inline u64 rq_clock_pelt(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + assert_clock_updated(rq); + + return rq->clock_pelt - rq->lost_idle_time; +} + +/* The rq is idle, we can sync to clock_task */ +static inline void _update_idle_rq_clock_pelt(struct rq *rq) +{ + rq->clock_pelt = rq_clock_task(rq); + + u64_u32_store(rq->clock_idle, rq_clock(rq)); + /* Paired with smp_rmb in migrate_se_pelt_lag() */ + smp_wmb(); + u64_u32_store(rq->clock_pelt_idle, rq_clock_pelt(rq)); +} + /* * The clock_pelt scales the time to reflect the effective amount of * computation done during the running delta time but then sync back to @@ -76,8 +95,7 @@ static inline void cfs_se_util_change(struct sched_avg *avg) static inline void update_rq_clock_pelt(struct rq *rq, s64 delta) { if (unlikely(is_idle_task(rq->curr))) { - /* The rq is idle, we can sync to clock_task */ - rq->clock_pelt = rq_clock_task(rq); + _update_idle_rq_clock_pelt(rq); return; } @@ -130,26 +148,33 @@ static inline void update_idle_rq_clock_pelt(struct rq *rq) */ if (util_sum >= divider) rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt; + + _update_idle_rq_clock_pelt(rq); } -static inline u64 rq_clock_pelt(struct rq *rq) +#ifdef CONFIG_CFS_BANDWIDTH +static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { - lockdep_assert_rq_held(rq); - assert_clock_updated(rq); + u64 throttled; - return rq->clock_pelt - rq->lost_idle_time; + if (unlikely(cfs_rq->throttle_count)) + throttled = U64_MAX; + else + throttled = cfs_rq->throttled_clock_pelt_time; + + u64_u32_store(cfs_rq->throttled_pelt_idle, throttled); } -#ifdef CONFIG_CFS_BANDWIDTH /* rq->task_clock normalized against any time this cfs_rq has spent throttled */ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { if (unlikely(cfs_rq->throttle_count)) - return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; + return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time; - return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; + return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time; } #else +static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { } static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { return rq_clock_pelt(rq_of(cfs_rq)); @@ -204,6 +229,7 @@ update_rq_clock_pelt(struct rq *rq, s64 delta) { } static inline void update_idle_rq_clock_pelt(struct rq *rq) { } +static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { } #endif diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index a4fa3aadfcba..7f6030091aee 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -190,12 +190,8 @@ static void group_init(struct psi_group *group) /* Init trigger-related members */ mutex_init(&group->trigger_lock); INIT_LIST_HEAD(&group->triggers); - memset(group->nr_triggers, 0, sizeof(group->nr_triggers)); - group->poll_states = 0; group->poll_min_period = U32_MAX; - memset(group->polling_total, 0, sizeof(group->polling_total)); group->polling_next_update = ULLONG_MAX; - group->polling_until = 0; init_waitqueue_head(&group->poll_wait); timer_setup(&group->poll_timer, poll_timer_fn, 0); rcu_assign_pointer(group->poll_task, NULL); @@ -921,6 +917,7 @@ void psi_memstall_enter(unsigned long *flags) rq_unlock_irq(rq, &rf); } +EXPORT_SYMBOL_GPL(psi_memstall_enter); /** * psi_memstall_leave - mark the end of an memory stall section @@ -950,6 +947,7 @@ void psi_memstall_leave(unsigned long *flags) rq_unlock_irq(rq, &rf); } +EXPORT_SYMBOL_GPL(psi_memstall_leave); #ifdef CONFIG_CGROUPS int psi_cgroup_alloc(struct cgroup *cgroup) @@ -957,10 +955,16 @@ int psi_cgroup_alloc(struct cgroup *cgroup) if (static_branch_likely(&psi_disabled)) return 0; - cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu); - if (!cgroup->psi.pcpu) + cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL); + if (!cgroup->psi) return -ENOMEM; - group_init(&cgroup->psi); + + cgroup->psi->pcpu = alloc_percpu(struct psi_group_cpu); + if (!cgroup->psi->pcpu) { + kfree(cgroup->psi); + return -ENOMEM; + } + group_init(cgroup->psi); return 0; } @@ -969,10 +973,11 @@ void psi_cgroup_free(struct cgroup *cgroup) if (static_branch_likely(&psi_disabled)) return; - cancel_delayed_work_sync(&cgroup->psi.avgs_work); - free_percpu(cgroup->psi.pcpu); + cancel_delayed_work_sync(&cgroup->psi->avgs_work); + free_percpu(cgroup->psi->pcpu); /* All triggers must be removed by now */ - WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n"); + WARN_ONCE(cgroup->psi->poll_states, "psi: trigger leak\n"); + kfree(cgroup->psi); } /** @@ -1060,14 +1065,17 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) mutex_unlock(&group->avgs_lock); for (full = 0; full < 2; full++) { - unsigned long avg[3]; - u64 total; + unsigned long avg[3] = { 0, }; + u64 total = 0; int w; - for (w = 0; w < 3; w++) - avg[w] = group->avg[res * 2 + full][w]; - total = div_u64(group->total[PSI_AVGS][res * 2 + full], - NSEC_PER_USEC); + /* CPU FULL is undefined at the system level */ + if (!(group == &psi_system && res == PSI_CPU && full)) { + for (w = 0; w < 3; w++) + avg[w] = group->avg[res * 2 + full][w]; + total = div_u64(group->total[PSI_AVGS][res * 2 + full], + NSEC_PER_USEC); + } seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", full ? "full" : "some", @@ -1081,7 +1089,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) } struct psi_trigger *psi_trigger_create(struct psi_group *group, - char *buf, size_t nbytes, enum psi_res res) + char *buf, enum psi_res res) { struct psi_trigger *t; enum psi_states state; @@ -1117,7 +1125,8 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, t->state = state; t->threshold = threshold_us * NSEC_PER_USEC; t->win.size = window_us * NSEC_PER_USEC; - window_reset(&t->win, 0, 0, 0); + window_reset(&t->win, sched_clock(), + group->total[PSI_POLL][t->state], 0); t->event = 0; t->last_event_time = 0; @@ -1309,7 +1318,7 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, return -EBUSY; } - new = psi_trigger_create(&psi_system, buf, nbytes, res); + new = psi_trigger_create(&psi_system, buf, res); if (IS_ERR(new)) { mutex_unlock(&seq->lock); return PTR_ERR(new); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a32c46889af8..55f39c8f4203 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -5,7 +5,6 @@ */ int sched_rr_timeslice = RR_TIMESLICE; -int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; /* More than 4 hours if BW_SHIFT equals 20. */ static const u64 max_rt_runtime = MAX_BW; @@ -13,6 +12,57 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); struct rt_bandwidth def_rt_bandwidth; +/* + * period over which we measure -rt task CPU usage in us. + * default: 1s + */ +unsigned int sysctl_sched_rt_period = 1000000; + +/* + * part of the period that we allow rt tasks to run in us. + * default: 0.95s + */ +int sysctl_sched_rt_runtime = 950000; + +#ifdef CONFIG_SYSCTL +static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; +static int sched_rt_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); +static int sched_rr_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); +static struct ctl_table sched_rt_sysctls[] = { + { + .procname = "sched_rt_period_us", + .data = &sysctl_sched_rt_period, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_rt_handler, + }, + { + .procname = "sched_rt_runtime_us", + .data = &sysctl_sched_rt_runtime, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sched_rt_handler, + }, + { + .procname = "sched_rr_timeslice_ms", + .data = &sysctl_sched_rr_timeslice, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sched_rr_handler, + }, + {} +}; + +static int __init sched_rt_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_rt_sysctls); + return 0; +} +late_initcall(sched_rt_sysctl_init); +#endif + static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) { struct rt_bandwidth *rt_b = @@ -430,7 +480,7 @@ static inline void rt_queue_push_tasks(struct rq *rq) #endif /* CONFIG_SMP */ static void enqueue_top_rt_rq(struct rt_rq *rt_rq); -static void dequeue_top_rt_rq(struct rt_rq *rt_rq); +static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count); static inline int on_rt_rq(struct sched_rt_entity *rt_se) { @@ -551,7 +601,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) rt_se = rt_rq->tg->rt_se[cpu]; if (!rt_se) { - dequeue_top_rt_rq(rt_rq); + dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running); /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ cpufreq_update_util(rq_of_rt_rq(rt_rq), 0); } @@ -637,7 +687,7 @@ static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) { - dequeue_top_rt_rq(rt_rq); + dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running); } static inline int rt_rq_throttled(struct rt_rq *rt_rq) @@ -871,6 +921,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) int enqueue = 0; struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); struct rq *rq = rq_of_rt_rq(rt_rq); + struct rq_flags rf; int skip; /* @@ -885,7 +936,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (skip) continue; - raw_spin_rq_lock(rq); + rq_lock(rq, &rf); update_rq_clock(rq); if (rt_rq->rt_time) { @@ -923,7 +974,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (enqueue) sched_rt_rq_enqueue(rt_rq); - raw_spin_rq_unlock(rq); + rq_unlock(rq, &rf); } if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)) @@ -1038,7 +1089,7 @@ static void update_curr_rt(struct rq *rq) } static void -dequeue_top_rt_rq(struct rt_rq *rt_rq) +dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count) { struct rq *rq = rq_of_rt_rq(rt_rq); @@ -1049,7 +1100,7 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq) BUG_ON(!rq->nr_running); - sub_nr_running(rq, rt_rq->rt_nr_running); + sub_nr_running(rq, count); rt_rq->rt_queued = 0; } @@ -1435,18 +1486,21 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flag static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags) { struct sched_rt_entity *back = NULL; + unsigned int rt_nr_running; for_each_sched_rt_entity(rt_se) { rt_se->back = back; back = rt_se; } - dequeue_top_rt_rq(rt_rq_of_se(back)); + rt_nr_running = rt_rq_of_se(back)->rt_nr_running; for (rt_se = back; rt_se; rt_se = rt_se->back) { if (on_rt_rq(rt_se)) __dequeue_rt_entity(rt_se, flags); } + + dequeue_top_rt_rq(rt_rq_of_se(back), rt_nr_running); } static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) @@ -2861,6 +2915,7 @@ long sched_group_rt_period(struct task_group *tg) return rt_period_us; } +#ifdef CONFIG_SYSCTL static int sched_rt_global_constraints(void) { int ret = 0; @@ -2871,6 +2926,7 @@ static int sched_rt_global_constraints(void) return ret; } +#endif /* CONFIG_SYSCTL */ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) { @@ -2882,6 +2938,8 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) } #else /* !CONFIG_RT_GROUP_SCHED */ + +#ifdef CONFIG_SYSCTL static int sched_rt_global_constraints(void) { unsigned long flags; @@ -2899,8 +2957,10 @@ static int sched_rt_global_constraints(void) return 0; } +#endif /* CONFIG_SYSCTL */ #endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_SYSCTL static int sched_rt_global_validate(void) { if (sysctl_sched_rt_period <= 0) @@ -2925,7 +2985,7 @@ static void sched_rt_do_global(void) raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); } -int sched_rt_handler(struct ctl_table *table, int write, void *buffer, +static int sched_rt_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int old_period, old_runtime; @@ -2964,7 +3024,7 @@ undo: return ret; } -int sched_rr_handler(struct ctl_table *table, int write, void *buffer, +static int sched_rr_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -2985,6 +3045,7 @@ int sched_rr_handler(struct ctl_table *table, int write, void *buffer, return ret; } +#endif /* CONFIG_SYSCTL */ #ifdef CONFIG_SCHED_DEBUG void print_rt_stats(struct seq_file *m, int cpu) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 58263f90c559..e26688d387ae 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -27,6 +27,7 @@ #include <linux/capability.h> #include <linux/cgroup_api.h> #include <linux/cgroup.h> +#include <linux/context_tracking.h> #include <linux/cpufreq.h> #include <linux/cpumask_api.h> #include <linux/ctype.h> @@ -108,10 +109,17 @@ extern __read_mostly int scheduler_running; extern unsigned long calc_load_update; extern atomic_long_t calc_load_tasks; +extern unsigned int sysctl_sched_child_runs_first; + extern void calc_global_load_tick(struct rq *this_rq); extern long calc_load_fold_active(struct rq *this_rq, long adjust); extern void call_trace_sched_update_nr_running(struct rq *rq, int count); + +extern unsigned int sysctl_sched_rt_period; +extern int sysctl_sched_rt_runtime; +extern int sched_rr_timeslice; + /* * Helpers for converting nanosecond timing to jiffy resolution */ @@ -473,9 +481,6 @@ extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); -extern void unregister_rt_sched_group(struct task_group *tg); -extern void free_rt_sched_group(struct task_group *tg); -extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int cpu, struct sched_rt_entity *parent); @@ -513,6 +518,49 @@ struct cfs_bandwidth { }; #endif /* CONFIG_CGROUP_SCHED */ +extern void unregister_rt_sched_group(struct task_group *tg); +extern void free_rt_sched_group(struct task_group *tg); +extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); + +/* + * u64_u32_load/u64_u32_store + * + * Use a copy of a u64 value to protect against data race. This is only + * applicable for 32-bits architectures. + */ +#ifdef CONFIG_64BIT +# define u64_u32_load_copy(var, copy) var +# define u64_u32_store_copy(var, copy, val) (var = val) +#else +# define u64_u32_load_copy(var, copy) \ +({ \ + u64 __val, __val_copy; \ + do { \ + __val_copy = copy; \ + /* \ + * paired with u64_u32_store_copy(), ordering access \ + * to var and copy. \ + */ \ + smp_rmb(); \ + __val = var; \ + } while (__val != __val_copy); \ + __val; \ +}) +# define u64_u32_store_copy(var, copy, val) \ +do { \ + typeof(val) __val = (val); \ + var = __val; \ + /* \ + * paired with u64_u32_load_copy(), ordering access to var and \ + * copy. \ + */ \ + smp_wmb(); \ + copy = __val; \ +} while (0) +#endif +# define u64_u32_load(var) u64_u32_load_copy(var, var##_copy) +# define u64_u32_store(var, val) u64_u32_store_copy(var, var##_copy, val) + /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; @@ -553,7 +601,7 @@ struct cfs_rq { */ struct sched_avg avg; #ifndef CONFIG_64BIT - u64 load_last_update_time_copy; + u64 last_update_time_copy; #endif struct { raw_spinlock_t lock ____cacheline_aligned; @@ -602,9 +650,13 @@ struct cfs_rq { int runtime_enabled; s64 runtime_remaining; + u64 throttled_pelt_idle; +#ifndef CONFIG_64BIT + u64 throttled_pelt_idle_copy; +#endif u64 throttled_clock; - u64 throttled_clock_task; - u64 throttled_clock_task_time; + u64 throttled_clock_pelt; + u64 throttled_clock_pelt_time; int throttled; int throttle_count; struct list_head throttled_list; @@ -974,6 +1026,12 @@ struct rq { u64 clock_task ____cacheline_aligned; u64 clock_pelt; unsigned long lost_idle_time; + u64 clock_pelt_idle; + u64 clock_idle; +#ifndef CONFIG_64BIT + u64 clock_pelt_idle_copy; + u64 clock_idle_copy; +#endif atomic_t nr_iowait; @@ -1232,8 +1290,6 @@ static inline bool sched_group_cookie_match(struct rq *rq, return false; } -extern void queue_core_balance(struct rq *rq); - static inline bool sched_core_enqueued(struct task_struct *p) { return !RB_EMPTY_NODE(&p->core_node); @@ -1267,10 +1323,6 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq) return &rq->__lock; } -static inline void queue_core_balance(struct rq *rq) -{ -} - static inline bool sched_cpu_cookie_match(struct rq *rq, struct task_struct *p) { return true; @@ -1692,6 +1744,11 @@ queue_balance_callback(struct rq *rq, { lockdep_assert_rq_held(rq); + /* + * Don't (re)queue an already queued item; nor queue anything when + * balance_push() is active, see the comment with + * balance_push_callback. + */ if (unlikely(head->next || rq->balance_callback == &balance_push_callback)) return; @@ -1809,15 +1866,6 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg) return to_cpumask(sg->sgc->cpumask); } -/** - * group_first_cpu - Returns the first CPU in the cpumask of a sched_group. - * @group: The group whose first CPU is to be returned. - */ -static inline unsigned int group_first_cpu(struct sched_group *group) -{ - return cpumask_first(sched_group_span(group)); -} - extern int group_balance_cpu(struct sched_group *sg); #ifdef CONFIG_SCHED_DEBUG @@ -1833,12 +1881,7 @@ static inline void dirty_sched_domain_sysctl(int cpu) #endif extern int sched_update_scaling(void); - -extern void flush_smp_call_function_from_idle(void); - -#else /* !CONFIG_SMP: */ -static inline void flush_smp_call_function_from_idle(void) { } -#endif +#endif /* CONFIG_SMP */ #include "stats.h" @@ -2043,7 +2086,6 @@ static inline int task_on_rq_migrating(struct task_struct *p) #define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ #define WF_MIGRATED 0x20 /* Internal use, task got migrated */ -#define WF_ON_CPU 0x40 /* Wakee is on_cpu */ #ifdef CONFIG_SMP static_assert(WF_EXEC == SD_BALANCE_EXEC); @@ -2188,6 +2230,8 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next) * * include/asm-generic/vmlinux.lds.h * + * *CAREFUL* they are laid out in *REVERSE* order!!! + * * Also enforce alignment on the instance, not the type, to guarantee layout. */ #define DEFINE_SCHED_CLASS(name) \ @@ -2196,17 +2240,16 @@ const struct sched_class name##_sched_class \ __section("__" #name "_sched_class") /* Defined in include/asm-generic/vmlinux.lds.h */ -extern struct sched_class __begin_sched_classes[]; -extern struct sched_class __end_sched_classes[]; - -#define sched_class_highest (__end_sched_classes - 1) -#define sched_class_lowest (__begin_sched_classes - 1) +extern struct sched_class __sched_class_highest[]; +extern struct sched_class __sched_class_lowest[]; #define for_class_range(class, _from, _to) \ - for (class = (_from); class != (_to); class--) + for (class = (_from); class < (_to); class++) #define for_each_class(class) \ - for_class_range(class, sched_class_highest, sched_class_lowest) + for_class_range(class, __sched_class_highest, __sched_class_lowest) + +#define sched_class_above(_a, _b) ((_a) < (_b)) extern const struct sched_class stop_sched_class; extern const struct sched_class dl_sched_class; @@ -2315,6 +2358,7 @@ extern void resched_cpu(int cpu); extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); +extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); extern void init_dl_task_timer(struct sched_dl_entity *dl_se); @@ -2484,6 +2528,24 @@ unsigned long arch_scale_freq_capacity(int cpu) } #endif +#ifdef CONFIG_SCHED_DEBUG +/* + * In double_lock_balance()/double_rq_lock(), we use raw_spin_rq_lock() to + * acquire rq lock instead of rq_lock(). So at the end of these two functions + * we need to call double_rq_clock_clear_update() to clear RQCF_UPDATED of + * rq->clock_update_flags to avoid the WARN_DOUBLE_CLOCK warning. + */ +static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) +{ + rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); + /* rq1 == rq2 for !CONFIG_SMP, so just clear RQCF_UPDATED once. */ +#ifdef CONFIG_SMP + rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); +#endif +} +#else +static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) {} +#endif #ifdef CONFIG_SMP @@ -2549,14 +2611,15 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) __acquires(busiest->lock) __acquires(this_rq->lock) { - if (__rq_lockp(this_rq) == __rq_lockp(busiest)) - return 0; - - if (likely(raw_spin_rq_trylock(busiest))) + if (__rq_lockp(this_rq) == __rq_lockp(busiest) || + likely(raw_spin_rq_trylock(busiest))) { + double_rq_clock_clear_update(this_rq, busiest); return 0; + } if (rq_order_less(this_rq, busiest)) { raw_spin_rq_lock_nested(busiest, SINGLE_DEPTH_NESTING); + double_rq_clock_clear_update(this_rq, busiest); return 0; } @@ -2650,6 +2713,7 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) BUG_ON(rq1 != rq2); raw_spin_rq_lock(rq1); __acquire(rq2->lock); /* Fake it out ;) */ + double_rq_clock_clear_update(rq1, rq2); } /* @@ -2829,7 +2893,7 @@ enum cpu_util_type { }; unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, - unsigned long max, enum cpu_util_type type, + enum cpu_util_type type, struct task_struct *p); static inline unsigned long cpu_bw_dl(struct rq *rq) diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h index 9620e323162c..2eb23dd0f285 100644 --- a/kernel/sched/smp.h +++ b/kernel/sched/smp.h @@ -7,3 +7,9 @@ extern void sched_ttwu_pending(void *arg); extern void send_call_function_single_ipi(int cpu); + +#ifdef CONFIG_SMP +extern void flush_smp_call_function_queue(void); +#else +static inline void flush_smp_call_function_queue(void) { } +#endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 810750e62118..8739c2a5a54e 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -206,7 +206,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) DEFINE_STATIC_KEY_FALSE(sched_energy_present); -unsigned int sysctl_sched_energy_aware = 1; +static unsigned int sysctl_sched_energy_aware = 1; DEFINE_MUTEX(sched_energy_mutex); bool sched_energy_update; @@ -220,7 +220,7 @@ void rebuild_sched_domains_energy(void) } #ifdef CONFIG_PROC_SYSCTL -int sched_energy_aware_handler(struct ctl_table *table, int write, +static int sched_energy_aware_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret, state; @@ -237,6 +237,27 @@ int sched_energy_aware_handler(struct ctl_table *table, int write, return ret; } + +static struct ctl_table sched_energy_aware_sysctls[] = { + { + .procname = "sched_energy_aware", + .data = &sysctl_sched_energy_aware, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_energy_aware_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static int __init sched_energy_aware_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_energy_aware_sysctls); + return 0; +} + +late_initcall(sched_energy_aware_sysctl_init); #endif static void free_pd(struct perf_domain *pd) @@ -2295,23 +2316,30 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att /* * For a single LLC per node, allow an - * imbalance up to 25% of the node. This is an - * arbitrary cutoff based on SMT-2 to balance - * between memory bandwidth and avoiding - * premature sharing of HT resources and SMT-4 - * or SMT-8 *may* benefit from a different - * cutoff. + * imbalance up to 12.5% of the node. This is + * arbitrary cutoff based two factors -- SMT and + * memory channels. For SMT-2, the intent is to + * avoid premature sharing of HT resources but + * SMT-4 or SMT-8 *may* benefit from a different + * cutoff. For memory channels, this is a very + * rough estimate of how many channels may be + * active and is based on recent CPUs with + * many cores. * * For multiple LLCs, allow an imbalance * until multiple tasks would share an LLC * on one node while LLCs on another node - * remain idle. + * remain idle. This assumes that there are + * enough logical CPUs per LLC to avoid SMT + * factors and that there is a correlation + * between LLCs and memory channels. */ nr_llcs = sd->span_weight / child->span_weight; if (nr_llcs == 1) - imb = sd->span_weight >> 2; + imb = sd->span_weight >> 3; else imb = nr_llcs; + imb = max(1U, imb); sd->imb_numa_nr = imb; /* Set span based on the first NUMA domain. */ diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index d4788f810b55..0b1cd985dc27 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -47,7 +47,7 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_ prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode); if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) ret = (*action)(&wbq_entry->key, mode); - } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); + } while (test_bit_acquire(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); finish_wait(wq_head, &wbq_entry->wq_entry); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index b5ac87f6dbd4..e9852d1b4a5e 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -200,6 +200,8 @@ static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter) * the filter can be freed. * @cache: cache of arch/syscall mappings to actions * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged + * @wait_killable_recv: Put notifying process in killable state once the + * notification is received by the userspace listener. * @prev: points to a previously installed, or inherited, filter * @prog: the BPF program to evaluate * @notif: the struct that holds all notification related information @@ -220,6 +222,7 @@ struct seccomp_filter { refcount_t refs; refcount_t users; bool log; + bool wait_killable_recv; struct action_cache cache; struct seccomp_filter *prev; struct bpf_prog *prog; @@ -893,6 +896,10 @@ static long seccomp_attach_filter(unsigned int flags, if (flags & SECCOMP_FILTER_FLAG_LOG) filter->log = true; + /* Set wait killable flag, if present. */ + if (flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV) + filter->wait_killable_recv = true; + /* * If there is an existing filter, make it the prev and don't drop its * task reference. @@ -1080,6 +1087,12 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_kn complete(&addfd->completion); } +static bool should_sleep_killable(struct seccomp_filter *match, + struct seccomp_knotif *n) +{ + return match->wait_killable_recv && n->state == SECCOMP_NOTIFY_SENT; +} + static int seccomp_do_user_notification(int this_syscall, struct seccomp_filter *match, const struct seccomp_data *sd) @@ -1100,7 +1113,7 @@ static int seccomp_do_user_notification(int this_syscall, n.data = sd; n.id = seccomp_next_notify_id(match); init_completion(&n.ready); - list_add(&n.list, &match->notif->notifications); + list_add_tail(&n.list, &match->notif->notifications); INIT_LIST_HEAD(&n.addfd); up(&match->notif->request); @@ -1110,11 +1123,25 @@ static int seccomp_do_user_notification(int this_syscall, * This is where we wait for a reply from userspace. */ do { + bool wait_killable = should_sleep_killable(match, &n); + mutex_unlock(&match->notify_lock); - err = wait_for_completion_interruptible(&n.ready); + if (wait_killable) + err = wait_for_completion_killable(&n.ready); + else + err = wait_for_completion_interruptible(&n.ready); mutex_lock(&match->notify_lock); - if (err != 0) + + if (err != 0) { + /* + * Check to see if the notifcation got picked up and + * whether we should switch to wait killable. + */ + if (!wait_killable && should_sleep_killable(match, &n)) + continue; + goto interrupted; + } addfd = list_first_entry_or_null(&n.addfd, struct seccomp_kaddfd, list); @@ -1484,6 +1511,9 @@ out: mutex_lock(&filter->notify_lock); knotif = find_notification(filter, unotif.id); if (knotif) { + /* Reset the process to make sure it's not stuck */ + if (should_sleep_killable(filter, knotif)) + complete(&knotif->ready); knotif->state = SECCOMP_NOTIFY_INIT; up(&filter->notif->request); } @@ -1829,6 +1859,14 @@ static long seccomp_set_mode_filter(unsigned int flags, ((flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH) == 0)) return -EINVAL; + /* + * The SECCOMP_FILTER_FLAG_WAIT_KILLABLE_SENT flag doesn't make sense + * without the SECCOMP_FILTER_FLAG_NEW_LISTENER flag. + */ + if ((flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV) && + ((flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) == 0)) + return -EINVAL; + /* Prepare the new filter before holding any locks. */ prepared = seccomp_prepare_user_filter(filter); if (IS_ERR(prepared)) diff --git a/kernel/signal.c b/kernel/signal.c index 30cd1ca43bcd..8a0f114d00e0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -762,7 +762,10 @@ still_pending: */ void signal_wake_up_state(struct task_struct *t, unsigned int state) { + lockdep_assert_held(&t->sighand->siglock); + set_tsk_thread_flag(t, TIF_SIGPENDING); + /* * TASK_WAKEKILL also means wake it up in the stopped/traced/killable * case. We don't check t->state here because there is a race with it @@ -884,7 +887,7 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info, static void ptrace_trap_notify(struct task_struct *t) { WARN_ON_ONCE(!(t->ptrace & PT_SEIZED)); - assert_spin_locked(&t->sighand->siglock); + lockdep_assert_held(&t->sighand->siglock); task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY); ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING); @@ -910,8 +913,9 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) if (signal->core_state) return sig == SIGKILL; /* - * The process is in the middle of dying, nothing to do. + * The process is in the middle of dying, drop the signal. */ + return false; } else if (sig_kernel_stop(sig)) { /* * This is a stop signal. Remove SIGCONT from all queues. @@ -930,9 +934,10 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) for_each_thread(p, t) { flush_sigqueue_mask(&flush, &t->pending); task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); - if (likely(!(t->ptrace & PT_SEIZED))) + if (likely(!(t->ptrace & PT_SEIZED))) { + t->jobctl &= ~JOBCTL_STOPPED; wake_up_state(t, __TASK_STOPPED); - else + } else ptrace_trap_notify(t); } @@ -1071,15 +1076,15 @@ static inline bool legacy_queue(struct sigpending *signals, int sig) return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); } -static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t, - enum pid_type type, bool force) +static int __send_signal_locked(int sig, struct kernel_siginfo *info, + struct task_struct *t, enum pid_type type, bool force) { struct sigpending *pending; struct sigqueue *q; int override_rlimit; int ret = 0, result; - assert_spin_locked(&t->sighand->siglock); + lockdep_assert_held(&t->sighand->siglock); result = TRACE_SIGNAL_IGNORED; if (!prepare_signal(sig, t, force)) @@ -1212,8 +1217,8 @@ static inline bool has_si_pid_and_uid(struct kernel_siginfo *info) return ret; } -static int send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t, - enum pid_type type) +int send_signal_locked(int sig, struct kernel_siginfo *info, + struct task_struct *t, enum pid_type type) { /* Should SIGKILL or SIGSTOP be received by a pid namespace init? */ bool force = false; @@ -1245,7 +1250,7 @@ static int send_signal(int sig, struct kernel_siginfo *info, struct task_struct force = true; } } - return __send_signal(sig, info, t, type, force); + return __send_signal_locked(sig, info, t, type, force); } static void print_fatal_signal(int signr) @@ -1281,12 +1286,6 @@ static int __init setup_print_fatal_signals(char *str) __setup("print-fatal-signals=", setup_print_fatal_signals); -int -__group_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p) -{ - return send_signal(sig, info, p, PIDTYPE_TGID); -} - int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type) { @@ -1294,7 +1293,7 @@ int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p int ret = -ESRCH; if (lock_task_sighand(p, &flags)) { - ret = send_signal(sig, info, p, type); + ret = send_signal_locked(sig, info, p, type); unlock_task_sighand(p, &flags); } @@ -1347,7 +1346,7 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, if (action->sa.sa_handler == SIG_DFL && (!t->ptrace || (handler == HANDLER_EXIT))) t->signal->flags &= ~SIGNAL_UNKILLABLE; - ret = send_signal(sig, info, t, PIDTYPE_PID); + ret = send_signal_locked(sig, info, t, PIDTYPE_PID); spin_unlock_irqrestore(&t->sighand->siglock, flags); return ret; @@ -1567,7 +1566,7 @@ int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr, if (sig) { if (lock_task_sighand(p, &flags)) { - ret = __send_signal(sig, &info, p, PIDTYPE_TGID, false); + ret = __send_signal_locked(sig, &info, p, PIDTYPE_TGID, false); unlock_task_sighand(p, &flags); } else ret = -ESRCH; @@ -1805,7 +1804,7 @@ int force_sig_pkuerr(void __user *addr, u32 pkey) } #endif -int force_sig_perf(void __user *addr, u32 type, u64 sig_data) +int send_sig_perf(void __user *addr, u32 type, u64 sig_data) { struct kernel_siginfo info; @@ -1817,7 +1816,18 @@ int force_sig_perf(void __user *addr, u32 type, u64 sig_data) info.si_perf_data = sig_data; info.si_perf_type = type; - return force_sig_info(&info); + /* + * Signals generated by perf events should not terminate the whole + * process if SIGTRAP is blocked, however, delivering the signal + * asynchronously is better than not delivering at all. But tell user + * space if the signal was asynchronous, so it can clearly be + * distinguished from normal synchronous ones. + */ + info.si_perf_flags = sigismember(¤t->blocked, info.si_signo) ? + TRAP_PERF_FLAG_ASYNC : + 0; + + return send_sig_info(info.si_signo, &info, current); } /** @@ -2020,12 +2030,12 @@ bool do_notify_parent(struct task_struct *tsk, int sig) bool autoreap = false; u64 utime, stime; - BUG_ON(sig == -1); + WARN_ON_ONCE(sig == -1); - /* do_notify_parent_cldstop should have been called instead. */ - BUG_ON(task_is_stopped_or_traced(tsk)); + /* do_notify_parent_cldstop should have been called instead. */ + WARN_ON_ONCE(task_is_stopped_or_traced(tsk)); - BUG_ON(!tsk->ptrace && + WARN_ON_ONCE(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk))); /* Wake up all pidfd waiters */ @@ -2103,7 +2113,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig) * parent's namespaces. */ if (valid_signal(sig) && sig) - __send_signal(sig, &info, tsk->parent, PIDTYPE_TGID, false); + __send_signal_locked(sig, &info, tsk->parent, PIDTYPE_TGID, false); __wake_up_parent(tsk, tsk->parent); spin_unlock_irqrestore(&psig->siglock, flags); @@ -2173,7 +2183,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, spin_lock_irqsave(&sighand->siglock, flags); if (sighand->action[SIGCHLD-1].sa.sa_handler != SIG_IGN && !(sighand->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP)) - __group_send_sig_info(SIGCHLD, &info, parent); + send_signal_locked(SIGCHLD, &info, parent, PIDTYPE_TGID); /* * Even if SIGCHLD is not generated, we must wake up wait4 calls. */ @@ -2193,13 +2203,12 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, * with. If the code did not stop because the tracer is gone, * the stop signal remains unchanged unless clear_code. */ -static int ptrace_stop(int exit_code, int why, int clear_code, - unsigned long message, kernel_siginfo_t *info) +static int ptrace_stop(int exit_code, int why, unsigned long message, + kernel_siginfo_t *info) __releases(¤t->sighand->siglock) __acquires(¤t->sighand->siglock) { bool gstop_done = false; - bool read_code = true; if (arch_ptrace_stop_needed()) { /* @@ -2216,10 +2225,16 @@ static int ptrace_stop(int exit_code, int why, int clear_code, } /* - * schedule() will not sleep if there is a pending signal that - * can awaken the task. + * After this point ptrace_signal_wake_up or signal_wake_up + * will clear TASK_TRACED if ptrace_unlink happens or a fatal + * signal comes in. Handle previous ptrace_unlinks and fatal + * signals here to prevent ptrace_stop sleeping in schedule. */ + if (!current->ptrace || __fatal_signal_pending(current)) + return exit_code; + set_special_state(TASK_TRACED); + current->jobctl |= JOBCTL_TRACED; /* * We're committing to trapping. TRACED should be visible before @@ -2265,54 +2280,33 @@ static int ptrace_stop(int exit_code, int why, int clear_code, spin_unlock_irq(¤t->sighand->siglock); read_lock(&tasklist_lock); - if (likely(current->ptrace)) { - /* - * Notify parents of the stop. - * - * While ptraced, there are two parents - the ptracer and - * the real_parent of the group_leader. The ptracer should - * know about every stop while the real parent is only - * interested in the completion of group stop. The states - * for the two don't interact with each other. Notify - * separately unless they're gonna be duplicates. - */ + /* + * Notify parents of the stop. + * + * While ptraced, there are two parents - the ptracer and + * the real_parent of the group_leader. The ptracer should + * know about every stop while the real parent is only + * interested in the completion of group stop. The states + * for the two don't interact with each other. Notify + * separately unless they're gonna be duplicates. + */ + if (current->ptrace) do_notify_parent_cldstop(current, true, why); - if (gstop_done && ptrace_reparented(current)) - do_notify_parent_cldstop(current, false, why); + if (gstop_done && (!current->ptrace || ptrace_reparented(current))) + do_notify_parent_cldstop(current, false, why); - /* - * Don't want to allow preemption here, because - * sys_ptrace() needs this task to be inactive. - * - * XXX: implement read_unlock_no_resched(). - */ - preempt_disable(); - read_unlock(&tasklist_lock); - cgroup_enter_frozen(); - preempt_enable_no_resched(); - freezable_schedule(); - cgroup_leave_frozen(true); - } else { - /* - * By the time we got the lock, our tracer went away. - * Don't drop the lock yet, another tracer may come. - * - * If @gstop_done, the ptracer went away between group stop - * completion and here. During detach, it would have set - * JOBCTL_STOP_PENDING on us and we'll re-enter - * TASK_STOPPED in do_signal_stop() on return, so notifying - * the real parent of the group stop completion is enough. - */ - if (gstop_done) - do_notify_parent_cldstop(current, false, why); - - /* tasklist protects us from ptrace_freeze_traced() */ - __set_current_state(TASK_RUNNING); - read_code = false; - if (clear_code) - exit_code = 0; - read_unlock(&tasklist_lock); - } + /* + * Don't want to allow preemption here, because + * sys_ptrace() needs this task to be inactive. + * + * XXX: implement read_unlock_no_resched(). + */ + preempt_disable(); + read_unlock(&tasklist_lock); + cgroup_enter_frozen(); + preempt_enable_no_resched(); + freezable_schedule(); + cgroup_leave_frozen(true); /* * We are back. Now reacquire the siglock before touching @@ -2320,14 +2314,13 @@ static int ptrace_stop(int exit_code, int why, int clear_code, * any signal-sending on another CPU that wants to examine it. */ spin_lock_irq(¤t->sighand->siglock); - if (read_code) - exit_code = current->exit_code; + exit_code = current->exit_code; current->last_siginfo = NULL; current->ptrace_message = 0; current->exit_code = 0; /* LISTENING can be set only during STOP traps, clear it */ - current->jobctl &= ~JOBCTL_LISTENING; + current->jobctl &= ~(JOBCTL_LISTENING | JOBCTL_PTRACE_FROZEN); /* * Queued signals ignored us while we were stopped for tracing. @@ -2349,7 +2342,7 @@ static int ptrace_do_notify(int signr, int exit_code, int why, unsigned long mes info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); /* Let the debugger run. */ - return ptrace_stop(exit_code, why, 1, message, &info); + return ptrace_stop(exit_code, why, message, &info); } int ptrace_notify(int exit_code, unsigned long message) @@ -2460,6 +2453,7 @@ static bool do_signal_stop(int signr) if (task_participate_group_stop(current)) notify = CLD_STOPPED; + current->jobctl |= JOBCTL_STOPPED; set_special_state(TASK_STOPPED); spin_unlock_irq(¤t->sighand->siglock); @@ -2521,7 +2515,7 @@ static void do_jobctl_trap(void) CLD_STOPPED, 0); } else { WARN_ON_ONCE(!signr); - ptrace_stop(signr, CLD_STOPPED, 0, 0, NULL); + ptrace_stop(signr, CLD_STOPPED, 0, NULL); } } @@ -2574,7 +2568,7 @@ static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type) * comment in dequeue_signal(). */ current->jobctl |= JOBCTL_STOP_DEQUEUED; - signr = ptrace_stop(signr, CLD_TRAPPED, 0, 0, info); + signr = ptrace_stop(signr, CLD_TRAPPED, 0, info); /* We're back. Did the debugger cancel the sig? */ if (signr == 0) @@ -2601,7 +2595,7 @@ static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type) /* If the (new) signal is now blocked, requeue it. */ if (sigismember(¤t->blocked, signr) || fatal_signal_pending(current)) { - send_signal(signr, info, current, type); + send_signal_locked(signr, info, current, type); signr = 0; } @@ -3432,6 +3426,7 @@ void copy_siginfo_to_external32(struct compat_siginfo *to, to->si_addr = ptr_to_compat(from->si_addr); to->si_perf_data = from->si_perf_data; to->si_perf_type = from->si_perf_type; + to->si_perf_flags = from->si_perf_flags; break; case SIL_CHLD: to->si_pid = from->si_pid; @@ -3509,6 +3504,7 @@ static int post_copy_siginfo_from_user32(kernel_siginfo_t *to, to->si_addr = compat_ptr(from->si_addr); to->si_perf_data = from->si_perf_data; to->si_perf_type = from->si_perf_type; + to->si_perf_flags = from->si_perf_flags; break; case SIL_CHLD: to->si_pid = from->si_pid; @@ -4722,6 +4718,7 @@ static inline void siginfo_buildtime_checks(void) CHECK_OFFSET(si_pkey); CHECK_OFFSET(si_perf_data); CHECK_OFFSET(si_perf_type); + CHECK_OFFSET(si_perf_flags); /* sigpoll */ CHECK_OFFSET(si_band); @@ -4793,7 +4790,7 @@ void kdb_send_sig(struct task_struct *t, int sig) "the deadlock.\n"); return; } - ret = send_signal(sig, SEND_SIG_PRIV, t, PIDTYPE_PID); + ret = send_signal_locked(sig, SEND_SIG_PRIV, t, PIDTYPE_PID); spin_unlock(&t->sighand->siglock); if (ret) kdb_printf("Fail to deliver Signal %d to process %d.\n", diff --git a/kernel/smp.c b/kernel/smp.c index 01a7c1706a58..e8cdc025a046 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -96,7 +96,7 @@ static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data); static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); -static void flush_smp_call_function_queue(bool warn_cpu_offline); +static void __flush_smp_call_function_queue(bool warn_cpu_offline); int smpcfd_prepare_cpu(unsigned int cpu) { @@ -141,7 +141,7 @@ int smpcfd_dying_cpu(unsigned int cpu) * ensure that the outgoing CPU doesn't go offline with work * still pending. */ - flush_smp_call_function_queue(false); + __flush_smp_call_function_queue(false); irq_work_run(); return 0; } @@ -174,16 +174,18 @@ static int __init csdlock_debug(char *str) if (val) static_branch_enable(&csdlock_debug_enabled); - return 0; + return 1; } -early_param("csdlock_debug", csdlock_debug); +__setup("csdlock_debug=", csdlock_debug); static DEFINE_PER_CPU(call_single_data_t *, cur_csd); static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func); static DEFINE_PER_CPU(void *, cur_csd_info); static DEFINE_PER_CPU(struct cfd_seq_local, cfd_seq_local); -#define CSD_LOCK_TIMEOUT (5ULL * NSEC_PER_SEC) +static ulong csd_lock_timeout = 5000; /* CSD lock timeout in milliseconds. */ +module_param(csd_lock_timeout, ulong, 0444); + static atomic_t csd_bug_count = ATOMIC_INIT(0); static u64 cfd_seq; @@ -329,6 +331,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 * u64 ts2, ts_delta; call_single_data_t *cpu_cur_csd; unsigned int flags = READ_ONCE(csd->node.u_flags); + unsigned long long csd_lock_timeout_ns = csd_lock_timeout * NSEC_PER_MSEC; if (!(flags & CSD_FLAG_LOCK)) { if (!unlikely(*bug_id)) @@ -341,7 +344,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 * ts2 = sched_clock(); ts_delta = ts2 - *ts1; - if (likely(ts_delta <= CSD_LOCK_TIMEOUT)) + if (likely(ts_delta <= csd_lock_timeout_ns || csd_lock_timeout_ns == 0)) return false; firsttime = !*bug_id; @@ -367,8 +370,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 * if (cpu >= 0) { if (static_branch_unlikely(&csdlock_debug_extended)) csd_lock_print_extended(csd, cpu); - if (!trigger_single_cpu_backtrace(cpu)) - dump_cpu_task(cpu); + dump_cpu_task(cpu); if (!cpu_cur_csd) { pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu); arch_send_call_function_single_ipi(cpu); @@ -541,11 +543,11 @@ void generic_smp_call_function_single_interrupt(void) { cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->gotipi, CFD_SEQ_NOCPU, smp_processor_id(), CFD_SEQ_GOTIPI); - flush_smp_call_function_queue(true); + __flush_smp_call_function_queue(true); } /** - * flush_smp_call_function_queue - Flush pending smp-call-function callbacks + * __flush_smp_call_function_queue - Flush pending smp-call-function callbacks * * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an * offline CPU. Skip this check if set to 'false'. @@ -558,7 +560,7 @@ void generic_smp_call_function_single_interrupt(void) * Loop through the call_single_queue and run all the queued callbacks. * Must be called with interrupts disabled. */ -static void flush_smp_call_function_queue(bool warn_cpu_offline) +static void __flush_smp_call_function_queue(bool warn_cpu_offline) { call_single_data_t *csd, *csd_next; struct llist_node *entry, *prev; @@ -579,7 +581,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) /* There shouldn't be any pending callbacks on an offline CPU. */ if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) && - !warned && !llist_empty(head))) { + !warned && entry != NULL)) { warned = true; WARN(1, "IPI on offline CPU %d\n", smp_processor_id()); @@ -681,8 +683,22 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) smp_processor_id(), CFD_SEQ_HDLEND); } -void flush_smp_call_function_from_idle(void) + +/** + * flush_smp_call_function_queue - Flush pending smp-call-function callbacks + * from task context (idle, migration thread) + * + * When TIF_POLLING_NRFLAG is supported and a CPU is in idle and has it + * set, then remote CPUs can avoid sending IPIs and wake the idle CPU by + * setting TIF_NEED_RESCHED. The idle task on the woken up CPU has to + * handle queued SMP function calls before scheduling. + * + * The migration thread has to ensure that an eventually pending wakeup has + * been handled before it migrates a task. + */ +void flush_smp_call_function_queue(void) { + unsigned int was_pending; unsigned long flags; if (llist_empty(this_cpu_ptr(&call_single_queue))) @@ -691,9 +707,11 @@ void flush_smp_call_function_from_idle(void) cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->idle, CFD_SEQ_NOCPU, smp_processor_id(), CFD_SEQ_IDLE); local_irq_save(flags); - flush_smp_call_function_queue(true); + /* Get the already pending soft interrupts for RT enabled kernels */ + was_pending = local_softirq_pending(); + __flush_smp_call_function_queue(true); if (local_softirq_pending()) - do_softirq(); + do_softirq_post_smp_call_flush(was_pending); local_irq_restore(flags); } diff --git a/kernel/smpboot.c b/kernel/smpboot.c index f6bc0bc8a2aa..b9f54544e749 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -392,6 +392,13 @@ int cpu_check_up_prepare(int cpu) */ return -EAGAIN; + case CPU_UP_PREPARE: + /* + * Timeout while waiting for the CPU to show up. Allow to try + * again later. + */ + return 0; + default: /* Should not happen. Famous last words. */ diff --git a/kernel/softirq.c b/kernel/softirq.c index fac801815554..c8a6913c067d 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -294,6 +294,19 @@ static inline void invoke_softirq(void) wakeup_softirqd(); } +/* + * flush_smp_call_function_queue() can raise a soft interrupt in a function + * call. On RT kernels this is undesired and the only known functionality + * in the block layer which does this is disabled on RT. If soft interrupts + * get raised which haven't been raised before the flush, warn so it can be + * investigated. + */ +void do_softirq_post_smp_call_flush(unsigned int was_pending) +{ + if (WARN_ON_ONCE(was_pending != local_softirq_pending())) + invoke_softirq(); +} + #else /* CONFIG_PREEMPT_RT */ /* @@ -607,7 +620,7 @@ void irq_enter_rcu(void) */ void irq_enter(void) { - rcu_irq_enter(); + ct_irq_enter(); irq_enter_rcu(); } @@ -659,7 +672,7 @@ void irq_exit_rcu(void) void irq_exit(void) { __irq_exit_rcu(); - rcu_irq_exit(); + ct_irq_exit(); /* must be last! */ lockdep_hardirq_exit(); } diff --git a/kernel/stackleak.c b/kernel/stackleak.c index ddb5a7f48d69..c2c33d2202e9 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -70,59 +70,81 @@ late_initcall(stackleak_sysctls_init); #define skip_erasing() false #endif /* CONFIG_STACKLEAK_RUNTIME_DISABLE */ -asmlinkage void noinstr stackleak_erase(void) +static __always_inline void __stackleak_erase(bool on_task_stack) { - /* It would be nice not to have 'kstack_ptr' and 'boundary' on stack */ - unsigned long kstack_ptr = current->lowest_stack; - unsigned long boundary = (unsigned long)end_of_stack(current); - unsigned int poison_count = 0; - const unsigned int depth = STACKLEAK_SEARCH_DEPTH / sizeof(unsigned long); - - if (skip_erasing()) - return; - - /* Check that 'lowest_stack' value is sane */ - if (unlikely(kstack_ptr - boundary >= THREAD_SIZE)) - kstack_ptr = boundary; + const unsigned long task_stack_low = stackleak_task_low_bound(current); + const unsigned long task_stack_high = stackleak_task_high_bound(current); + unsigned long erase_low, erase_high; - /* Search for the poison value in the kernel stack */ - while (kstack_ptr > boundary && poison_count <= depth) { - if (*(unsigned long *)kstack_ptr == STACKLEAK_POISON) - poison_count++; - else - poison_count = 0; - - kstack_ptr -= sizeof(unsigned long); - } - - /* - * One 'long int' at the bottom of the thread stack is reserved and - * should not be poisoned (see CONFIG_SCHED_STACK_END_CHECK=y). - */ - if (kstack_ptr == boundary) - kstack_ptr += sizeof(unsigned long); + erase_low = stackleak_find_top_of_poison(task_stack_low, + current->lowest_stack); #ifdef CONFIG_STACKLEAK_METRICS - current->prev_lowest_stack = kstack_ptr; + current->prev_lowest_stack = erase_low; #endif /* - * Now write the poison value to the kernel stack. Start from - * 'kstack_ptr' and move up till the new 'boundary'. We assume that - * the stack pointer doesn't change when we write poison. + * Write poison to the task's stack between 'erase_low' and + * 'erase_high'. + * + * If we're running on a different stack (e.g. an entry trampoline + * stack) we can erase everything below the pt_regs at the top of the + * task stack. + * + * If we're running on the task stack itself, we must not clobber any + * stack used by this function and its caller. We assume that this + * function has a fixed-size stack frame, and the current stack pointer + * doesn't change while we write poison. */ - if (on_thread_stack()) - boundary = current_stack_pointer; + if (on_task_stack) + erase_high = current_stack_pointer; else - boundary = current_top_of_stack(); + erase_high = task_stack_high; - while (kstack_ptr < boundary) { - *(unsigned long *)kstack_ptr = STACKLEAK_POISON; - kstack_ptr += sizeof(unsigned long); + while (erase_low < erase_high) { + *(unsigned long *)erase_low = STACKLEAK_POISON; + erase_low += sizeof(unsigned long); } /* Reset the 'lowest_stack' value for the next syscall */ - current->lowest_stack = current_top_of_stack() - THREAD_SIZE/64; + current->lowest_stack = task_stack_high; +} + +/* + * Erase and poison the portion of the task stack used since the last erase. + * Can be called from the task stack or an entry stack when the task stack is + * no longer in use. + */ +asmlinkage void noinstr stackleak_erase(void) +{ + if (skip_erasing()) + return; + + __stackleak_erase(on_thread_stack()); +} + +/* + * Erase and poison the portion of the task stack used since the last erase. + * Can only be called from the task stack. + */ +asmlinkage void noinstr stackleak_erase_on_task_stack(void) +{ + if (skip_erasing()) + return; + + __stackleak_erase(true); +} + +/* + * Erase and poison the portion of the task stack used since the last erase. + * Can only be called from a stack other than the task stack. + */ +asmlinkage void noinstr stackleak_erase_off_task_stack(void) +{ + if (skip_erasing()) + return; + + __stackleak_erase(false); } void __used __no_caller_saved_registers noinstr stackleak_track_stack(void) @@ -139,8 +161,7 @@ void __used __no_caller_saved_registers noinstr stackleak_track_stack(void) /* 'lowest_stack' should be aligned on the register width boundary */ sp = ALIGN(sp, sizeof(unsigned long)); if (sp < current->lowest_stack && - sp >= (unsigned long)task_stack_page(current) + - sizeof(unsigned long)) { + sp >= stackleak_task_low_bound(current)) { current->lowest_stack = sp; } } diff --git a/kernel/static_call.c b/kernel/static_call.c index f2b8baea35d2..e9c3e69f3837 100644 --- a/kernel/static_call.c +++ b/kernel/static_call.c @@ -1,549 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/init.h> #include <linux/static_call.h> -#include <linux/bug.h> -#include <linux/smp.h> -#include <linux/sort.h> -#include <linux/slab.h> -#include <linux/module.h> -#include <linux/cpu.h> -#include <linux/processor.h> -#include <asm/sections.h> - -extern struct static_call_site __start_static_call_sites[], - __stop_static_call_sites[]; -extern struct static_call_tramp_key __start_static_call_tramp_key[], - __stop_static_call_tramp_key[]; - -static bool static_call_initialized; - -/* mutex to protect key modules/sites */ -static DEFINE_MUTEX(static_call_mutex); - -static void static_call_lock(void) -{ - mutex_lock(&static_call_mutex); -} - -static void static_call_unlock(void) -{ - mutex_unlock(&static_call_mutex); -} - -static inline void *static_call_addr(struct static_call_site *site) -{ - return (void *)((long)site->addr + (long)&site->addr); -} - -static inline unsigned long __static_call_key(const struct static_call_site *site) -{ - return (long)site->key + (long)&site->key; -} - -static inline struct static_call_key *static_call_key(const struct static_call_site *site) -{ - return (void *)(__static_call_key(site) & ~STATIC_CALL_SITE_FLAGS); -} - -/* These assume the key is word-aligned. */ -static inline bool static_call_is_init(struct static_call_site *site) -{ - return __static_call_key(site) & STATIC_CALL_SITE_INIT; -} - -static inline bool static_call_is_tail(struct static_call_site *site) -{ - return __static_call_key(site) & STATIC_CALL_SITE_TAIL; -} - -static inline void static_call_set_init(struct static_call_site *site) -{ - site->key = (__static_call_key(site) | STATIC_CALL_SITE_INIT) - - (long)&site->key; -} - -static int static_call_site_cmp(const void *_a, const void *_b) -{ - const struct static_call_site *a = _a; - const struct static_call_site *b = _b; - const struct static_call_key *key_a = static_call_key(a); - const struct static_call_key *key_b = static_call_key(b); - - if (key_a < key_b) - return -1; - - if (key_a > key_b) - return 1; - - return 0; -} - -static void static_call_site_swap(void *_a, void *_b, int size) -{ - long delta = (unsigned long)_a - (unsigned long)_b; - struct static_call_site *a = _a; - struct static_call_site *b = _b; - struct static_call_site tmp = *a; - - a->addr = b->addr - delta; - a->key = b->key - delta; - - b->addr = tmp.addr + delta; - b->key = tmp.key + delta; -} - -static inline void static_call_sort_entries(struct static_call_site *start, - struct static_call_site *stop) -{ - sort(start, stop - start, sizeof(struct static_call_site), - static_call_site_cmp, static_call_site_swap); -} - -static inline bool static_call_key_has_mods(struct static_call_key *key) -{ - return !(key->type & 1); -} - -static inline struct static_call_mod *static_call_key_next(struct static_call_key *key) -{ - if (!static_call_key_has_mods(key)) - return NULL; - - return key->mods; -} - -static inline struct static_call_site *static_call_key_sites(struct static_call_key *key) -{ - if (static_call_key_has_mods(key)) - return NULL; - - return (struct static_call_site *)(key->type & ~1); -} - -void __static_call_update(struct static_call_key *key, void *tramp, void *func) -{ - struct static_call_site *site, *stop; - struct static_call_mod *site_mod, first; - - cpus_read_lock(); - static_call_lock(); - - if (key->func == func) - goto done; - - key->func = func; - - arch_static_call_transform(NULL, tramp, func, false); - - /* - * If uninitialized, we'll not update the callsites, but they still - * point to the trampoline and we just patched that. - */ - if (WARN_ON_ONCE(!static_call_initialized)) - goto done; - - first = (struct static_call_mod){ - .next = static_call_key_next(key), - .mod = NULL, - .sites = static_call_key_sites(key), - }; - - for (site_mod = &first; site_mod; site_mod = site_mod->next) { - bool init = system_state < SYSTEM_RUNNING; - struct module *mod = site_mod->mod; - - if (!site_mod->sites) { - /* - * This can happen if the static call key is defined in - * a module which doesn't use it. - * - * It also happens in the has_mods case, where the - * 'first' entry has no sites associated with it. - */ - continue; - } - - stop = __stop_static_call_sites; - - if (mod) { -#ifdef CONFIG_MODULES - stop = mod->static_call_sites + - mod->num_static_call_sites; - init = mod->state == MODULE_STATE_COMING; -#endif - } - - for (site = site_mod->sites; - site < stop && static_call_key(site) == key; site++) { - void *site_addr = static_call_addr(site); - - if (!init && static_call_is_init(site)) - continue; - - if (!kernel_text_address((unsigned long)site_addr)) { - /* - * This skips patching built-in __exit, which - * is part of init_section_contains() but is - * not part of kernel_text_address(). - * - * Skipping built-in __exit is fine since it - * will never be executed. - */ - WARN_ONCE(!static_call_is_init(site), - "can't patch static call site at %pS", - site_addr); - continue; - } - - arch_static_call_transform(site_addr, NULL, func, - static_call_is_tail(site)); - } - } - -done: - static_call_unlock(); - cpus_read_unlock(); -} -EXPORT_SYMBOL_GPL(__static_call_update); - -static int __static_call_init(struct module *mod, - struct static_call_site *start, - struct static_call_site *stop) -{ - struct static_call_site *site; - struct static_call_key *key, *prev_key = NULL; - struct static_call_mod *site_mod; - - if (start == stop) - return 0; - - static_call_sort_entries(start, stop); - - for (site = start; site < stop; site++) { - void *site_addr = static_call_addr(site); - - if ((mod && within_module_init((unsigned long)site_addr, mod)) || - (!mod && init_section_contains(site_addr, 1))) - static_call_set_init(site); - - key = static_call_key(site); - if (key != prev_key) { - prev_key = key; - - /* - * For vmlinux (!mod) avoid the allocation by storing - * the sites pointer in the key itself. Also see - * __static_call_update()'s @first. - * - * This allows architectures (eg. x86) to call - * static_call_init() before memory allocation works. - */ - if (!mod) { - key->sites = site; - key->type |= 1; - goto do_transform; - } - - site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL); - if (!site_mod) - return -ENOMEM; - - /* - * When the key has a direct sites pointer, extract - * that into an explicit struct static_call_mod, so we - * can have a list of modules. - */ - if (static_call_key_sites(key)) { - site_mod->mod = NULL; - site_mod->next = NULL; - site_mod->sites = static_call_key_sites(key); - - key->mods = site_mod; - - site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL); - if (!site_mod) - return -ENOMEM; - } - - site_mod->mod = mod; - site_mod->sites = site; - site_mod->next = static_call_key_next(key); - key->mods = site_mod; - } - -do_transform: - arch_static_call_transform(site_addr, NULL, key->func, - static_call_is_tail(site)); - } - - return 0; -} - -static int addr_conflict(struct static_call_site *site, void *start, void *end) -{ - unsigned long addr = (unsigned long)static_call_addr(site); - - if (addr <= (unsigned long)end && - addr + CALL_INSN_SIZE > (unsigned long)start) - return 1; - - return 0; -} - -static int __static_call_text_reserved(struct static_call_site *iter_start, - struct static_call_site *iter_stop, - void *start, void *end, bool init) -{ - struct static_call_site *iter = iter_start; - - while (iter < iter_stop) { - if (init || !static_call_is_init(iter)) { - if (addr_conflict(iter, start, end)) - return 1; - } - iter++; - } - - return 0; -} - -#ifdef CONFIG_MODULES - -static int __static_call_mod_text_reserved(void *start, void *end) -{ - struct module *mod; - int ret; - - preempt_disable(); - mod = __module_text_address((unsigned long)start); - WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); - if (!try_module_get(mod)) - mod = NULL; - preempt_enable(); - - if (!mod) - return 0; - - ret = __static_call_text_reserved(mod->static_call_sites, - mod->static_call_sites + mod->num_static_call_sites, - start, end, mod->state == MODULE_STATE_COMING); - - module_put(mod); - - return ret; -} - -static unsigned long tramp_key_lookup(unsigned long addr) -{ - struct static_call_tramp_key *start = __start_static_call_tramp_key; - struct static_call_tramp_key *stop = __stop_static_call_tramp_key; - struct static_call_tramp_key *tramp_key; - - for (tramp_key = start; tramp_key != stop; tramp_key++) { - unsigned long tramp; - - tramp = (long)tramp_key->tramp + (long)&tramp_key->tramp; - if (tramp == addr) - return (long)tramp_key->key + (long)&tramp_key->key; - } - - return 0; -} - -static int static_call_add_module(struct module *mod) -{ - struct static_call_site *start = mod->static_call_sites; - struct static_call_site *stop = start + mod->num_static_call_sites; - struct static_call_site *site; - - for (site = start; site != stop; site++) { - unsigned long s_key = __static_call_key(site); - unsigned long addr = s_key & ~STATIC_CALL_SITE_FLAGS; - unsigned long key; - - /* - * Is the key is exported, 'addr' points to the key, which - * means modules are allowed to call static_call_update() on - * it. - * - * Otherwise, the key isn't exported, and 'addr' points to the - * trampoline so we need to lookup the key. - * - * We go through this dance to prevent crazy modules from - * abusing sensitive static calls. - */ - if (!kernel_text_address(addr)) - continue; - - key = tramp_key_lookup(addr); - if (!key) { - pr_warn("Failed to fixup __raw_static_call() usage at: %ps\n", - static_call_addr(site)); - return -EINVAL; - } - - key |= s_key & STATIC_CALL_SITE_FLAGS; - site->key = key - (long)&site->key; - } - - return __static_call_init(mod, start, stop); -} - -static void static_call_del_module(struct module *mod) -{ - struct static_call_site *start = mod->static_call_sites; - struct static_call_site *stop = mod->static_call_sites + - mod->num_static_call_sites; - struct static_call_key *key, *prev_key = NULL; - struct static_call_mod *site_mod, **prev; - struct static_call_site *site; - - for (site = start; site < stop; site++) { - key = static_call_key(site); - if (key == prev_key) - continue; - - prev_key = key; - - for (prev = &key->mods, site_mod = key->mods; - site_mod && site_mod->mod != mod; - prev = &site_mod->next, site_mod = site_mod->next) - ; - - if (!site_mod) - continue; - - *prev = site_mod->next; - kfree(site_mod); - } -} - -static int static_call_module_notify(struct notifier_block *nb, - unsigned long val, void *data) -{ - struct module *mod = data; - int ret = 0; - - cpus_read_lock(); - static_call_lock(); - - switch (val) { - case MODULE_STATE_COMING: - ret = static_call_add_module(mod); - if (ret) { - WARN(1, "Failed to allocate memory for static calls"); - static_call_del_module(mod); - } - break; - case MODULE_STATE_GOING: - static_call_del_module(mod); - break; - } - - static_call_unlock(); - cpus_read_unlock(); - - return notifier_from_errno(ret); -} - -static struct notifier_block static_call_module_nb = { - .notifier_call = static_call_module_notify, -}; - -#else - -static inline int __static_call_mod_text_reserved(void *start, void *end) -{ - return 0; -} - -#endif /* CONFIG_MODULES */ - -int static_call_text_reserved(void *start, void *end) -{ - bool init = system_state < SYSTEM_RUNNING; - int ret = __static_call_text_reserved(__start_static_call_sites, - __stop_static_call_sites, start, end, init); - - if (ret) - return ret; - - return __static_call_mod_text_reserved(start, end); -} - -int __init static_call_init(void) -{ - int ret; - - if (static_call_initialized) - return 0; - - cpus_read_lock(); - static_call_lock(); - ret = __static_call_init(NULL, __start_static_call_sites, - __stop_static_call_sites); - static_call_unlock(); - cpus_read_unlock(); - - if (ret) { - pr_err("Failed to allocate memory for static_call!\n"); - BUG(); - } - - static_call_initialized = true; - -#ifdef CONFIG_MODULES - register_module_notifier(&static_call_module_nb); -#endif - return 0; -} -early_initcall(static_call_init); long __static_call_return0(void) { return 0; } EXPORT_SYMBOL_GPL(__static_call_return0); - -#ifdef CONFIG_STATIC_CALL_SELFTEST - -static int func_a(int x) -{ - return x+1; -} - -static int func_b(int x) -{ - return x+2; -} - -DEFINE_STATIC_CALL(sc_selftest, func_a); - -static struct static_call_data { - int (*func)(int); - int val; - int expect; -} static_call_data [] __initdata = { - { NULL, 2, 3 }, - { func_b, 2, 4 }, - { func_a, 2, 3 } -}; - -static int __init test_static_call_init(void) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(static_call_data); i++ ) { - struct static_call_data *scd = &static_call_data[i]; - - if (scd->func) - static_call_update(sc_selftest, scd->func); - - WARN_ON(static_call(sc_selftest)(scd->val) != scd->expect); - } - - return 0; -} -early_initcall(test_static_call_init); - -#endif /* CONFIG_STATIC_CALL_SELFTEST */ diff --git a/kernel/static_call_inline.c b/kernel/static_call_inline.c new file mode 100644 index 000000000000..dc5665b62814 --- /dev/null +++ b/kernel/static_call_inline.c @@ -0,0 +1,543 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/init.h> +#include <linux/static_call.h> +#include <linux/bug.h> +#include <linux/smp.h> +#include <linux/sort.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/cpu.h> +#include <linux/processor.h> +#include <asm/sections.h> + +extern struct static_call_site __start_static_call_sites[], + __stop_static_call_sites[]; +extern struct static_call_tramp_key __start_static_call_tramp_key[], + __stop_static_call_tramp_key[]; + +static bool static_call_initialized; + +/* mutex to protect key modules/sites */ +static DEFINE_MUTEX(static_call_mutex); + +static void static_call_lock(void) +{ + mutex_lock(&static_call_mutex); +} + +static void static_call_unlock(void) +{ + mutex_unlock(&static_call_mutex); +} + +static inline void *static_call_addr(struct static_call_site *site) +{ + return (void *)((long)site->addr + (long)&site->addr); +} + +static inline unsigned long __static_call_key(const struct static_call_site *site) +{ + return (long)site->key + (long)&site->key; +} + +static inline struct static_call_key *static_call_key(const struct static_call_site *site) +{ + return (void *)(__static_call_key(site) & ~STATIC_CALL_SITE_FLAGS); +} + +/* These assume the key is word-aligned. */ +static inline bool static_call_is_init(struct static_call_site *site) +{ + return __static_call_key(site) & STATIC_CALL_SITE_INIT; +} + +static inline bool static_call_is_tail(struct static_call_site *site) +{ + return __static_call_key(site) & STATIC_CALL_SITE_TAIL; +} + +static inline void static_call_set_init(struct static_call_site *site) +{ + site->key = (__static_call_key(site) | STATIC_CALL_SITE_INIT) - + (long)&site->key; +} + +static int static_call_site_cmp(const void *_a, const void *_b) +{ + const struct static_call_site *a = _a; + const struct static_call_site *b = _b; + const struct static_call_key *key_a = static_call_key(a); + const struct static_call_key *key_b = static_call_key(b); + + if (key_a < key_b) + return -1; + + if (key_a > key_b) + return 1; + + return 0; +} + +static void static_call_site_swap(void *_a, void *_b, int size) +{ + long delta = (unsigned long)_a - (unsigned long)_b; + struct static_call_site *a = _a; + struct static_call_site *b = _b; + struct static_call_site tmp = *a; + + a->addr = b->addr - delta; + a->key = b->key - delta; + + b->addr = tmp.addr + delta; + b->key = tmp.key + delta; +} + +static inline void static_call_sort_entries(struct static_call_site *start, + struct static_call_site *stop) +{ + sort(start, stop - start, sizeof(struct static_call_site), + static_call_site_cmp, static_call_site_swap); +} + +static inline bool static_call_key_has_mods(struct static_call_key *key) +{ + return !(key->type & 1); +} + +static inline struct static_call_mod *static_call_key_next(struct static_call_key *key) +{ + if (!static_call_key_has_mods(key)) + return NULL; + + return key->mods; +} + +static inline struct static_call_site *static_call_key_sites(struct static_call_key *key) +{ + if (static_call_key_has_mods(key)) + return NULL; + + return (struct static_call_site *)(key->type & ~1); +} + +void __static_call_update(struct static_call_key *key, void *tramp, void *func) +{ + struct static_call_site *site, *stop; + struct static_call_mod *site_mod, first; + + cpus_read_lock(); + static_call_lock(); + + if (key->func == func) + goto done; + + key->func = func; + + arch_static_call_transform(NULL, tramp, func, false); + + /* + * If uninitialized, we'll not update the callsites, but they still + * point to the trampoline and we just patched that. + */ + if (WARN_ON_ONCE(!static_call_initialized)) + goto done; + + first = (struct static_call_mod){ + .next = static_call_key_next(key), + .mod = NULL, + .sites = static_call_key_sites(key), + }; + + for (site_mod = &first; site_mod; site_mod = site_mod->next) { + bool init = system_state < SYSTEM_RUNNING; + struct module *mod = site_mod->mod; + + if (!site_mod->sites) { + /* + * This can happen if the static call key is defined in + * a module which doesn't use it. + * + * It also happens in the has_mods case, where the + * 'first' entry has no sites associated with it. + */ + continue; + } + + stop = __stop_static_call_sites; + + if (mod) { +#ifdef CONFIG_MODULES + stop = mod->static_call_sites + + mod->num_static_call_sites; + init = mod->state == MODULE_STATE_COMING; +#endif + } + + for (site = site_mod->sites; + site < stop && static_call_key(site) == key; site++) { + void *site_addr = static_call_addr(site); + + if (!init && static_call_is_init(site)) + continue; + + if (!kernel_text_address((unsigned long)site_addr)) { + /* + * This skips patching built-in __exit, which + * is part of init_section_contains() but is + * not part of kernel_text_address(). + * + * Skipping built-in __exit is fine since it + * will never be executed. + */ + WARN_ONCE(!static_call_is_init(site), + "can't patch static call site at %pS", + site_addr); + continue; + } + + arch_static_call_transform(site_addr, NULL, func, + static_call_is_tail(site)); + } + } + +done: + static_call_unlock(); + cpus_read_unlock(); +} +EXPORT_SYMBOL_GPL(__static_call_update); + +static int __static_call_init(struct module *mod, + struct static_call_site *start, + struct static_call_site *stop) +{ + struct static_call_site *site; + struct static_call_key *key, *prev_key = NULL; + struct static_call_mod *site_mod; + + if (start == stop) + return 0; + + static_call_sort_entries(start, stop); + + for (site = start; site < stop; site++) { + void *site_addr = static_call_addr(site); + + if ((mod && within_module_init((unsigned long)site_addr, mod)) || + (!mod && init_section_contains(site_addr, 1))) + static_call_set_init(site); + + key = static_call_key(site); + if (key != prev_key) { + prev_key = key; + + /* + * For vmlinux (!mod) avoid the allocation by storing + * the sites pointer in the key itself. Also see + * __static_call_update()'s @first. + * + * This allows architectures (eg. x86) to call + * static_call_init() before memory allocation works. + */ + if (!mod) { + key->sites = site; + key->type |= 1; + goto do_transform; + } + + site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL); + if (!site_mod) + return -ENOMEM; + + /* + * When the key has a direct sites pointer, extract + * that into an explicit struct static_call_mod, so we + * can have a list of modules. + */ + if (static_call_key_sites(key)) { + site_mod->mod = NULL; + site_mod->next = NULL; + site_mod->sites = static_call_key_sites(key); + + key->mods = site_mod; + + site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL); + if (!site_mod) + return -ENOMEM; + } + + site_mod->mod = mod; + site_mod->sites = site; + site_mod->next = static_call_key_next(key); + key->mods = site_mod; + } + +do_transform: + arch_static_call_transform(site_addr, NULL, key->func, + static_call_is_tail(site)); + } + + return 0; +} + +static int addr_conflict(struct static_call_site *site, void *start, void *end) +{ + unsigned long addr = (unsigned long)static_call_addr(site); + + if (addr <= (unsigned long)end && + addr + CALL_INSN_SIZE > (unsigned long)start) + return 1; + + return 0; +} + +static int __static_call_text_reserved(struct static_call_site *iter_start, + struct static_call_site *iter_stop, + void *start, void *end, bool init) +{ + struct static_call_site *iter = iter_start; + + while (iter < iter_stop) { + if (init || !static_call_is_init(iter)) { + if (addr_conflict(iter, start, end)) + return 1; + } + iter++; + } + + return 0; +} + +#ifdef CONFIG_MODULES + +static int __static_call_mod_text_reserved(void *start, void *end) +{ + struct module *mod; + int ret; + + preempt_disable(); + mod = __module_text_address((unsigned long)start); + WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); + if (!try_module_get(mod)) + mod = NULL; + preempt_enable(); + + if (!mod) + return 0; + + ret = __static_call_text_reserved(mod->static_call_sites, + mod->static_call_sites + mod->num_static_call_sites, + start, end, mod->state == MODULE_STATE_COMING); + + module_put(mod); + + return ret; +} + +static unsigned long tramp_key_lookup(unsigned long addr) +{ + struct static_call_tramp_key *start = __start_static_call_tramp_key; + struct static_call_tramp_key *stop = __stop_static_call_tramp_key; + struct static_call_tramp_key *tramp_key; + + for (tramp_key = start; tramp_key != stop; tramp_key++) { + unsigned long tramp; + + tramp = (long)tramp_key->tramp + (long)&tramp_key->tramp; + if (tramp == addr) + return (long)tramp_key->key + (long)&tramp_key->key; + } + + return 0; +} + +static int static_call_add_module(struct module *mod) +{ + struct static_call_site *start = mod->static_call_sites; + struct static_call_site *stop = start + mod->num_static_call_sites; + struct static_call_site *site; + + for (site = start; site != stop; site++) { + unsigned long s_key = __static_call_key(site); + unsigned long addr = s_key & ~STATIC_CALL_SITE_FLAGS; + unsigned long key; + + /* + * Is the key is exported, 'addr' points to the key, which + * means modules are allowed to call static_call_update() on + * it. + * + * Otherwise, the key isn't exported, and 'addr' points to the + * trampoline so we need to lookup the key. + * + * We go through this dance to prevent crazy modules from + * abusing sensitive static calls. + */ + if (!kernel_text_address(addr)) + continue; + + key = tramp_key_lookup(addr); + if (!key) { + pr_warn("Failed to fixup __raw_static_call() usage at: %ps\n", + static_call_addr(site)); + return -EINVAL; + } + + key |= s_key & STATIC_CALL_SITE_FLAGS; + site->key = key - (long)&site->key; + } + + return __static_call_init(mod, start, stop); +} + +static void static_call_del_module(struct module *mod) +{ + struct static_call_site *start = mod->static_call_sites; + struct static_call_site *stop = mod->static_call_sites + + mod->num_static_call_sites; + struct static_call_key *key, *prev_key = NULL; + struct static_call_mod *site_mod, **prev; + struct static_call_site *site; + + for (site = start; site < stop; site++) { + key = static_call_key(site); + if (key == prev_key) + continue; + + prev_key = key; + + for (prev = &key->mods, site_mod = key->mods; + site_mod && site_mod->mod != mod; + prev = &site_mod->next, site_mod = site_mod->next) + ; + + if (!site_mod) + continue; + + *prev = site_mod->next; + kfree(site_mod); + } +} + +static int static_call_module_notify(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct module *mod = data; + int ret = 0; + + cpus_read_lock(); + static_call_lock(); + + switch (val) { + case MODULE_STATE_COMING: + ret = static_call_add_module(mod); + if (ret) { + WARN(1, "Failed to allocate memory for static calls"); + static_call_del_module(mod); + } + break; + case MODULE_STATE_GOING: + static_call_del_module(mod); + break; + } + + static_call_unlock(); + cpus_read_unlock(); + + return notifier_from_errno(ret); +} + +static struct notifier_block static_call_module_nb = { + .notifier_call = static_call_module_notify, +}; + +#else + +static inline int __static_call_mod_text_reserved(void *start, void *end) +{ + return 0; +} + +#endif /* CONFIG_MODULES */ + +int static_call_text_reserved(void *start, void *end) +{ + bool init = system_state < SYSTEM_RUNNING; + int ret = __static_call_text_reserved(__start_static_call_sites, + __stop_static_call_sites, start, end, init); + + if (ret) + return ret; + + return __static_call_mod_text_reserved(start, end); +} + +int __init static_call_init(void) +{ + int ret; + + if (static_call_initialized) + return 0; + + cpus_read_lock(); + static_call_lock(); + ret = __static_call_init(NULL, __start_static_call_sites, + __stop_static_call_sites); + static_call_unlock(); + cpus_read_unlock(); + + if (ret) { + pr_err("Failed to allocate memory for static_call!\n"); + BUG(); + } + + static_call_initialized = true; + +#ifdef CONFIG_MODULES + register_module_notifier(&static_call_module_nb); +#endif + return 0; +} +early_initcall(static_call_init); + +#ifdef CONFIG_STATIC_CALL_SELFTEST + +static int func_a(int x) +{ + return x+1; +} + +static int func_b(int x) +{ + return x+2; +} + +DEFINE_STATIC_CALL(sc_selftest, func_a); + +static struct static_call_data { + int (*func)(int); + int val; + int expect; +} static_call_data [] __initdata = { + { NULL, 2, 3 }, + { func_b, 2, 4 }, + { func_a, 2, 3 } +}; + +static int __init test_static_call_init(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(static_call_data); i++ ) { + struct static_call_data *scd = &static_call_data[i]; + + if (scd->func) + static_call_update(sc_selftest, scd->func); + + WARN_ON(static_call(sc_selftest)(scd->val) != scd->expect); + } + + return 0; +} +early_initcall(test_static_call_init); + +#endif /* CONFIG_STATIC_CALL_SELFTEST */ diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index cbc30271ea4d..cedb17ba158a 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -535,8 +535,6 @@ void stop_machine_park(int cpu) kthread_park(stopper->thread); } -extern void sched_set_stop_task(int cpu, struct task_struct *stop); - static void cpu_stop_create(unsigned int cpu) { sched_set_stop_task(cpu, per_cpu(cpu_stopper.thread, cpu)); @@ -633,6 +631,27 @@ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) } EXPORT_SYMBOL_GPL(stop_machine); +#ifdef CONFIG_SCHED_SMT +int stop_core_cpuslocked(unsigned int cpu, cpu_stop_fn_t fn, void *data) +{ + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + + struct multi_stop_data msdata = { + .fn = fn, + .data = data, + .num_threads = cpumask_weight(smt_mask), + .active_cpus = smt_mask, + }; + + lockdep_assert_cpus_held(); + + /* Set the initial state and stop all online cpus. */ + set_state(&msdata, MULTI_STOP_PREPARE); + return stop_cpus(smt_mask, multi_cpu_stop, &msdata); +} +EXPORT_SYMBOL_GPL(stop_core_cpuslocked); +#endif + /** * stop_machine_from_inactive_cpu - stop_machine() from inactive CPU * @fn: the function to run diff --git a/kernel/sys.c b/kernel/sys.c index 9633229376a7..8a6432465dc5 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -117,6 +117,12 @@ #ifndef SVE_GET_VL # define SVE_GET_VL() (-EINVAL) #endif +#ifndef SME_SET_VL +# define SME_SET_VL(a) (-EINVAL) +#endif +#ifndef SME_GET_VL +# define SME_GET_VL() (-EINVAL) +#endif #ifndef PAC_RESET_KEYS # define PAC_RESET_KEYS(a, b) (-EINVAL) #endif @@ -2541,6 +2547,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SVE_GET_VL: error = SVE_GET_VL(); break; + case PR_SME_SET_VL: + error = SME_SET_VL(arg2); + break; + case PR_SME_GET_VL: + error = SME_GET_VL(); + break; case PR_GET_SPECULATION_CTRL: if (arg3 || arg4 || arg5) return -EINVAL; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index a492f159624f..860b2dcf3ac4 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -277,6 +277,7 @@ COND_SYSCALL(landlock_restrict_self); /* mm/fadvise.c */ COND_SYSCALL(fadvise64_64); +COND_SYSCALL_COMPAT(fadvise64_64); /* mm/, CONFIG_MMU only */ COND_SYSCALL(swapon); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 830aaf8ca08e..205d605cacc5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -61,13 +61,9 @@ #include <linux/capability.h> #include <linux/binfmts.h> #include <linux/sched/sysctl.h> -#include <linux/kexec.h> -#include <linux/bpf.h> #include <linux/mount.h> #include <linux/userfaultfd_k.h> -#include <linux/latencytop.h> #include <linux/pid.h> -#include <linux/delayacct.h> #include "../lib/kstrtox.h" @@ -82,15 +78,9 @@ #ifdef CONFIG_SPARC #include <asm/setup.h> #endif -#ifdef CONFIG_BSD_PROCESS_ACCT -#include <linux/acct.h> -#endif #ifdef CONFIG_RT_MUTEXES #include <linux/rtmutex.h> #endif -#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT) -#include <linux/lockdep.h> -#endif #if defined(CONFIG_SYSCTL) @@ -100,8 +90,6 @@ static const int six_hundred_forty_kb = 640 * 1024; #endif -/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ -static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; @@ -148,66 +136,6 @@ static const int max_extfrag_threshold = 1000; #endif /* CONFIG_SYSCTL */ -#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL) -static int bpf_stats_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - struct static_key *key = (struct static_key *)table->data; - static int saved_val; - int val, ret; - struct ctl_table tmp = { - .data = &val, - .maxlen = sizeof(val), - .mode = table->mode, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }; - - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - mutex_lock(&bpf_stats_enabled_mutex); - val = saved_val; - ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); - if (write && !ret && val != saved_val) { - if (val) - static_key_slow_inc(key); - else - static_key_slow_dec(key); - saved_val = val; - } - mutex_unlock(&bpf_stats_enabled_mutex); - return ret; -} - -void __weak unpriv_ebpf_notify(int new_state) -{ -} - -static int bpf_unpriv_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - int ret, unpriv_enable = *(int *)table->data; - bool locked_state = unpriv_enable == 1; - struct ctl_table tmp = *table; - - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - tmp.data = &unpriv_enable; - ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); - if (write && !ret) { - if (locked_state && unpriv_enable != 1) - return -EPERM; - *(int *)table->data = unpriv_enable; - } - - unpriv_ebpf_notify(unpriv_enable); - - return ret; -} -#endif /* CONFIG_BPF_SYSCALL && CONFIG_SYSCTL */ - /* * /proc/sys support */ @@ -518,14 +446,14 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, if (*negp) { if (*lvalp > (unsigned long) INT_MAX + 1) return -EINVAL; - *valp = -*lvalp; + WRITE_ONCE(*valp, -*lvalp); } else { if (*lvalp > (unsigned long) INT_MAX) return -EINVAL; - *valp = *lvalp; + WRITE_ONCE(*valp, *lvalp); } } else { - int val = *valp; + int val = READ_ONCE(*valp); if (val < 0) { *negp = true; *lvalp = -(unsigned long)val; @@ -544,9 +472,9 @@ static int do_proc_douintvec_conv(unsigned long *lvalp, if (write) { if (*lvalp > UINT_MAX) return -EINVAL; - *valp = *lvalp; + WRITE_ONCE(*valp, *lvalp); } else { - unsigned int val = *valp; + unsigned int val = READ_ONCE(*valp); *lvalp = (unsigned long)val; } return 0; @@ -564,12 +492,12 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, int *i, vleft, first = 1, err = 0; size_t left; char *p; - + if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; return 0; } - + i = (int *) tbl_data; vleft = table->maxlen / sizeof(*i); left = *lenp; @@ -801,7 +729,7 @@ int proc_dobool(struct ctl_table *table, int write, void *buffer, * @ppos: file position * * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. + * values from/to the user buffer, treated as an ASCII string. * * Returns 0 on success. */ @@ -929,7 +857,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, if ((param->min && *param->min > tmp) || (param->max && *param->max < tmp)) return -EINVAL; - *valp = tmp; + WRITE_ONCE(*valp, tmp); } return 0; @@ -995,7 +923,7 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, (param->max && *param->max < tmp)) return -ERANGE; - *valp = tmp; + WRITE_ONCE(*valp, tmp); } return 0; @@ -1079,13 +1007,13 @@ int proc_dou8vec_minmax(struct ctl_table *table, int write, tmp.maxlen = sizeof(val); tmp.data = &val; - val = *data; + val = READ_ONCE(*data); res = do_proc_douintvec(&tmp, write, buffer, lenp, ppos, do_proc_douintvec_minmax_conv, ¶m); if (res) return res; if (write) - *data = val; + WRITE_ONCE(*data, val); return 0; } EXPORT_SYMBOL_GPL(proc_dou8vec_minmax); @@ -1162,9 +1090,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, err = -EINVAL; break; } - *i = val; + WRITE_ONCE(*i, val); } else { - val = convdiv * (*i) / convmul; + val = convdiv * READ_ONCE(*i) / convmul; if (!first) proc_put_char(&buffer, &left, '\t'); proc_put_long(&buffer, &left, val, false); @@ -1245,9 +1173,12 @@ static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp, if (write) { if (*lvalp > INT_MAX / HZ) return 1; - *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ); + if (*negp) + WRITE_ONCE(*valp, -*lvalp * HZ); + else + WRITE_ONCE(*valp, *lvalp * HZ); } else { - int val = *valp; + int val = READ_ONCE(*valp); unsigned long lval; if (val < 0) { *negp = true; @@ -1293,9 +1224,9 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, if (jif > INT_MAX) return 1; - *valp = (int)jif; + WRITE_ONCE(*valp, (int)jif); } else { - int val = *valp; + int val = READ_ONCE(*valp); unsigned long lval; if (val < 0) { *negp = true; @@ -1309,6 +1240,30 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, return 0; } +static int do_proc_dointvec_ms_jiffies_minmax_conv(bool *negp, unsigned long *lvalp, + int *valp, int write, void *data) +{ + int tmp, ret; + struct do_proc_dointvec_minmax_conv_param *param = data; + /* + * If writing, first do so via a temporary local int so we can + * bounds-check it before touching *valp. + */ + int *ip = write ? &tmp : valp; + + ret = do_proc_dointvec_ms_jiffies_conv(negp, lvalp, ip, write, data); + if (ret) + return ret; + + if (write) { + if ((param->min && *param->min > tmp) || + (param->max && *param->max < tmp)) + return -EINVAL; + *valp = tmp; + } + return 0; +} + /** * proc_dointvec_jiffies - read a vector of integers as seconds * @table: the sysctl table @@ -1318,7 +1273,7 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, * @ppos: file position * * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. + * values from/to the user buffer, treated as an ASCII string. * The values read are assumed to be in seconds, and are converted into * jiffies. * @@ -1331,6 +1286,17 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, do_proc_dointvec_jiffies_conv,NULL); } +int proc_dointvec_ms_jiffies_minmax(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct do_proc_dointvec_minmax_conv_param param = { + .min = (int *) table->extra1, + .max = (int *) table->extra2, + }; + return do_proc_dointvec(table, write, buffer, lenp, ppos, + do_proc_dointvec_ms_jiffies_minmax_conv, ¶m); +} + /** * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds * @table: the sysctl table @@ -1340,8 +1306,8 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, * @ppos: pointer to the file position * * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. - * The values read are assumed to be in 1/USER_HZ seconds, and + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in 1/USER_HZ seconds, and * are converted into jiffies. * * Returns 0 on success. @@ -1349,8 +1315,8 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table,write,buffer,lenp,ppos, - do_proc_dointvec_userhz_jiffies_conv,NULL); + return do_proc_dointvec(table, write, buffer, lenp, ppos, + do_proc_dointvec_userhz_jiffies_conv, NULL); } /** @@ -1363,8 +1329,8 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, * @ppos: the current position in the file * * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. - * The values read are assumed to be in 1/1000 seconds, and + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in 1/1000 seconds, and * are converted into jiffies. * * Returns 0 on success. @@ -1595,6 +1561,12 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, return -ENOSYS; } +int proc_dointvec_ms_jiffies_minmax(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -1659,35 +1631,6 @@ int proc_do_static_key(struct ctl_table *table, int write, } static struct ctl_table kern_table[] = { - { - .procname = "sched_child_runs_first", - .data = &sysctl_sched_child_runs_first, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#ifdef CONFIG_SCHEDSTATS - { - .procname = "sched_schedstats", - .data = NULL, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sysctl_schedstats, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SCHEDSTATS */ -#ifdef CONFIG_TASK_DELAY_ACCT - { - .procname = "task_delayacct", - .data = NULL, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sysctl_delayacct, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_TASK_DELAY_ACCT */ #ifdef CONFIG_NUMA_BALANCING { .procname = "numa_balancing", @@ -1700,103 +1643,6 @@ static struct ctl_table kern_table[] = { }, #endif /* CONFIG_NUMA_BALANCING */ { - .procname = "sched_rt_period_us", - .data = &sysctl_sched_rt_period, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_rt_handler, - }, - { - .procname = "sched_rt_runtime_us", - .data = &sysctl_sched_rt_runtime, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sched_rt_handler, - }, - { - .procname = "sched_deadline_period_max_us", - .data = &sysctl_sched_dl_period_max, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "sched_deadline_period_min_us", - .data = &sysctl_sched_dl_period_min, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "sched_rr_timeslice_ms", - .data = &sysctl_sched_rr_timeslice, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sched_rr_handler, - }, -#ifdef CONFIG_UCLAMP_TASK - { - .procname = "sched_util_clamp_min", - .data = &sysctl_sched_uclamp_util_min, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sysctl_sched_uclamp_handler, - }, - { - .procname = "sched_util_clamp_max", - .data = &sysctl_sched_uclamp_util_max, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sysctl_sched_uclamp_handler, - }, - { - .procname = "sched_util_clamp_min_rt_default", - .data = &sysctl_sched_uclamp_util_min_rt_default, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sysctl_sched_uclamp_handler, - }, -#endif -#ifdef CONFIG_CFS_BANDWIDTH - { - .procname = "sched_cfs_bandwidth_slice_us", - .data = &sysctl_sched_cfs_bandwidth_slice, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - }, -#endif -#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) - { - .procname = "sched_energy_aware", - .data = &sysctl_sched_energy_aware, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_energy_aware_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif -#ifdef CONFIG_PROVE_LOCKING - { - .procname = "prove_locking", - .data = &prove_locking, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_LOCK_STAT - { - .procname = "lock_stat", - .data = &lock_stat, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - { .procname = "panic", .data = &panic_timeout, .maxlen = sizeof(int), @@ -1820,24 +1666,6 @@ static struct ctl_table kern_table[] = { .extra2 = SYSCTL_ONE, }, #endif -#ifdef CONFIG_LATENCYTOP - { - .procname = "latencytop", - .data = &latencytop_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sysctl_latencytop, - }, -#endif -#ifdef CONFIG_BLK_DEV_INITRD - { - .procname = "real-root-dev", - .data = &real_root_dev, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif { .procname = "print-fatal-signals", .data = &print_fatal_signals, @@ -1895,22 +1723,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif - { - .procname = "ctrl-alt-del", - .data = &C_A_D, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#ifdef CONFIG_FUNCTION_TRACER - { - .procname = "ftrace_enabled", - .data = &ftrace_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = ftrace_enable_sysctl, - }, -#endif #ifdef CONFIG_STACK_TRACER { .procname = "stack_tracer_enabled", @@ -1943,18 +1755,6 @@ static struct ctl_table kern_table[] = { .proc_handler = tracepoint_printk_sysctl, }, #endif -#ifdef CONFIG_KEXEC_CORE - { - .procname = "kexec_load_disabled", - .data = &kexec_load_disabled, - .maxlen = sizeof(int), - .mode = 0644, - /* only handle a transition from default "0" to "1" */ - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - .extra2 = SYSCTL_ONE, - }, -#endif #ifdef CONFIG_MODULES { .procname = "modprobe", @@ -1983,15 +1783,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dostring, }, #endif -#ifdef CONFIG_BSD_PROCESS_ACCT - { - .procname = "acct", - .data = &acct_parm, - .maxlen = 3*sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif #ifdef CONFIG_MAGIC_SYSRQ { .procname = "sysrq", @@ -2049,17 +1840,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_SMP - { - .procname = "oops_all_cpu_backtrace", - .data = &sysctl_oops_all_cpu_backtrace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SMP */ { .procname = "pid_max", .data = &pid_max, @@ -2208,13 +1988,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif - { - .procname = "poweroff_cmd", - .data = &poweroff_cmd, - .maxlen = POWEROFF_CMD_PATH_LEN, - .mode = 0644, - .proc_handler = proc_dostring, - }, #ifdef CONFIG_KEYS { .procname = "keys", @@ -2288,36 +2061,7 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) - { - .procname = "timer_migration", - .data = &sysctl_timer_migration, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = timer_migration_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif -#ifdef CONFIG_BPF_SYSCALL - { - .procname = "unprivileged_bpf_disabled", - .data = &sysctl_unprivileged_bpf_disabled, - .maxlen = sizeof(sysctl_unprivileged_bpf_disabled), - .mode = 0644, - .proc_handler = bpf_unpriv_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO, - }, - { - .procname = "bpf_stats_enabled", - .data = &bpf_stats_enabled_key.key, - .maxlen = sizeof(bpf_stats_enabled_key), - .mode = 0644, - .proc_handler = bpf_stats_handler, - }, -#endif -#if defined(CONFIG_TREE_RCU) +#ifdef CONFIG_TREE_RCU { .procname = "panic_on_rcu_stall", .data = &sysctl_panic_on_rcu_stall, @@ -2327,8 +2071,6 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, -#endif -#if defined(CONFIG_TREE_RCU) { .procname = "max_rcu_stall_to_panic", .data = &sysctl_max_rcu_stall_to_panic, @@ -2353,29 +2095,6 @@ static struct ctl_table vm_table[] = { .extra2 = SYSCTL_TWO, }, { - .procname = "panic_on_oom", - .data = &sysctl_panic_on_oom, - .maxlen = sizeof(sysctl_panic_on_oom), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO, - }, - { - .procname = "oom_kill_allocating_task", - .data = &sysctl_oom_kill_allocating_task, - .maxlen = sizeof(sysctl_oom_kill_allocating_task), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "oom_dump_tasks", - .data = &sysctl_oom_dump_tasks, - .maxlen = sizeof(sysctl_oom_dump_tasks), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { .procname = "overcommit_ratio", .data = &sysctl_overcommit_ratio, .maxlen = sizeof(sysctl_overcommit_ratio), @@ -2398,55 +2117,6 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, }, { - .procname = "dirty_background_ratio", - .data = &dirty_background_ratio, - .maxlen = sizeof(dirty_background_ratio), - .mode = 0644, - .proc_handler = dirty_background_ratio_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_HUNDRED, - }, - { - .procname = "dirty_background_bytes", - .data = &dirty_background_bytes, - .maxlen = sizeof(dirty_background_bytes), - .mode = 0644, - .proc_handler = dirty_background_bytes_handler, - .extra1 = SYSCTL_LONG_ONE, - }, - { - .procname = "dirty_ratio", - .data = &vm_dirty_ratio, - .maxlen = sizeof(vm_dirty_ratio), - .mode = 0644, - .proc_handler = dirty_ratio_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_HUNDRED, - }, - { - .procname = "dirty_bytes", - .data = &vm_dirty_bytes, - .maxlen = sizeof(vm_dirty_bytes), - .mode = 0644, - .proc_handler = dirty_bytes_handler, - .extra1 = (void *)&dirty_bytes_min, - }, - { - .procname = "dirty_writeback_centisecs", - .data = &dirty_writeback_interval, - .maxlen = sizeof(dirty_writeback_interval), - .mode = 0644, - .proc_handler = dirty_writeback_centisecs_handler, - }, - { - .procname = "dirty_expire_centisecs", - .data = &dirty_expire_interval, - .maxlen = sizeof(dirty_expire_interval), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, - { .procname = "dirtytime_expire_seconds", .data = &dirtytime_expire_interval, .maxlen = sizeof(dirtytime_expire_interval), @@ -2463,6 +2133,17 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO_HUNDRED, }, +#ifdef CONFIG_NUMA + { + .procname = "numa_stat", + .data = &sysctl_vm_numa_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_vm_numa_stat_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif #ifdef CONFIG_HUGETLB_PAGE { .procname = "nr_hugepages", @@ -2479,15 +2160,6 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &hugetlb_mempolicy_sysctl_handler, }, - { - .procname = "numa_stat", - .data = &sysctl_vm_numa_stat, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sysctl_vm_numa_stat_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, #endif { .procname = "hugetlb_shm_group", @@ -2618,13 +2290,6 @@ static struct ctl_table vm_table[] = { }, #endif { - .procname = "laptop_mode", - .data = &laptop_mode, - .maxlen = sizeof(laptop_mode), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { .procname = "vfs_cache_pressure", .data = &sysctl_vfs_cache_pressure, .maxlen = sizeof(sysctl_vfs_cache_pressure), @@ -2721,17 +2386,6 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, }, #endif -#ifdef CONFIG_HIGHMEM - { - .procname = "highmem_is_dirtyable", - .data = &vm_highmem_is_dirtyable, - .maxlen = sizeof(vm_highmem_is_dirtyable), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif #ifdef CONFIG_MEMORY_FAILURE { .procname = "memory_failure_early_kill", diff --git a/kernel/task_work.c b/kernel/task_work.c index c59e1a49bc40..dff75bcde151 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -12,12 +12,22 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ * @notify: how to notify the targeted task * * Queue @work for task_work_run() below and notify the @task if @notify - * is @TWA_RESUME or @TWA_SIGNAL. @TWA_SIGNAL works like signals, in that the - * it will interrupt the targeted task and run the task_work. @TWA_RESUME - * work is run only when the task exits the kernel and returns to user mode, - * or before entering guest mode. Fails if the @task is exiting/exited and thus - * it can't process this @work. Otherwise @work->func() will be called when the - * @task goes through one of the aforementioned transitions, or exits. + * is @TWA_RESUME, @TWA_SIGNAL, or @TWA_SIGNAL_NO_IPI. + * + * @TWA_SIGNAL works like signals, in that the it will interrupt the targeted + * task and run the task_work, regardless of whether the task is currently + * running in the kernel or userspace. + * @TWA_SIGNAL_NO_IPI works like @TWA_SIGNAL, except it doesn't send a + * reschedule IPI to force the targeted task to reschedule and run task_work. + * This can be advantageous if there's no strict requirement that the + * task_work be run as soon as possible, just whenever the task enters the + * kernel anyway. + * @TWA_RESUME work is run only when the task exits the kernel and returns to + * user mode, or before entering guest mode. + * + * Fails if the @task is exiting/exited and thus it can't process this @work. + * Otherwise @work->func() will be called when the @task goes through one of + * the aforementioned transitions, or exits. * * If the targeted task is exiting, then an error is returned and the work item * is not queued. It's up to the caller to arrange for an alternative mechanism @@ -53,6 +63,9 @@ int task_work_add(struct task_struct *task, struct callback_head *work, case TWA_SIGNAL: set_notify_signal(task); break; + case TWA_SIGNAL_NO_IPI: + __set_notify_signal(task); + break; default: WARN_ON_ONCE(1); break; diff --git a/kernel/taskstats.c b/kernel/taskstats.c index bcac5a9043aa..8ce3fa0c19e2 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/taskstats_kern.h> #include <linux/tsacct_kern.h> +#include <linux/acct.h> #include <linux/delayacct.h> #include <linux/cpumask.h> #include <linux/percpu.h> @@ -153,6 +154,23 @@ static void send_cpu_listeners(struct sk_buff *skb, up_write(&listeners->sem); } +static void exe_add_tsk(struct taskstats *stats, struct task_struct *tsk) +{ + /* No idea if I'm allowed to access that here, now. */ + struct file *exe_file = get_task_exe_file(tsk); + + if (exe_file) { + /* Following cp_new_stat64() in stat.c . */ + stats->ac_exe_dev = + huge_encode_dev(exe_file->f_inode->i_sb->s_dev); + stats->ac_exe_inode = exe_file->f_inode->i_ino; + fput(exe_file); + } else { + stats->ac_exe_dev = 0; + stats->ac_exe_inode = 0; + } +} + static void fill_stats(struct user_namespace *user_ns, struct pid_namespace *pid_ns, struct task_struct *tsk, struct taskstats *stats) @@ -175,6 +193,9 @@ static void fill_stats(struct user_namespace *user_ns, /* fill in extended acct fields */ xacct_add_tsk(stats, tsk); + + /* add executable info */ + exe_add_tsk(stats, tsk); } static int fill_stats_for_pid(pid_t pid, struct taskstats *stats) @@ -620,6 +641,8 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) goto err; fill_stats(&init_user_ns, &init_pid_ns, tsk, stats); + if (group_dead) + stats->ac_flag |= AGROUP; /* * Doesn't matter if tsk is the leader or the last group member leaving @@ -665,6 +688,8 @@ static struct genl_family family __ro_after_init = { .module = THIS_MODULE, .ops = taskstats_ops, .n_ops = ARRAY_SIZE(taskstats_ops), + .resv_start_op = CGROUPSTATS_CMD_GET + 1, + .netnsok = true, }; /* Needed early in initialization */ diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 27b7868b5c30..a41753be1a2b 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -73,6 +73,15 @@ config TIME_KUNIT_TEST If unsure, say N. +config CONTEXT_TRACKING + bool + +config CONTEXT_TRACKING_IDLE + bool + select CONTEXT_TRACKING + help + Tracks idle state on behalf of RCU. + if GENERIC_CLOCKEVENTS menu "Timers subsystem" @@ -111,7 +120,7 @@ config NO_HZ_FULL # NO_HZ_COMMON dependency # We need at least one periodic CPU for timekeeping depends on SMP - depends on HAVE_CONTEXT_TRACKING + depends on HAVE_CONTEXT_TRACKING_USER # VIRT_CPU_ACCOUNTING_GEN dependency depends on HAVE_VIRT_CPU_ACCOUNTING_GEN select NO_HZ_COMMON @@ -137,31 +146,37 @@ config NO_HZ_FULL endchoice -config CONTEXT_TRACKING - bool +config CONTEXT_TRACKING_USER + bool + depends on HAVE_CONTEXT_TRACKING_USER + select CONTEXT_TRACKING + help + Track transitions between kernel and user on behalf of RCU and + tickless cputime accounting. The former case relies on context + tracking to enter/exit RCU extended quiescent states. -config CONTEXT_TRACKING_FORCE - bool "Force context tracking" - depends on CONTEXT_TRACKING +config CONTEXT_TRACKING_USER_FORCE + bool "Force user context tracking" + depends on CONTEXT_TRACKING_USER default y if !NO_HZ_FULL help The major pre-requirement for full dynticks to work is to - support the context tracking subsystem. But there are also + support the user context tracking subsystem. But there are also other dependencies to provide in order to make the full dynticks working. This option stands for testing when an arch implements the - context tracking backend but doesn't yet fulfill all the + user context tracking backend but doesn't yet fulfill all the requirements to make the full dynticks feature working. Without the full dynticks, there is no way to test the support - for context tracking and the subsystems that rely on it: RCU + for user context tracking and the subsystems that rely on it: RCU userspace extended quiescent state and tickless cputime accounting. This option copes with the absence of the full - dynticks subsystem by forcing the context tracking on all + dynticks subsystem by forcing the user context tracking on all CPUs in the system. Say Y only if you're working on the development of an - architecture backend for the context tracking. + architecture backend for the user context tracking. Say N otherwise, this option brings an overhead that you don't want in production. diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 003ccf338d20..5d85014d59b5 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -690,7 +690,7 @@ static ssize_t unbind_device_store(struct device *dev, { char name[CS_NAME_LEN]; ssize_t ret = sysfs_get_uname(buf, name, count); - struct clock_event_device *ce; + struct clock_event_device *ce = NULL, *iter; if (ret < 0) return ret; @@ -698,9 +698,10 @@ static ssize_t unbind_device_store(struct device *dev, ret = -ENODEV; mutex_lock(&clockevents_mutex); raw_spin_lock_irq(&clockevents_lock); - list_for_each_entry(ce, &clockevent_devices, list) { - if (!strcmp(ce->name, name)) { - ret = __clockevents_try_unbind(ce, dev->id); + list_for_each_entry(iter, &clockevent_devices, list) { + if (!strcmp(iter->name, name)) { + ret = __clockevents_try_unbind(iter, dev->id); + ce = iter; break; } } diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 95d7ca35bdf2..cee5da1e54c4 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -343,7 +343,7 @@ void clocksource_verify_percpu(struct clocksource *cs) cpus_read_lock(); preempt_disable(); clocksource_verify_choose_cpus(); - if (cpumask_weight(&cpus_chosen) == 0) { + if (cpumask_empty(&cpus_chosen)) { preempt_enable(); cpus_read_unlock(); pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 0ea8702eb516..23af5eca11b1 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -2311,6 +2311,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, return !t.task ? 0 : -EINTR; } +EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock); /** * schedule_hrtimeout_range - sleep until timeout diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 0a97193984db..cb925e8ef9a8 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -870,7 +870,7 @@ static inline void check_dl_overrun(struct task_struct *tsk) { if (tsk->dl.dl_overrun) { tsk->dl.dl_overrun = 0; - __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); + send_signal_locked(SIGXCPU, SEND_SIG_PRIV, tsk, PIDTYPE_TGID); } } @@ -884,7 +884,7 @@ static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard) rt ? "RT" : "CPU", hard ? "hard" : "soft", current->comm, task_pid_nr(current)); } - __group_send_sig_info(signo, SEND_SIG_PRIV, current); + send_signal_locked(signo, SEND_SIG_PRIV, current, PIDTYPE_TGID); return true; } @@ -958,7 +958,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, trace_itimer_expire(signo == SIGPROF ? ITIMER_PROF : ITIMER_VIRTUAL, task_tgid(tsk), cur_time); - __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); + send_signal_locked(signo, SEND_SIG_PRIV, tsk, PIDTYPE_TGID); } if (it->expires && it->expires < *expires) diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index fcb3b21d8bdc..90ea5f373e50 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -70,7 +70,7 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, return do_sys_settimeofday64(&new_tp, NULL); } -int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp) +static int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp) { switch (which_clock) { case CLOCK_REALTIME: @@ -90,6 +90,7 @@ int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp) return 0; } + SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, struct __kernel_timespec __user *, tp) { diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 1cd10b102c51..5dead89308b7 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1051,15 +1051,24 @@ retry_delete: } /* - * This is called by do_exit or de_thread, only when there are no more - * references to the shared signal_struct. + * This is called by do_exit or de_thread, only when nobody else can + * modify the signal->posix_timers list. Yet we need sighand->siglock + * to prevent the race with /proc/pid/timers. */ -void exit_itimers(struct signal_struct *sig) +void exit_itimers(struct task_struct *tsk) { + struct list_head timers; struct k_itimer *tmr; - while (!list_empty(&sig->posix_timers)) { - tmr = list_entry(sig->posix_timers.next, struct k_itimer, list); + if (list_empty(&tsk->signal->posix_timers)) + return; + + spin_lock_irq(&tsk->sighand->siglock); + list_replace_init(&tsk->signal->posix_timers, &timers); + spin_unlock_irq(&tsk->sighand->siglock); + + while (!list_empty(&timers)) { + tmr = list_first_entry(&timers, struct k_itimer, list); itimer_delete(tmr); } } diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index b1b9b12899f5..8464c5acc913 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -8,6 +8,7 @@ #include <linux/jiffies.h> #include <linux/ktime.h> #include <linux/kernel.h> +#include <linux/math.h> #include <linux/moduleparam.h> #include <linux/sched.h> #include <linux/sched/clock.h> @@ -199,15 +200,13 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) r = rate; if (r >= 4000000) { - r /= 1000000; + r = DIV_ROUND_CLOSEST(r, 1000000); r_unit = 'M'; + } else if (r >= 4000) { + r = DIV_ROUND_CLOSEST(r, 1000); + r_unit = 'k'; } else { - if (r >= 1000) { - r /= 1000; - r_unit = 'k'; - } else { - r_unit = ' '; - } + r_unit = ' '; } /* Calculate the ns resolution of this counter */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 2d76c91b85de..b0e3c9205946 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -188,7 +188,7 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) */ if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) { #ifdef CONFIG_NO_HZ_FULL - WARN_ON(tick_nohz_full_running); + WARN_ON_ONCE(tick_nohz_full_running); #endif tick_do_timer_cpu = cpu; } @@ -526,7 +526,6 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask) cpumask_copy(tick_nohz_full_mask, cpumask); tick_nohz_full_running = true; } -EXPORT_SYMBOL_GPL(tick_nohz_full_setup); static int tick_nohz_cpu_down(unsigned int cpu) { @@ -571,7 +570,7 @@ void __init tick_nohz_init(void) } for_each_cpu(cpu, tick_nohz_full_mask) - context_tracking_cpu_set(cpu); + ct_cpu_track_user(cpu); ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "kernel/nohz:predown", NULL, @@ -928,6 +927,8 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) if (unlikely(expires == KTIME_MAX)) { if (ts->nohz_mode == NOHZ_MODE_HIGHRES) hrtimer_cancel(&ts->sched_timer); + else + tick_program_event(KTIME_MAX, 1); return; } @@ -1364,9 +1365,15 @@ static void tick_nohz_handler(struct clock_event_device *dev) tick_sched_do_timer(ts, now); tick_sched_handle(ts, regs); - /* No need to reprogram if we are running tickless */ - if (unlikely(ts->tick_stopped)) + if (unlikely(ts->tick_stopped)) { + /* + * The clockevent device is not reprogrammed, so change the + * clock event device to ONESHOT_STOPPED to avoid spurious + * interrupts on devices which might not be truly one shot. + */ + tick_program_event(KTIME_MAX, 1); return; + } hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); @@ -1538,7 +1545,7 @@ void tick_cancel_sched_timer(int cpu) } #endif -/** +/* * Async notification about clocksource changes */ void tick_clock_notify(void) @@ -1559,7 +1566,7 @@ void tick_oneshot_notify(void) set_bit(0, &ts->check_clocks); } -/** +/* * Check, if a change happened, which makes oneshot possible. * * Called cyclic from the hrtimer softirq (driven by the timer diff --git a/kernel/time/time.c b/kernel/time/time.c index 29923b20e0e4..526257b3727c 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -449,7 +449,7 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0, } EXPORT_SYMBOL(mktime64); -struct __kernel_old_timeval ns_to_kernel_old_timeval(const s64 nsec) +struct __kernel_old_timeval ns_to_kernel_old_timeval(s64 nsec) { struct timespec64 ts = ns_to_timespec64(nsec); struct __kernel_old_timeval tv; @@ -503,7 +503,7 @@ EXPORT_SYMBOL(set_normalized_timespec64); * * Returns the timespec64 representation of the nsec parameter. */ -struct timespec64 ns_to_timespec64(const s64 nsec) +struct timespec64 ns_to_timespec64(s64 nsec) { struct timespec64 ts = { 0, 0 }; s32 rem; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index dcdcb85121e4..f72b9f1de178 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -17,11 +17,13 @@ #include <linux/clocksource.h> #include <linux/jiffies.h> #include <linux/time.h> +#include <linux/timex.h> #include <linux/tick.h> #include <linux/stop_machine.h> #include <linux/pvclock_gtod.h> #include <linux/compiler.h> #include <linux/audit.h> +#include <linux/random.h> #include "tick-internal.h" #include "ntp_internal.h" @@ -429,6 +431,14 @@ static void update_fast_timekeeper(const struct tk_read_base *tkr, memcpy(base + 1, base, sizeof(*base)); } +static __always_inline u64 fast_tk_get_delta_ns(struct tk_read_base *tkr) +{ + u64 delta, cycles = tk_clock_read(tkr); + + delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask); + return timekeeping_delta_to_ns(tkr, delta); +} + static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) { struct tk_read_base *tkr; @@ -439,12 +449,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) seq = raw_read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); now = ktime_to_ns(tkr->base); - - now += timekeeping_delta_to_ns(tkr, - clocksource_delta( - tk_clock_read(tkr), - tkr->cycle_last, - tkr->mask)); + now += fast_tk_get_delta_ns(tkr); } while (read_seqcount_latch_retry(&tkf->seq, seq)); return now; @@ -482,7 +487,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) * of the following timestamps. Callers need to be aware of that and * deal with it. */ -u64 ktime_get_mono_fast_ns(void) +u64 notrace ktime_get_mono_fast_ns(void) { return __ktime_get_fast_ns(&tk_fast_mono); } @@ -494,7 +499,7 @@ EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); * Contrary to ktime_get_mono_fast_ns() this is always correct because the * conversion factor is not affected by NTP/PTP correction. */ -u64 ktime_get_raw_fast_ns(void) +u64 notrace ktime_get_raw_fast_ns(void) { return __ktime_get_fast_ns(&tk_fast_raw); } @@ -528,10 +533,27 @@ u64 notrace ktime_get_boot_fast_ns(void) { struct timekeeper *tk = &tk_core.timekeeper; - return (ktime_get_mono_fast_ns() + ktime_to_ns(tk->offs_boot)); + return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_boot))); } EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); +/** + * ktime_get_tai_fast_ns - NMI safe and fast access to tai clock. + * + * The same limitations as described for ktime_get_boot_fast_ns() apply. The + * mono time and the TAI offset are not read atomically which may yield wrong + * readouts. However, an update of the TAI offset is an rare event e.g., caused + * by settime or adjtimex with an offset. The user of this function has to deal + * with the possibility of wrong timestamps in post processing. + */ +u64 notrace ktime_get_tai_fast_ns(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_tai))); +} +EXPORT_SYMBOL_GPL(ktime_get_tai_fast_ns); + static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono) { struct tk_read_base *tkr; @@ -543,10 +565,7 @@ static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono) tkr = tkf->base + (seq & 0x01); basem = ktime_to_ns(tkr->base); baser = ktime_to_ns(tkr->base_real); - - delta = timekeeping_delta_to_ns(tkr, - clocksource_delta(tk_clock_read(tkr), - tkr->cycle_last, tkr->mask)); + delta = fast_tk_get_delta_ns(tkr); } while (read_seqcount_latch_retry(&tkf->seq, seq)); if (mono) @@ -1325,8 +1344,10 @@ out: /* Signal hrtimers about time change */ clock_was_set(CLOCK_SET_WALL); - if (!ret) + if (!ret) { audit_tk_injoffset(ts_delta); + add_device_randomness(ts, sizeof(*ts)); + } return ret; } @@ -2380,6 +2401,20 @@ static int timekeeping_validate_timex(const struct __kernel_timex *txc) return 0; } +/** + * random_get_entropy_fallback - Returns the raw clock source value, + * used by random.c for platforms with no valid random_get_entropy(). + */ +unsigned long random_get_entropy_fallback(void) +{ + struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono; + struct clocksource *clock = READ_ONCE(tkr->clock); + + if (unlikely(timekeeping_suspended || !clock)) + return 0; + return clock->read(clock); +} +EXPORT_SYMBOL_GPL(random_get_entropy_fallback); /** * do_adjtimex() - Accessor function to NTP __do_adjtimex function @@ -2398,6 +2433,7 @@ int do_adjtimex(struct __kernel_timex *txc) ret = timekeeping_validate_timex(txc); if (ret) return ret; + add_device_randomness(txc, sizeof(*txc)); if (txc->modes & ADJ_SETOFFSET) { struct timespec64 delta; @@ -2415,6 +2451,7 @@ int do_adjtimex(struct __kernel_timex *txc) audit_ntp_init(&ad); ktime_get_real_ts64(&ts); + add_device_randomness(&ts, sizeof(ts)); raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&tk_core.seq); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 85f1021ad459..717fcb9fb14a 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -44,6 +44,7 @@ #include <linux/slab.h> #include <linux/compat.h> #include <linux/random.h> +#include <linux/sysctl.h> #include <linux/uaccess.h> #include <asm/unistd.h> @@ -223,7 +224,7 @@ static void timer_update_keys(struct work_struct *work); static DECLARE_WORK(timer_update_work, timer_update_keys); #ifdef CONFIG_SMP -unsigned int sysctl_timer_migration = 1; +static unsigned int sysctl_timer_migration = 1; DEFINE_STATIC_KEY_FALSE(timers_migration_enabled); @@ -234,7 +235,42 @@ static void timers_update_migration(void) else static_branch_disable(&timers_migration_enabled); } -#else + +#ifdef CONFIG_SYSCTL +static int timer_migration_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + mutex_lock(&timer_keys_mutex); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (!ret && write) + timers_update_migration(); + mutex_unlock(&timer_keys_mutex); + return ret; +} + +static struct ctl_table timer_sysctl[] = { + { + .procname = "timer_migration", + .data = &sysctl_timer_migration, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = timer_migration_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static int __init timer_sysctl_init(void) +{ + register_sysctl("kernel", timer_sysctl); + return 0; +} +device_initcall(timer_sysctl_init); +#endif /* CONFIG_SYSCTL */ +#else /* CONFIG_SMP */ static inline void timers_update_migration(void) { } #endif /* !CONFIG_SMP */ @@ -251,19 +287,6 @@ void timers_update_nohz(void) schedule_work(&timer_update_work); } -int timer_migration_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - int ret; - - mutex_lock(&timer_keys_mutex); - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (!ret && write) - timers_update_migration(); - mutex_unlock(&timer_keys_mutex); - return ret; -} - static inline bool is_timers_nohz_active(void) { return static_branch_unlikely(&timers_nohz_active); @@ -502,7 +525,7 @@ static inline unsigned calc_index(unsigned long expires, unsigned lvl, * * Round up with level granularity to prevent this. */ - expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl); + expires = (expires >> LVL_SHIFT(lvl)) + 1; *bucket_expiry = expires << LVL_SHIFT(lvl); return LVL_OFFS(lvl) + (expires & LVL_MASK); } @@ -615,9 +638,39 @@ static void internal_add_timer(struct timer_base *base, struct timer_list *timer static const struct debug_obj_descr timer_debug_descr; +struct timer_hint { + void (*function)(struct timer_list *t); + long offset; +}; + +#define TIMER_HINT(fn, container, timr, hintfn) \ + { \ + .function = fn, \ + .offset = offsetof(container, hintfn) - \ + offsetof(container, timr) \ + } + +static const struct timer_hint timer_hints[] = { + TIMER_HINT(delayed_work_timer_fn, + struct delayed_work, timer, work.func), + TIMER_HINT(kthread_delayed_work_timer_fn, + struct kthread_delayed_work, timer, work.func), +}; + static void *timer_debug_hint(void *addr) { - return ((struct timer_list *) addr)->function; + struct timer_list *timer = addr; + int i; + + for (i = 0; i < ARRAY_SIZE(timer_hints); i++) { + if (timer_hints[i].function == timer->function) { + void (**fn)(void) = addr + timer_hints[i].offset; + + return *fn; + } + } + + return timer->function; } static bool timer_is_static_object(void *addr) @@ -1722,11 +1775,14 @@ static inline void __run_timers(struct timer_base *base) time_after_eq(jiffies, base->next_expiry)) { levels = collect_expired_timers(base, heads); /* - * The only possible reason for not finding any expired - * timer at this clk is that all matching timers have been - * dequeued. + * The two possible reasons for not finding any expired + * timer at this clk are that all matching timers have been + * dequeued or no timer has been queued since + * base::next_expiry was set to base::clk + + * NEXT_TIMER_MAX_DELTA. */ - WARN_ON_ONCE(!levels && !base->next_expiry_recalc); + WARN_ON_ONCE(!levels && !base->next_expiry_recalc + && base->timers_pending); base->clk++; base->next_expiry = __next_timer_interrupt(base); @@ -1777,8 +1833,6 @@ void update_process_times(int user_tick) { struct task_struct *p = current; - PRANDOM_ADD_NOISE(jiffies, user_tick, p, 0); - /* Note: this timer irq context must be accounted for as well. */ account_process_tick(p, user_tick); run_local_timers(); @@ -1950,6 +2004,7 @@ int timers_prepare_cpu(unsigned int cpu) base = per_cpu_ptr(&timer_bases[b], cpu); base->clk = jiffies; base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; + base->next_expiry_recalc = false; base->timers_pending = false; base->is_idle = false; } diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 2c43e327a619..e9e95c790b8e 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -51,6 +51,12 @@ config HAVE_DYNAMIC_FTRACE_WITH_ARGS This allows for use of regs_get_kernel_argument() and kernel_stack_pointer(). +config HAVE_DYNAMIC_FTRACE_NO_PATCHABLE + bool + help + If the architecture generates __patchable_function_entries sections + but does not want them included in the ftrace locations. + config HAVE_FTRACE_MCOUNT_RECORD bool help @@ -144,6 +150,7 @@ config TRACING select BINARY_PRINTF select EVENT_TRACING select TRACE_CLOCK + select TASKS_RCU if PREEMPTION config GENERIC_TRACER bool @@ -193,7 +200,8 @@ config FUNCTION_TRACER sequence is then dynamically patched into a tracer call when tracing is enabled by the administrator. If it's runtime disabled (the bootup default), then the overhead of the instructions is very - small and not measurable even in micro-benchmarks. + small and not measurable even in micro-benchmarks (at least on + x86, but may have impact on other architectures). config FUNCTION_GRAPH_TRACER bool "Kernel Function Graph Tracer" @@ -728,6 +736,7 @@ config FTRACE_MCOUNT_USE_OBJTOOL depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY depends on !FTRACE_MCOUNT_USE_CC depends on FTRACE_MCOUNT_RECORD + select OBJTOOL config FTRACE_MCOUNT_USE_RECORDMCOUNT def_bool y @@ -1103,4 +1112,6 @@ config HIST_TRIGGERS_DEBUG If unsure, say N. +source "kernel/trace/rv/Kconfig" + endif # FTRACE diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index d77cd8032213..c6651e16b557 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -31,6 +31,10 @@ ifdef CONFIG_GCOV_PROFILE_FTRACE GCOV_PROFILE := y endif +# Functions in this file could be invoked from early interrupt +# code and produce random code coverage. +KCOV_INSTRUMENT_trace_preemptirq.o := n + CFLAGS_bpf_trace.o := -I$(src) CFLAGS_trace_benchmark.o := -I$(src) @@ -102,5 +106,6 @@ obj-$(CONFIG_FPROBE) += fprobe.o obj-$(CONFIG_RETHOOK) += rethook.o obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o +obj-$(CONFIG_RV) += rv/ libftrace-y := ftrace.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 4d5629196d01..7f5eb295fe19 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -145,13 +145,14 @@ static void trace_note_time(struct blk_trace *bt) local_irq_restore(flags); } -void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg, - const char *fmt, ...) +void __blk_trace_note_message(struct blk_trace *bt, + struct cgroup_subsys_state *css, const char *fmt, ...) { int n; va_list args; unsigned long flags; char *buf; + u64 cgid = 0; if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer_enabled)) @@ -170,17 +171,16 @@ void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg, n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); va_end(args); - if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) - blkcg = NULL; #ifdef CONFIG_BLK_CGROUP - trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, - blkcg ? cgroup_id(blkcg->css.cgroup) : 1); -#else - trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, 0); + if (css && (blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) + cgid = cgroup_id(css->cgroup); + else + cgid = 1; #endif + trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, cgid); local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(__trace_note_message); +EXPORT_SYMBOL_GPL(__blk_trace_note_message); static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, pid_t pid) @@ -205,7 +205,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), #define BLK_TC_PREFLUSH BLK_TC_FLUSH /* The ilog2() calls fall out because they're constant */ -#define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \ +#define MASK_TC_BIT(rw, __name) ((__force u32)(rw & REQ_ ## __name) << \ (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name)) /* @@ -213,8 +213,8 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), * blk_io_trace structure and places it in a per-cpu subbuffer. */ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, - int op, int op_flags, u32 what, int error, int pdu_len, - void *pdu_data, u64 cgid) + const blk_opf_t opf, u32 what, int error, + int pdu_len, void *pdu_data, u64 cgid) { struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; @@ -227,16 +227,17 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, int cpu; bool blk_tracer = blk_tracer_enabled; ssize_t cgid_len = cgid ? sizeof(cgid) : 0; + const enum req_op op = opf & REQ_OP_MASK; if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) return; what |= ddir_act[op_is_write(op) ? WRITE : READ]; - what |= MASK_TC_BIT(op_flags, SYNC); - what |= MASK_TC_BIT(op_flags, RAHEAD); - what |= MASK_TC_BIT(op_flags, META); - what |= MASK_TC_BIT(op_flags, PREFLUSH); - what |= MASK_TC_BIT(op_flags, FUA); + what |= MASK_TC_BIT(opf, SYNC); + what |= MASK_TC_BIT(opf, RAHEAD); + what |= MASK_TC_BIT(opf, META); + what |= MASK_TC_BIT(opf, PREFLUSH); + what |= MASK_TC_BIT(opf, FUA); if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE) what |= BLK_TC_ACT(BLK_TC_DISCARD); if (op == REQ_OP_FLUSH) @@ -411,7 +412,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, return PTR_ERR(msg); bt = filp->private_data; - __trace_note_message(bt, NULL, "%s", msg); + __blk_trace_note_message(bt, NULL, "%s", msg); kfree(msg); return count; @@ -736,12 +737,12 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) switch (cmd) { case BLKTRACESETUP: - bdevname(bdev, b); + snprintf(b, sizeof(b), "%pg", bdev); ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); break; #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) case BLKTRACESETUP32: - bdevname(bdev, b); + snprintf(b, sizeof(b), "%pg", bdev); ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); break; #endif @@ -770,19 +771,17 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) **/ void blk_trace_shutdown(struct request_queue *q) { - mutex_lock(&q->debugfs_mutex); if (rcu_dereference_protected(q->blk_trace, lockdep_is_held(&q->debugfs_mutex))) { __blk_trace_startstop(q, 0); __blk_trace_remove(q); } - - mutex_unlock(&q->debugfs_mutex); } #ifdef CONFIG_BLK_CGROUP static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) { + struct cgroup_subsys_state *blkcg_css; struct blk_trace *bt; /* We don't use the 'bt' value here except as an optimization... */ @@ -790,9 +789,10 @@ static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) return 0; - if (!bio->bi_blkg) + blkcg_css = bio_blkcg_css(bio); + if (!blkcg_css) return 0; - return cgroup_id(bio_blkcg(bio)->css.cgroup); + return cgroup_id(blkcg_css->cgroup); } #else static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) @@ -843,9 +843,8 @@ static void blk_add_trace_rq(struct request *rq, blk_status_t error, else what |= BLK_TC_ACT(BLK_TC_FS); - __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq), - rq->cmd_flags, what, blk_status_to_errno(error), 0, - NULL, cgid); + __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, rq->cmd_flags, + what, blk_status_to_errno(error), 0, NULL, cgid); rcu_read_unlock(); } @@ -904,7 +903,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, } __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio_op(bio), bio->bi_opf, what, error, 0, NULL, + bio->bi_opf, what, error, 0, NULL, blk_trace_bio_get_cgid(q, bio)); rcu_read_unlock(); } @@ -950,7 +949,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q) rcu_read_lock(); bt = rcu_dereference(q->blk_trace); if (bt) - __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0); + __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0); rcu_read_unlock(); } @@ -970,7 +969,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, else what = BLK_TA_UNPLUG_TIMER; - __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0); + __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0); } rcu_read_unlock(); } @@ -986,8 +985,7 @@ static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu) __be64 rpdu = cpu_to_be64(pdu); __blk_add_trace(bt, bio->bi_iter.bi_sector, - bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, - BLK_TA_SPLIT, + bio->bi_iter.bi_size, bio->bi_opf, BLK_TA_SPLIT, blk_status_to_errno(bio->bi_status), sizeof(rpdu), &rpdu, blk_trace_bio_get_cgid(q, bio)); @@ -1023,7 +1021,7 @@ static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev, r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio_op(bio), bio->bi_opf, BLK_TA_REMAP, + bio->bi_opf, BLK_TA_REMAP, blk_status_to_errno(bio->bi_status), sizeof(r), &r, blk_trace_bio_get_cgid(q, bio)); rcu_read_unlock(); @@ -1059,7 +1057,7 @@ static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev, r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), - rq_data_dir(rq), 0, BLK_TA_REMAP, 0, + rq->cmd_flags, BLK_TA_REMAP, 0, sizeof(r), &r, blk_trace_request_get_cgid(rq)); rcu_read_unlock(); } @@ -1085,7 +1083,7 @@ void blk_add_driver_data(struct request *rq, void *data, size_t len) return; } - __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0, + __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, BLK_TA_DRV_DATA, 0, len, data, blk_trace_request_get_cgid(rq)); rcu_read_unlock(); @@ -1868,17 +1866,6 @@ out_unlock_bdev: out: return ret ? ret : count; } - -int blk_trace_init_sysfs(struct device *dev) -{ - return sysfs_create_group(&dev->kobj, &blk_trace_attr_group); -} - -void blk_trace_remove_sysfs(struct device *dev) -{ - sysfs_remove_group(&dev->kobj, &blk_trace_attr_group); -} - #endif /* CONFIG_BLK_DEV_IO_TRACE */ #ifdef CONFIG_EVENT_TRACING @@ -1886,21 +1873,21 @@ void blk_trace_remove_sysfs(struct device *dev) /** * blk_fill_rwbs - Fill the buffer rwbs by mapping op to character string. * @rwbs: buffer to be filled - * @op: REQ_OP_XXX for the tracepoint + * @opf: request operation type (REQ_OP_XXX) and flags for the tracepoint * * Description: - * Maps the REQ_OP_XXX to character and fills the buffer provided by the - * caller with resulting string. + * Maps each request operation and flag to a single character and fills the + * buffer provided by the caller with resulting string. * **/ -void blk_fill_rwbs(char *rwbs, unsigned int op) +void blk_fill_rwbs(char *rwbs, blk_opf_t opf) { int i = 0; - if (op & REQ_PREFLUSH) + if (opf & REQ_PREFLUSH) rwbs[i++] = 'F'; - switch (op & REQ_OP_MASK) { + switch (opf & REQ_OP_MASK) { case REQ_OP_WRITE: rwbs[i++] = 'W'; break; @@ -1921,13 +1908,13 @@ void blk_fill_rwbs(char *rwbs, unsigned int op) rwbs[i++] = 'N'; } - if (op & REQ_FUA) + if (opf & REQ_FUA) rwbs[i++] = 'F'; - if (op & REQ_RAHEAD) + if (opf & REQ_RAHEAD) rwbs[i++] = 'A'; - if (op & REQ_SYNC) + if (opf & REQ_SYNC) rwbs[i++] = 'S'; - if (op & REQ_META) + if (opf & REQ_META) rwbs[i++] = 'M'; rwbs[i] = '\0'; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 7fa2ebc07f60..688552df95ca 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -20,6 +20,8 @@ #include <linux/fprobe.h> #include <linux/bsearch.h> #include <linux/sort.h> +#include <linux/key.h> +#include <linux/verification.h> #include <net/bpf_sk_storage.h> @@ -129,7 +131,10 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) * out of events when it was updated in between this and the * rcu_dereference() which is accepted risk. */ - ret = BPF_PROG_RUN_ARRAY(call->prog_array, ctx, bpf_prog_run); + rcu_read_lock(); + ret = bpf_prog_run_array(rcu_dereference(call->prog_array), + ctx, bpf_prog_run); + rcu_read_unlock(); out: __this_cpu_dec(bpf_prog_active); @@ -1023,11 +1028,30 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = { .arg1_type = ARG_PTR_TO_CTX, }; +#ifdef CONFIG_X86_KERNEL_IBT +static unsigned long get_entry_ip(unsigned long fentry_ip) +{ + u32 instr; + + /* Being extra safe in here in case entry ip is on the page-edge. */ + if (get_kernel_nofault(instr, (u32 *) fentry_ip - 1)) + return fentry_ip; + if (is_endbr(instr)) + fentry_ip -= ENDBR_INSN_SIZE; + return fentry_ip; +} +#else +#define get_entry_ip(fentry_ip) fentry_ip +#endif + BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs) { struct kprobe *kp = kprobe_running(); - return kp ? (uintptr_t)kp->addr : 0; + if (!kp || !(kp->flags & KPROBE_FLAG_ON_FUNC_ENTRY)) + return 0; + + return get_entry_ip((uintptr_t)kp->addr); } static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = { @@ -1088,6 +1112,21 @@ static const struct bpf_func_proto bpf_get_attach_cookie_proto_pe = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_1(bpf_get_attach_cookie_tracing, void *, ctx) +{ + struct bpf_trace_run_ctx *run_ctx; + + run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx); + return run_ctx->bpf_cookie; +} + +static const struct bpf_func_proto bpf_get_attach_cookie_proto_tracing = { + .func = bpf_get_attach_cookie_tracing, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags) { #ifndef CONFIG_X86 @@ -1163,6 +1202,184 @@ static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +#ifdef CONFIG_KEYS +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "kfuncs which will be used in BPF programs"); + +/** + * bpf_lookup_user_key - lookup a key by its serial + * @serial: key handle serial number + * @flags: lookup-specific flags + * + * Search a key with a given *serial* and the provided *flags*. + * If found, increment the reference count of the key by one, and + * return it in the bpf_key structure. + * + * The bpf_key structure must be passed to bpf_key_put() when done + * with it, so that the key reference count is decremented and the + * bpf_key structure is freed. + * + * Permission checks are deferred to the time the key is used by + * one of the available key-specific kfuncs. + * + * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested + * special keyring (e.g. session keyring), if it doesn't yet exist. + * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting + * for the key construction, and to retrieve uninstantiated keys (keys + * without data attached to them). + * + * Return: a bpf_key pointer with a valid key pointer if the key is found, a + * NULL pointer otherwise. + */ +struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags) +{ + key_ref_t key_ref; + struct bpf_key *bkey; + + if (flags & ~KEY_LOOKUP_ALL) + return NULL; + + /* + * Permission check is deferred until the key is used, as the + * intent of the caller is unknown here. + */ + key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK); + if (IS_ERR(key_ref)) + return NULL; + + bkey = kmalloc(sizeof(*bkey), GFP_KERNEL); + if (!bkey) { + key_put(key_ref_to_ptr(key_ref)); + return NULL; + } + + bkey->key = key_ref_to_ptr(key_ref); + bkey->has_ref = true; + + return bkey; +} + +/** + * bpf_lookup_system_key - lookup a key by a system-defined ID + * @id: key ID + * + * Obtain a bpf_key structure with a key pointer set to the passed key ID. + * The key pointer is marked as invalid, to prevent bpf_key_put() from + * attempting to decrement the key reference count on that pointer. The key + * pointer set in such way is currently understood only by + * verify_pkcs7_signature(). + * + * Set *id* to one of the values defined in include/linux/verification.h: + * 0 for the primary keyring (immutable keyring of system keys); + * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring + * (where keys can be added only if they are vouched for by existing keys + * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform + * keyring (primarily used by the integrity subsystem to verify a kexec'ed + * kerned image and, possibly, the initramfs signature). + * + * Return: a bpf_key pointer with an invalid key pointer set from the + * pre-determined ID on success, a NULL pointer otherwise + */ +struct bpf_key *bpf_lookup_system_key(u64 id) +{ + struct bpf_key *bkey; + + if (system_keyring_id_check(id) < 0) + return NULL; + + bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC); + if (!bkey) + return NULL; + + bkey->key = (struct key *)(unsigned long)id; + bkey->has_ref = false; + + return bkey; +} + +/** + * bpf_key_put - decrement key reference count if key is valid and free bpf_key + * @bkey: bpf_key structure + * + * Decrement the reference count of the key inside *bkey*, if the pointer + * is valid, and free *bkey*. + */ +void bpf_key_put(struct bpf_key *bkey) +{ + if (bkey->has_ref) + key_put(bkey->key); + + kfree(bkey); +} + +#ifdef CONFIG_SYSTEM_DATA_VERIFICATION +/** + * bpf_verify_pkcs7_signature - verify a PKCS#7 signature + * @data_ptr: data to verify + * @sig_ptr: signature of the data + * @trusted_keyring: keyring with keys trusted for signature verification + * + * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr* + * with keys in a keyring referenced by *trusted_keyring*. + * + * Return: 0 on success, a negative value on error. + */ +int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr, + struct bpf_dynptr_kern *sig_ptr, + struct bpf_key *trusted_keyring) +{ + int ret; + + if (trusted_keyring->has_ref) { + /* + * Do the permission check deferred in bpf_lookup_user_key(). + * See bpf_lookup_user_key() for more details. + * + * A call to key_task_permission() here would be redundant, as + * it is already done by keyring_search() called by + * find_asymmetric_key(). + */ + ret = key_validate(trusted_keyring->key); + if (ret < 0) + return ret; + } + + return verify_pkcs7_signature(data_ptr->data, + bpf_dynptr_get_size(data_ptr), + sig_ptr->data, + bpf_dynptr_get_size(sig_ptr), + trusted_keyring->key, + VERIFYING_UNSPECIFIED_SIGNATURE, NULL, + NULL); +} +#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */ + +__diag_pop(); + +BTF_SET8_START(key_sig_kfunc_set) +BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE) +#ifdef CONFIG_SYSTEM_DATA_VERIFICATION +BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE) +#endif +BTF_SET8_END(key_sig_kfunc_set) + +static const struct btf_kfunc_id_set bpf_key_sig_kfunc_set = { + .owner = THIS_MODULE, + .set = &key_sig_kfunc_set, +}; + +static int __init bpf_key_sig_kfuncs_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, + &bpf_key_sig_kfunc_set); +} + +late_initcall(bpf_key_sig_kfuncs_init); +#endif /* CONFIG_KEYS */ + static const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1179,6 +1396,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_pop_elem_proto; case BPF_FUNC_map_peek_elem: return &bpf_map_peek_elem_proto; + case BPF_FUNC_map_lookup_percpu_elem: + return &bpf_map_lookup_percpu_elem_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: @@ -1685,6 +1904,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skc_to_udp6_sock_proto; case BPF_FUNC_skc_to_unix_sock: return &bpf_skc_to_unix_sock_proto; + case BPF_FUNC_skc_to_mptcp_sock: + return &bpf_skc_to_mptcp_sock_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_tracing_proto; case BPF_FUNC_sk_storage_delete: @@ -1716,6 +1937,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return bpf_prog_has_trampoline(prog) ? &bpf_get_func_ret_proto : NULL; case BPF_FUNC_get_func_arg_cnt: return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL; + case BPF_FUNC_get_attach_cookie: + return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto_tracing : NULL; default: fn = raw_tp_prog_func_proto(func_id, prog); if (!fn && prog->expected_attach_type == BPF_TRACE_ITER) @@ -1912,7 +2135,7 @@ int perf_event_attach_bpf_prog(struct perf_event *event, event->prog = prog; event->bpf_cookie = bpf_cookie; rcu_assign_pointer(event->tp_event->prog_array, new_array); - bpf_prog_array_free(old_array); + bpf_prog_array_free_sleepable(old_array); unlock: mutex_unlock(&bpf_event_mutex); @@ -1938,7 +2161,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event) bpf_prog_array_delete_safe(old_array, event->prog); } else { rcu_assign_pointer(event->tp_event->prog_array, new_array); - bpf_prog_array_free(old_array); + bpf_prog_array_free_sleepable(old_array); } bpf_prog_put(event->prog); @@ -2018,9 +2241,15 @@ static __always_inline void __bpf_trace_run(struct bpf_prog *prog, u64 *args) { cant_sleep(); + if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { + bpf_prog_inc_misses_counter(prog); + goto out; + } rcu_read_lock(); (void) bpf_prog_run(prog, args); rcu_read_unlock(); +out: + this_cpu_dec(*(prog->active)); } #define UNPACK(...) __VA_ARGS__ @@ -2226,6 +2455,59 @@ struct bpf_kprobe_multi_run_ctx { unsigned long entry_ip; }; +struct user_syms { + const char **syms; + char *buf; +}; + +static int copy_user_syms(struct user_syms *us, unsigned long __user *usyms, u32 cnt) +{ + unsigned long __user usymbol; + const char **syms = NULL; + char *buf = NULL, *p; + int err = -ENOMEM; + unsigned int i; + + syms = kvmalloc_array(cnt, sizeof(*syms), GFP_KERNEL); + if (!syms) + goto error; + + buf = kvmalloc_array(cnt, KSYM_NAME_LEN, GFP_KERNEL); + if (!buf) + goto error; + + for (p = buf, i = 0; i < cnt; i++) { + if (__get_user(usymbol, usyms + i)) { + err = -EFAULT; + goto error; + } + err = strncpy_from_user(p, (const char __user *) usymbol, KSYM_NAME_LEN); + if (err == KSYM_NAME_LEN) + err = -E2BIG; + if (err < 0) + goto error; + syms[i] = p; + p += err + 1; + } + + us->syms = syms; + us->buf = buf; + return 0; + +error: + if (err) { + kvfree(syms); + kvfree(buf); + } + return err; +} + +static void free_user_syms(struct user_syms *us) +{ + kvfree(us->syms); + kvfree(us->buf); +} + static void bpf_kprobe_multi_link_release(struct bpf_link *link) { struct bpf_kprobe_multi_link *kmulti_link; @@ -2254,15 +2536,13 @@ static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void const struct bpf_kprobe_multi_link *link = priv; unsigned long *addr_a = a, *addr_b = b; u64 *cookie_a, *cookie_b; - unsigned long tmp1; - u64 tmp2; cookie_a = link->cookies + (addr_a - link->addrs); cookie_b = link->cookies + (addr_b - link->addrs); /* swap addr_a/addr_b and cookie_a/cookie_b values */ - tmp1 = *addr_a; *addr_a = *addr_b; *addr_b = tmp1; - tmp2 = *cookie_a; *cookie_a = *cookie_b; *cookie_b = tmp2; + swap(*addr_a, *addr_b); + swap(*cookie_a, *cookie_b); } static int __bpf_kprobe_multi_cookie_cmp(const void *a, const void *b) @@ -2339,62 +2619,43 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, } static void -kprobe_multi_link_handler(struct fprobe *fp, unsigned long entry_ip, +kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip, struct pt_regs *regs) { struct bpf_kprobe_multi_link *link; link = container_of(fp, struct bpf_kprobe_multi_link, fp); - kprobe_multi_link_prog_run(link, entry_ip, regs); + kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs); } -static int -kprobe_multi_resolve_syms(const void *usyms, u32 cnt, - unsigned long *addrs) +static int symbols_cmp_r(const void *a, const void *b, const void *priv) { - unsigned long addr, size; - const char **syms; - int err = -ENOMEM; - unsigned int i; - char *func; + const char **str_a = (const char **) a; + const char **str_b = (const char **) b; - size = cnt * sizeof(*syms); - syms = kvzalloc(size, GFP_KERNEL); - if (!syms) - return -ENOMEM; + return strcmp(*str_a, *str_b); +} - func = kmalloc(KSYM_NAME_LEN, GFP_KERNEL); - if (!func) - goto error; +struct multi_symbols_sort { + const char **funcs; + u64 *cookies; +}; - if (copy_from_user(syms, usyms, size)) { - err = -EFAULT; - goto error; - } +static void symbols_swap_r(void *a, void *b, int size, const void *priv) +{ + const struct multi_symbols_sort *data = priv; + const char **name_a = a, **name_b = b; - for (i = 0; i < cnt; i++) { - err = strncpy_from_user(func, syms[i], KSYM_NAME_LEN); - if (err == KSYM_NAME_LEN) - err = -E2BIG; - if (err < 0) - goto error; - err = -EINVAL; - addr = kallsyms_lookup_name(func); - if (!addr) - goto error; - if (!kallsyms_lookup_size_offset(addr, &size, NULL)) - goto error; - addr = ftrace_location_range(addr, addr + size - 1); - if (!addr) - goto error; - addrs[i] = addr; - } + swap(*name_a, *name_b); - err = 0; -error: - kvfree(syms); - kfree(func); - return err; + /* If defined, swap also related cookies. */ + if (data->cookies) { + u64 *cookie_a, *cookie_b; + + cookie_a = data->cookies + (name_a - data->funcs); + cookie_b = data->cookies + (name_b - data->funcs); + swap(*cookie_a, *cookie_b); + } } int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) @@ -2430,24 +2691,13 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr return -EINVAL; size = cnt * sizeof(*addrs); - addrs = kvmalloc(size, GFP_KERNEL); + addrs = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL); if (!addrs) return -ENOMEM; - if (uaddrs) { - if (copy_from_user(addrs, uaddrs, size)) { - err = -EFAULT; - goto error; - } - } else { - err = kprobe_multi_resolve_syms(usyms, cnt, addrs); - if (err) - goto error; - } - ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies); if (ucookies) { - cookies = kvmalloc(size, GFP_KERNEL); + cookies = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL); if (!cookies) { err = -ENOMEM; goto error; @@ -2458,6 +2708,33 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr } } + if (uaddrs) { + if (copy_from_user(addrs, uaddrs, size)) { + err = -EFAULT; + goto error; + } + } else { + struct multi_symbols_sort data = { + .cookies = cookies, + }; + struct user_syms us; + + err = copy_user_syms(&us, usyms, cnt); + if (err) + goto error; + + if (cookies) + data.funcs = us.syms; + + sort_r(us.syms, cnt, sizeof(*us.syms), symbols_cmp_r, + symbols_swap_r, &data); + + err = ftrace_lookup_symbols(us.syms, cnt, addrs); + free_user_syms(&us); + if (err) + goto error; + } + link = kzalloc(sizeof(*link), GFP_KERNEL); if (!link) { err = -ENOMEM; diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 8f4fb328133a..218cd95bf8e4 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -30,6 +30,26 @@ int ftrace_graph_active; /* Both enabled by default (can be cleared by function_graph tracer flags */ static bool fgraph_sleep_time = true; +#ifdef CONFIG_DYNAMIC_FTRACE +/* + * archs can override this function if they must do something + * to enable hook for graph tracer. + */ +int __weak ftrace_enable_ftrace_graph_caller(void) +{ + return 0; +} + +/* + * archs can override this function if they must do something + * to disable hook for graph tracer. + */ +int __weak ftrace_disable_ftrace_graph_caller(void) +{ + return 0; +} +#endif + /** * ftrace_graph_stop - set to permanently disable function graph tracing * @@ -404,9 +424,9 @@ free: static void ftrace_graph_probe_sched_switch(void *ignore, bool preempt, - unsigned int prev_state, struct task_struct *prev, - struct task_struct *next) + struct task_struct *next, + unsigned int prev_state) { unsigned long long timestamp; int index; diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 89d9f994ebb0..aac63ca9c3d1 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -85,39 +85,31 @@ static void fprobe_exit_handler(struct rethook_node *rh, void *data, } NOKPROBE_SYMBOL(fprobe_exit_handler); +static int symbols_cmp(const void *a, const void *b) +{ + const char **str_a = (const char **) a; + const char **str_b = (const char **) b; + + return strcmp(*str_a, *str_b); +} + /* Convert ftrace location address from symbols */ static unsigned long *get_ftrace_locations(const char **syms, int num) { - unsigned long addr, size; unsigned long *addrs; - int i; /* Convert symbols to symbol address */ addrs = kcalloc(num, sizeof(*addrs), GFP_KERNEL); if (!addrs) return ERR_PTR(-ENOMEM); - for (i = 0; i < num; i++) { - addr = kallsyms_lookup_name(syms[i]); - if (!addr) /* Maybe wrong symbol */ - goto error; - - /* Convert symbol address to ftrace location. */ - if (!kallsyms_lookup_size_offset(addr, &size, NULL) || !size) - goto error; + /* ftrace_lookup_symbols expects sorted symbols */ + sort(syms, num, sizeof(*syms), symbols_cmp, NULL); - addr = ftrace_location_range(addr, addr + size - 1); - if (!addr) /* No dynamic ftrace there. */ - goto error; + if (!ftrace_lookup_symbols(syms, num, addrs)) + return addrs; - addrs[i] = addr; - } - - return addrs; - -error: kfree(addrs); - return ERR_PTR(-ENOENT); } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4f1d2f5e7263..447d2e2a8549 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -45,6 +45,8 @@ #include "trace_output.h" #include "trace_stat.h" +#define FTRACE_INVALID_FUNCTION "__ftrace_invalid_address__" + #define FTRACE_WARN_ON(cond) \ ({ \ int ___r = cond; \ @@ -86,7 +88,7 @@ struct ftrace_ops ftrace_list_end __read_mostly = { /* ftrace_enabled is a method to turn ftrace on or off */ int ftrace_enabled __read_mostly; -static int last_ftrace_enabled; +static int __maybe_unused last_ftrace_enabled; /* Current function tracing op */ struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; @@ -119,7 +121,7 @@ struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; struct ftrace_ops global_ops; -/* Defined by vmlinux.lds.h see the commment above arch_ftrace_ops_list_func for details */ +/* Defined by vmlinux.lds.h see the comment above arch_ftrace_ops_list_func for details */ void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs); @@ -952,7 +954,6 @@ static struct tracer_stat function_stats __initdata = { static __init void ftrace_profile_tracefs(struct dentry *d_tracer) { struct ftrace_profile_stat *stat; - struct dentry *entry; char *name; int ret; int cpu; @@ -983,11 +984,9 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer) } } - entry = tracefs_create_file("function_profile_enabled", - TRACE_MODE_WRITE, d_tracer, NULL, - &ftrace_profile_fops); - if (!entry) - pr_warn("Could not create tracefs 'function_profile_enabled' entry\n"); + trace_create_file("function_profile_enabled", + TRACE_MODE_WRITE, d_tracer, NULL, + &ftrace_profile_fops); } #else /* CONFIG_FUNCTION_PROFILER */ @@ -1870,6 +1869,13 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, * - If the hash is NULL, it hits all recs (if IPMODIFY is set, this is rejected) * - If the hash is EMPTY_HASH, it hits nothing * - Anything else hits the recs which match the hash entries. + * + * DIRECT ops does not have IPMODIFY flag, but we still need to check it + * against functions with FTRACE_FL_IPMODIFY. If there is any overlap, call + * ops_func(SHARE_IPMODIFY_SELF) to make sure current ops can share with + * IPMODIFY. If ops_func(SHARE_IPMODIFY_SELF) returns non-zero, propagate + * the return value to the caller and eventually to the owner of the DIRECT + * ops. */ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, struct ftrace_hash *old_hash, @@ -1878,17 +1884,26 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, struct ftrace_page *pg; struct dyn_ftrace *rec, *end = NULL; int in_old, in_new; + bool is_ipmodify, is_direct; /* Only update if the ops has been registered */ if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) return 0; - if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY)) + is_ipmodify = ops->flags & FTRACE_OPS_FL_IPMODIFY; + is_direct = ops->flags & FTRACE_OPS_FL_DIRECT; + + /* neither IPMODIFY nor DIRECT, skip */ + if (!is_ipmodify && !is_direct) + return 0; + + if (WARN_ON_ONCE(is_ipmodify && is_direct)) return 0; /* - * Since the IPMODIFY is a very address sensitive action, we do not - * allow ftrace_ops to set all functions to new hash. + * Since the IPMODIFY and DIRECT are very address sensitive + * actions, we do not allow ftrace_ops to set all functions to new + * hash. */ if (!new_hash || !old_hash) return -EINVAL; @@ -1906,12 +1921,32 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, continue; if (in_new) { - /* New entries must ensure no others are using it */ - if (rec->flags & FTRACE_FL_IPMODIFY) - goto rollback; - rec->flags |= FTRACE_FL_IPMODIFY; - } else /* Removed entry */ + if (rec->flags & FTRACE_FL_IPMODIFY) { + int ret; + + /* Cannot have two ipmodify on same rec */ + if (is_ipmodify) + goto rollback; + + FTRACE_WARN_ON(rec->flags & FTRACE_FL_DIRECT); + + /* + * Another ops with IPMODIFY is already + * attached. We are now attaching a direct + * ops. Run SHARE_IPMODIFY_SELF, to check + * whether sharing is supported. + */ + if (!ops->ops_func) + return -EBUSY; + ret = ops->ops_func(ops, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF); + if (ret) + return ret; + } else if (is_ipmodify) { + rec->flags |= FTRACE_FL_IPMODIFY; + } + } else if (is_ipmodify) { rec->flags &= ~FTRACE_FL_IPMODIFY; + } } while_for_each_ftrace_rec(); return 0; @@ -2455,8 +2490,7 @@ static void call_direct_funcs(unsigned long ip, unsigned long pip, struct ftrace_ops direct_ops = { .func = call_direct_funcs, - .flags = FTRACE_OPS_FL_IPMODIFY - | FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS + .flags = FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_PERMANENT, /* * By declaring the main trampoline as this trampoline @@ -2707,18 +2741,16 @@ ftrace_nop_initialize(struct module *mod, struct dyn_ftrace *rec) * archs can override this function if they must do something * before the modifying code is performed. */ -int __weak ftrace_arch_code_modify_prepare(void) +void __weak ftrace_arch_code_modify_prepare(void) { - return 0; } /* * archs can override this function if they must do something * after the modifying code is performed. */ -int __weak ftrace_arch_code_modify_post_process(void) +void __weak ftrace_arch_code_modify_post_process(void) { - return 0; } void ftrace_modify_all_code(int command) @@ -2804,12 +2836,7 @@ void __weak arch_ftrace_update_code(int command) static void ftrace_run_update_code(int command) { - int ret; - - ret = ftrace_arch_code_modify_prepare(); - FTRACE_WARN_ON(ret); - if (ret) - return; + ftrace_arch_code_modify_prepare(); /* * By default we use stop_machine() to modify the code. @@ -2819,8 +2846,7 @@ static void ftrace_run_update_code(int command) */ arch_ftrace_update_code(command); - ret = ftrace_arch_code_modify_post_process(); - FTRACE_WARN_ON(ret); + ftrace_arch_code_modify_post_process(); } static void ftrace_run_modify_code(struct ftrace_ops *ops, int command, @@ -2946,6 +2972,16 @@ int ftrace_startup(struct ftrace_ops *ops, int command) ftrace_startup_enable(command); + /* + * If ftrace is in an undefined state, we just remove ops from list + * to prevent the NULL pointer, instead of totally rolling it back and + * free trampoline, because those actions could cause further damage. + */ + if (unlikely(ftrace_disabled)) { + __unregister_ftrace_function(ops); + return -ENODEV; + } + ops->flags &= ~FTRACE_OPS_FL_ADDING; return 0; @@ -3065,40 +3101,6 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) return 0; } -static void ftrace_startup_sysctl(void) -{ - int command; - - if (unlikely(ftrace_disabled)) - return; - - /* Force update next time */ - saved_ftrace_func = NULL; - /* ftrace_start_up is true if we want ftrace running */ - if (ftrace_start_up) { - command = FTRACE_UPDATE_CALLS; - if (ftrace_graph_active) - command |= FTRACE_START_FUNC_RET; - ftrace_startup_enable(command); - } -} - -static void ftrace_shutdown_sysctl(void) -{ - int command; - - if (unlikely(ftrace_disabled)) - return; - - /* ftrace_start_up is true if ftrace is running */ - if (ftrace_start_up) { - command = FTRACE_DISABLE_CALLS; - if (ftrace_graph_active) - command |= FTRACE_STOP_FUNC_RET; - ftrace_run_update_code(command); - } -} - static u64 ftrace_update_time; unsigned long ftrace_update_tot_cnt; unsigned long ftrace_number_of_pages; @@ -3114,36 +3116,6 @@ static inline int ops_traces_mod(struct ftrace_ops *ops) ftrace_hash_empty(ops->func_hash->notrace_hash); } -/* - * Check if the current ops references the record. - * - * If the ops traces all functions, then it was already accounted for. - * If the ops does not trace the current record function, skip it. - * If the ops ignores the function via notrace filter, skip it. - */ -static inline bool -ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) -{ - /* If ops isn't enabled, ignore it */ - if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) - return false; - - /* If ops traces all then it includes this function */ - if (ops_traces_mod(ops)) - return true; - - /* The function must be in the filter */ - if (!ftrace_hash_empty(ops->func_hash->filter_hash) && - !__ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip)) - return false; - - /* If in notrace hash, we ignore it too */ - if (ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) - return false; - - return true; -} - static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) { bool init_nop = ftrace_need_init_nop(); @@ -3665,6 +3637,105 @@ static void add_trampoline_func(struct seq_file *m, struct ftrace_ops *ops, seq_printf(m, " ->%pS", ptr); } +#ifdef FTRACE_MCOUNT_MAX_OFFSET +/* + * Weak functions can still have an mcount/fentry that is saved in + * the __mcount_loc section. These can be detected by having a + * symbol offset of greater than FTRACE_MCOUNT_MAX_OFFSET, as the + * symbol found by kallsyms is not the function that the mcount/fentry + * is part of. The offset is much greater in these cases. + * + * Test the record to make sure that the ip points to a valid kallsyms + * and if not, mark it disabled. + */ +static int test_for_valid_rec(struct dyn_ftrace *rec) +{ + char str[KSYM_SYMBOL_LEN]; + unsigned long offset; + const char *ret; + + ret = kallsyms_lookup(rec->ip, NULL, &offset, NULL, str); + + /* Weak functions can cause invalid addresses */ + if (!ret || offset > FTRACE_MCOUNT_MAX_OFFSET) { + rec->flags |= FTRACE_FL_DISABLED; + return 0; + } + return 1; +} + +static struct workqueue_struct *ftrace_check_wq __initdata; +static struct work_struct ftrace_check_work __initdata; + +/* + * Scan all the mcount/fentry entries to make sure they are valid. + */ +static __init void ftrace_check_work_func(struct work_struct *work) +{ + struct ftrace_page *pg; + struct dyn_ftrace *rec; + + mutex_lock(&ftrace_lock); + do_for_each_ftrace_rec(pg, rec) { + test_for_valid_rec(rec); + } while_for_each_ftrace_rec(); + mutex_unlock(&ftrace_lock); +} + +static int __init ftrace_check_for_weak_functions(void) +{ + INIT_WORK(&ftrace_check_work, ftrace_check_work_func); + + ftrace_check_wq = alloc_workqueue("ftrace_check_wq", WQ_UNBOUND, 0); + + queue_work(ftrace_check_wq, &ftrace_check_work); + return 0; +} + +static int __init ftrace_check_sync(void) +{ + /* Make sure the ftrace_check updates are finished */ + if (ftrace_check_wq) + destroy_workqueue(ftrace_check_wq); + return 0; +} + +late_initcall_sync(ftrace_check_sync); +subsys_initcall(ftrace_check_for_weak_functions); + +static int print_rec(struct seq_file *m, unsigned long ip) +{ + unsigned long offset; + char str[KSYM_SYMBOL_LEN]; + char *modname; + const char *ret; + + ret = kallsyms_lookup(ip, NULL, &offset, &modname, str); + /* Weak functions can cause invalid addresses */ + if (!ret || offset > FTRACE_MCOUNT_MAX_OFFSET) { + snprintf(str, KSYM_SYMBOL_LEN, "%s_%ld", + FTRACE_INVALID_FUNCTION, offset); + ret = NULL; + } + + seq_puts(m, str); + if (modname) + seq_printf(m, " [%s]", modname); + return ret == NULL ? -1 : 0; +} +#else +static inline int test_for_valid_rec(struct dyn_ftrace *rec) +{ + return 1; +} + +static inline int print_rec(struct seq_file *m, unsigned long ip) +{ + seq_printf(m, "%ps", (void *)ip); + return 0; +} +#endif + static int t_show(struct seq_file *m, void *v) { struct ftrace_iterator *iter = m->private; @@ -3689,7 +3760,13 @@ static int t_show(struct seq_file *m, void *v) if (!rec) return 0; - seq_printf(m, "%ps", (void *)rec->ip); + if (print_rec(m, rec->ip)) { + /* This should only happen when a rec is disabled */ + WARN_ON_ONCE(!(rec->flags & FTRACE_FL_DISABLED)); + seq_putc(m, '\n'); + return 0; + } + if (iter->flags & FTRACE_ITER_ENABLED) { struct ftrace_ops *ops; @@ -4007,6 +4084,24 @@ add_rec_by_index(struct ftrace_hash *hash, struct ftrace_glob *func_g, return 0; } +#ifdef FTRACE_MCOUNT_MAX_OFFSET +static int lookup_ip(unsigned long ip, char **modname, char *str) +{ + unsigned long offset; + + kallsyms_lookup(ip, NULL, &offset, modname, str); + if (offset > FTRACE_MCOUNT_MAX_OFFSET) + return -1; + return 0; +} +#else +static int lookup_ip(unsigned long ip, char **modname, char *str) +{ + kallsyms_lookup(ip, NULL, NULL, modname, str); + return 0; +} +#endif + static int ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g, struct ftrace_glob *mod_g, int exclude_mod) @@ -4014,7 +4109,12 @@ ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g, char str[KSYM_SYMBOL_LEN]; char *modname; - kallsyms_lookup(rec->ip, NULL, NULL, &modname, str); + if (lookup_ip(rec->ip, &modname, str)) { + /* This should only happen when a rec is disabled */ + WARN_ON_ONCE(system_state == SYSTEM_RUNNING && + !(rec->flags & FTRACE_FL_DISABLED)); + return 0; + } if (mod_g) { int mod_matches = (modname) ? ftrace_match(modname, mod_g) : 0; @@ -4465,7 +4565,7 @@ int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper, * @ip: The instruction pointer address to remove the data from * * Returns the data if it is found, otherwise NULL. - * Note, if the data pointer is used as the data itself, (see + * Note, if the data pointer is used as the data itself, (see * ftrace_func_mapper_find_ip(), then the return value may be meaningless, * if the data pointer was set to zero. */ @@ -4560,8 +4660,8 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr, struct ftrace_probe_ops *probe_ops, void *data) { + struct ftrace_func_probe *probe = NULL, *iter; struct ftrace_func_entry *entry; - struct ftrace_func_probe *probe; struct ftrace_hash **orig_hash; struct ftrace_hash *old_hash; struct ftrace_hash *hash; @@ -4580,11 +4680,13 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr, mutex_lock(&ftrace_lock); /* Check if the probe_ops is already registered */ - list_for_each_entry(probe, &tr->func_probes, list) { - if (probe->probe_ops == probe_ops) + list_for_each_entry(iter, &tr->func_probes, list) { + if (iter->probe_ops == probe_ops) { + probe = iter; break; + } } - if (&probe->list == &tr->func_probes) { + if (!probe) { probe = kzalloc(sizeof(*probe), GFP_KERNEL); if (!probe) { mutex_unlock(&ftrace_lock); @@ -4702,9 +4804,9 @@ int unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr, struct ftrace_probe_ops *probe_ops) { + struct ftrace_func_probe *probe = NULL, *iter; struct ftrace_ops_hash old_hash_ops; struct ftrace_func_entry *entry; - struct ftrace_func_probe *probe; struct ftrace_glob func_g; struct ftrace_hash **orig_hash; struct ftrace_hash *old_hash; @@ -4732,11 +4834,13 @@ unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr, mutex_lock(&ftrace_lock); /* Check if the probe_ops is already registered */ - list_for_each_entry(probe, &tr->func_probes, list) { - if (probe->probe_ops == probe_ops) + list_for_each_entry(iter, &tr->func_probes, list) { + if (iter->probe_ops == probe_ops) { + probe = iter; break; + } } - if (&probe->list == &tr->func_probes) + if (!probe) goto err_unlock_ftrace; ret = -EINVAL; @@ -5126,6 +5230,8 @@ static struct ftrace_direct_func *ftrace_alloc_direct_func(unsigned long addr) return direct; } +static int register_ftrace_function_nolock(struct ftrace_ops *ops); + /** * register_ftrace_direct - Call a custom trampoline directly * @ip: The address of the nop at the beginning of a function @@ -5195,16 +5301,15 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) goto out_unlock; ret = ftrace_set_filter_ip(&direct_ops, ip, 0, 0); - if (ret) - remove_hash_entry(direct_functions, entry); if (!ret && !(direct_ops.flags & FTRACE_OPS_FL_ENABLED)) { - ret = register_ftrace_function(&direct_ops); + ret = register_ftrace_function_nolock(&direct_ops); if (ret) ftrace_set_filter_ip(&direct_ops, ip, 1, 0); } if (ret) { + remove_hash_entry(direct_functions, entry); kfree(entry); if (!direct->count) { list_del_rcu(&direct->next); @@ -5457,8 +5562,7 @@ int modify_ftrace_direct(unsigned long ip, } EXPORT_SYMBOL_GPL(modify_ftrace_direct); -#define MULTI_FLAGS (FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_DIRECT | \ - FTRACE_OPS_FL_SAVE_REGS) +#define MULTI_FLAGS (FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS) static int check_direct_multi(struct ftrace_ops *ops) { @@ -5551,7 +5655,7 @@ int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) ops->flags = MULTI_FLAGS; ops->trampoline = FTRACE_REGS_ADDR; - err = register_ftrace_function(ops); + err = register_ftrace_function_nolock(ops); out_remove: if (err) @@ -5603,22 +5707,8 @@ int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) } EXPORT_SYMBOL_GPL(unregister_ftrace_direct_multi); -/** - * modify_ftrace_direct_multi - Modify an existing direct 'multi' call - * to call something else - * @ops: The address of the struct ftrace_ops object - * @addr: The address of the new trampoline to call at @ops functions - * - * This is used to unregister currently registered direct caller and - * register new one @addr on functions registered in @ops object. - * - * Note there's window between ftrace_shutdown and ftrace_startup calls - * where there will be no callbacks called. - * - * Returns: zero on success. Non zero on error, which includes: - * -EINVAL - The @ops object was not properly registered. - */ -int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) +static int +__modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) { struct ftrace_hash *hash; struct ftrace_func_entry *entry, *iter; @@ -5629,20 +5719,15 @@ int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) int i, size; int err; - if (check_direct_multi(ops)) - return -EINVAL; - if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) - return -EINVAL; - - mutex_lock(&direct_mutex); + lockdep_assert_held_once(&direct_mutex); /* Enable the tmp_ops to have the same functions as the direct ops */ ftrace_ops_init(&tmp_ops); tmp_ops.func_hash = ops->func_hash; - err = register_ftrace_function(&tmp_ops); + err = register_ftrace_function_nolock(&tmp_ops); if (err) - goto out_direct; + return err; /* * Now the ftrace_ops_list_func() is called to do the direct callers. @@ -5666,7 +5751,64 @@ int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) /* Removing the tmp_ops will add the updated direct callers to the functions */ unregister_ftrace_function(&tmp_ops); - out_direct: + return err; +} + +/** + * modify_ftrace_direct_multi_nolock - Modify an existing direct 'multi' call + * to call something else + * @ops: The address of the struct ftrace_ops object + * @addr: The address of the new trampoline to call at @ops functions + * + * This is used to unregister currently registered direct caller and + * register new one @addr on functions registered in @ops object. + * + * Note there's window between ftrace_shutdown and ftrace_startup calls + * where there will be no callbacks called. + * + * Caller should already have direct_mutex locked, so we don't lock + * direct_mutex here. + * + * Returns: zero on success. Non zero on error, which includes: + * -EINVAL - The @ops object was not properly registered. + */ +int modify_ftrace_direct_multi_nolock(struct ftrace_ops *ops, unsigned long addr) +{ + if (check_direct_multi(ops)) + return -EINVAL; + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return -EINVAL; + + return __modify_ftrace_direct_multi(ops, addr); +} +EXPORT_SYMBOL_GPL(modify_ftrace_direct_multi_nolock); + +/** + * modify_ftrace_direct_multi - Modify an existing direct 'multi' call + * to call something else + * @ops: The address of the struct ftrace_ops object + * @addr: The address of the new trampoline to call at @ops functions + * + * This is used to unregister currently registered direct caller and + * register new one @addr on functions registered in @ops object. + * + * Note there's window between ftrace_shutdown and ftrace_startup calls + * where there will be no callbacks called. + * + * Returns: zero on success. Non zero on error, which includes: + * -EINVAL - The @ops object was not properly registered. + */ +int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) +{ + int err; + + if (check_direct_multi(ops)) + return -EINVAL; + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return -EINVAL; + + mutex_lock(&direct_mutex); + err = __modify_ftrace_direct_multi(ops, addr); mutex_unlock(&direct_mutex); return err; } @@ -6635,6 +6777,38 @@ static int ftrace_get_trampoline_kallsym(unsigned int symnum, return -ERANGE; } +#if defined(CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS) || defined(CONFIG_MODULES) +/* + * Check if the current ops references the given ip. + * + * If the ops traces all functions, then it was already accounted for. + * If the ops does not trace the current record function, skip it. + * If the ops ignores the function via notrace filter, skip it. + */ +static bool +ops_references_ip(struct ftrace_ops *ops, unsigned long ip) +{ + /* If ops isn't enabled, ignore it */ + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return false; + + /* If ops traces all then it includes this function */ + if (ops_traces_mod(ops)) + return true; + + /* The function must be in the filter */ + if (!ftrace_hash_empty(ops->func_hash->filter_hash) && + !__ftrace_lookup_ip(ops->func_hash->filter_hash, ip)) + return false; + + /* If in notrace hash, we ignore it too */ + if (ftrace_lookup_ip(ops->func_hash->notrace_hash, ip)) + return false; + + return true; +} +#endif + #ifdef CONFIG_MODULES #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next) @@ -6647,7 +6821,7 @@ static int referenced_filters(struct dyn_ftrace *rec) int cnt = 0; for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) { - if (ops_references_rec(ops, rec)) { + if (ops_references_ip(ops, rec->ip)) { if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_DIRECT)) continue; if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_IPMODIFY)) @@ -6827,6 +7001,13 @@ void ftrace_module_enable(struct module *mod) !within_module_init(rec->ip, mod)) break; + /* Weak functions should still be ignored */ + if (!test_for_valid_rec(rec)) { + /* Clear all other flags. Should not be enabled anyway */ + rec->flags = FTRACE_FL_DISABLED; + continue; + } + cnt = 0; /* @@ -6863,11 +7044,16 @@ void ftrace_module_enable(struct module *mod) void ftrace_module_init(struct module *mod) { + int ret; + if (ftrace_disabled || !mod->num_ftrace_callsites) return; - ftrace_process_locs(mod, mod->ftrace_callsites, - mod->ftrace_callsites + mod->num_ftrace_callsites); + ret = ftrace_process_locs(mod, mod->ftrace_callsites, + mod->ftrace_callsites + mod->num_ftrace_callsites); + if (ret) + pr_warn("ftrace: failed to allocate entries for module '%s' functions\n", + mod->name); } static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map, @@ -7200,15 +7386,19 @@ void __init ftrace_init(void) pr_info("ftrace: allocating %ld entries in %ld pages\n", count, count / ENTRIES_PER_PAGE + 1); - last_ftrace_enabled = ftrace_enabled = 1; - ret = ftrace_process_locs(NULL, __start_mcount_loc, __stop_mcount_loc); + if (ret) { + pr_warn("ftrace: failed to allocate entries for functions\n"); + goto failed; + } pr_info("ftrace: allocated %ld pages with %ld groups\n", ftrace_number_of_pages, ftrace_number_of_groups); + last_ftrace_enabled = ftrace_enabled = 1; + set_ftrace_early_filters(); return; @@ -7267,9 +7457,6 @@ core_initcall(ftrace_nodyn_init); static inline int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { return 0; } static inline void ftrace_startup_all(int command) { } -# define ftrace_startup_sysctl() do { } while (0) -# define ftrace_shutdown_sysctl() do { } while (0) - static void ftrace_update_trampoline(struct ftrace_ops *ops) { } @@ -7420,9 +7607,9 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) static void ftrace_filter_pid_sched_switch_probe(void *data, bool preempt, - unsigned int prev_state, struct task_struct *prev, - struct task_struct *next) + struct task_struct *next, + unsigned int prev_state) { struct trace_array *tr = data; struct trace_pid_list *pid_list; @@ -7864,6 +8051,143 @@ int ftrace_is_dead(void) return ftrace_disabled; } +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +/* + * When registering ftrace_ops with IPMODIFY, it is necessary to make sure + * it doesn't conflict with any direct ftrace_ops. If there is existing + * direct ftrace_ops on a kernel function being patched, call + * FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER on it to enable sharing. + * + * @ops: ftrace_ops being registered. + * + * Returns: + * 0 on success; + * Negative on failure. + */ +static int prepare_direct_functions_for_ipmodify(struct ftrace_ops *ops) +{ + struct ftrace_func_entry *entry; + struct ftrace_hash *hash; + struct ftrace_ops *op; + int size, i, ret; + + lockdep_assert_held_once(&direct_mutex); + + if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY)) + return 0; + + hash = ops->func_hash->filter_hash; + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + unsigned long ip = entry->ip; + bool found_op = false; + + mutex_lock(&ftrace_lock); + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (!(op->flags & FTRACE_OPS_FL_DIRECT)) + continue; + if (ops_references_ip(op, ip)) { + found_op = true; + break; + } + } while_for_each_ftrace_op(op); + mutex_unlock(&ftrace_lock); + + if (found_op) { + if (!op->ops_func) + return -EBUSY; + + ret = op->ops_func(op, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER); + if (ret) + return ret; + } + } + } + + return 0; +} + +/* + * Similar to prepare_direct_functions_for_ipmodify, clean up after ops + * with IPMODIFY is unregistered. The cleanup is optional for most DIRECT + * ops. + */ +static void cleanup_direct_functions_after_ipmodify(struct ftrace_ops *ops) +{ + struct ftrace_func_entry *entry; + struct ftrace_hash *hash; + struct ftrace_ops *op; + int size, i; + + if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY)) + return; + + mutex_lock(&direct_mutex); + + hash = ops->func_hash->filter_hash; + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + unsigned long ip = entry->ip; + bool found_op = false; + + mutex_lock(&ftrace_lock); + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (!(op->flags & FTRACE_OPS_FL_DIRECT)) + continue; + if (ops_references_ip(op, ip)) { + found_op = true; + break; + } + } while_for_each_ftrace_op(op); + mutex_unlock(&ftrace_lock); + + /* The cleanup is optional, ignore any errors */ + if (found_op && op->ops_func) + op->ops_func(op, FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER); + } + } + mutex_unlock(&direct_mutex); +} + +#define lock_direct_mutex() mutex_lock(&direct_mutex) +#define unlock_direct_mutex() mutex_unlock(&direct_mutex) + +#else /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + +static int prepare_direct_functions_for_ipmodify(struct ftrace_ops *ops) +{ + return 0; +} + +static void cleanup_direct_functions_after_ipmodify(struct ftrace_ops *ops) +{ +} + +#define lock_direct_mutex() do { } while (0) +#define unlock_direct_mutex() do { } while (0) + +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + +/* + * Similar to register_ftrace_function, except we don't lock direct_mutex. + */ +static int register_ftrace_function_nolock(struct ftrace_ops *ops) +{ + int ret; + + ftrace_ops_init(ops); + + mutex_lock(&ftrace_lock); + + ret = ftrace_startup(ops, 0); + + mutex_unlock(&ftrace_lock); + + return ret; +} + /** * register_ftrace_function - register a function for profiling * @ops: ops structure that holds the function for profiling. @@ -7879,14 +8203,15 @@ int register_ftrace_function(struct ftrace_ops *ops) { int ret; - ftrace_ops_init(ops); - - mutex_lock(&ftrace_lock); - - ret = ftrace_startup(ops, 0); + lock_direct_mutex(); + ret = prepare_direct_functions_for_ipmodify(ops); + if (ret < 0) + goto out_unlock; - mutex_unlock(&ftrace_lock); + ret = register_ftrace_function_nolock(ops); +out_unlock: + unlock_direct_mutex(); return ret; } EXPORT_SYMBOL_GPL(register_ftrace_function); @@ -7905,10 +8230,122 @@ int unregister_ftrace_function(struct ftrace_ops *ops) ret = ftrace_shutdown(ops, 0); mutex_unlock(&ftrace_lock); + cleanup_direct_functions_after_ipmodify(ops); return ret; } EXPORT_SYMBOL_GPL(unregister_ftrace_function); +static int symbols_cmp(const void *a, const void *b) +{ + const char **str_a = (const char **) a; + const char **str_b = (const char **) b; + + return strcmp(*str_a, *str_b); +} + +struct kallsyms_data { + unsigned long *addrs; + const char **syms; + size_t cnt; + size_t found; +}; + +static int kallsyms_callback(void *data, const char *name, + struct module *mod, unsigned long addr) +{ + struct kallsyms_data *args = data; + const char **sym; + int idx; + + sym = bsearch(&name, args->syms, args->cnt, sizeof(*args->syms), symbols_cmp); + if (!sym) + return 0; + + idx = sym - args->syms; + if (args->addrs[idx]) + return 0; + + if (!ftrace_location(addr)) + return 0; + + args->addrs[idx] = addr; + args->found++; + return args->found == args->cnt ? 1 : 0; +} + +/** + * ftrace_lookup_symbols - Lookup addresses for array of symbols + * + * @sorted_syms: array of symbols pointers symbols to resolve, + * must be alphabetically sorted + * @cnt: number of symbols/addresses in @syms/@addrs arrays + * @addrs: array for storing resulting addresses + * + * This function looks up addresses for array of symbols provided in + * @syms array (must be alphabetically sorted) and stores them in + * @addrs array, which needs to be big enough to store at least @cnt + * addresses. + * + * This function returns 0 if all provided symbols are found, + * -ESRCH otherwise. + */ +int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs) +{ + struct kallsyms_data args; + int err; + + memset(addrs, 0, sizeof(*addrs) * cnt); + args.addrs = addrs; + args.syms = sorted_syms; + args.cnt = cnt; + args.found = 0; + err = kallsyms_on_each_symbol(kallsyms_callback, &args); + if (err < 0) + return err; + return args.found == args.cnt ? 0 : -ESRCH; +} + +#ifdef CONFIG_SYSCTL + +#ifdef CONFIG_DYNAMIC_FTRACE +static void ftrace_startup_sysctl(void) +{ + int command; + + if (unlikely(ftrace_disabled)) + return; + + /* Force update next time */ + saved_ftrace_func = NULL; + /* ftrace_start_up is true if we want ftrace running */ + if (ftrace_start_up) { + command = FTRACE_UPDATE_CALLS; + if (ftrace_graph_active) + command |= FTRACE_START_FUNC_RET; + ftrace_startup_enable(command); + } +} + +static void ftrace_shutdown_sysctl(void) +{ + int command; + + if (unlikely(ftrace_disabled)) + return; + + /* ftrace_start_up is true if ftrace is running */ + if (ftrace_start_up) { + command = FTRACE_DISABLE_CALLS; + if (ftrace_graph_active) + command |= FTRACE_STOP_FUNC_RET; + ftrace_run_update_code(command); + } +} +#else +# define ftrace_startup_sysctl() do { } while (0) +# define ftrace_shutdown_sysctl() do { } while (0) +#endif /* CONFIG_DYNAMIC_FTRACE */ + static bool is_permanent_ops_registered(void) { struct ftrace_ops *op; @@ -7921,7 +8358,7 @@ static bool is_permanent_ops_registered(void) return false; } -int +static int ftrace_enable_sysctl(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -7964,3 +8401,22 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, mutex_unlock(&ftrace_lock); return ret; } + +static struct ctl_table ftrace_sysctls[] = { + { + .procname = "ftrace_enabled", + .data = &ftrace_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = ftrace_enable_sysctl, + }, + {} +}; + +static int __init ftrace_sysctl_init(void) +{ + register_sysctl_init("kernel", ftrace_sysctls); + return 0; +} +late_initcall(ftrace_sysctl_init); +#endif diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c index a2ef1d18126a..95106d02b32d 100644 --- a/kernel/trace/pid_list.c +++ b/kernel/trace/pid_list.c @@ -118,9 +118,9 @@ static inline unsigned int pid_join(unsigned int upper1, /** * trace_pid_list_is_set - test if the pid is set in the list * @pid_list: The pid list to test - * @pid: The pid to to see if set in the list. + * @pid: The pid to see if set in the list. * - * Tests if @pid is is set in the @pid_list. This is usually called + * Tests if @pid is set in the @pid_list. This is usually called * from the scheduler when a task is scheduled. Its pid is checked * if it should be traced or not. * diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c index ab463a4d2b23..c69d82273ce7 100644 --- a/kernel/trace/rethook.c +++ b/kernel/trace/rethook.c @@ -65,7 +65,7 @@ static void rethook_free_rcu(struct rcu_head *head) */ void rethook_free(struct rethook *rh) { - rcu_assign_pointer(rh->handler, NULL); + WRITE_ONCE(rh->handler, NULL); call_rcu(&rh->rcu, rethook_free_rcu); } @@ -154,6 +154,15 @@ struct rethook_node *rethook_try_get(struct rethook *rh) if (unlikely(!handler)) return NULL; + /* + * This expects the caller will set up a rethook on a function entry. + * When the function returns, the rethook will eventually be reclaimed + * or released in the rethook_recycle() with call_rcu(). + * This means the caller must be run in the RCU-availabe context. + */ + if (unlikely(!rcu_is_watching())) + return NULL; + fn = freelist_try_get(&rh->pool); if (!fn) return NULL; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 05dfc7a12d3d..d59b6a328b7f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -29,6 +29,14 @@ #include <asm/local.h> +/* + * The "absolute" timestamp in the buffer is only 59 bits. + * If a clock has the 5 MSBs set, it needs to be saved and + * reinserted. + */ +#define TS_MSB (0xf8ULL << 56) +#define ABS_TS_MASK (~TS_MSB) + static void update_pages_handler(struct work_struct *work); /* @@ -468,6 +476,7 @@ struct rb_time_struct { local_t cnt; local_t top; local_t bottom; + local_t msb; }; #else #include <asm/local64.h> @@ -569,7 +578,6 @@ struct ring_buffer_iter { * For the ring buffer, 64 bit required operations for the time is * the following: * - * - Only need 59 bits (uses 60 to make it even). * - Reads may fail if it interrupted a modification of the time stamp. * It will succeed if it did not interrupt another write even if * the read itself is interrupted by a write. @@ -594,6 +602,7 @@ struct ring_buffer_iter { */ #define RB_TIME_SHIFT 30 #define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1) +#define RB_TIME_MSB_SHIFT 60 static inline int rb_time_cnt(unsigned long val) { @@ -613,7 +622,7 @@ static inline u64 rb_time_val(unsigned long top, unsigned long bottom) static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt) { - unsigned long top, bottom; + unsigned long top, bottom, msb; unsigned long c; /* @@ -625,6 +634,7 @@ static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt) c = local_read(&t->cnt); top = local_read(&t->top); bottom = local_read(&t->bottom); + msb = local_read(&t->msb); } while (c != local_read(&t->cnt)); *cnt = rb_time_cnt(top); @@ -633,7 +643,8 @@ static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt) if (*cnt != rb_time_cnt(bottom)) return false; - *ret = rb_time_val(top, bottom); + /* The shift to msb will lose its cnt bits */ + *ret = rb_time_val(top, bottom) | ((u64)msb << RB_TIME_MSB_SHIFT); return true; } @@ -649,10 +660,12 @@ static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT); } -static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom) +static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom, + unsigned long *msb) { *top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK); *bottom = (unsigned long)(val & RB_TIME_VAL_MASK); + *msb = (unsigned long)(val >> RB_TIME_MSB_SHIFT); } static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt) @@ -663,15 +676,16 @@ static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long static void rb_time_set(rb_time_t *t, u64 val) { - unsigned long cnt, top, bottom; + unsigned long cnt, top, bottom, msb; - rb_time_split(val, &top, &bottom); + rb_time_split(val, &top, &bottom, &msb); /* Writes always succeed with a valid number even if it gets interrupted. */ do { cnt = local_inc_return(&t->cnt); rb_time_val_set(&t->top, top, cnt); rb_time_val_set(&t->bottom, bottom, cnt); + rb_time_val_set(&t->msb, val >> RB_TIME_MSB_SHIFT, cnt); } while (cnt != local_read(&t->cnt)); } @@ -686,8 +700,8 @@ rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set) static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) { - unsigned long cnt, top, bottom; - unsigned long cnt2, top2, bottom2; + unsigned long cnt, top, bottom, msb; + unsigned long cnt2, top2, bottom2, msb2; u64 val; /* The cmpxchg always fails if it interrupted an update */ @@ -703,16 +717,18 @@ static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) cnt2 = cnt + 1; - rb_time_split(val, &top, &bottom); + rb_time_split(val, &top, &bottom, &msb); top = rb_time_val_cnt(top, cnt); bottom = rb_time_val_cnt(bottom, cnt); - rb_time_split(set, &top2, &bottom2); + rb_time_split(set, &top2, &bottom2, &msb2); top2 = rb_time_val_cnt(top2, cnt2); bottom2 = rb_time_val_cnt(bottom2, cnt2); if (!rb_time_read_cmpxchg(&t->cnt, cnt, cnt2)) return false; + if (!rb_time_read_cmpxchg(&t->msb, msb, msb2)) + return false; if (!rb_time_read_cmpxchg(&t->top, top, top2)) return false; if (!rb_time_read_cmpxchg(&t->bottom, bottom, bottom2)) @@ -783,6 +799,24 @@ static inline void verify_event(struct ring_buffer_per_cpu *cpu_buffer, } #endif +/* + * The absolute time stamp drops the 5 MSBs and some clocks may + * require them. The rb_fix_abs_ts() will take a previous full + * time stamp, and add the 5 MSB of that time stamp on to the + * saved absolute time stamp. Then they are compared in case of + * the unlikely event that the latest time stamp incremented + * the 5 MSB. + */ +static inline u64 rb_fix_abs_ts(u64 abs, u64 save_ts) +{ + if (save_ts & TS_MSB) { + abs |= save_ts & TS_MSB; + /* Check for overflow */ + if (unlikely(abs < save_ts)) + abs += 1ULL << 59; + } + return abs; +} static inline u64 rb_time_stamp(struct trace_buffer *buffer); @@ -811,8 +845,10 @@ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer, u64 ts; /* If the event includes an absolute time, then just use that */ - if (event->type_len == RINGBUF_TYPE_TIME_STAMP) - return rb_event_time_stamp(event); + if (event->type_len == RINGBUF_TYPE_TIME_STAMP) { + ts = rb_event_time_stamp(event); + return rb_fix_abs_ts(ts, cpu_buffer->tail_page->page->time_stamp); + } nest = local_read(&cpu_buffer->committing); verify_event(cpu_buffer, event); @@ -2754,8 +2790,15 @@ static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer, (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE); if (unlikely(info->delta > (1ULL << 59))) { + /* + * Some timers can use more than 59 bits, and when a timestamp + * is added to the buffer, it will lose those bits. + */ + if (abs && (info->ts & TS_MSB)) { + info->delta &= ABS_TS_MASK; + /* did the clock go backwards */ - if (info->before == info->after && info->before > info->ts) { + } else if (info->before == info->after && info->before > info->ts) { /* not interrupted */ static int once; @@ -3304,7 +3347,7 @@ static void dump_buffer_page(struct buffer_data_page *bpage, case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); - ts = delta; + ts = rb_fix_abs_ts(delta, ts); pr_warn(" [%lld] absolute:%lld TIME STAMP\n", ts, delta); break; @@ -3380,7 +3423,7 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); - ts = delta; + ts = rb_fix_abs_ts(delta, ts); break; case RINGBUF_TYPE_PADDING: @@ -4367,6 +4410,7 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); + delta = rb_fix_abs_ts(delta, cpu_buffer->read_stamp); cpu_buffer->read_stamp = delta; return; @@ -4397,6 +4441,7 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter, case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); + delta = rb_fix_abs_ts(delta, iter->read_stamp); iter->read_stamp = delta; return; @@ -4650,6 +4695,7 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, case RINGBUF_TYPE_TIME_STAMP: if (ts) { *ts = rb_event_time_stamp(event); + *ts = rb_fix_abs_ts(*ts, reader->page->time_stamp); ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } @@ -4741,6 +4787,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) case RINGBUF_TYPE_TIME_STAMP: if (ts) { *ts = rb_event_time_stamp(event); + *ts = rb_fix_abs_ts(*ts, iter->head_page->page->time_stamp); ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } @@ -6011,10 +6058,10 @@ static __init int test_ringbuffer(void) pr_info(" total events: %ld\n", total_lost + total_read); pr_info(" recorded len bytes: %ld\n", total_len); pr_info(" recorded size bytes: %ld\n", total_size); - if (total_lost) + if (total_lost) { pr_info(" With dropped events, record len and size may not match\n" " alloced and written from above\n"); - if (!total_lost) { + } else { if (RB_WARN_ON(buffer, total_len != total_alloc || total_size != total_written)) break; diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig new file mode 100644 index 000000000000..831779607e84 --- /dev/null +++ b/kernel/trace/rv/Kconfig @@ -0,0 +1,78 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config DA_MON_EVENTS + bool + +config DA_MON_EVENTS_IMPLICIT + select DA_MON_EVENTS + bool + +config DA_MON_EVENTS_ID + select DA_MON_EVENTS + bool + +menuconfig RV + bool "Runtime Verification" + depends on TRACING + help + Enable the kernel runtime verification infrastructure. RV is a + lightweight (yet rigorous) method that complements classical + exhaustive verification techniques (such as model checking and + theorem proving). RV works by analyzing the trace of the system's + actual execution, comparing it against a formal specification of + the system behavior. + + For further information, see: + Documentation/trace/rv/runtime-verification.rst + +config RV_MON_WIP + depends on RV + depends on PREEMPT_TRACER + select DA_MON_EVENTS_IMPLICIT + bool "wip monitor" + help + Enable wip (wakeup in preemptive) sample monitor that illustrates + the usage of per-cpu monitors, and one limitation of the + preempt_disable/enable events. + + For further information, see: + Documentation/trace/rv/monitor_wip.rst + +config RV_MON_WWNR + depends on RV + select DA_MON_EVENTS_ID + bool "wwnr monitor" + help + Enable wwnr (wakeup while not running) sample monitor, this is a + sample monitor that illustrates the usage of per-task monitor. + The model is borken on purpose: it serves to test reactors. + + For further information, see: + Documentation/trace/rv/monitor_wwnr.rst + +config RV_REACTORS + bool "Runtime verification reactors" + default y + depends on RV + help + Enables the online runtime verification reactors. A runtime + monitor can cause a reaction to the detection of an exception + on the model's execution. By default, the monitors have + tracing reactions, printing the monitor output via tracepoints, + but other reactions can be added (on-demand) via this interface. + +config RV_REACT_PRINTK + bool "Printk reactor" + depends on RV_REACTORS + default y + help + Enables the printk reactor. The printk reactor emits a printk() + message if an exception is found. + +config RV_REACT_PANIC + bool "Panic reactor" + depends on RV_REACTORS + default y + help + Enables the panic reactor. The panic reactor emits a printk() + message if an exception is found and panic()s the system. diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile new file mode 100644 index 000000000000..963d14875b45 --- /dev/null +++ b/kernel/trace/rv/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_RV) += rv.o +obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o +obj-$(CONFIG_RV_MON_WWNR) += monitors/wwnr/wwnr.o +obj-$(CONFIG_RV_REACTORS) += rv_reactors.o +obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o +obj-$(CONFIG_RV_REACT_PANIC) += reactor_panic.o diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c new file mode 100644 index 000000000000..83cace53b9fa --- /dev/null +++ b/kernel/trace/rv/monitors/wip/wip.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "wip" + +#include <trace/events/rv.h> +#include <trace/events/sched.h> +#include <trace/events/preemptirq.h> + +#include "wip.h" + +struct rv_monitor rv_wip; +DECLARE_DA_MON_PER_CPU(wip, unsigned char); + +static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event_wip(preempt_disable_wip); +} + +static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_start_event_wip(preempt_enable_wip); +} + +static void handle_sched_waking(void *data, struct task_struct *task) +{ + da_handle_event_wip(sched_waking_wip); +} + +static int enable_wip(void) +{ + int retval; + + retval = da_monitor_init_wip(); + if (retval) + return retval; + + rv_attach_trace_probe("wip", preempt_enable, handle_preempt_enable); + rv_attach_trace_probe("wip", sched_waking, handle_sched_waking); + rv_attach_trace_probe("wip", preempt_disable, handle_preempt_disable); + + return 0; +} + +static void disable_wip(void) +{ + rv_wip.enabled = 0; + + rv_detach_trace_probe("wip", preempt_disable, handle_preempt_disable); + rv_detach_trace_probe("wip", preempt_enable, handle_preempt_enable); + rv_detach_trace_probe("wip", sched_waking, handle_sched_waking); + + da_monitor_destroy_wip(); +} + +struct rv_monitor rv_wip = { + .name = "wip", + .description = "wakeup in preemptive per-cpu testing monitor.", + .enable = enable_wip, + .disable = disable_wip, + .reset = da_monitor_reset_all_wip, + .enabled = 0, +}; + +static int register_wip(void) +{ + rv_register_monitor(&rv_wip); + return 0; +} + +static void unregister_wip(void) +{ + rv_unregister_monitor(&rv_wip); +} + +module_init(register_wip); +module_exit(unregister_wip); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Daniel Bristot de Oliveira <bristot@kernel.org>"); +MODULE_DESCRIPTION("wip: wakeup in preemptive - per-cpu sample monitor."); diff --git a/kernel/trace/rv/monitors/wip/wip.h b/kernel/trace/rv/monitors/wip/wip.h new file mode 100644 index 000000000000..dacc37b62a2c --- /dev/null +++ b/kernel/trace/rv/monitors/wip/wip.h @@ -0,0 +1,46 @@ +/* + * Automatically generated C representation of wip automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_wip { + preemptive_wip = 0, + non_preemptive_wip, + state_max_wip +}; + +#define INVALID_STATE state_max_wip + +enum events_wip { + preempt_disable_wip = 0, + preempt_enable_wip, + sched_waking_wip, + event_max_wip +}; + +struct automaton_wip { + char *state_names[state_max_wip]; + char *event_names[event_max_wip]; + unsigned char function[state_max_wip][event_max_wip]; + unsigned char initial_state; + bool final_states[state_max_wip]; +}; + +static struct automaton_wip automaton_wip = { + .state_names = { + "preemptive", + "non_preemptive" + }, + .event_names = { + "preempt_disable", + "preempt_enable", + "sched_waking" + }, + .function = { + { non_preemptive_wip, INVALID_STATE, INVALID_STATE }, + { INVALID_STATE, preemptive_wip, non_preemptive_wip }, + }, + .initial_state = preemptive_wip, + .final_states = { 1, 0 }, +}; diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c new file mode 100644 index 000000000000..599225d9cf38 --- /dev/null +++ b/kernel/trace/rv/monitors/wwnr/wwnr.c @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "wwnr" + +#include <trace/events/rv.h> +#include <trace/events/sched.h> + +#include "wwnr.h" + +struct rv_monitor rv_wwnr; +DECLARE_DA_MON_PER_TASK(wwnr, unsigned char); + +static void handle_switch(void *data, bool preempt, struct task_struct *p, + struct task_struct *n, unsigned int prev_state) +{ + /* start monitoring only after the first suspension */ + if (prev_state == TASK_INTERRUPTIBLE) + da_handle_start_event_wwnr(p, switch_out_wwnr); + else + da_handle_event_wwnr(p, switch_out_wwnr); + + da_handle_event_wwnr(n, switch_in_wwnr); +} + +static void handle_wakeup(void *data, struct task_struct *p) +{ + da_handle_event_wwnr(p, wakeup_wwnr); +} + +static int enable_wwnr(void) +{ + int retval; + + retval = da_monitor_init_wwnr(); + if (retval) + return retval; + + rv_attach_trace_probe("wwnr", sched_switch, handle_switch); + rv_attach_trace_probe("wwnr", sched_wakeup, handle_wakeup); + + return 0; +} + +static void disable_wwnr(void) +{ + rv_wwnr.enabled = 0; + + rv_detach_trace_probe("wwnr", sched_switch, handle_switch); + rv_detach_trace_probe("wwnr", sched_wakeup, handle_wakeup); + + da_monitor_destroy_wwnr(); +} + +struct rv_monitor rv_wwnr = { + .name = "wwnr", + .description = "wakeup while not running per-task testing model.", + .enable = enable_wwnr, + .disable = disable_wwnr, + .reset = da_monitor_reset_all_wwnr, + .enabled = 0, +}; + +static int register_wwnr(void) +{ + rv_register_monitor(&rv_wwnr); + return 0; +} + +static void unregister_wwnr(void) +{ + rv_unregister_monitor(&rv_wwnr); +} + +module_init(register_wwnr); +module_exit(unregister_wwnr); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Daniel Bristot de Oliveira <bristot@kernel.org>"); +MODULE_DESCRIPTION("wwnr: wakeup while not running monitor"); diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.h b/kernel/trace/rv/monitors/wwnr/wwnr.h new file mode 100644 index 000000000000..118e576b91b4 --- /dev/null +++ b/kernel/trace/rv/monitors/wwnr/wwnr.h @@ -0,0 +1,46 @@ +/* + * Automatically generated C representation of wwnr automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_wwnr { + not_running_wwnr = 0, + running_wwnr, + state_max_wwnr +}; + +#define INVALID_STATE state_max_wwnr + +enum events_wwnr { + switch_in_wwnr = 0, + switch_out_wwnr, + wakeup_wwnr, + event_max_wwnr +}; + +struct automaton_wwnr { + char *state_names[state_max_wwnr]; + char *event_names[event_max_wwnr]; + unsigned char function[state_max_wwnr][event_max_wwnr]; + unsigned char initial_state; + bool final_states[state_max_wwnr]; +}; + +static struct automaton_wwnr automaton_wwnr = { + .state_names = { + "not_running", + "running" + }, + .event_names = { + "switch_in", + "switch_out", + "wakeup" + }, + .function = { + { running_wwnr, INVALID_STATE, not_running_wwnr }, + { INVALID_STATE, not_running_wwnr, INVALID_STATE }, + }, + .initial_state = not_running_wwnr, + .final_states = { 1, 0 }, +}; diff --git a/kernel/trace/rv/reactor_panic.c b/kernel/trace/rv/reactor_panic.c new file mode 100644 index 000000000000..d65f6c25a87c --- /dev/null +++ b/kernel/trace/rv/reactor_panic.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira <bristot@kernel.org> + * + * Panic RV reactor: + * Prints the exception msg to the kernel message log and panic(). + */ + +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> + +static void rv_panic_reaction(char *msg) +{ + panic(msg); +} + +static struct rv_reactor rv_panic = { + .name = "panic", + .description = "panic the system if an exception is found.", + .react = rv_panic_reaction +}; + +static int __init register_react_panic(void) +{ + rv_register_reactor(&rv_panic); + return 0; +} + +static void __exit unregister_react_panic(void) +{ + rv_unregister_reactor(&rv_panic); +} + +module_init(register_react_panic); +module_exit(unregister_react_panic); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Daniel Bristot de Oliveira"); +MODULE_DESCRIPTION("panic rv reactor: panic if an exception is found."); diff --git a/kernel/trace/rv/reactor_printk.c b/kernel/trace/rv/reactor_printk.c new file mode 100644 index 000000000000..4b6b7106a477 --- /dev/null +++ b/kernel/trace/rv/reactor_printk.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira <bristot@kernel.org> + * + * Printk RV reactor: + * Prints the exception msg to the kernel message log. + */ +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> + +static void rv_printk_reaction(char *msg) +{ + printk_deferred(msg); +} + +static struct rv_reactor rv_printk = { + .name = "printk", + .description = "prints the exception msg to the kernel message log.", + .react = rv_printk_reaction +}; + +static int __init register_react_printk(void) +{ + rv_register_reactor(&rv_printk); + return 0; +} + +static void __exit unregister_react_printk(void) +{ + rv_unregister_reactor(&rv_printk); +} + +module_init(register_react_printk); +module_exit(unregister_react_printk); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Daniel Bristot de Oliveira"); +MODULE_DESCRIPTION("printk rv reactor: printk if an exception is hit."); diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c new file mode 100644 index 000000000000..6c97cc2d754a --- /dev/null +++ b/kernel/trace/rv/rv.c @@ -0,0 +1,799 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira <bristot@kernel.org> + * + * This is the online Runtime Verification (RV) interface. + * + * RV is a lightweight (yet rigorous) method that complements classical + * exhaustive verification techniques (such as model checking and + * theorem proving) with a more practical approach to complex systems. + * + * RV works by analyzing the trace of the system's actual execution, + * comparing it against a formal specification of the system behavior. + * RV can give precise information on the runtime behavior of the + * monitored system while enabling the reaction for unexpected + * events, avoiding, for example, the propagation of a failure on + * safety-critical systems. + * + * The development of this interface roots in the development of the + * paper: + * + * De Oliveira, Daniel Bristot; Cucinotta, Tommaso; De Oliveira, Romulo + * Silva. Efficient formal verification for the Linux kernel. In: + * International Conference on Software Engineering and Formal Methods. + * Springer, Cham, 2019. p. 315-332. + * + * And: + * + * De Oliveira, Daniel Bristot, et al. Automata-based formal analysis + * and verification of the real-time Linux kernel. PhD Thesis, 2020. + * + * == Runtime monitor interface == + * + * A monitor is the central part of the runtime verification of a system. + * + * The monitor stands in between the formal specification of the desired + * (or undesired) behavior, and the trace of the actual system. + * + * In Linux terms, the runtime verification monitors are encapsulated + * inside the "RV monitor" abstraction. A RV monitor includes a reference + * model of the system, a set of instances of the monitor (per-cpu monitor, + * per-task monitor, and so on), and the helper functions that glue the + * monitor to the system via trace. Generally, a monitor includes some form + * of trace output as a reaction for event parsing and exceptions, + * as depicted bellow: + * + * Linux +----- RV Monitor ----------------------------------+ Formal + * Realm | | Realm + * +-------------------+ +----------------+ +-----------------+ + * | Linux kernel | | Monitor | | Reference | + * | Tracing | -> | Instance(s) | <- | Model | + * | (instrumentation) | | (verification) | | (specification) | + * +-------------------+ +----------------+ +-----------------+ + * | | | + * | V | + * | +----------+ | + * | | Reaction | | + * | +--+--+--+-+ | + * | | | | | + * | | | +-> trace output ? | + * +------------------------|--|----------------------+ + * | +----> panic ? + * +-------> <user-specified> + * + * This file implements the interface for loading RV monitors, and + * to control the verification session. + * + * == Registering monitors == + * + * The struct rv_monitor defines a set of callback functions to control + * a verification session. For instance, when a given monitor is enabled, + * the "enable" callback function is called to hook the instrumentation + * functions to the kernel trace events. The "disable" function is called + * when disabling the verification session. + * + * A RV monitor is registered via: + * int rv_register_monitor(struct rv_monitor *monitor); + * And unregistered via: + * int rv_unregister_monitor(struct rv_monitor *monitor); + * + * == User interface == + * + * The user interface resembles kernel tracing interface. It presents + * these files: + * + * "available_monitors" + * - List the available monitors, one per line. + * + * For example: + * # cat available_monitors + * wip + * wwnr + * + * "enabled_monitors" + * - Lists the enabled monitors, one per line; + * - Writing to it enables a given monitor; + * - Writing a monitor name with a '!' prefix disables it; + * - Truncating the file disables all enabled monitors. + * + * For example: + * # cat enabled_monitors + * # echo wip > enabled_monitors + * # echo wwnr >> enabled_monitors + * # cat enabled_monitors + * wip + * wwnr + * # echo '!wip' >> enabled_monitors + * # cat enabled_monitors + * wwnr + * # echo > enabled_monitors + * # cat enabled_monitors + * # + * + * Note that more than one monitor can be enabled concurrently. + * + * "monitoring_on" + * - It is an on/off general switcher for monitoring. Note + * that it does not disable enabled monitors or detach events, + * but stops the per-entity monitors from monitoring the events + * received from the instrumentation. It resembles the "tracing_on" + * switcher. + * + * "monitors/" + * Each monitor will have its own directory inside "monitors/". There + * the monitor specific files will be presented. + * The "monitors/" directory resembles the "events" directory on + * tracefs. + * + * For example: + * # cd monitors/wip/ + * # ls + * desc enable + * # cat desc + * auto-generated wakeup in preemptive monitor. + * # cat enable + * 0 + * + * For further information, see: + * Documentation/trace/rv/runtime-verification.rst + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/slab.h> + +#ifdef CONFIG_DA_MON_EVENTS +#define CREATE_TRACE_POINTS +#include <trace/events/rv.h> +#endif + +#include "rv.h" + +DEFINE_MUTEX(rv_interface_lock); + +static struct rv_interface rv_root; + +struct dentry *get_monitors_root(void) +{ + return rv_root.monitors_dir; +} + +/* + * Interface for the monitor register. + */ +static LIST_HEAD(rv_monitors_list); + +static int task_monitor_count; +static bool task_monitor_slots[RV_PER_TASK_MONITORS]; + +int rv_get_task_monitor_slot(void) +{ + int i; + + lockdep_assert_held(&rv_interface_lock); + + if (task_monitor_count == RV_PER_TASK_MONITORS) + return -EBUSY; + + task_monitor_count++; + + for (i = 0; i < RV_PER_TASK_MONITORS; i++) { + if (task_monitor_slots[i] == false) { + task_monitor_slots[i] = true; + return i; + } + } + + WARN_ONCE(1, "RV task_monitor_count and slots are out of sync\n"); + + return -EINVAL; +} + +void rv_put_task_monitor_slot(int slot) +{ + lockdep_assert_held(&rv_interface_lock); + + if (slot < 0 || slot >= RV_PER_TASK_MONITORS) { + WARN_ONCE(1, "RV releasing an invalid slot!: %d\n", slot); + return; + } + + WARN_ONCE(!task_monitor_slots[slot], "RV releasing unused task_monitor_slots: %d\n", + slot); + + task_monitor_count--; + task_monitor_slots[slot] = false; +} + +/* + * This section collects the monitor/ files and folders. + */ +static ssize_t monitor_enable_read_data(struct file *filp, char __user *user_buf, size_t count, + loff_t *ppos) +{ + struct rv_monitor_def *mdef = filp->private_data; + const char *buff; + + buff = mdef->monitor->enabled ? "1\n" : "0\n"; + + return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff)+1); +} + +/* + * __rv_disable_monitor - disabled an enabled monitor + */ +static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync) +{ + lockdep_assert_held(&rv_interface_lock); + + if (mdef->monitor->enabled) { + mdef->monitor->enabled = 0; + mdef->monitor->disable(); + + /* + * Wait for the execution of all events to finish. + * Otherwise, the data used by the monitor could + * be inconsistent. i.e., if the monitor is re-enabled. + */ + if (sync) + tracepoint_synchronize_unregister(); + return 1; + } + return 0; +} + +/** + * rv_disable_monitor - disable a given runtime monitor + * + * Returns 0 on success. + */ +int rv_disable_monitor(struct rv_monitor_def *mdef) +{ + __rv_disable_monitor(mdef, true); + return 0; +} + +/** + * rv_enable_monitor - enable a given runtime monitor + * + * Returns 0 on success, error otherwise. + */ +int rv_enable_monitor(struct rv_monitor_def *mdef) +{ + int retval; + + lockdep_assert_held(&rv_interface_lock); + + if (mdef->monitor->enabled) + return 0; + + retval = mdef->monitor->enable(); + + if (!retval) + mdef->monitor->enabled = 1; + + return retval; +} + +/* + * interface for enabling/disabling a monitor. + */ +static ssize_t monitor_enable_write_data(struct file *filp, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct rv_monitor_def *mdef = filp->private_data; + int retval; + bool val; + + retval = kstrtobool_from_user(user_buf, count, &val); + if (retval) + return retval; + + retval = count; + + mutex_lock(&rv_interface_lock); + + if (val) + retval = rv_enable_monitor(mdef); + else + retval = rv_disable_monitor(mdef); + + mutex_unlock(&rv_interface_lock); + + return retval ? : count; +} + +static const struct file_operations interface_enable_fops = { + .open = simple_open, + .llseek = no_llseek, + .write = monitor_enable_write_data, + .read = monitor_enable_read_data, +}; + +/* + * Interface to read monitors description. + */ +static ssize_t monitor_desc_read_data(struct file *filp, char __user *user_buf, size_t count, + loff_t *ppos) +{ + struct rv_monitor_def *mdef = filp->private_data; + char buff[256]; + + memset(buff, 0, sizeof(buff)); + + snprintf(buff, sizeof(buff), "%s\n", mdef->monitor->description); + + return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff) + 1); +} + +static const struct file_operations interface_desc_fops = { + .open = simple_open, + .llseek = no_llseek, + .read = monitor_desc_read_data, +}; + +/* + * During the registration of a monitor, this function creates + * the monitor dir, where the specific options of the monitor + * are exposed. + */ +static int create_monitor_dir(struct rv_monitor_def *mdef) +{ + struct dentry *root = get_monitors_root(); + const char *name = mdef->monitor->name; + struct dentry *tmp; + int retval; + + mdef->root_d = rv_create_dir(name, root); + if (!mdef->root_d) + return -ENOMEM; + + tmp = rv_create_file("enable", RV_MODE_WRITE, mdef->root_d, mdef, &interface_enable_fops); + if (!tmp) { + retval = -ENOMEM; + goto out_remove_root; + } + + tmp = rv_create_file("desc", RV_MODE_READ, mdef->root_d, mdef, &interface_desc_fops); + if (!tmp) { + retval = -ENOMEM; + goto out_remove_root; + } + + retval = reactor_populate_monitor(mdef); + if (retval) + goto out_remove_root; + + return 0; + +out_remove_root: + rv_remove(mdef->root_d); + return retval; +} + +/* + * Available/Enable monitor shared seq functions. + */ +static int monitors_show(struct seq_file *m, void *p) +{ + struct rv_monitor_def *mon_def = p; + + seq_printf(m, "%s\n", mon_def->monitor->name); + return 0; +} + +/* + * Used by the seq file operations at the end of a read + * operation. + */ +static void monitors_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&rv_interface_lock); +} + +/* + * Available monitor seq functions. + */ +static void *available_monitors_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&rv_interface_lock); + return seq_list_start(&rv_monitors_list, *pos); +} + +static void *available_monitors_next(struct seq_file *m, void *p, loff_t *pos) +{ + return seq_list_next(p, &rv_monitors_list, pos); +} + +/* + * Enable monitor seq functions. + */ +static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos) +{ + struct rv_monitor_def *m_def = p; + + (*pos)++; + + list_for_each_entry_continue(m_def, &rv_monitors_list, list) { + if (m_def->monitor->enabled) + return m_def; + } + + return NULL; +} + +static void *enabled_monitors_start(struct seq_file *m, loff_t *pos) +{ + struct rv_monitor_def *m_def; + loff_t l; + + mutex_lock(&rv_interface_lock); + + if (list_empty(&rv_monitors_list)) + return NULL; + + m_def = list_entry(&rv_monitors_list, struct rv_monitor_def, list); + + for (l = 0; l <= *pos; ) { + m_def = enabled_monitors_next(m, m_def, &l); + if (!m_def) + break; + } + + return m_def; +} + +/* + * available/enabled monitors seq definition. + */ +static const struct seq_operations available_monitors_seq_ops = { + .start = available_monitors_start, + .next = available_monitors_next, + .stop = monitors_stop, + .show = monitors_show +}; + +static const struct seq_operations enabled_monitors_seq_ops = { + .start = enabled_monitors_start, + .next = enabled_monitors_next, + .stop = monitors_stop, + .show = monitors_show +}; + +/* + * available_monitors interface. + */ +static int available_monitors_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &available_monitors_seq_ops); +}; + +static const struct file_operations available_monitors_ops = { + .open = available_monitors_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +/* + * enabled_monitors interface. + */ +static void disable_all_monitors(void) +{ + struct rv_monitor_def *mdef; + int enabled = 0; + + mutex_lock(&rv_interface_lock); + + list_for_each_entry(mdef, &rv_monitors_list, list) + enabled += __rv_disable_monitor(mdef, false); + + if (enabled) { + /* + * Wait for the execution of all events to finish. + * Otherwise, the data used by the monitor could + * be inconsistent. i.e., if the monitor is re-enabled. + */ + tracepoint_synchronize_unregister(); + } + + mutex_unlock(&rv_interface_lock); +} + +static int enabled_monitors_open(struct inode *inode, struct file *file) +{ + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) + disable_all_monitors(); + + return seq_open(file, &enabled_monitors_seq_ops); +}; + +static ssize_t enabled_monitors_write(struct file *filp, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buff[MAX_RV_MONITOR_NAME_SIZE + 2]; + struct rv_monitor_def *mdef; + int retval = -EINVAL; + bool enable = true; + char *ptr = buff; + int len; + + if (count < 1 || count > MAX_RV_MONITOR_NAME_SIZE + 1) + return -EINVAL; + + memset(buff, 0, sizeof(buff)); + + retval = simple_write_to_buffer(buff, sizeof(buff) - 1, ppos, user_buf, count); + if (retval < 0) + return -EFAULT; + + ptr = strim(buff); + + if (ptr[0] == '!') { + enable = false; + ptr++; + } + + len = strlen(ptr); + if (!len) + return count; + + mutex_lock(&rv_interface_lock); + + retval = -EINVAL; + + list_for_each_entry(mdef, &rv_monitors_list, list) { + if (strcmp(ptr, mdef->monitor->name) != 0) + continue; + + /* + * Monitor found! + */ + if (enable) + retval = rv_enable_monitor(mdef); + else + retval = rv_disable_monitor(mdef); + + if (!retval) + retval = count; + + break; + } + + mutex_unlock(&rv_interface_lock); + return retval; +} + +static const struct file_operations enabled_monitors_ops = { + .open = enabled_monitors_open, + .read = seq_read, + .write = enabled_monitors_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* + * Monitoring on global switcher! + */ +static bool __read_mostly monitoring_on; + +/** + * rv_monitoring_on - checks if monitoring is on + * + * Returns 1 if on, 0 otherwise. + */ +bool rv_monitoring_on(void) +{ + /* Ensures that concurrent monitors read consistent monitoring_on */ + smp_rmb(); + return READ_ONCE(monitoring_on); +} + +/* + * monitoring_on general switcher. + */ +static ssize_t monitoring_on_read_data(struct file *filp, char __user *user_buf, + size_t count, loff_t *ppos) +{ + const char *buff; + + buff = rv_monitoring_on() ? "1\n" : "0\n"; + + return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff) + 1); +} + +static void turn_monitoring_off(void) +{ + WRITE_ONCE(monitoring_on, false); + /* Ensures that concurrent monitors read consistent monitoring_on */ + smp_wmb(); +} + +static void reset_all_monitors(void) +{ + struct rv_monitor_def *mdef; + + list_for_each_entry(mdef, &rv_monitors_list, list) { + if (mdef->monitor->enabled) + mdef->monitor->reset(); + } +} + +static void turn_monitoring_on(void) +{ + WRITE_ONCE(monitoring_on, true); + /* Ensures that concurrent monitors read consistent monitoring_on */ + smp_wmb(); +} + +static void turn_monitoring_on_with_reset(void) +{ + lockdep_assert_held(&rv_interface_lock); + + if (rv_monitoring_on()) + return; + + /* + * Monitors might be out of sync with the system if events were not + * processed because of !rv_monitoring_on(). + * + * Reset all monitors, forcing a re-sync. + */ + reset_all_monitors(); + turn_monitoring_on(); +} + +static ssize_t monitoring_on_write_data(struct file *filp, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + int retval; + bool val; + + retval = kstrtobool_from_user(user_buf, count, &val); + if (retval) + return retval; + + mutex_lock(&rv_interface_lock); + + if (val) + turn_monitoring_on_with_reset(); + else + turn_monitoring_off(); + + /* + * Wait for the execution of all events to finish + * before returning to user-space. + */ + tracepoint_synchronize_unregister(); + + mutex_unlock(&rv_interface_lock); + + return count; +} + +static const struct file_operations monitoring_on_fops = { + .open = simple_open, + .llseek = no_llseek, + .write = monitoring_on_write_data, + .read = monitoring_on_read_data, +}; + +static void destroy_monitor_dir(struct rv_monitor_def *mdef) +{ + reactor_cleanup_monitor(mdef); + rv_remove(mdef->root_d); +} + +/** + * rv_register_monitor - register a rv monitor. + * @monitor: The rv_monitor to be registered. + * + * Returns 0 if successful, error otherwise. + */ +int rv_register_monitor(struct rv_monitor *monitor) +{ + struct rv_monitor_def *r; + int retval = 0; + + if (strlen(monitor->name) >= MAX_RV_MONITOR_NAME_SIZE) { + pr_info("Monitor %s has a name longer than %d\n", monitor->name, + MAX_RV_MONITOR_NAME_SIZE); + return -1; + } + + mutex_lock(&rv_interface_lock); + + list_for_each_entry(r, &rv_monitors_list, list) { + if (strcmp(monitor->name, r->monitor->name) == 0) { + pr_info("Monitor %s is already registered\n", monitor->name); + retval = -1; + goto out_unlock; + } + } + + r = kzalloc(sizeof(struct rv_monitor_def), GFP_KERNEL); + if (!r) { + retval = -ENOMEM; + goto out_unlock; + } + + r->monitor = monitor; + + retval = create_monitor_dir(r); + if (retval) { + kfree(r); + goto out_unlock; + } + + list_add_tail(&r->list, &rv_monitors_list); + +out_unlock: + mutex_unlock(&rv_interface_lock); + return retval; +} + +/** + * rv_unregister_monitor - unregister a rv monitor. + * @monitor: The rv_monitor to be unregistered. + * + * Returns 0 if successful, error otherwise. + */ +int rv_unregister_monitor(struct rv_monitor *monitor) +{ + struct rv_monitor_def *ptr, *next; + + mutex_lock(&rv_interface_lock); + + list_for_each_entry_safe(ptr, next, &rv_monitors_list, list) { + if (strcmp(monitor->name, ptr->monitor->name) == 0) { + rv_disable_monitor(ptr); + list_del(&ptr->list); + destroy_monitor_dir(ptr); + } + } + + mutex_unlock(&rv_interface_lock); + return 0; +} + +int __init rv_init_interface(void) +{ + struct dentry *tmp; + int retval; + + rv_root.root_dir = rv_create_dir("rv", NULL); + if (!rv_root.root_dir) + goto out_err; + + rv_root.monitors_dir = rv_create_dir("monitors", rv_root.root_dir); + if (!rv_root.monitors_dir) + goto out_err; + + tmp = rv_create_file("available_monitors", RV_MODE_READ, rv_root.root_dir, NULL, + &available_monitors_ops); + if (!tmp) + goto out_err; + + tmp = rv_create_file("enabled_monitors", RV_MODE_WRITE, rv_root.root_dir, NULL, + &enabled_monitors_ops); + if (!tmp) + goto out_err; + + tmp = rv_create_file("monitoring_on", RV_MODE_WRITE, rv_root.root_dir, NULL, + &monitoring_on_fops); + if (!tmp) + goto out_err; + retval = init_rv_reactors(rv_root.root_dir); + if (retval) + goto out_err; + + turn_monitoring_on(); + + return 0; + +out_err: + rv_remove(rv_root.root_dir); + printk(KERN_ERR "RV: Error while creating the RV interface\n"); + return 1; +} diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h new file mode 100644 index 000000000000..db6cb0913dbd --- /dev/null +++ b/kernel/trace/rv/rv.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/mutex.h> + +struct rv_interface { + struct dentry *root_dir; + struct dentry *monitors_dir; +}; + +#include "../trace.h" +#include <linux/tracefs.h> +#include <linux/rv.h> + +#define RV_MODE_WRITE TRACE_MODE_WRITE +#define RV_MODE_READ TRACE_MODE_READ + +#define rv_create_dir tracefs_create_dir +#define rv_create_file tracefs_create_file +#define rv_remove tracefs_remove + +#define MAX_RV_MONITOR_NAME_SIZE 32 +#define MAX_RV_REACTOR_NAME_SIZE 32 + +extern struct mutex rv_interface_lock; + +#ifdef CONFIG_RV_REACTORS +struct rv_reactor_def { + struct list_head list; + struct rv_reactor *reactor; + /* protected by the monitor interface lock */ + int counter; +}; +#endif + +struct rv_monitor_def { + struct list_head list; + struct rv_monitor *monitor; + struct dentry *root_d; +#ifdef CONFIG_RV_REACTORS + struct rv_reactor_def *rdef; + bool reacting; +#endif + bool task_monitor; +}; + +struct dentry *get_monitors_root(void); +int rv_disable_monitor(struct rv_monitor_def *mdef); +int rv_enable_monitor(struct rv_monitor_def *mdef); + +#ifdef CONFIG_RV_REACTORS +int reactor_populate_monitor(struct rv_monitor_def *mdef); +void reactor_cleanup_monitor(struct rv_monitor_def *mdef); +int init_rv_reactors(struct dentry *root_dir); +#else +static inline int reactor_populate_monitor(struct rv_monitor_def *mdef) +{ + return 0; +} + +static inline void reactor_cleanup_monitor(struct rv_monitor_def *mdef) +{ + return; +} + +static inline int init_rv_reactors(struct dentry *root_dir) +{ + return 0; +} +#endif diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c new file mode 100644 index 000000000000..6aae106695b6 --- /dev/null +++ b/kernel/trace/rv/rv_reactors.c @@ -0,0 +1,510 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira <bristot@kernel.org> + * + * Runtime reactor interface. + * + * A runtime monitor can cause a reaction to the detection of an + * exception on the model's execution. By default, the monitors have + * tracing reactions, printing the monitor output via tracepoints. + * But other reactions can be added (on-demand) via this interface. + * + * == Registering reactors == + * + * The struct rv_reactor defines a callback function to be executed + * in case of a model exception happens. The callback function + * receives a message to be (optionally) printed before executing + * the reaction. + * + * A RV reactor is registered via: + * int rv_register_reactor(struct rv_reactor *reactor) + * And unregistered via: + * int rv_unregister_reactor(struct rv_reactor *reactor) + * + * These functions are exported to modules, enabling reactors to be + * dynamically loaded. + * + * == User interface == + * + * The user interface resembles the kernel tracing interface and + * presents these files: + * + * "available_reactors" + * - List the available reactors, one per line. + * + * For example: + * # cat available_reactors + * nop + * panic + * printk + * + * "reacting_on" + * - It is an on/off general switch for reactors, disabling + * all reactions. + * + * "monitors/MONITOR/reactors" + * - List available reactors, with the select reaction for the given + * MONITOR inside []. The default one is the nop (no operation) + * reactor. + * - Writing the name of an reactor enables it to the given + * MONITOR. + * + * For example: + * # cat monitors/wip/reactors + * [nop] + * panic + * printk + * # echo panic > monitors/wip/reactors + * # cat monitors/wip/reactors + * nop + * [panic] + * printk + */ + +#include <linux/slab.h> + +#include "rv.h" + +/* + * Interface for the reactor register. + */ +static LIST_HEAD(rv_reactors_list); + +static struct rv_reactor_def *get_reactor_rdef_by_name(char *name) +{ + struct rv_reactor_def *r; + + list_for_each_entry(r, &rv_reactors_list, list) { + if (strcmp(name, r->reactor->name) == 0) + return r; + } + return NULL; +} + +/* + * Available reactors seq functions. + */ +static int reactors_show(struct seq_file *m, void *p) +{ + struct rv_reactor_def *rea_def = p; + + seq_printf(m, "%s\n", rea_def->reactor->name); + return 0; +} + +static void reactors_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&rv_interface_lock); +} + +static void *reactors_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&rv_interface_lock); + return seq_list_start(&rv_reactors_list, *pos); +} + +static void *reactors_next(struct seq_file *m, void *p, loff_t *pos) +{ + return seq_list_next(p, &rv_reactors_list, pos); +} + +/* + * available_reactors seq definition. + */ +static const struct seq_operations available_reactors_seq_ops = { + .start = reactors_start, + .next = reactors_next, + .stop = reactors_stop, + .show = reactors_show +}; + +/* + * available_reactors interface. + */ +static int available_reactors_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &available_reactors_seq_ops); +}; + +static const struct file_operations available_reactors_ops = { + .open = available_reactors_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +/* + * Monitor's reactor file. + */ +static int monitor_reactor_show(struct seq_file *m, void *p) +{ + struct rv_monitor_def *mdef = m->private; + struct rv_reactor_def *rdef = p; + + if (mdef->rdef == rdef) + seq_printf(m, "[%s]\n", rdef->reactor->name); + else + seq_printf(m, "%s\n", rdef->reactor->name); + return 0; +} + +/* + * available_reactors seq definition. + */ +static const struct seq_operations monitor_reactors_seq_ops = { + .start = reactors_start, + .next = reactors_next, + .stop = reactors_stop, + .show = monitor_reactor_show +}; + +static void monitor_swap_reactors(struct rv_monitor_def *mdef, struct rv_reactor_def *rdef, + bool reacting) +{ + bool monitor_enabled; + + /* nothing to do */ + if (mdef->rdef == rdef) + return; + + monitor_enabled = mdef->monitor->enabled; + if (monitor_enabled) + rv_disable_monitor(mdef); + + /* swap reactor's usage */ + mdef->rdef->counter--; + rdef->counter++; + + mdef->rdef = rdef; + mdef->reacting = reacting; + mdef->monitor->react = rdef->reactor->react; + + if (monitor_enabled) + rv_enable_monitor(mdef); +} + +static ssize_t +monitor_reactors_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buff[MAX_RV_REACTOR_NAME_SIZE + 2]; + struct rv_monitor_def *mdef; + struct rv_reactor_def *rdef; + struct seq_file *seq_f; + int retval = -EINVAL; + bool enable; + char *ptr; + int len; + + if (count < 1 || count > MAX_RV_REACTOR_NAME_SIZE + 1) + return -EINVAL; + + memset(buff, 0, sizeof(buff)); + + retval = simple_write_to_buffer(buff, sizeof(buff) - 1, ppos, user_buf, count); + if (retval < 0) + return -EFAULT; + + ptr = strim(buff); + + len = strlen(ptr); + if (!len) + return count; + + /* + * See monitor_reactors_open() + */ + seq_f = file->private_data; + mdef = seq_f->private; + + mutex_lock(&rv_interface_lock); + + retval = -EINVAL; + + list_for_each_entry(rdef, &rv_reactors_list, list) { + if (strcmp(ptr, rdef->reactor->name) != 0) + continue; + + if (rdef == get_reactor_rdef_by_name("nop")) + enable = false; + else + enable = true; + + monitor_swap_reactors(mdef, rdef, enable); + + retval = count; + break; + } + + mutex_unlock(&rv_interface_lock); + + return retval; +} + +/* + * available_reactors interface. + */ +static int monitor_reactors_open(struct inode *inode, struct file *file) +{ + struct rv_monitor_def *mdef = inode->i_private; + struct seq_file *seq_f; + int ret; + + ret = seq_open(file, &monitor_reactors_seq_ops); + if (ret < 0) + return ret; + + /* + * seq_open stores the seq_file on the file->private data. + */ + seq_f = file->private_data; + + /* + * Copy the create file "private" data to the seq_file private data. + */ + seq_f->private = mdef; + + return 0; +}; + +static const struct file_operations monitor_reactors_ops = { + .open = monitor_reactors_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = monitor_reactors_write +}; + +static int __rv_register_reactor(struct rv_reactor *reactor) +{ + struct rv_reactor_def *r; + + list_for_each_entry(r, &rv_reactors_list, list) { + if (strcmp(reactor->name, r->reactor->name) == 0) { + pr_info("Reactor %s is already registered\n", reactor->name); + return -EINVAL; + } + } + + r = kzalloc(sizeof(struct rv_reactor_def), GFP_KERNEL); + if (!r) + return -ENOMEM; + + r->reactor = reactor; + r->counter = 0; + + list_add_tail(&r->list, &rv_reactors_list); + + return 0; +} + +/** + * rv_register_reactor - register a rv reactor. + * @reactor: The rv_reactor to be registered. + * + * Returns 0 if successful, error otherwise. + */ +int rv_register_reactor(struct rv_reactor *reactor) +{ + int retval = 0; + + if (strlen(reactor->name) >= MAX_RV_REACTOR_NAME_SIZE) { + pr_info("Reactor %s has a name longer than %d\n", + reactor->name, MAX_RV_MONITOR_NAME_SIZE); + return -EINVAL; + } + + mutex_lock(&rv_interface_lock); + retval = __rv_register_reactor(reactor); + mutex_unlock(&rv_interface_lock); + return retval; +} + +/** + * rv_unregister_reactor - unregister a rv reactor. + * @reactor: The rv_reactor to be unregistered. + * + * Returns 0 if successful, error otherwise. + */ +int rv_unregister_reactor(struct rv_reactor *reactor) +{ + struct rv_reactor_def *ptr, *next; + int ret = 0; + + mutex_lock(&rv_interface_lock); + + list_for_each_entry_safe(ptr, next, &rv_reactors_list, list) { + if (strcmp(reactor->name, ptr->reactor->name) == 0) { + + if (!ptr->counter) { + list_del(&ptr->list); + } else { + printk(KERN_WARNING + "rv: the rv_reactor %s is in use by %d monitor(s)\n", + ptr->reactor->name, ptr->counter); + printk(KERN_WARNING "rv: the rv_reactor %s cannot be removed\n", + ptr->reactor->name); + ret = -EBUSY; + break; + } + } + } + + mutex_unlock(&rv_interface_lock); + return ret; +} + +/* + * reacting_on interface. + */ +static bool __read_mostly reacting_on; + +/** + * rv_reacting_on - checks if reacting is on + * + * Returns 1 if on, 0 otherwise. + */ +bool rv_reacting_on(void) +{ + /* Ensures that concurrent monitors read consistent reacting_on */ + smp_rmb(); + return READ_ONCE(reacting_on); +} + +static ssize_t reacting_on_read_data(struct file *filp, + char __user *user_buf, + size_t count, loff_t *ppos) +{ + char *buff; + + buff = rv_reacting_on() ? "1\n" : "0\n"; + + return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff)+1); +} + +static void turn_reacting_off(void) +{ + WRITE_ONCE(reacting_on, false); + /* Ensures that concurrent monitors read consistent reacting_on */ + smp_wmb(); +} + +static void turn_reacting_on(void) +{ + WRITE_ONCE(reacting_on, true); + /* Ensures that concurrent monitors read consistent reacting_on */ + smp_wmb(); +} + +static ssize_t reacting_on_write_data(struct file *filp, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + int retval; + bool val; + + retval = kstrtobool_from_user(user_buf, count, &val); + if (retval) + return retval; + + mutex_lock(&rv_interface_lock); + + if (val) + turn_reacting_on(); + else + turn_reacting_off(); + + /* + * Wait for the execution of all events to finish + * before returning to user-space. + */ + tracepoint_synchronize_unregister(); + + mutex_unlock(&rv_interface_lock); + + return count; +} + +static const struct file_operations reacting_on_fops = { + .open = simple_open, + .llseek = no_llseek, + .write = reacting_on_write_data, + .read = reacting_on_read_data, +}; + +/** + * reactor_populate_monitor - creates per monitor reactors file + * @mdef: monitor's definition. + * + * Returns 0 if successful, error otherwise. + */ +int reactor_populate_monitor(struct rv_monitor_def *mdef) +{ + struct dentry *tmp; + + tmp = rv_create_file("reactors", RV_MODE_WRITE, mdef->root_d, mdef, &monitor_reactors_ops); + if (!tmp) + return -ENOMEM; + + /* + * Configure as the rv_nop reactor. + */ + mdef->rdef = get_reactor_rdef_by_name("nop"); + mdef->rdef->counter++; + mdef->reacting = false; + + return 0; +} + +/** + * reactor_cleanup_monitor - cleanup a monitor reference + * @mdef: monitor's definition. + */ +void reactor_cleanup_monitor(struct rv_monitor_def *mdef) +{ + lockdep_assert_held(&rv_interface_lock); + mdef->rdef->counter--; + WARN_ON_ONCE(mdef->rdef->counter < 0); +} + +/* + * Nop reactor register + */ +static void rv_nop_reaction(char *msg) +{ +} + +static struct rv_reactor rv_nop = { + .name = "nop", + .description = "no-operation reactor: do nothing.", + .react = rv_nop_reaction +}; + +int init_rv_reactors(struct dentry *root_dir) +{ + struct dentry *available, *reacting; + int retval; + + available = rv_create_file("available_reactors", RV_MODE_READ, root_dir, NULL, + &available_reactors_ops); + if (!available) + goto out_err; + + reacting = rv_create_file("reacting_on", RV_MODE_WRITE, root_dir, NULL, &reacting_on_fops); + if (!reacting) + goto rm_available; + + retval = __rv_register_reactor(&rv_nop); + if (retval) + goto rm_reacting; + + turn_reacting_on(); + + return 0; + +rm_reacting: + rv_remove(reacting); +rm_available: + rv_remove(available); +out_err: + return -ENOMEM; +} diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f4de111fa18f..d3005279165d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -721,13 +721,16 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, pos = 0; ret = trace_get_user(&parser, ubuf, cnt, &pos); - if (ret < 0 || !trace_parser_loaded(&parser)) + if (ret < 0) break; read += ret; ubuf += ret; cnt -= ret; + if (!trace_parser_loaded(&parser)) + break; + ret = -EINVAL; if (kstrtoul(parser.buffer, 0, &val)) break; @@ -753,7 +756,6 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, if (!nr_pids) { /* Cleared the list of pids */ trace_pid_list_free(pid_list); - read = ret; pid_list = NULL; } @@ -1174,7 +1176,7 @@ void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) EXPORT_SYMBOL_GPL(tracing_snapshot_cond); /** - * tracing_snapshot_cond_data - get the user data associated with a snapshot + * tracing_cond_snapshot_data - get the user data associated with a snapshot * @tr: The tracing instance * * When the user enables a conditional snapshot using @@ -1542,6 +1544,7 @@ static struct { { ktime_get_mono_fast_ns, "mono", 1 }, { ktime_get_raw_fast_ns, "mono_raw", 1 }, { ktime_get_boot_fast_ns, "boot", 1 }, + { ktime_get_tai_fast_ns, "tai", 1 }, ARCH_TRACE_CLOCKS }; @@ -2835,7 +2838,7 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, } EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve); -static DEFINE_SPINLOCK(tracepoint_iter_lock); +static DEFINE_RAW_SPINLOCK(tracepoint_iter_lock); static DEFINE_MUTEX(tracepoint_printk_mutex); static void output_printk(struct trace_event_buffer *fbuffer) @@ -2863,14 +2866,14 @@ static void output_printk(struct trace_event_buffer *fbuffer) event = &fbuffer->trace_file->event_call->event; - spin_lock_irqsave(&tracepoint_iter_lock, flags); + raw_spin_lock_irqsave(&tracepoint_iter_lock, flags); trace_seq_init(&iter->seq); iter->ent = fbuffer->entry; event_call->event.funcs->trace(iter, 0, event); trace_seq_putc(&iter->seq, 0); printk("%s", iter->seq.buffer); - spin_unlock_irqrestore(&tracepoint_iter_lock, flags); + raw_spin_unlock_irqrestore(&tracepoint_iter_lock, flags); } int tracepoint_printk_sysctl(struct ctl_table *table, int write, @@ -3102,17 +3105,17 @@ void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, } /* - * When an NMI triggers, RCU is enabled via rcu_nmi_enter(), + * When an NMI triggers, RCU is enabled via ct_nmi_enter(), * but if the above rcu_is_watching() failed, then the NMI - * triggered someplace critical, and rcu_irq_enter() should + * triggered someplace critical, and ct_irq_enter() should * not be called from NMI. */ if (unlikely(in_nmi())) return; - rcu_irq_enter_irqson(); + ct_irq_enter_irqson(); __ftrace_trace_stack(buffer, trace_ctx, skip, NULL); - rcu_irq_exit_irqson(); + ct_irq_exit_irqson(); } /** @@ -4249,7 +4252,7 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file unsigned int flags) { bool tgid = flags & TRACE_ITER_RECORD_TGID; - const char *space = " "; + static const char space[] = " "; int prec = tgid ? 12 : 2; print_event_info(buf, m); @@ -4273,9 +4276,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) struct tracer *type = iter->trace; unsigned long entries; unsigned long total; - const char *name = "preemption"; - - name = type->name; + const char *name = type->name; get_total_entries(buf, &total, &entries); @@ -4289,17 +4290,11 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) entries, total, buf->cpu, -#if defined(CONFIG_PREEMPT_NONE) - "server", -#elif defined(CONFIG_PREEMPT_VOLUNTARY) - "desktop", -#elif defined(CONFIG_PREEMPT) - "preempt", -#elif defined(CONFIG_PREEMPT_RT) - "preempt_rt", -#else + preempt_model_none() ? "server" : + preempt_model_voluntary() ? "desktop" : + preempt_model_full() ? "preempt" : + preempt_model_rt() ? "preempt_rt" : "unknown", -#endif /* These are reserved for later use */ 0, 0, 0, 0); #ifdef CONFIG_SMP @@ -5475,7 +5470,7 @@ static const char readme_msg[] = " error_log\t- error log for failed commands (that support it)\n" " buffer_size_kb\t- view and modify size of per cpu buffer\n" " buffer_total_size_kb - view total size of all cpu buffers\n\n" - " trace_clock\t\t-change the clock used to order events\n" + " trace_clock\t\t- change the clock used to order events\n" " local: Per cpu clock but may not be synced across CPUs\n" " global: Synced across CPUs but slows tracing down.\n" " counter: Not a clock, but just an increment\n" @@ -5484,7 +5479,7 @@ static const char readme_msg[] = #ifdef CONFIG_X86_64 " x86-tsc: TSC cycle counter\n" #endif - "\n timestamp_mode\t-view the mode used to timestamp events\n" + "\n timestamp_mode\t- view the mode used to timestamp events\n" " delta: Delta difference against a buffer-wide timestamp\n" " absolute: Absolute (standalone) timestamp\n" "\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n" @@ -5574,13 +5569,13 @@ static const char readme_msg[] = #endif #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) "\t accepts: event-definitions (one definition per line)\n" - "\t Format: p[:[<group>/]<event>] <place> [<args>]\n" - "\t r[maxactive][:[<group>/]<event>] <place> [<args>]\n" + "\t Format: p[:[<group>/][<event>]] <place> [<args>]\n" + "\t r[maxactive][:[<group>/][<event>]] <place> [<args>]\n" #ifdef CONFIG_HIST_TRIGGERS "\t s:[synthetic/]<event> <field> [<field>]\n" #endif - "\t e[:[<group>/]<event>] <attached-group>.<attached-event> [<args>]\n" - "\t -:[<group>/]<event>\n" + "\t e[:[<group>/][<event>]] <attached-group>.<attached-event> [<args>]\n" + "\t -:[<group>/][<event>]\n" #ifdef CONFIG_KPROBE_EVENTS "\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n" "place (kretprobe): [<module>:]<symbol>[+<offset>]%return|<memaddr>\n" @@ -6332,12 +6327,18 @@ static void tracing_set_nop(struct trace_array *tr) tr->current_trace = &nop_trace; } +static bool tracer_options_updated; + static void add_tracer_options(struct trace_array *tr, struct tracer *t) { /* Only enable if the directory has been created already. */ if (!tr->dir) return; + /* Only create trace option files after update_tracer_options finish */ + if (!tracer_options_updated) + return; + create_trace_option_files(tr, t); } @@ -6423,9 +6424,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) synchronize_rcu(); free_snapshot(tr); } -#endif -#ifdef CONFIG_TRACER_MAX_TRACE if (t->use_max_tr && !had_max_tr) { ret = tracing_alloc_snapshot_instance(tr); if (ret < 0) @@ -6454,7 +6453,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, { struct trace_array *tr = filp->private_data; char buf[MAX_TRACER_SIZE+1]; - int i; + char *name; size_t ret; int err; @@ -6468,11 +6467,9 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, buf[cnt] = 0; - /* strip ending whitespace. */ - for (i = cnt - 1; i > 0 && isspace(buf[i]); i--) - buf[i] = 0; + name = strim(buf); - err = tracing_set_tracer(tr, buf); + err = tracing_set_tracer(tr, name); if (err) return err; @@ -9104,6 +9101,16 @@ allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size return 0; } +static void free_trace_buffer(struct array_buffer *buf) +{ + if (buf->buffer) { + ring_buffer_free(buf->buffer); + buf->buffer = NULL; + free_percpu(buf->data); + buf->data = NULL; + } +} + static int allocate_trace_buffers(struct trace_array *tr, int size) { int ret; @@ -9116,10 +9123,7 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) ret = allocate_trace_buffer(tr, &tr->max_buffer, allocate_snapshot ? size : 1); if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) { - ring_buffer_free(tr->array_buffer.buffer); - tr->array_buffer.buffer = NULL; - free_percpu(tr->array_buffer.data); - tr->array_buffer.data = NULL; + free_trace_buffer(&tr->array_buffer); return -ENOMEM; } tr->allocated_snapshot = allocate_snapshot; @@ -9134,16 +9138,6 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) return 0; } -static void free_trace_buffer(struct array_buffer *buf) -{ - if (buf->buffer) { - ring_buffer_free(buf->buffer); - buf->buffer = NULL; - free_percpu(buf->data); - buf->data = NULL; - } -} - static void free_trace_buffers(struct trace_array *tr) { if (!tr) @@ -9176,6 +9170,7 @@ static void __update_tracer_options(struct trace_array *tr) static void update_tracer_options(struct trace_array *tr) { mutex_lock(&trace_types_lock); + tracer_options_updated = true; __update_tracer_options(tr); mutex_unlock(&trace_types_lock); } @@ -9608,6 +9603,7 @@ extern struct trace_eval_map *__stop_ftrace_eval_maps[]; static struct workqueue_struct *eval_map_wq __initdata; static struct work_struct eval_map_work __initdata; +static struct work_struct tracerfs_init_work __initdata; static void __init eval_map_work_func(struct work_struct *work) { @@ -9633,6 +9629,8 @@ static int __init trace_eval_init(void) return 0; } +subsys_initcall(trace_eval_init); + static int __init trace_eval_sync(void) { /* Make sure the eval map updates are finished */ @@ -9715,15 +9713,8 @@ static struct notifier_block trace_module_nb = { }; #endif /* CONFIG_MODULES */ -static __init int tracer_init_tracefs(void) +static __init void tracer_init_tracefs_work_func(struct work_struct *work) { - int ret; - - trace_access_lock_init(); - - ret = tracing_init_dentry(); - if (ret) - return 0; event_trace_init(); @@ -9745,8 +9736,6 @@ static __init int tracer_init_tracefs(void) trace_create_file("saved_tgids", TRACE_MODE_READ, NULL, NULL, &tracing_saved_tgids_fops); - trace_eval_init(); - trace_create_eval_file(NULL); #ifdef CONFIG_MODULES @@ -9761,6 +9750,26 @@ static __init int tracer_init_tracefs(void) create_trace_instances(NULL); update_tracer_options(&global_trace); +} + +static __init int tracer_init_tracefs(void) +{ + int ret; + + trace_access_lock_init(); + + ret = tracing_init_dentry(); + if (ret) + return 0; + + if (eval_map_wq) { + INIT_WORK(&tracerfs_init_work, tracer_init_tracefs_work_func); + queue_work(eval_map_wq, &tracerfs_init_work); + } else { + tracer_init_tracefs_work_func(NULL); + } + + rv_init_interface(); return 0; } @@ -9854,6 +9863,12 @@ void trace_init_global_iter(struct trace_iterator *iter) /* Output in nanoseconds only if we are using a clock in nanoseconds. */ if (trace_clocks[iter->tr->clock_id].in_ns) iter->iter_flags |= TRACE_FILE_TIME_IN_NS; + + /* Can not use kmalloc for iter.temp and iter.fmt */ + iter->temp = static_temp_buf; + iter->temp_size = STATIC_TEMP_BUF_SIZE; + iter->fmt = static_fmt_buf; + iter->fmt_size = STATIC_FMT_BUF_SIZE; } void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) @@ -9886,11 +9901,6 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) /* Simulate the iterator */ trace_init_global_iter(&iter); - /* Can not use kmalloc for iter.temp and iter.fmt */ - iter.temp = static_temp_buf; - iter.temp_size = STATIC_TEMP_BUF_SIZE; - iter.fmt = static_fmt_buf; - iter.fmt_size = STATIC_FMT_BUF_SIZE; for_each_tracing_cpu(cpu) { atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 07d990270e2a..900e75d96c84 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1573,13 +1573,12 @@ struct enable_trigger_data { }; extern int event_enable_trigger_print(struct seq_file *m, - struct event_trigger_ops *ops, - struct event_trigger_data *data); -extern void event_enable_trigger_free(struct event_trigger_ops *ops, struct event_trigger_data *data); +extern void event_enable_trigger_free(struct event_trigger_data *data); extern int event_enable_trigger_parse(struct event_command *cmd_ops, struct trace_event_file *file, - char *glob, char *cmd, char *param); + char *glob, char *cmd, + char *param_and_filter); extern int event_enable_register_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file); @@ -1587,8 +1586,7 @@ extern void event_enable_unregister_trigger(char *glob, struct event_trigger_data *test, struct trace_event_file *file); extern void trigger_data_free(struct event_trigger_data *data); -extern int event_trigger_init(struct event_trigger_ops *ops, - struct event_trigger_data *data); +extern int event_trigger_init(struct event_trigger_data *data); extern int trace_event_trigger_enable_disable(struct trace_event_file *file, int trigger_enable); extern void update_cond_flag(struct trace_event_file *file); @@ -1629,10 +1627,11 @@ extern void event_trigger_reset_filter(struct event_command *cmd_ops, extern int event_trigger_register(struct event_command *cmd_ops, struct trace_event_file *file, char *glob, - char *cmd, - char *trigger, - struct event_trigger_data *trigger_data, - int *n_registered); + struct event_trigger_data *trigger_data); +extern void event_trigger_unregister(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, + struct event_trigger_data *trigger_data); /** * struct event_trigger_ops - callbacks for trace event triggers @@ -1686,12 +1685,9 @@ struct event_trigger_ops { struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe); - int (*init)(struct event_trigger_ops *ops, - struct event_trigger_data *data); - void (*free)(struct event_trigger_ops *ops, - struct event_trigger_data *data); + int (*init)(struct event_trigger_data *data); + void (*free)(struct event_trigger_data *data); int (*print)(struct seq_file *m, - struct event_trigger_ops *ops, struct event_trigger_data *data); }; @@ -2009,4 +2005,13 @@ struct trace_min_max_param { extern const struct file_operations trace_min_max_fops; +#ifdef CONFIG_RV +extern int rv_init_interface(void); +#else +static inline int rv_init_interface(void) +{ + return 0; +} +#endif + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 0580287d7a0d..778200dd8ede 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -300,7 +300,7 @@ trace_boot_hist_add_handlers(struct xbc_node *hnode, char **bufp, { struct xbc_node *node; const char *p, *handler; - int ret; + int ret = 0; handler = xbc_node_get_data(hnode); diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index e34e8182ee4b..154996684fb5 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -101,7 +101,7 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type event = p + 1; *p = '\0'; } - if (event[0] == '\0') { + if (!system && event[0] == '\0') { ret = -EINVAL; goto out; } @@ -255,19 +255,14 @@ static const struct file_operations dynamic_events_ops = { /* Make a tracefs interface for controlling dynamic events */ static __init int init_dynamic_event(void) { - struct dentry *entry; int ret; ret = tracing_init_dentry(); if (ret) return 0; - entry = tracefs_create_file("dynamic_events", TRACE_MODE_WRITE, NULL, - NULL, &dynamic_events_ops); - - /* Event list interface */ - if (!entry) - pr_warn("Could not create tracefs 'dynamic_events' entry\n"); + trace_create_file("dynamic_events", TRACE_MODE_WRITE, NULL, + NULL, &dynamic_events_ops); return 0; } diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 541aa13581b9..1783e3478912 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -125,6 +125,7 @@ static bool eprobe_dyn_event_match(const char *system, const char *event, * We match the following: * event only - match all eprobes with event name * system and event only - match all system/event probes + * system only - match all system probes * * The below has the above satisfied with more arguments: * @@ -143,7 +144,7 @@ static bool eprobe_dyn_event_match(const char *system, const char *event, return false; /* Must match the event name */ - if (strcmp(trace_probe_name(&ep->tp), event) != 0) + if (event[0] != '\0' && strcmp(trace_probe_name(&ep->tp), event) != 0) return false; /* No arguments match all */ @@ -226,6 +227,7 @@ static int trace_eprobe_tp_arg_update(struct trace_eprobe *ep, int i) struct probe_arg *parg = &ep->tp.args[i]; struct ftrace_event_field *field; struct list_head *head; + int ret = -ENOENT; head = trace_get_fields(ep->event); list_for_each_entry(field, head, link) { @@ -235,9 +237,20 @@ static int trace_eprobe_tp_arg_update(struct trace_eprobe *ep, int i) return 0; } } + + /* + * Argument not found on event. But allow for comm and COMM + * to be used to get the current->comm. + */ + if (strcmp(parg->code->data, "COMM") == 0 || + strcmp(parg->code->data, "comm") == 0) { + parg->code->op = FETCH_OP_COMM; + ret = 0; + } + kfree(parg->code->data); parg->code->data = NULL; - return -ENOENT; + return ret; } static int eprobe_event_define_fields(struct trace_event_call *event_call) @@ -310,6 +323,27 @@ static unsigned long get_event_field(struct fetch_insn *code, void *rec) addr = rec + field->offset; + if (is_string_field(field)) { + switch (field->filter_type) { + case FILTER_DYN_STRING: + val = (unsigned long)(rec + (*(unsigned int *)addr & 0xffff)); + break; + case FILTER_RDYN_STRING: + val = (unsigned long)(addr + (*(unsigned int *)addr & 0xffff)); + break; + case FILTER_STATIC_STRING: + val = (unsigned long)addr; + break; + case FILTER_PTR_STRING: + val = (unsigned long)(*(char *)addr); + break; + default: + WARN_ON_ONCE(1); + return 0; + } + return val; + } + switch (field->size) { case 1: if (field->is_signed) @@ -341,16 +375,38 @@ static unsigned long get_event_field(struct fetch_insn *code, void *rec) static int get_eprobe_size(struct trace_probe *tp, void *rec) { + struct fetch_insn *code; struct probe_arg *arg; int i, len, ret = 0; for (i = 0; i < tp->nr_args; i++) { arg = tp->args + i; - if (unlikely(arg->dynamic)) { + if (arg->dynamic) { unsigned long val; - val = get_event_field(arg->code, rec); - len = process_fetch_insn_bottom(arg->code + 1, val, NULL, NULL); + code = arg->code; + retry: + switch (code->op) { + case FETCH_OP_TP_ARG: + val = get_event_field(code, rec); + break; + case FETCH_OP_IMM: + val = code->immediate; + break; + case FETCH_OP_COMM: + val = (unsigned long)current->comm; + break; + case FETCH_OP_DATA: + val = (unsigned long)code->data; + break; + case FETCH_NOP_SYMBOL: /* Ignore a place holder */ + code++; + goto retry; + default: + continue; + } + code++; + len = process_fetch_insn_bottom(code, val, NULL, NULL); if (len > 0) ret += len; } @@ -368,8 +424,28 @@ process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, { unsigned long val; - val = get_event_field(code, rec); - return process_fetch_insn_bottom(code + 1, val, dest, base); + retry: + switch (code->op) { + case FETCH_OP_TP_ARG: + val = get_event_field(code, rec); + break; + case FETCH_OP_IMM: + val = code->immediate; + break; + case FETCH_OP_COMM: + val = (unsigned long)current->comm; + break; + case FETCH_OP_DATA: + val = (unsigned long)code->data; + break; + case FETCH_NOP_SYMBOL: /* Ignore a place holder */ + code++; + goto retry; + default: + return -EILSEQ; + } + code++; + return process_fetch_insn_bottom(code, val, dest, base); } NOKPROBE_SYMBOL(process_fetch_insn) @@ -511,20 +587,17 @@ __eprobe_trace_func(struct eprobe_data *edata, void *rec) * functions are just stubs to fulfill what is needed to use the trigger * infrastructure. */ -static int eprobe_trigger_init(struct event_trigger_ops *ops, - struct event_trigger_data *data) +static int eprobe_trigger_init(struct event_trigger_data *data) { return 0; } -static void eprobe_trigger_free(struct event_trigger_ops *ops, - struct event_trigger_data *data) +static void eprobe_trigger_free(struct event_trigger_data *data) { } static int eprobe_trigger_print(struct seq_file *m, - struct event_trigger_ops *ops, struct event_trigger_data *data) { /* Do not print eprobe event triggers */ @@ -549,7 +622,8 @@ static struct event_trigger_ops eprobe_trigger_ops = { static int eprobe_trigger_cmd_parse(struct event_command *cmd_ops, struct trace_event_file *file, - char *glob, char *cmd, char *param) + char *glob, char *cmd, + char *param_and_filter) { return -1; } @@ -650,7 +724,7 @@ static struct trace_event_functions eprobe_funcs = { static int disable_eprobe(struct trace_eprobe *ep, struct trace_array *tr) { - struct event_trigger_data *trigger; + struct event_trigger_data *trigger = NULL, *iter; struct trace_event_file *file; struct eprobe_data *edata; @@ -658,14 +732,16 @@ static int disable_eprobe(struct trace_eprobe *ep, if (!file) return -ENOENT; - list_for_each_entry(trigger, &file->triggers, list) { - if (!(trigger->flags & EVENT_TRIGGER_FL_PROBE)) + list_for_each_entry(iter, &file->triggers, list) { + if (!(iter->flags & EVENT_TRIGGER_FL_PROBE)) continue; - edata = trigger->private_data; - if (edata->ep == ep) + edata = iter->private_data; + if (edata->ep == ep) { + trigger = iter; break; + } } - if (list_entry_is_head(trigger, &file->triggers, list)) + if (!trigger) return -ENODEV; list_del_rcu(&trigger->list); @@ -838,8 +914,15 @@ static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[ if (ret) return ret; - if (ep->tp.args[i].code->op == FETCH_OP_TP_ARG) + if (ep->tp.args[i].code->op == FETCH_OP_TP_ARG) { ret = trace_eprobe_tp_arg_update(ep, i); + if (ret) + trace_probe_log_err(0, BAD_ATTACH_ARG); + } + + /* Handle symbols "@" */ + if (!ret) + ret = traceprobe_update_arg(&ep->tp.args[i]); return ret; } @@ -848,7 +931,7 @@ static int __trace_eprobe_create(int argc, const char *argv[]) { /* * Argument syntax: - * e[:[GRP/]ENAME] SYSTEM.EVENT [FETCHARGS] + * e[:[GRP/][ENAME]] SYSTEM.EVENT [FETCHARGS] * Fetch args: * <name>=$<field>[:TYPE] */ @@ -858,6 +941,7 @@ static int __trace_eprobe_create(int argc, const char *argv[]) struct trace_eprobe *ep = NULL; char buf1[MAX_EVENT_NAME_LEN]; char buf2[MAX_EVENT_NAME_LEN]; + char gbuf[MAX_EVENT_NAME_LEN]; int ret = 0; int i; @@ -869,25 +953,25 @@ static int __trace_eprobe_create(int argc, const char *argv[]) event = strchr(&argv[0][1], ':'); if (event) { event++; - ret = traceprobe_parse_event_name(&event, &group, buf1, + ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) goto parse_error; - } else { - strscpy(buf1, argv[1], MAX_EVENT_NAME_LEN); - sanitize_event_name(buf1); - event = buf1; } - if (!is_good_name(event) || !is_good_name(group)) - goto parse_error; + trace_probe_log_set_index(1); sys_event = argv[1]; - ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, - sys_event - argv[1]); - if (ret || !sys_name) - goto parse_error; - if (!is_good_name(sys_event) || !is_good_name(sys_name)) + ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, 0); + if (ret || !sys_event || !sys_name) { + trace_probe_log_err(0, NO_EVENT_INFO); goto parse_error; + } + + if (!event) { + strscpy(buf1, argv[1], MAX_EVENT_NAME_LEN); + sanitize_event_name(buf1); + event = buf1; + } mutex_lock(&event_mutex); event_call = find_and_get_event(sys_name, sys_event); @@ -896,6 +980,8 @@ static int __trace_eprobe_create(int argc, const char *argv[]) if (IS_ERR(ep)) { ret = PTR_ERR(ep); + if (ret == -ENODEV) + trace_probe_log_err(0, BAD_ATTACH_EVENT); /* This must return -ENOMEM or missing event, else there is a bug */ WARN_ON_ONCE(ret != -ENOMEM && ret != -ENODEV); ep = NULL; diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index a114549720d6..61e3a2620fa3 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -157,7 +157,7 @@ static void perf_trace_event_unreg(struct perf_event *p_event) int i; if (--tp_event->perf_refcount > 0) - goto out; + return; tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL); @@ -176,8 +176,6 @@ static void perf_trace_event_unreg(struct perf_event *p_event) perf_trace_buf[i] = NULL; } } -out: - trace_event_put_ref(tp_event); } static int perf_trace_event_open(struct perf_event *p_event) @@ -241,6 +239,7 @@ void perf_trace_destroy(struct perf_event *p_event) mutex_lock(&event_mutex); perf_trace_event_close(p_event); perf_trace_event_unreg(p_event); + trace_event_put_ref(p_event->tp_event); mutex_unlock(&event_mutex); } @@ -292,6 +291,7 @@ void perf_kprobe_destroy(struct perf_event *p_event) mutex_lock(&event_mutex); perf_trace_event_close(p_event); perf_trace_event_unreg(p_event); + trace_event_put_ref(p_event->tp_event); mutex_unlock(&event_mutex); destroy_local_trace_kprobe(p_event->tp_event); @@ -347,6 +347,7 @@ void perf_uprobe_destroy(struct perf_event *p_event) mutex_lock(&event_mutex); perf_trace_event_close(p_event); perf_trace_event_unreg(p_event); + trace_event_put_ref(p_event->tp_event); mutex_unlock(&event_mutex); destroy_local_trace_uprobe(p_event->tp_event); } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e11e167b7809..0356cae0cf74 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -176,6 +176,7 @@ static int trace_define_generic_fields(void) __generic_field(int, CPU, FILTER_CPU); __generic_field(int, cpu, FILTER_CPU); + __generic_field(int, common_cpu, FILTER_CPU); __generic_field(char *, COMM, FILTER_COMM); __generic_field(char *, comm, FILTER_COMM); @@ -392,12 +393,6 @@ static void test_event_printk(struct trace_event_call *call) if (!(dereference_flags & (1ULL << arg))) goto next_arg; - /* Check for __get_sockaddr */; - if (str_has_prefix(fmt + i, "__get_sockaddr(")) { - dereference_flags &= ~(1ULL << arg); - goto next_arg; - } - /* Find the REC-> in the argument */ c = strchr(fmt + i, ','); r = strstr(fmt + i, "REC->"); @@ -413,7 +408,14 @@ static void test_event_printk(struct trace_event_call *call) a = strchr(fmt + i, '&'); if ((a && (a < r)) || test_field(r, call)) dereference_flags &= ~(1ULL << arg); + } else if ((r = strstr(fmt + i, "__get_dynamic_array(")) && + (!c || r < c)) { + dereference_flags &= ~(1ULL << arg); + } else if ((r = strstr(fmt + i, "__get_sockaddr(")) && + (!c || r < c)) { + dereference_flags &= ~(1ULL << arg); } + next_arg: i--; arg++; @@ -773,9 +775,9 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable) static void event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, - unsigned int prev_state, struct task_struct *prev, - struct task_struct *next) + struct task_struct *next, + unsigned int prev_state) { struct trace_array *tr = data; struct trace_pid_list *no_pid_list; @@ -799,9 +801,9 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, static void event_filter_pid_sched_switch_probe_post(void *data, bool preempt, - unsigned int prev_state, struct task_struct *prev, - struct task_struct *next) + struct task_struct *next, + unsigned int prev_state) { struct trace_array *tr = data; struct trace_pid_list *no_pid_list; @@ -1723,9 +1725,9 @@ static LIST_HEAD(event_subsystems); static int subsystem_open(struct inode *inode, struct file *filp) { + struct trace_subsystem_dir *dir = NULL, *iter_dir; + struct trace_array *tr = NULL, *iter_tr; struct event_subsystem *system = NULL; - struct trace_subsystem_dir *dir = NULL; /* Initialize for gcc */ - struct trace_array *tr; int ret; if (tracing_is_disabled()) @@ -1734,10 +1736,12 @@ static int subsystem_open(struct inode *inode, struct file *filp) /* Make sure the system still exists */ mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); - list_for_each_entry(tr, &ftrace_trace_arrays, list) { - list_for_each_entry(dir, &tr->systems, list) { - if (dir == inode->i_private) { + list_for_each_entry(iter_tr, &ftrace_trace_arrays, list) { + list_for_each_entry(iter_dir, &iter_tr->systems, list) { + if (iter_dir == inode->i_private) { /* Don't open systems with no events */ + tr = iter_tr; + dir = iter_dir; if (dir->nr_events) { __get_system_dir(dir); system = dir->subsystem; @@ -1753,9 +1757,6 @@ static int subsystem_open(struct inode *inode, struct file *filp) if (!system) return -ENODEV; - /* Some versions of gcc think dir can be uninitialized here */ - WARN_ON(!dir); - /* Still need to increment the ref count of the system */ if (trace_array_get(tr) < 0) { put_system(dir); @@ -2280,8 +2281,8 @@ static struct dentry * event_subsystem_dir(struct trace_array *tr, const char *name, struct trace_event_file *file, struct dentry *parent) { + struct event_subsystem *system, *iter; struct trace_subsystem_dir *dir; - struct event_subsystem *system; struct dentry *entry; /* First see if we did not already create this dir */ @@ -2295,13 +2296,13 @@ event_subsystem_dir(struct trace_array *tr, const char *name, } /* Now see if the system itself exists. */ - list_for_each_entry(system, &event_subsystems, list) { - if (strcmp(system->name, name) == 0) + system = NULL; + list_for_each_entry(iter, &event_subsystems, list) { + if (strcmp(iter->name, name) == 0) { + system = iter; break; + } } - /* Reset system variable when not found */ - if (&system->list == &event_subsystems) - system = NULL; dir = kmalloc(sizeof(*dir), GFP_KERNEL); if (!dir) @@ -3546,12 +3547,10 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) struct dentry *d_events; struct dentry *entry; - entry = tracefs_create_file("set_event", TRACE_MODE_WRITE, parent, - tr, &ftrace_set_event_fops); - if (!entry) { - pr_warn("Could not create tracefs 'set_event' entry\n"); + entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent, + tr, &ftrace_set_event_fops); + if (!entry) return -ENOMEM; - } d_events = tracefs_create_dir("events", parent); if (!d_events) { @@ -3566,16 +3565,12 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) /* There are not as crucial, just warn if they are not created */ - entry = tracefs_create_file("set_event_pid", TRACE_MODE_WRITE, parent, - tr, &ftrace_set_event_pid_fops); - if (!entry) - pr_warn("Could not create tracefs 'set_event_pid' entry\n"); + trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent, + tr, &ftrace_set_event_pid_fops); - entry = tracefs_create_file("set_event_notrace_pid", - TRACE_MODE_WRITE, parent, tr, - &ftrace_set_event_notrace_pid_fops); - if (!entry) - pr_warn("Could not create tracefs 'set_event_notrace_pid' entry\n"); + trace_create_file("set_event_notrace_pid", + TRACE_MODE_WRITE, parent, tr, + &ftrace_set_event_notrace_pid_fops); /* ring buffer internal formats */ trace_create_file("header_page", TRACE_MODE_READ, d_events, @@ -3790,17 +3785,14 @@ static __init int event_trace_init_fields(void) __init int event_trace_init(void) { struct trace_array *tr; - struct dentry *entry; int ret; tr = top_trace_array(); if (!tr) return -ENODEV; - entry = tracefs_create_file("available_events", TRACE_MODE_READ, - NULL, tr, &ftrace_avail_fops); - if (!entry) - pr_warn("Could not create tracefs 'available_events' entry\n"); + trace_create_file("available_events", TRACE_MODE_READ, + NULL, tr, &ftrace_avail_fops); ret = early_event_add_tracer(NULL, tr); if (ret) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index b458a9afa2c0..4b1057ab9d96 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1816,7 +1816,7 @@ static void create_filter_finish(struct filter_parse_error *pe) * create_filter - create a filter for a trace_event_call * @tr: the trace array associated with these events * @call: trace_event_call to create a filter for - * @filter_str: filter string + * @filter_string: filter string * @set_str: remember @filter_str and enable detailed error in filter * @filterp: out param for created filter (always updated on return) * Must be a pointer that references a NULL pointer. diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 44db5ba9cabb..fdf784620c28 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2093,8 +2093,11 @@ static int init_var_ref(struct hist_field *ref_field, return err; free: kfree(ref_field->system); + ref_field->system = NULL; kfree(ref_field->event_name); + ref_field->event_name = NULL; kfree(ref_field->name); + ref_field->name = NULL; goto out; } @@ -2785,7 +2788,8 @@ static char *find_trigger_filter(struct hist_trigger_data *hist_data, static struct event_command trigger_hist_cmd; static int event_hist_trigger_parse(struct event_command *cmd_ops, struct trace_event_file *file, - char *glob, char *cmd, char *param); + char *glob, char *cmd, + char *param_and_filter); static bool compatible_keys(struct hist_trigger_data *target_hist_data, struct hist_trigger_data *hist_data, @@ -4161,7 +4165,7 @@ static int create_val_field(struct hist_trigger_data *hist_data, return __create_val_field(hist_data, val_idx, file, NULL, field_str, 0); } -static const char *no_comm = "(no comm)"; +static const char no_comm[] = "(no comm)"; static u64 hist_field_execname(struct hist_field *hist_field, struct tracing_map_elt *elt, @@ -4426,6 +4430,8 @@ static int parse_var_defs(struct hist_trigger_data *hist_data) s = kstrdup(field_str, GFP_KERNEL); if (!s) { + kfree(hist_data->attrs->var_defs.name[n_vars]); + hist_data->attrs->var_defs.name[n_vars] = NULL; ret = -ENOMEM; goto free; } @@ -4449,7 +4455,7 @@ static int create_hist_fields(struct hist_trigger_data *hist_data, ret = parse_var_defs(hist_data); if (ret) - goto out; + return ret; ret = create_val_fields(hist_data, file); if (ret) @@ -4460,8 +4466,7 @@ static int create_hist_fields(struct hist_trigger_data *hist_data, goto out; ret = create_key_fields(hist_data, file); - if (ret) - goto out; + out: free_var_defs(hist_data); @@ -5252,7 +5257,7 @@ static void hist_trigger_show(struct seq_file *m, seq_puts(m, "\n\n"); seq_puts(m, "# event histogram\n#\n# trigger info: "); - data->ops->print(m, data->ops, data); + data->ops->print(m, data); seq_puts(m, "#\n\n"); hist_data = data->private_data; @@ -5484,7 +5489,7 @@ static void hist_trigger_debug_show(struct seq_file *m, seq_puts(m, "\n\n"); seq_puts(m, "# event histogram\n#\n# trigger info: "); - data->ops->print(m, data->ops, data); + data->ops->print(m, data); seq_puts(m, "#\n\n"); hist_data = data->private_data; @@ -5621,7 +5626,6 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) } static int event_hist_trigger_print(struct seq_file *m, - struct event_trigger_ops *ops, struct event_trigger_data *data) { struct hist_trigger_data *hist_data = data->private_data; @@ -5729,8 +5733,7 @@ static int event_hist_trigger_print(struct seq_file *m, return 0; } -static int event_hist_trigger_init(struct event_trigger_ops *ops, - struct event_trigger_data *data) +static int event_hist_trigger_init(struct event_trigger_data *data) { struct hist_trigger_data *hist_data = data->private_data; @@ -5758,8 +5761,7 @@ static void unregister_field_var_hists(struct hist_trigger_data *hist_data) } } -static void event_hist_trigger_free(struct event_trigger_ops *ops, - struct event_trigger_data *data) +static void event_hist_trigger_free(struct event_trigger_data *data) { struct hist_trigger_data *hist_data = data->private_data; @@ -5788,25 +5790,23 @@ static struct event_trigger_ops event_hist_trigger_ops = { .free = event_hist_trigger_free, }; -static int event_hist_trigger_named_init(struct event_trigger_ops *ops, - struct event_trigger_data *data) +static int event_hist_trigger_named_init(struct event_trigger_data *data) { data->ref++; save_named_trigger(data->named_data->name, data); - event_hist_trigger_init(ops, data->named_data); + event_hist_trigger_init(data->named_data); return 0; } -static void event_hist_trigger_named_free(struct event_trigger_ops *ops, - struct event_trigger_data *data) +static void event_hist_trigger_named_free(struct event_trigger_data *data) { if (WARN_ON_ONCE(data->ref <= 0)) return; - event_hist_trigger_free(ops, data->named_data); + event_hist_trigger_free(data->named_data); data->ref--; if (!data->ref) { @@ -5933,6 +5933,48 @@ static bool hist_trigger_match(struct event_trigger_data *data, return true; } +static bool existing_hist_update_only(char *glob, + struct event_trigger_data *data, + struct trace_event_file *file) +{ + struct hist_trigger_data *hist_data = data->private_data; + struct event_trigger_data *test, *named_data = NULL; + bool updated = false; + + if (!hist_data->attrs->pause && !hist_data->attrs->cont && + !hist_data->attrs->clear) + goto out; + + if (hist_data->attrs->name) { + named_data = find_named_trigger(hist_data->attrs->name); + if (named_data) { + if (!hist_trigger_match(data, named_data, named_data, + true)) + goto out; + } + } + + if (hist_data->attrs->name && !named_data) + goto out; + + list_for_each_entry(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (!hist_trigger_match(data, test, named_data, false)) + continue; + if (hist_data->attrs->pause) + test->paused = true; + else if (hist_data->attrs->cont) + test->paused = false; + else if (hist_data->attrs->clear) + hist_clear(test); + updated = true; + goto out; + } + } + out: + return updated; +} + static int hist_register_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file) @@ -5961,19 +6003,11 @@ static int hist_register_trigger(char *glob, list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { - if (!hist_trigger_match(data, test, named_data, false)) - continue; - if (hist_data->attrs->pause) - test->paused = true; - else if (hist_data->attrs->cont) - test->paused = false; - else if (hist_data->attrs->clear) - hist_clear(test); - else { + if (hist_trigger_match(data, test, named_data, false)) { hist_err(tr, HIST_ERR_TRIGGER_EEXIST, 0); ret = -EEXIST; + goto out; } - goto out; } } new: @@ -5993,7 +6027,7 @@ static int hist_register_trigger(char *glob, } if (data->ops->init) { - ret = data->ops->init(data->ops, data); + ret = data->ops->init(data); if (ret < 0) goto out; } @@ -6012,8 +6046,6 @@ static int hist_register_trigger(char *glob, if (named_data) destroy_hist_data(hist_data); - - ret++; out: return ret; } @@ -6089,20 +6121,19 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file) { + struct event_trigger_data *test = NULL, *iter, *named_data = NULL; struct hist_trigger_data *hist_data = data->private_data; - struct event_trigger_data *test, *named_data = NULL; - bool unregistered = false; lockdep_assert_held(&event_mutex); if (hist_data->attrs->name) named_data = find_named_trigger(hist_data->attrs->name); - list_for_each_entry(test, &file->triggers, list) { - if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { - if (!hist_trigger_match(data, test, named_data, false)) + list_for_each_entry(iter, &file->triggers, list) { + if (iter->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (!hist_trigger_match(data, iter, named_data, false)) continue; - unregistered = true; + test = iter; list_del_rcu(&test->list); trace_event_trigger_enable_disable(file, 0); update_cond_flag(file); @@ -6110,11 +6141,11 @@ static void hist_unregister_trigger(char *glob, } } - if (unregistered && test->ops->free) - test->ops->free(test->ops, test); + if (test && test->ops->free) + test->ops->free(test); if (hist_data->enable_timestamps) { - if (!hist_data->remove || unregistered) + if (!hist_data->remove || test) tracing_set_filter_buffering(file->tr, false); } } @@ -6164,57 +6195,57 @@ static void hist_unreg_all(struct trace_event_file *file) if (hist_data->enable_timestamps) tracing_set_filter_buffering(file->tr, false); if (test->ops->free) - test->ops->free(test->ops, test); + test->ops->free(test); } } } static int event_hist_trigger_parse(struct event_command *cmd_ops, struct trace_event_file *file, - char *glob, char *cmd, char *param) + char *glob, char *cmd, + char *param_and_filter) { unsigned int hist_trigger_bits = TRACING_MAP_BITS_DEFAULT; struct event_trigger_data *trigger_data; struct hist_trigger_attrs *attrs; - struct event_trigger_ops *trigger_ops; struct hist_trigger_data *hist_data; + char *param, *filter, *p, *start; struct synth_event *se; const char *se_name; - bool remove = false; - char *trigger, *p, *start; + bool remove; int ret = 0; lockdep_assert_held(&event_mutex); - WARN_ON(!glob); + if (WARN_ON(!glob)) + return -EINVAL; - if (strlen(glob)) { + if (glob[0]) { hist_err_clear(); - last_cmd_set(file, param); + last_cmd_set(file, param_and_filter); } - if (!param) - return -EINVAL; + remove = event_trigger_check_remove(glob); - if (glob[0] == '!') - remove = true; + if (event_trigger_empty_param(param_and_filter)) + return -EINVAL; /* * separate the trigger from the filter (k:v [if filter]) * allowing for whitespace in the trigger */ - p = trigger = param; + p = param = param_and_filter; do { p = strstr(p, "if"); if (!p) break; - if (p == param) + if (p == param_and_filter) return -EINVAL; if (*(p - 1) != ' ' && *(p - 1) != '\t') { p++; continue; } - if (p >= param + strlen(param) - (sizeof("if") - 1) - 1) + if (p >= param_and_filter + strlen(param_and_filter) - (sizeof("if") - 1) - 1) return -EINVAL; if (*(p + sizeof("if") - 1) != ' ' && *(p + sizeof("if") - 1) != '\t') { p++; @@ -6224,24 +6255,24 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, } while (1); if (!p) - param = NULL; + filter = NULL; else { *(p - 1) = '\0'; - param = strstrip(p); - trigger = strstrip(trigger); + filter = strstrip(p); + param = strstrip(param); } /* * To simplify arithmetic expression parsing, replace occurrences of * '.sym-offset' modifier with '.symXoffset' */ - start = strstr(trigger, ".sym-offset"); + start = strstr(param, ".sym-offset"); while (start) { *(start + 4) = 'X'; start = strstr(start + 11, ".sym-offset"); } - attrs = parse_hist_trigger_attrs(file->tr, trigger); + attrs = parse_hist_trigger_attrs(file->tr, param); if (IS_ERR(attrs)) return PTR_ERR(attrs); @@ -6254,29 +6285,15 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, return PTR_ERR(hist_data); } - trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); - - trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); + trigger_data = event_trigger_alloc(cmd_ops, cmd, param, hist_data); if (!trigger_data) { ret = -ENOMEM; goto out_free; } - trigger_data->count = -1; - trigger_data->ops = trigger_ops; - trigger_data->cmd_ops = cmd_ops; - - INIT_LIST_HEAD(&trigger_data->list); - RCU_INIT_POINTER(trigger_data->filter, NULL); - - trigger_data->private_data = hist_data; - - /* if param is non-empty, it's supposed to be a filter */ - if (param && cmd_ops->set_filter) { - ret = cmd_ops->set_filter(param, trigger_data, file); - if (ret < 0) - goto out_free; - } + ret = event_trigger_set_filter(cmd_ops, file, filter, trigger_data); + if (ret < 0) + goto out_free; if (remove) { if (!have_hist_trigger_match(trigger_data, file)) @@ -6287,7 +6304,7 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, goto out_free; } - cmd_ops->unreg(glob+1, trigger_data, file); + event_trigger_unregister(cmd_ops, file, glob+1, trigger_data); se_name = trace_event_name(file->event_call); se = find_synth_event(se_name); if (se) @@ -6296,17 +6313,11 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, goto out_free; } - ret = cmd_ops->reg(glob, trigger_data, file); - /* - * The above returns on success the # of triggers registered, - * but if it didn't register any it returns zero. Consider no - * triggers registered a failure too. - */ - if (!ret) { - if (!(attrs->pause || attrs->cont || attrs->clear)) - ret = -ENOENT; + if (existing_hist_update_only(glob, trigger_data, file)) goto out_free; - } else if (ret < 0) + + ret = event_trigger_register(cmd_ops, file, glob, trigger_data); + if (ret < 0) goto out_free; if (get_named_trigger_data(trigger_data)) @@ -6331,18 +6342,15 @@ enable: se = find_synth_event(se_name); if (se) se->ref++; - /* Just return zero, not the number of registered triggers */ - ret = 0; out: if (ret == 0) hist_err_clear(); return ret; out_unreg: - cmd_ops->unreg(glob+1, trigger_data, file); + event_trigger_unregister(cmd_ops, file, glob+1, trigger_data); out_free: - if (cmd_ops->set_filter) - cmd_ops->set_filter(NULL, trigger_data, NULL); + event_trigger_reset_filter(cmd_ops, trigger_data); remove_hist_vars(hist_data); @@ -6463,7 +6471,7 @@ static void hist_enable_unreg_all(struct trace_event_file *file) update_cond_flag(file); trace_event_trigger_enable_disable(file, 0); if (test->ops->free) - test->ops->free(test->ops, test); + test->ops->free(test); } } } diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 7eb9d04f1c2e..918730d74932 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -142,7 +142,8 @@ static bool check_user_trigger(struct trace_event_file *file) { struct event_trigger_data *data; - list_for_each_entry_rcu(data, &file->triggers, list) { + list_for_each_entry_rcu(data, &file->triggers, list, + lockdep_is_held(&event_mutex)) { if (data->flags & EVENT_TRIGGER_FL_PROBE) continue; return true; @@ -188,7 +189,7 @@ static int trigger_show(struct seq_file *m, void *v) } data = list_entry(v, struct event_trigger_data, list); - data->ops->print(m, data->ops, data); + data->ops->print(m, data); return 0; } @@ -432,7 +433,6 @@ event_trigger_print(const char *name, struct seq_file *m, /** * event_trigger_init - Generic event_trigger_ops @init implementation - * @ops: The trigger ops associated with the trigger * @data: Trigger-specific data * * Common implementation of event trigger initialization. @@ -442,8 +442,7 @@ event_trigger_print(const char *name, struct seq_file *m, * * Return: 0 on success, errno otherwise */ -int event_trigger_init(struct event_trigger_ops *ops, - struct event_trigger_data *data) +int event_trigger_init(struct event_trigger_data *data) { data->ref++; return 0; @@ -451,7 +450,6 @@ int event_trigger_init(struct event_trigger_ops *ops, /** * event_trigger_free - Generic event_trigger_ops @free implementation - * @ops: The trigger ops associated with the trigger * @data: Trigger-specific data * * Common implementation of event trigger de-initialization. @@ -460,8 +458,7 @@ int event_trigger_init(struct event_trigger_ops *ops, * implementations. */ static void -event_trigger_free(struct event_trigger_ops *ops, - struct event_trigger_data *data) +event_trigger_free(struct event_trigger_data *data) { if (WARN_ON_ONCE(data->ref <= 0)) return; @@ -515,7 +512,7 @@ clear_event_triggers(struct trace_array *tr) trace_event_trigger_enable_disable(file, 0); list_del_rcu(&data->list); if (data->ops->free) - data->ops->free(data->ops, data); + data->ops->free(data); } } } @@ -581,19 +578,18 @@ static int register_trigger(char *glob, } if (data->ops->init) { - ret = data->ops->init(data->ops, data); + ret = data->ops->init(data); if (ret < 0) goto out; } list_add_rcu(&data->list, &file->triggers); - ret++; update_cond_flag(file); - if (trace_event_trigger_enable_disable(file, 1) < 0) { + ret = trace_event_trigger_enable_disable(file, 1); + if (ret < 0) { list_del_rcu(&data->list); update_cond_flag(file); - ret--; } out: return ret; @@ -614,14 +610,13 @@ static void unregister_trigger(char *glob, struct event_trigger_data *test, struct trace_event_file *file) { - struct event_trigger_data *data; - bool unregistered = false; + struct event_trigger_data *data = NULL, *iter; lockdep_assert_held(&event_mutex); - list_for_each_entry(data, &file->triggers, list) { - if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) { - unregistered = true; + list_for_each_entry(iter, &file->triggers, list) { + if (iter->cmd_ops->trigger_type == test->cmd_ops->trigger_type) { + data = iter; list_del_rcu(&data->list); trace_event_trigger_enable_disable(file, 0); update_cond_flag(file); @@ -629,8 +624,8 @@ static void unregister_trigger(char *glob, } } - if (unregistered && data->ops->free) - data->ops->free(data->ops, data); + if (data && data->ops->free) + data->ops->free(data); } /* @@ -744,15 +739,15 @@ bool event_trigger_empty_param(const char *param) /** * event_trigger_separate_filter - separate an event trigger from a filter - * @param: The param string containing trigger and possibly filter - * @trigger: outparam, will be filled with a pointer to the trigger + * @param_and_filter: String containing trigger and possibly filter + * @param: outparam, will be filled with a pointer to the trigger * @filter: outparam, will be filled with a pointer to the filter * @param_required: Specifies whether or not the param string is required * * Given a param string of the form '[trigger] [if filter]', this * function separates the filter from the trigger and returns the - * trigger in *trigger and the filter in *filter. Either the *trigger - * or the *filter may be set to NULL by this function - if not set to + * trigger in @param and the filter in @filter. Either the @param + * or the @filter may be set to NULL by this function - if not set to * NULL, they will contain strings corresponding to the trigger and * filter. * @@ -927,48 +922,37 @@ void event_trigger_reset_filter(struct event_command *cmd_ops, * @cmd_ops: The event_command operations for the trigger * @file: The event file for the trigger's event * @glob: The trigger command string, with optional remove(!) operator - * @cmd: The cmd string - * @param: The param string * @trigger_data: The trigger_data for the trigger - * @n_registered: optional outparam, the number of triggers registered * * Register an event trigger. The @cmd_ops are used to call the - * cmd_ops->reg() function which actually does the registration. The - * cmd_ops->reg() function returns the number of triggers registered, - * which is assigned to n_registered, if n_registered is non-NULL. + * cmd_ops->reg() function which actually does the registration. * * Return: 0 on success, errno otherwise */ int event_trigger_register(struct event_command *cmd_ops, struct trace_event_file *file, char *glob, - char *cmd, - char *param, - struct event_trigger_data *trigger_data, - int *n_registered) + struct event_trigger_data *trigger_data) { - int ret; - - if (n_registered) - *n_registered = 0; - - ret = cmd_ops->reg(glob, trigger_data, file); - /* - * The above returns on success the # of functions enabled, - * but if it didn't find any functions it returns zero. - * Consider no functions a failure too. - */ - if (!ret) { - cmd_ops->unreg(glob, trigger_data, file); - ret = -ENOENT; - } else if (ret > 0) { - if (n_registered) - *n_registered = ret; - /* Just return zero, not the number of enabled functions */ - ret = 0; - } + return cmd_ops->reg(glob, trigger_data, file); +} - return ret; +/** + * event_trigger_unregister - unregister an event trigger + * @cmd_ops: The event_command operations for the trigger + * @file: The event file for the trigger's event + * @glob: The trigger command string, with optional remove(!) operator + * @trigger_data: The trigger_data for the trigger + * + * Unregister an event trigger. The @cmd_ops are used to call the + * cmd_ops->unreg() function which actually does the unregistration. + */ +void event_trigger_unregister(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, + struct event_trigger_data *trigger_data) +{ + cmd_ops->unreg(glob, trigger_data, file); } /* @@ -981,7 +965,7 @@ int event_trigger_register(struct event_command *cmd_ops, * @file: The trace_event_file associated with the event * @glob: The raw string used to register the trigger * @cmd: The cmd portion of the string used to register the trigger - * @param: The params portion of the string used to register the trigger + * @param_and_filter: The param and filter portion of the string used to register the trigger * * Common implementation for event command parsing and trigger * instantiation. @@ -994,94 +978,53 @@ int event_trigger_register(struct event_command *cmd_ops, static int event_trigger_parse(struct event_command *cmd_ops, struct trace_event_file *file, - char *glob, char *cmd, char *param) + char *glob, char *cmd, char *param_and_filter) { struct event_trigger_data *trigger_data; - struct event_trigger_ops *trigger_ops; - char *trigger = NULL; - char *number; + char *param, *filter; + bool remove; int ret; - /* separate the trigger from the filter (t:n [if filter]) */ - if (param && isdigit(param[0])) { - trigger = strsep(¶m, " \t"); - if (param) { - param = skip_spaces(param); - if (!*param) - param = NULL; - } - } + remove = event_trigger_check_remove(glob); - trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); + ret = event_trigger_separate_filter(param_and_filter, ¶m, &filter, false); + if (ret) + return ret; ret = -ENOMEM; - trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); + trigger_data = event_trigger_alloc(cmd_ops, cmd, param, file); if (!trigger_data) goto out; - trigger_data->count = -1; - trigger_data->ops = trigger_ops; - trigger_data->cmd_ops = cmd_ops; - trigger_data->private_data = file; - INIT_LIST_HEAD(&trigger_data->list); - INIT_LIST_HEAD(&trigger_data->named_list); - - if (glob[0] == '!') { - cmd_ops->unreg(glob+1, trigger_data, file); + if (remove) { + event_trigger_unregister(cmd_ops, file, glob+1, trigger_data); kfree(trigger_data); ret = 0; goto out; } - if (trigger) { - number = strsep(&trigger, ":"); - - ret = -EINVAL; - if (!strlen(number)) - goto out_free; - - /* - * We use the callback data field (which is a pointer) - * as our counter. - */ - ret = kstrtoul(number, 0, &trigger_data->count); - if (ret) - goto out_free; - } - - if (!param) /* if param is non-empty, it's supposed to be a filter */ - goto out_reg; - - if (!cmd_ops->set_filter) - goto out_reg; + ret = event_trigger_parse_num(param, trigger_data); + if (ret) + goto out_free; - ret = cmd_ops->set_filter(param, trigger_data, file); + ret = event_trigger_set_filter(cmd_ops, file, filter, trigger_data); if (ret < 0) goto out_free; - out_reg: /* Up the trigger_data count to make sure reg doesn't free it on failure */ - event_trigger_init(trigger_ops, trigger_data); - ret = cmd_ops->reg(glob, trigger_data, file); - /* - * The above returns on success the # of functions enabled, - * but if it didn't find any functions it returns zero. - * Consider no functions a failure too. - */ - if (!ret) { - cmd_ops->unreg(glob, trigger_data, file); - ret = -ENOENT; - } else if (ret > 0) - ret = 0; + event_trigger_init(trigger_data); + + ret = event_trigger_register(cmd_ops, file, glob, trigger_data); + if (ret) + goto out_free; /* Down the counter of trigger_data or free it if not used anymore */ - event_trigger_free(trigger_ops, trigger_data); + event_trigger_free(trigger_data); out: return ret; out_free: - if (cmd_ops->set_filter) - cmd_ops->set_filter(NULL, trigger_data, NULL); + event_trigger_reset_filter(cmd_ops, trigger_data); kfree(trigger_data); goto out; } @@ -1401,16 +1344,14 @@ traceoff_count_trigger(struct event_trigger_data *data, } static int -traceon_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, - struct event_trigger_data *data) +traceon_trigger_print(struct seq_file *m, struct event_trigger_data *data) { return event_trigger_print("traceon", m, (void *)data->count, data->filter_str); } static int -traceoff_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, - struct event_trigger_data *data) +traceoff_trigger_print(struct seq_file *m, struct event_trigger_data *data) { return event_trigger_print("traceoff", m, (void *)data->count, data->filter_str); @@ -1521,8 +1462,7 @@ register_snapshot_trigger(char *glob, } static int -snapshot_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, - struct event_trigger_data *data) +snapshot_trigger_print(struct seq_file *m, struct event_trigger_data *data) { return event_trigger_print("snapshot", m, (void *)data->count, data->filter_str); @@ -1617,8 +1557,7 @@ stacktrace_count_trigger(struct event_trigger_data *data, } static int -stacktrace_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, - struct event_trigger_data *data) +stacktrace_trigger_print(struct seq_file *m, struct event_trigger_data *data) { return event_trigger_print("stacktrace", m, (void *)data->count, data->filter_str); @@ -1708,7 +1647,6 @@ event_enable_count_trigger(struct event_trigger_data *data, } int event_enable_trigger_print(struct seq_file *m, - struct event_trigger_ops *ops, struct event_trigger_data *data) { struct enable_trigger_data *enable_data = data->private_data; @@ -1733,8 +1671,7 @@ int event_enable_trigger_print(struct seq_file *m, return 0; } -void event_enable_trigger_free(struct event_trigger_ops *ops, - struct event_trigger_data *data) +void event_enable_trigger_free(struct event_trigger_data *data) { struct enable_trigger_data *enable_data = data->private_data; @@ -1781,39 +1718,33 @@ static struct event_trigger_ops event_disable_count_trigger_ops = { int event_enable_trigger_parse(struct event_command *cmd_ops, struct trace_event_file *file, - char *glob, char *cmd, char *param) + char *glob, char *cmd, char *param_and_filter) { struct trace_event_file *event_enable_file; struct enable_trigger_data *enable_data; struct event_trigger_data *trigger_data; - struct event_trigger_ops *trigger_ops; struct trace_array *tr = file->tr; + char *param, *filter; + bool enable, remove; const char *system; const char *event; bool hist = false; - char *trigger; - char *number; - bool enable; int ret; - if (!param) - return -EINVAL; + remove = event_trigger_check_remove(glob); - /* separate the trigger from the filter (s:e:n [if filter]) */ - trigger = strsep(¶m, " \t"); - if (!trigger) + if (event_trigger_empty_param(param_and_filter)) return -EINVAL; - if (param) { - param = skip_spaces(param); - if (!*param) - param = NULL; - } - system = strsep(&trigger, ":"); - if (!trigger) + ret = event_trigger_separate_filter(param_and_filter, ¶m, &filter, true); + if (ret) + return ret; + + system = strsep(¶m, ":"); + if (!param) return -EINVAL; - event = strsep(&trigger, ":"); + event = strsep(¶m, ":"); ret = -EINVAL; event_enable_file = find_event_file(tr, system, event); @@ -1829,32 +1760,24 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, #else enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; #endif - trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); - ret = -ENOMEM; - trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); - if (!trigger_data) - goto out; enable_data = kzalloc(sizeof(*enable_data), GFP_KERNEL); - if (!enable_data) { - kfree(trigger_data); + if (!enable_data) goto out; - } - - trigger_data->count = -1; - trigger_data->ops = trigger_ops; - trigger_data->cmd_ops = cmd_ops; - INIT_LIST_HEAD(&trigger_data->list); - RCU_INIT_POINTER(trigger_data->filter, NULL); enable_data->hist = hist; enable_data->enable = enable; enable_data->file = event_enable_file; - trigger_data->private_data = enable_data; - if (glob[0] == '!') { - cmd_ops->unreg(glob+1, trigger_data, file); + trigger_data = event_trigger_alloc(cmd_ops, cmd, param, enable_data); + if (!trigger_data) { + kfree(enable_data); + goto out; + } + + if (remove) { + event_trigger_unregister(cmd_ops, file, glob+1, trigger_data); kfree(trigger_data); kfree(enable_data); ret = 0; @@ -1862,35 +1785,16 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, } /* Up the trigger_data count to make sure nothing frees it on failure */ - event_trigger_init(trigger_ops, trigger_data); - - if (trigger) { - number = strsep(&trigger, ":"); - - ret = -EINVAL; - if (!strlen(number)) - goto out_free; - - /* - * We use the callback data field (which is a pointer) - * as our counter. - */ - ret = kstrtoul(number, 0, &trigger_data->count); - if (ret) - goto out_free; - } - - if (!param) /* if param is non-empty, it's supposed to be a filter */ - goto out_reg; + event_trigger_init(trigger_data); - if (!cmd_ops->set_filter) - goto out_reg; + ret = event_trigger_parse_num(param, trigger_data); + if (ret) + goto out_free; - ret = cmd_ops->set_filter(param, trigger_data, file); + ret = event_trigger_set_filter(cmd_ops, file, filter, trigger_data); if (ret < 0) goto out_free; - out_reg: /* Don't let event modules unload while probe registered */ ret = trace_event_try_get_ref(event_enable_file->event_call); if (!ret) { @@ -1901,32 +1805,23 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, ret = trace_event_enable_disable(event_enable_file, 1, 1); if (ret < 0) goto out_put; - ret = cmd_ops->reg(glob, trigger_data, file); - /* - * The above returns on success the # of functions enabled, - * but if it didn't find any functions it returns zero. - * Consider no functions a failure too. - */ - if (!ret) { - ret = -ENOENT; - goto out_disable; - } else if (ret < 0) + + ret = event_trigger_register(cmd_ops, file, glob, trigger_data); + if (ret) goto out_disable; - /* Just return zero, not the number of enabled functions */ - ret = 0; - event_trigger_free(trigger_ops, trigger_data); + + event_trigger_free(trigger_data); out: return ret; - out_disable: trace_event_enable_disable(event_enable_file, 0, 1); out_put: trace_event_put_ref(event_enable_file->event_call); out_free: - if (cmd_ops->set_filter) - cmd_ops->set_filter(NULL, trigger_data, NULL); - event_trigger_free(trigger_ops, trigger_data); + event_trigger_reset_filter(cmd_ops, trigger_data); + event_trigger_free(trigger_data); kfree(enable_data); + goto out; } @@ -1953,19 +1848,18 @@ int event_enable_register_trigger(char *glob, } if (data->ops->init) { - ret = data->ops->init(data->ops, data); + ret = data->ops->init(data); if (ret < 0) goto out; } list_add_rcu(&data->list, &file->triggers); - ret++; update_cond_flag(file); - if (trace_event_trigger_enable_disable(file, 1) < 0) { + ret = trace_event_trigger_enable_disable(file, 1); + if (ret < 0) { list_del_rcu(&data->list); update_cond_flag(file); - ret--; } out: return ret; @@ -1976,19 +1870,18 @@ void event_enable_unregister_trigger(char *glob, struct trace_event_file *file) { struct enable_trigger_data *test_enable_data = test->private_data; + struct event_trigger_data *data = NULL, *iter; struct enable_trigger_data *enable_data; - struct event_trigger_data *data; - bool unregistered = false; lockdep_assert_held(&event_mutex); - list_for_each_entry(data, &file->triggers, list) { - enable_data = data->private_data; + list_for_each_entry(iter, &file->triggers, list) { + enable_data = iter->private_data; if (enable_data && - (data->cmd_ops->trigger_type == + (iter->cmd_ops->trigger_type == test->cmd_ops->trigger_type) && (enable_data->file == test_enable_data->file)) { - unregistered = true; + data = iter; list_del_rcu(&data->list); trace_event_trigger_enable_disable(file, 0); update_cond_flag(file); @@ -1996,8 +1889,8 @@ void event_enable_unregister_trigger(char *glob, } } - if (unregistered && data->ops->free) - data->ops->free(data->ops, data); + if (data && data->ops->free) + data->ops->free(data); } static struct event_trigger_ops * diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 706e1686b5eb..a6621c52ce45 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -567,7 +567,7 @@ static int user_event_set_call_visible(struct user_event *user, bool visible) * to allow user_event files to be less locked down. The extreme case * being "other" has read/write access to user_events_data/status. * - * When not locked down, processes may not have have permissions to + * When not locked down, processes may not have permissions to * add/remove calls themselves to tracefs. We need to temporarily * switch to root file permission to allow for this scenario. */ diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 47cebef78532..23f7f0ec4f4c 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -163,7 +163,8 @@ static bool trace_kprobe_match(const char *system, const char *event, { struct trace_kprobe *tk = to_trace_kprobe(ev); - return strcmp(trace_probe_name(&tk->tp), event) == 0 && + return (event[0] == '\0' || + strcmp(trace_probe_name(&tk->tp), event) == 0) && (!system || strcmp(trace_probe_group_name(&tk->tp), system) == 0) && trace_kprobe_match_command_head(tk, argc, argv); } @@ -708,11 +709,11 @@ static int __trace_kprobe_create(int argc, const char *argv[]) /* * Argument syntax: * - Add kprobe: - * p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] + * p[:[GRP/][EVENT]] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] * - Add kretprobe: - * r[MAXACTIVE][:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS] + * r[MAXACTIVE][:[GRP/][EVENT]] [MOD:]KSYM[+0] [FETCHARGS] * Or - * p:[GRP/]EVENT] [MOD:]KSYM[+0]%return [FETCHARGS] + * p[:[GRP/][EVENT]] [MOD:]KSYM[+0]%return [FETCHARGS] * * Fetch args: * $retval : fetch return value @@ -739,6 +740,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) long offset = 0; void *addr = NULL; char buf[MAX_EVENT_NAME_LEN]; + char gbuf[MAX_EVENT_NAME_LEN]; unsigned int flags = TPARG_FL_KERNEL; switch (argv[0][0]) { @@ -833,11 +835,13 @@ static int __trace_kprobe_create(int argc, const char *argv[]) trace_probe_log_set_index(0); if (event) { - ret = traceprobe_parse_event_name(&event, &group, buf, + ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) goto parse_error; - } else { + } + + if (!event) { /* Make a new event name */ if (symbol) snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_%ld", @@ -1718,8 +1722,17 @@ static int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) { struct kretprobe *rp = get_kretprobe(ri); - struct trace_kprobe *tk = container_of(rp, struct trace_kprobe, rp); + struct trace_kprobe *tk; + + /* + * There is a small chance that get_kretprobe(ri) returns NULL when + * the kretprobe is unregister on another CPU between kretprobe's + * trampoline_handler and this function. + */ + if (unlikely(!rp)) + return 0; + tk = container_of(rp, struct trace_kprobe, rp); raw_cpu_inc(*tk->nhit); if (trace_probe_test_flag(&tk->tp, TP_FLAG_TRACE)) @@ -1907,25 +1920,18 @@ core_initcall(init_kprobe_trace_early); static __init int init_kprobe_trace(void) { int ret; - struct dentry *entry; ret = tracing_init_dentry(); if (ret) return 0; - entry = tracefs_create_file("kprobe_events", TRACE_MODE_WRITE, - NULL, NULL, &kprobe_events_ops); - /* Event list interface */ - if (!entry) - pr_warn("Could not create tracefs 'kprobe_events' entry\n"); + trace_create_file("kprobe_events", TRACE_MODE_WRITE, + NULL, NULL, &kprobe_events_ops); /* Profile interface */ - entry = tracefs_create_file("kprobe_profile", TRACE_MODE_READ, - NULL, NULL, &kprobe_profile_ops); - - if (!entry) - pr_warn("Could not create tracefs 'kprobe_profile' entry\n"); + trace_create_file("kprobe_profile", TRACE_MODE_READ, + NULL, NULL, &kprobe_profile_ops); setup_boot_kprobe_events(); diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index e9ae1f33a7f0..313439920a8c 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1168,9 +1168,9 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t) */ static void trace_sched_switch_callback(void *data, bool preempt, - unsigned int prev_state, struct task_struct *p, - struct task_struct *n) + struct task_struct *n, + unsigned int prev_state) { struct osnoise_variables *osn_var = this_cpu_osn_var(); @@ -1578,11 +1578,27 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer) trace_timerlat_sample(&s); - notify_new_max_latency(diff); + if (osnoise_data.stop_tracing) { + if (time_to_us(diff) >= osnoise_data.stop_tracing) { + + /* + * At this point, if stop_tracing is set and <= print_stack, + * print_stack is set and would be printed in the thread handler. + * + * Thus, print the stack trace as it is helpful to define the + * root cause of an IRQ latency. + */ + if (osnoise_data.stop_tracing <= osnoise_data.print_stack) { + timerlat_save_stack(0); + timerlat_dump_stack(time_to_us(diff)); + } - if (osnoise_data.stop_tracing) - if (time_to_us(diff) >= osnoise_data.stop_tracing) osnoise_stop_tracing(); + notify_new_max_latency(diff); + + return HRTIMER_NORESTART; + } + } wake_up_process(tlat->kthread); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 8aa493d25c73..67f47ea27921 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -692,7 +692,7 @@ static LIST_HEAD(ftrace_event_list); static int trace_search_list(struct list_head **list) { - struct trace_event *e; + struct trace_event *e = NULL, *iter; int next = __TRACE_LAST_TYPE; if (list_empty(&ftrace_event_list)) { @@ -704,9 +704,11 @@ static int trace_search_list(struct list_head **list) * We used up all possible max events, * lets see if somebody freed one. */ - list_for_each_entry(e, &ftrace_event_list, list) { - if (e->type != next) + list_for_each_entry(iter, &ftrace_event_list, list) { + if (iter->type != next) { + e = iter; break; + } next++; } @@ -714,7 +716,10 @@ static int trace_search_list(struct list_head **list) if (next > TRACE_EVENT_TYPE_MAX) return 0; - *list = &e->list; + if (e) + *list = &e->list; + else + *list = &ftrace_event_list; return next; } @@ -778,9 +783,8 @@ int register_trace_event(struct trace_event *event) list_add_tail(&event->list, list); - } else if (event->type > __TRACE_LAST_TYPE) { - printk(KERN_WARNING "Need to add type to trace.h\n"); - WARN_ON(1); + } else if (WARN(event->type > __TRACE_LAST_TYPE, + "Need to add type to trace.h")) { goto out; } else { /* Is this event already used */ @@ -1571,13 +1575,8 @@ __init static int init_events(void) for (i = 0; events[i]; i++) { event = events[i]; - ret = register_trace_event(event); - if (!ret) { - printk(KERN_WARNING "event %d failed to register\n", - event->type); - WARN_ON_ONCE(1); - } + WARN_ONCE(!ret, "event %d failed to register", event->type); } return 0; diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index f4938040c228..1e130da1b742 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -46,7 +46,7 @@ void trace_hardirqs_on(void) this_cpu_write(tracing_irq_cpu, 0); } - lockdep_hardirqs_on_prepare(CALLER_ADDR0); + lockdep_hardirqs_on_prepare(); lockdep_hardirqs_on(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_on); @@ -94,15 +94,15 @@ __visible void trace_hardirqs_on_caller(unsigned long caller_addr) this_cpu_write(tracing_irq_cpu, 0); } - lockdep_hardirqs_on_prepare(CALLER_ADDR0); - lockdep_hardirqs_on(CALLER_ADDR0); + lockdep_hardirqs_on_prepare(); + lockdep_hardirqs_on(caller_addr); } EXPORT_SYMBOL(trace_hardirqs_on_caller); NOKPROBE_SYMBOL(trace_hardirqs_on_caller); __visible void trace_hardirqs_off_caller(unsigned long caller_addr) { - lockdep_hardirqs_off(CALLER_ADDR0); + lockdep_hardirqs_off(caller_addr); if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 80863c6508e5..36dff277de46 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -257,6 +257,10 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, } len = strlen(event); if (len == 0) { + if (slash) { + *pevent = NULL; + return 0; + } trace_probe_log_err(offset, NO_EVENT_NAME); return -EINVAL; } else if (len > MAX_EVENT_NAME_LEN) { @@ -279,7 +283,14 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, int ret = 0; int len; - if (strcmp(arg, "retval") == 0) { + if (flags & TPARG_FL_TPOINT) { + if (code->data) + return -EFAULT; + code->data = kstrdup(arg, GFP_KERNEL); + if (!code->data) + return -ENOMEM; + code->op = FETCH_OP_TP_ARG; + } else if (strcmp(arg, "retval") == 0) { if (flags & TPARG_FL_RETURN) { code->op = FETCH_OP_RETVAL; } else { @@ -303,7 +314,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, } } else goto inval_var; - } else if (strcmp(arg, "comm") == 0) { + } else if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) { code->op = FETCH_OP_COMM; #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API } else if (((flags & TPARG_FL_MASK) == @@ -319,13 +330,6 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, code->op = FETCH_OP_ARG; code->param = (unsigned int)param - 1; #endif - } else if (flags & TPARG_FL_TPOINT) { - if (code->data) - return -EFAULT; - code->data = kstrdup(arg, GFP_KERNEL); - if (!code->data) - return -ENOMEM; - code->op = FETCH_OP_TP_ARG; } else goto inval_var; @@ -380,6 +384,11 @@ parse_probe_arg(char *arg, const struct fetch_type *type, break; case '%': /* named register */ + if (flags & TPARG_FL_TPOINT) { + /* eprobes do not handle registers */ + trace_probe_log_err(offs, BAD_VAR); + break; + } ret = regs_query_register_offset(arg + 1); if (ret >= 0) { code->op = FETCH_OP_REG; @@ -613,9 +622,11 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, /* * Since $comm and immediate string can not be dereferenced, - * we can find those by strcmp. + * we can find those by strcmp. But ignore for eprobes. */ - if (strcmp(arg, "$comm") == 0 || strncmp(arg, "\\\"", 2) == 0) { + if (!(flags & TPARG_FL_TPOINT) && + (strcmp(arg, "$comm") == 0 || strcmp(arg, "$COMM") == 0 || + strncmp(arg, "\\\"", 2) == 0)) { /* The type of $comm must be "string", and not an array. */ if (parg->count || (t && strcmp(t, "string"))) goto out; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 92cc149af0fd..3b3869ae8cfd 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -442,7 +442,10 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(FAIL_REG_PROBE, "Failed to register probe event"),\ C(DIFF_PROBE_TYPE, "Probe type is different from existing probe"),\ C(DIFF_ARG_TYPE, "Argument type or name is different from existing probe"),\ - C(SAME_PROBE, "There is already the exact same probe event"), + C(SAME_PROBE, "There is already the exact same probe event"),\ + C(NO_EVENT_INFO, "This requires both group and event name to attach"),\ + C(BAD_ATTACH_EVENT, "Attached event does not exist"),\ + C(BAD_ATTACH_ARG, "Attached event does not have this field"), #undef C #define C(a, b) TP_ERR_##a diff --git a/kernel/trace/trace_recursion_record.c b/kernel/trace/trace_recursion_record.c index 4d4b78c8ca25..a520b11afb0d 100644 --- a/kernel/trace/trace_recursion_record.c +++ b/kernel/trace/trace_recursion_record.c @@ -224,12 +224,9 @@ static const struct file_operations recursed_functions_fops = { __init static int create_recursed_functions(void) { - struct dentry *dentry; - dentry = trace_create_file("recursed_functions", TRACE_MODE_WRITE, - NULL, NULL, &recursed_functions_fops); - if (!dentry) - pr_warn("WARNING: Failed to create recursed_functions\n"); + trace_create_file("recursed_functions", TRACE_MODE_WRITE, + NULL, NULL, &recursed_functions_fops); return 0; } diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 45796d8bd4b2..c9ffdcfe622e 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -22,8 +22,8 @@ static DEFINE_MUTEX(sched_register_mutex); static void probe_sched_switch(void *ignore, bool preempt, - unsigned int prev_state, - struct task_struct *prev, struct task_struct *next) + struct task_struct *prev, struct task_struct *next, + unsigned int prev_state) { int flags; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 46429f9a96fa..330aee1c1a49 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -426,8 +426,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr, static void notrace probe_wakeup_sched_switch(void *ignore, bool preempt, - unsigned int prev_state, - struct task_struct *prev, struct task_struct *next) + struct task_struct *prev, struct task_struct *next, + unsigned int prev_state) { struct trace_array_cpu *data; u64 T0, T1, delta; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index abcadbe933bb..a2d301f58ced 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -895,6 +895,9 @@ trace_selftest_startup_function_graph(struct tracer *trace, ret = -1; goto out; } + + /* Enable tracing on all functions again */ + ftrace_set_global_filter(NULL, 0, 1); #endif /* Don't test dynamic tracing, the function tracer already did */ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index f755bde42fd0..b69e207012c9 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -154,7 +154,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags, goto end; /* parameter types */ - if (tr->trace_flags & TRACE_ITER_VERBOSE) + if (tr && tr->trace_flags & TRACE_ITER_VERBOSE) trace_seq_printf(s, "%s ", entry->types[i]); /* parameter values */ @@ -296,9 +296,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) struct trace_event_file *trace_file; struct syscall_trace_enter *entry; struct syscall_metadata *sys_data; - struct ring_buffer_event *event; - struct trace_buffer *buffer; - unsigned int trace_ctx; + struct trace_event_buffer fbuffer; unsigned long args[6]; int syscall_nr; int size; @@ -321,20 +319,16 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; - trace_ctx = tracing_gen_ctx(); - - event = trace_event_buffer_lock_reserve(&buffer, trace_file, - sys_data->enter_event->event.type, size, trace_ctx); - if (!event) + entry = trace_event_buffer_reserve(&fbuffer, trace_file, size); + if (!entry) return; - entry = ring_buffer_event_data(event); + entry = ring_buffer_event_data(fbuffer.event); entry->nr = syscall_nr; syscall_get_arguments(current, regs, args); memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args); - event_trigger_unlock_commit(trace_file, buffer, event, entry, - trace_ctx); + trace_event_buffer_commit(&fbuffer); } static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) @@ -343,9 +337,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) struct trace_event_file *trace_file; struct syscall_trace_exit *entry; struct syscall_metadata *sys_data; - struct ring_buffer_event *event; - struct trace_buffer *buffer; - unsigned int trace_ctx; + struct trace_event_buffer fbuffer; int syscall_nr; syscall_nr = trace_get_syscall_nr(current, regs); @@ -364,20 +356,15 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) if (!sys_data) return; - trace_ctx = tracing_gen_ctx(); - - event = trace_event_buffer_lock_reserve(&buffer, trace_file, - sys_data->exit_event->event.type, sizeof(*entry), - trace_ctx); - if (!event) + entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry)); + if (!entry) return; - entry = ring_buffer_event_data(event); + entry = ring_buffer_event_data(fbuffer.event); entry->nr = syscall_nr; entry->ret = syscall_get_return_value(current, regs); - event_trigger_unlock_commit(trace_file, buffer, event, entry, - trace_ctx); + trace_event_buffer_commit(&fbuffer); } static int reg_event_syscall_enter(struct trace_event_file *file, diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 9711589273cd..fb58e86dd117 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -16,6 +16,7 @@ #include <linux/namei.h> #include <linux/string.h> #include <linux/rculist.h> +#include <linux/filter.h> #include "trace_dynevent.h" #include "trace_probe.h" @@ -312,7 +313,8 @@ static bool trace_uprobe_match(const char *system, const char *event, { struct trace_uprobe *tu = to_trace_uprobe(ev); - return strcmp(trace_probe_name(&tu->tp), event) == 0 && + return (event[0] == '\0' || + strcmp(trace_probe_name(&tu->tp), event) == 0) && (!system || strcmp(trace_probe_group_name(&tu->tp), system) == 0) && trace_uprobe_match_command_head(tu, argc, argv); } @@ -532,7 +534,7 @@ end: /* * Argument syntax: - * - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET[%return][(REF)] [FETCHARGS] + * - Add uprobe: p|r[:[GRP/][EVENT]] PATH:OFFSET[%return][(REF)] [FETCHARGS] */ static int __trace_uprobe_create(int argc, const char **argv) { @@ -540,13 +542,13 @@ static int __trace_uprobe_create(int argc, const char **argv) const char *event = NULL, *group = UPROBE_EVENT_SYSTEM; char *arg, *filename, *rctr, *rctr_end, *tmp; char buf[MAX_EVENT_NAME_LEN]; + char gbuf[MAX_EVENT_NAME_LEN]; enum probe_print_type ptype; struct path path; unsigned long offset, ref_ctr_offset; bool is_return = false; int i, ret; - ret = 0; ref_ctr_offset = 0; switch (argv[0][0]) { @@ -645,11 +647,13 @@ static int __trace_uprobe_create(int argc, const char **argv) /* setup a probe */ trace_probe_log_set_index(0); if (event) { - ret = traceprobe_parse_event_name(&event, &group, buf, + ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) goto fail_address_parse; - } else { + } + + if (!event) { char *tail; char *ptr; @@ -1343,15 +1347,15 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, int size, esize; int rctx; +#ifdef CONFIG_BPF_EVENTS if (bpf_prog_array_valid(call)) { u32 ret; - preempt_disable(); - ret = trace_call_bpf(call, regs); - preempt_enable(); + ret = bpf_prog_run_array_sleepable(call->prog_array, regs, bpf_prog_run); if (!ret) return; } +#endif /* CONFIG_BPF_EVENTS */ esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 9628b5571846..9901708ce6b8 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -1045,7 +1045,8 @@ static void sort_secondary(struct tracing_map *map, /** * tracing_map_sort_entries - Sort the current set of tracing_map_elts in a map * @map: The tracing_map - * @sort_key: The sort key to use for sorting + * @sort_keys: The sort key to use for sorting + * @n_sort_keys: hitcount, always have at least one * @sort_entries: outval: pointer to allocated and sorted array of entries * * tracing_map_sort_entries() sorts the current set of entries in the diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 64ea283f2f86..ef42c1a11920 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -571,7 +571,8 @@ static void for_each_tracepoint_range( bool trace_module_has_bad_taint(struct module *mod) { return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP) | - (1 << TAINT_UNSIGNED_MODULE)); + (1 << TAINT_UNSIGNED_MODULE) | + (1 << TAINT_TEST)); } static BLOCKING_NOTIFIER_HEAD(tracepoint_notify_list); @@ -647,7 +648,7 @@ static int tracepoint_module_coming(struct module *mod) /* * We skip modules that taint the kernel, especially those with different * module headers (for forced load), to make sure we don't cause a crash. - * Staging, out-of-tree, and unsigned GPL modules are fine. + * Staging, out-of-tree, unsigned GPL, and test modules are fine. */ if (trace_module_has_bad_taint(mod)) return 0; diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 1d261fbe367b..4252f0645b9e 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -23,15 +23,20 @@ void bacct_add_tsk(struct user_namespace *user_ns, { const struct cred *tcred; u64 utime, stime, utimescaled, stimescaled; - u64 delta; + u64 now_ns, delta; time64_t btime; BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); /* calculate task elapsed time in nsec */ - delta = ktime_get_ns() - tsk->start_time; + now_ns = ktime_get_ns(); + /* store whole group time first */ + delta = now_ns - tsk->group_leader->start_time; /* Convert to micro seconds */ do_div(delta, NSEC_PER_USEC); + stats->ac_tgetime = delta; + delta = now_ns - tsk->start_time; + do_div(delta, NSEC_PER_USEC); stats->ac_etime = delta; /* Convert to seconds for btime (note y2106 limit) */ btime = ktime_get_real_seconds() - div_u64(delta, USEC_PER_SEC); @@ -51,6 +56,7 @@ void bacct_add_tsk(struct user_namespace *user_ns, stats->ac_nice = task_nice(tsk); stats->ac_sched = tsk->policy; stats->ac_pid = task_pid_nr_ns(tsk, pid_ns); + stats->ac_tgid = task_tgid_nr_ns(tsk, pid_ns); rcu_read_lock(); tcred = __task_cred(tsk); stats->ac_uid = from_kuid_munged(user_ns, tcred->uid); diff --git a/kernel/umh.c b/kernel/umh.c index 36c123360ab8..b989736e8707 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -132,7 +132,7 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) /* If SIGCLD is ignored do_wait won't populate the status. */ kernel_sigaction(SIGCHLD, SIG_DFL); - pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD); + pid = user_mode_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD); if (pid < 0) sub_info->retval = pid; else @@ -171,8 +171,8 @@ static void call_usermodehelper_exec_work(struct work_struct *work) * want to pollute current->children, and we need a parent * that always ignores SIGCHLD to ensure auto-reaping. */ - pid = kernel_thread(call_usermodehelper_exec_async, sub_info, - CLONE_PARENT | SIGCHLD); + pid = user_mode_thread(call_usermodehelper_exec_async, sub_info, + CLONE_PARENT | SIGCHLD); if (pid < 0) { sub_info->retval = pid; umh_complete(sub_info); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 981bb2d10d83..54211dbd516c 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -9,6 +9,7 @@ #include <linux/highuid.h> #include <linux/cred.h> #include <linux/securebits.h> +#include <linux/security.h> #include <linux/keyctl.h> #include <linux/key-type.h> #include <keys/user-type.h> @@ -113,6 +114,10 @@ int create_user_ns(struct cred *new) !kgid_has_mapping(parent_ns, group)) goto fail_dec; + ret = security_create_user_ns(new); + if (ret < 0) + goto fail_dec; + ret = -ENOMEM; ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); if (!ns) diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c index 9dae1f648713..8303f4c7ca71 100644 --- a/kernel/usermode_driver.c +++ b/kernel/usermode_driver.c @@ -28,7 +28,7 @@ static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *na file = file_open_root_mnt(mnt, name, O_CREAT | O_WRONLY, 0700); if (IS_ERR(file)) { - mntput(mnt); + kern_unmount(mnt); return ERR_CAST(file); } @@ -38,7 +38,7 @@ static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *na if (err >= 0) err = -ENOMEM; filp_close(file, NULL); - mntput(mnt); + kern_unmount(mnt); return ERR_PTR(err); } diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 230038d4f908..a6f9bdd956c3 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -4,7 +4,7 @@ * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * - * See Documentation/watch_queue.rst + * See Documentation/core-api/watch_queue.rst */ #define pr_fmt(fmt) "watchq: " fmt @@ -34,6 +34,27 @@ MODULE_LICENSE("GPL"); #define WATCH_QUEUE_NOTE_SIZE 128 #define WATCH_QUEUE_NOTES_PER_PAGE (PAGE_SIZE / WATCH_QUEUE_NOTE_SIZE) +/* + * This must be called under the RCU read-lock, which makes + * sure that the wqueue still exists. It can then take the lock, + * and check that the wqueue hasn't been destroyed, which in + * turn makes sure that the notification pipe still exists. + */ +static inline bool lock_wqueue(struct watch_queue *wqueue) +{ + spin_lock_bh(&wqueue->lock); + if (unlikely(wqueue->defunct)) { + spin_unlock_bh(&wqueue->lock); + return false; + } + return true; +} + +static inline void unlock_wqueue(struct watch_queue *wqueue) +{ + spin_unlock_bh(&wqueue->lock); +} + static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { @@ -69,6 +90,10 @@ static const struct pipe_buf_operations watch_queue_pipe_buf_ops = { /* * Post a notification to a watch queue. + * + * Must be called with the RCU lock for reading, and the + * watch_queue lock held, which guarantees that the pipe + * hasn't been released. */ static bool post_one_notification(struct watch_queue *wqueue, struct watch_notification *n) @@ -85,9 +110,6 @@ static bool post_one_notification(struct watch_queue *wqueue, spin_lock_irq(&pipe->rd_wait.lock); - if (wqueue->defunct) - goto out; - mask = pipe->ring_size - 1; head = pipe->head; tail = pipe->tail; @@ -203,7 +225,10 @@ void __post_watch_notification(struct watch_list *wlist, if (security_post_notification(watch->cred, cred, n) < 0) continue; - post_one_notification(wqueue, n); + if (lock_wqueue(wqueue)) { + post_one_notification(wqueue, n); + unlock_wqueue(wqueue); + } } rcu_read_unlock(); @@ -429,6 +454,33 @@ void init_watch(struct watch *watch, struct watch_queue *wqueue) rcu_assign_pointer(watch->queue, wqueue); } +static int add_one_watch(struct watch *watch, struct watch_list *wlist, struct watch_queue *wqueue) +{ + const struct cred *cred; + struct watch *w; + + hlist_for_each_entry(w, &wlist->watchers, list_node) { + struct watch_queue *wq = rcu_access_pointer(w->queue); + if (wqueue == wq && watch->id == w->id) + return -EBUSY; + } + + cred = current_cred(); + if (atomic_inc_return(&cred->user->nr_watches) > task_rlimit(current, RLIMIT_NOFILE)) { + atomic_dec(&cred->user->nr_watches); + return -EAGAIN; + } + + watch->cred = get_cred(cred); + rcu_assign_pointer(watch->watch_list, wlist); + + kref_get(&wqueue->usage); + kref_get(&watch->usage); + hlist_add_head(&watch->queue_node, &wqueue->watches); + hlist_add_head_rcu(&watch->list_node, &wlist->watchers); + return 0; +} + /** * add_watch_to_object - Add a watch on an object to a watch list * @watch: The watch to add @@ -443,33 +495,21 @@ void init_watch(struct watch *watch, struct watch_queue *wqueue) */ int add_watch_to_object(struct watch *watch, struct watch_list *wlist) { - struct watch_queue *wqueue = rcu_access_pointer(watch->queue); - struct watch *w; - - hlist_for_each_entry(w, &wlist->watchers, list_node) { - struct watch_queue *wq = rcu_access_pointer(w->queue); - if (wqueue == wq && watch->id == w->id) - return -EBUSY; - } + struct watch_queue *wqueue; + int ret = -ENOENT; - watch->cred = get_current_cred(); - rcu_assign_pointer(watch->watch_list, wlist); + rcu_read_lock(); - if (atomic_inc_return(&watch->cred->user->nr_watches) > - task_rlimit(current, RLIMIT_NOFILE)) { - atomic_dec(&watch->cred->user->nr_watches); - put_cred(watch->cred); - return -EAGAIN; + wqueue = rcu_access_pointer(watch->queue); + if (lock_wqueue(wqueue)) { + spin_lock(&wlist->lock); + ret = add_one_watch(watch, wlist, wqueue); + spin_unlock(&wlist->lock); + unlock_wqueue(wqueue); } - spin_lock_bh(&wqueue->lock); - kref_get(&wqueue->usage); - kref_get(&watch->usage); - hlist_add_head(&watch->queue_node, &wqueue->watches); - spin_unlock_bh(&wqueue->lock); - - hlist_add_head(&watch->list_node, &wlist->watchers); - return 0; + rcu_read_unlock(); + return ret; } EXPORT_SYMBOL(add_watch_to_object); @@ -520,20 +560,15 @@ found: wqueue = rcu_dereference(watch->queue); - /* We don't need the watch list lock for the next bit as RCU is - * protecting *wqueue from deallocation. - */ - if (wqueue) { + if (lock_wqueue(wqueue)) { post_one_notification(wqueue, &n.watch); - spin_lock_bh(&wqueue->lock); - if (!hlist_unhashed(&watch->queue_node)) { hlist_del_init_rcu(&watch->queue_node); put_watch(watch); } - spin_unlock_bh(&wqueue->lock); + unlock_wqueue(wqueue); } if (wlist->release_watch) { diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 9166220457bc..8e61f21e7e33 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -57,7 +57,7 @@ int __read_mostly sysctl_hardlockup_all_cpu_backtrace; * Should we panic when a soft-lockup or hard-lockup occurs: */ unsigned int __read_mostly hardlockup_panic = - CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; + IS_ENABLED(CONFIG_BOOTPARAM_HARDLOCKUP_PANIC); /* * We may not want to enable hard lockup detection by default in all cases, * for example when running the kernel as a guest on a hypervisor. In these @@ -168,7 +168,7 @@ static struct cpumask watchdog_allowed_mask __read_mostly; /* Global variables, exported for sysctl */ unsigned int __read_mostly softlockup_panic = - CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; + IS_ENABLED(CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC); static bool softlockup_initialized __read_mostly; static u64 __read_mostly sample_period; @@ -537,7 +537,7 @@ int lockup_detector_offline_cpu(unsigned int cpu) return 0; } -static void lockup_detector_reconfigure(void) +static void __lockup_detector_reconfigure(void) { cpus_read_lock(); watchdog_nmi_stop(); @@ -557,6 +557,13 @@ static void lockup_detector_reconfigure(void) __lockup_detector_cleanup(); } +void lockup_detector_reconfigure(void) +{ + mutex_lock(&watchdog_mutex); + __lockup_detector_reconfigure(); + mutex_unlock(&watchdog_mutex); +} + /* * Create the watchdog infrastructure and configure the detector(s). */ @@ -573,13 +580,13 @@ static __init void lockup_detector_setup(void) return; mutex_lock(&watchdog_mutex); - lockup_detector_reconfigure(); + __lockup_detector_reconfigure(); softlockup_initialized = true; mutex_unlock(&watchdog_mutex); } #else /* CONFIG_SOFTLOCKUP_DETECTOR */ -static void lockup_detector_reconfigure(void) +static void __lockup_detector_reconfigure(void) { cpus_read_lock(); watchdog_nmi_stop(); @@ -587,9 +594,13 @@ static void lockup_detector_reconfigure(void) watchdog_nmi_start(); cpus_read_unlock(); } +void lockup_detector_reconfigure(void) +{ + __lockup_detector_reconfigure(); +} static inline void lockup_detector_setup(void) { - lockup_detector_reconfigure(); + __lockup_detector_reconfigure(); } #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ @@ -629,7 +640,7 @@ static void proc_watchdog_update(void) { /* Remove impossible cpus to keep sysctl output clean. */ cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); - lockup_detector_reconfigure(); + __lockup_detector_reconfigure(); } /* diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 0d2514b4ff0d..7cd5f5e7e0a1 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1651,7 +1651,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, struct work_struct *work = &dwork->work; WARN_ON_ONCE(!wq); - WARN_ON_FUNCTION_MISMATCH(timer->function, delayed_work_timer_fn); + WARN_ON_ONCE(timer->function != delayed_work_timer_fn); WARN_ON_ONCE(timer_pending(timer)); WARN_ON_ONCE(!list_empty(&work->entry)); @@ -2788,13 +2788,13 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, } /** - * flush_workqueue - ensure that any scheduled work has run to completion. + * __flush_workqueue - ensure that any scheduled work has run to completion. * @wq: workqueue to flush * * This function sleeps until all work items which were queued on entry * have finished execution, but it is not livelocked by new incoming ones. */ -void flush_workqueue(struct workqueue_struct *wq) +void __flush_workqueue(struct workqueue_struct *wq) { struct wq_flusher this_flusher = { .list = LIST_HEAD_INIT(this_flusher.list), @@ -2943,7 +2943,7 @@ void flush_workqueue(struct workqueue_struct *wq) out_unlock: mutex_unlock(&wq->mutex); } -EXPORT_SYMBOL(flush_workqueue); +EXPORT_SYMBOL(__flush_workqueue); /** * drain_workqueue - drain a workqueue @@ -2971,7 +2971,7 @@ void drain_workqueue(struct workqueue_struct *wq) wq->flags |= __WQ_DRAINING; mutex_unlock(&wq->mutex); reflush: - flush_workqueue(wq); + __flush_workqueue(wq); mutex_lock(&wq->mutex); @@ -3066,10 +3066,8 @@ static bool __flush_work(struct work_struct *work, bool from_cancel) if (WARN_ON(!work->func)) return false; - if (!from_cancel) { - lock_map_acquire(&work->lockdep_map); - lock_map_release(&work->lockdep_map); - } + lock_map_acquire(&work->lockdep_map); + lock_map_release(&work->lockdep_map); if (start_flush_work(work, &barr, from_cancel)) { wait_for_completion(&barr.done); @@ -3258,6 +3256,15 @@ static bool __cancel_work(struct work_struct *work, bool is_dwork) return ret; } +/* + * See cancel_delayed_work() + */ +bool cancel_work(struct work_struct *work) +{ + return __cancel_work(work, false); +} +EXPORT_SYMBOL(cancel_work); + /** * cancel_delayed_work - cancel a delayed work * @dwork: delayed_work to cancel @@ -5001,7 +5008,10 @@ static void unbind_workers(int cpu) for_each_pool_worker(worker, pool) { kthread_set_per_cpu(worker->task, -1); - WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); + if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask)) + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0); + else + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); } mutex_unlock(&wq_pool_attach_mutex); @@ -6111,3 +6121,11 @@ void __init workqueue_init(void) wq_online = true; wq_watchdog_init(); } + +/* + * Despite the naming, this is a no-op function which is here only for avoiding + * link error. Since compile-time warning may fail to catch, we will need to + * emit run-time warning from __flush_workqueue(). + */ +void __warn_flushing_systemwide_wq(void) { } +EXPORT_SYMBOL(__warn_flushing_systemwide_wq); |
