diff options
| author | Ingo Molnar <mingo@kernel.org> | 2024-02-14 10:45:07 +0100 |
|---|---|---|
| committer | Ingo Molnar <mingo@kernel.org> | 2024-02-14 10:45:07 +0100 |
| commit | 03c11eb3b16dc0058589751dfd91f254be2be613 (patch) | |
| tree | e5f2889212fec0bb0babdce9abd781ab487e246a /kernel | |
| parent | de8c6a352131f642b82474abe0cbb5dd26a7e081 (diff) | |
| parent | 841c35169323cd833294798e58b9bf63fa4fa1de (diff) | |
Merge tag 'v6.8-rc4' into x86/percpu, to resolve conflicts and refresh the branch
Conflicts:
arch/x86/include/asm/percpu.h
arch/x86/include/asm/text-patching.h
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
227 files changed, 14959 insertions, 7835 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 9bfe68fe9676..946dffa048b7 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -36,6 +36,8 @@ config KEXEC config KEXEC_FILE bool "Enable kexec file based system call" depends on ARCH_SUPPORTS_KEXEC_FILE + select CRYPTO + select CRYPTO_SHA256 select KEXEC_CORE help This is new version of kexec system call. This system call is @@ -94,10 +96,8 @@ config KEXEC_JUMP config CRASH_DUMP bool "kernel crash dumps" depends on ARCH_SUPPORTS_CRASH_DUMP - depends on ARCH_SUPPORTS_KEXEC select CRASH_CORE select KEXEC_CORE - select KEXEC help Generate crash dump after being started by kexec. This should be normally only set in special crash dump kernels @@ -110,7 +110,7 @@ config CRASH_DUMP For more details see Documentation/admin-guide/kdump/kdump.rst For s390, this option also enables zfcpdump. - See also <file:Documentation/s390/zfcpdump.rst> + See also <file:Documentation/arch/s390/zfcpdump.rst> config CRASH_HOTPLUG bool "Update the crash elfcorehdr on system configuration changes" diff --git a/kernel/Makefile b/kernel/Makefile index 3947122d618b..ce105a5558fc 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -114,6 +114,7 @@ obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o obj-$(CONFIG_HAVE_STATIC_CALL) += static_call.o obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call_inline.o obj-$(CONFIG_CFI_CLANG) += cfi.o +obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_PERF_EVENTS) += events/ diff --git a/kernel/acct.c b/kernel/acct.c index 1a9f929fe629..986c8214dabf 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -246,7 +246,7 @@ static int acct_on(struct filename *pathname) filp_close(file, NULL); return PTR_ERR(internal); } - err = __mnt_want_write(internal); + err = mnt_get_write_access(internal); if (err) { mntput(internal); kfree(acct); @@ -271,7 +271,7 @@ static int acct_on(struct filename *pathname) old = xchg(&ns->bacct, &acct->pin); mutex_unlock(&acct->lock); pin_kill(old); - __mnt_drop_write(mnt); + mnt_put_write_access(mnt); mntput(mnt); return 0; } diff --git a/kernel/async.c b/kernel/async.c index b2c4ba5686ee..97f224a5257b 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -46,11 +46,12 @@ asynchronous and synchronous parts of the kernel. #include <linux/async.h> #include <linux/atomic.h> -#include <linux/ktime.h> #include <linux/export.h> -#include <linux/wait.h> +#include <linux/ktime.h> +#include <linux/pid.h> #include <linux/sched.h> #include <linux/slab.h> +#include <linux/wait.h> #include <linux/workqueue.h> #include "workqueue_internal.h" @@ -145,6 +146,39 @@ static void async_run_entry_fn(struct work_struct *work) wake_up(&async_done); } +static async_cookie_t __async_schedule_node_domain(async_func_t func, + void *data, int node, + struct async_domain *domain, + struct async_entry *entry) +{ + async_cookie_t newcookie; + unsigned long flags; + + INIT_LIST_HEAD(&entry->domain_list); + INIT_LIST_HEAD(&entry->global_list); + INIT_WORK(&entry->work, async_run_entry_fn); + entry->func = func; + entry->data = data; + entry->domain = domain; + + spin_lock_irqsave(&async_lock, flags); + + /* allocate cookie and queue */ + newcookie = entry->cookie = next_cookie++; + + list_add_tail(&entry->domain_list, &domain->pending); + if (domain->registered) + list_add_tail(&entry->global_list, &async_global_pending); + + atomic_inc(&entry_count); + spin_unlock_irqrestore(&async_lock, flags); + + /* schedule for execution */ + queue_work_node(node, system_unbound_wq, &entry->work); + + return newcookie; +} + /** * async_schedule_node_domain - NUMA specific version of async_schedule_domain * @func: function to execute asynchronously @@ -186,29 +220,8 @@ async_cookie_t async_schedule_node_domain(async_func_t func, void *data, func(data, newcookie); return newcookie; } - INIT_LIST_HEAD(&entry->domain_list); - INIT_LIST_HEAD(&entry->global_list); - INIT_WORK(&entry->work, async_run_entry_fn); - entry->func = func; - entry->data = data; - entry->domain = domain; - - spin_lock_irqsave(&async_lock, flags); - - /* allocate cookie and queue */ - newcookie = entry->cookie = next_cookie++; - list_add_tail(&entry->domain_list, &domain->pending); - if (domain->registered) - list_add_tail(&entry->global_list, &async_global_pending); - - atomic_inc(&entry_count); - spin_unlock_irqrestore(&async_lock, flags); - - /* schedule for execution */ - queue_work_node(node, system_unbound_wq, &entry->work); - - return newcookie; + return __async_schedule_node_domain(func, data, node, domain, entry); } EXPORT_SYMBOL_GPL(async_schedule_node_domain); @@ -232,6 +245,35 @@ async_cookie_t async_schedule_node(async_func_t func, void *data, int node) EXPORT_SYMBOL_GPL(async_schedule_node); /** + * async_schedule_dev_nocall - A simplified variant of async_schedule_dev() + * @func: function to execute asynchronously + * @dev: device argument to be passed to function + * + * @dev is used as both the argument for the function and to provide NUMA + * context for where to run the function. + * + * If the asynchronous execution of @func is scheduled successfully, return + * true. Otherwise, do nothing and return false, unlike async_schedule_dev() + * that will run the function synchronously then. + */ +bool async_schedule_dev_nocall(async_func_t func, struct device *dev) +{ + struct async_entry *entry; + + entry = kzalloc(sizeof(struct async_entry), GFP_KERNEL); + + /* Give up if there is no memory or too much work. */ + if (!entry || atomic_read(&entry_count) > MAX_WORK) { + kfree(entry); + return false; + } + + __async_schedule_node_domain(func, dev, dev_to_node(dev), + &async_dfl_domain, entry); + return true; +} + +/** * async_synchronize_full - synchronize all asynchronous function calls * * This function waits until all asynchronous function calls have been done. diff --git a/kernel/audit.c b/kernel/audit.c index 16205dd29843..9c8e5f732c4c 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -487,15 +487,19 @@ static void auditd_conn_free(struct rcu_head *rcu) * @pid: auditd PID * @portid: auditd netlink portid * @net: auditd network namespace pointer + * @skb: the netlink command from the audit daemon + * @ack: netlink ack flag, cleared if ack'd here * * Description: * This function will obtain and drop network namespace references as * necessary. Returns zero on success, negative values on failure. */ -static int auditd_set(struct pid *pid, u32 portid, struct net *net) +static int auditd_set(struct pid *pid, u32 portid, struct net *net, + struct sk_buff *skb, bool *ack) { unsigned long flags; struct auditd_connection *ac_old, *ac_new; + struct nlmsghdr *nlh; if (!pid || !net) return -EINVAL; @@ -507,6 +511,13 @@ static int auditd_set(struct pid *pid, u32 portid, struct net *net) ac_new->portid = portid; ac_new->net = get_net(net); + /* send the ack now to avoid a race with the queue backlog */ + if (*ack) { + nlh = nlmsg_hdr(skb); + netlink_ack(skb, nlh, 0, NULL); + *ack = false; + } + spin_lock_irqsave(&auditd_conn_lock, flags); ac_old = rcu_dereference_protected(auditd_conn, lockdep_is_held(&auditd_conn_lock)); @@ -1200,7 +1211,8 @@ static int audit_replace(struct pid *pid) return auditd_send_unicast_skb(skb); } -static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh, + bool *ack) { u32 seq; void *data; @@ -1293,7 +1305,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) /* register a new auditd connection */ err = auditd_set(req_pid, NETLINK_CB(skb).portid, - sock_net(NETLINK_CB(skb).sk)); + sock_net(NETLINK_CB(skb).sk), + skb, ack); if (audit_enabled != AUDIT_OFF) audit_log_config_change("audit_pid", new_pid, @@ -1538,9 +1551,10 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) * Parse the provided skb and deal with any messages that may be present, * malformed skbs are discarded. */ -static void audit_receive(struct sk_buff *skb) +static void audit_receive(struct sk_buff *skb) { struct nlmsghdr *nlh; + bool ack; /* * len MUST be signed for nlmsg_next to be able to dec it below 0 * if the nlmsg_len was not aligned @@ -1553,9 +1567,12 @@ static void audit_receive(struct sk_buff *skb) audit_ctl_lock(); while (nlmsg_ok(nlh, len)) { - err = audit_receive_msg(skb, nlh); - /* if err or if this message says it wants a response */ - if (err || (nlh->nlmsg_flags & NLM_F_ACK)) + ack = nlh->nlmsg_flags & NLM_F_ACK; + err = audit_receive_msg(skb, nlh, &ack); + + /* send an ack if the user asked for one and audit_receive_msg + * didn't already do it, or if there was an error. */ + if (ack || err) netlink_ack(skb, nlh, err, NULL); nlh = nlmsg_next(nlh, &len); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index e867c17d3f84..1b07e6f12a07 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -34,7 +34,7 @@ struct audit_chunk { struct list_head list; struct audit_tree *owner; unsigned index; /* index; upper bit indicates 'will prune' */ - } owners[]; + } owners[] __counted_by(count); }; struct audit_tree_mark { @@ -87,8 +87,8 @@ static struct task_struct *prune_thread; * that makes a difference. Some. */ -static struct fsnotify_group *audit_tree_group; -static struct kmem_cache *audit_tree_mark_cachep __read_mostly; +static struct fsnotify_group *audit_tree_group __ro_after_init; +static struct kmem_cache *audit_tree_mark_cachep __ro_after_init; static struct audit_tree *alloc_tree(const char *s) { diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 65075f1e4ac8..7a98cd176a12 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -527,11 +527,18 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark) unsigned long ino; dev_t dev; - exe_file = get_task_exe_file(tsk); + /* only do exe filtering if we are recording @current events/records */ + if (tsk != current) + return 0; + + if (!current->mm) + return 0; + exe_file = get_mm_exe_file(current->mm); if (!exe_file) return 0; ino = file_inode(exe_file)->i_ino; dev = file_inode(exe_file)->i_sb->s_dev; fput(exe_file); + return audit_mark_compare(mark, ino, dev); } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b0cb7631e48b..6f0d6fb6523f 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -143,6 +143,8 @@ static const struct audit_nfcfgop_tab audit_nfcfgs[] = { { AUDIT_NFT_OP_OBJ_RESET, "nft_reset_obj" }, { AUDIT_NFT_OP_FLOWTABLE_REGISTER, "nft_register_flowtable" }, { AUDIT_NFT_OP_FLOWTABLE_UNREGISTER, "nft_unregister_flowtable" }, + { AUDIT_NFT_OP_SETELEM_RESET, "nft_reset_setelem" }, + { AUDIT_NFT_OP_RULE_RESET, "nft_reset_rule" }, { AUDIT_NFT_OP_INVALID, "nft_invalid" }, }; @@ -2210,7 +2212,7 @@ __audit_reusename(const __user char *uptr) if (!n->name) continue; if (n->name->uptr == uptr) { - n->name->refcnt++; + atomic_inc(&n->name->refcnt); return n->name; } } @@ -2239,7 +2241,7 @@ void __audit_getname(struct filename *name) n->name = name; n->name_len = AUDIT_NAME_FULL; name->aname = n; - name->refcnt++; + atomic_inc(&name->refcnt); } static inline int audit_copy_fcaps(struct audit_names *name, @@ -2371,7 +2373,7 @@ out_alloc: return; if (name) { n->name = name; - name->refcnt++; + atomic_inc(&name->refcnt); } out: @@ -2498,7 +2500,7 @@ void __audit_inode_child(struct inode *parent, if (found_parent) { found_child->name = found_parent->name; found_child->name_len = AUDIT_NAME_FULL; - found_child->name->refcnt++; + atomic_inc(&found_child->name->refcnt); } } diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 2058e89b5ddd..0bdbbbeab155 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -867,11 +867,11 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, } if (old_ptr) - map->ops->map_fd_put_ptr(old_ptr); + map->ops->map_fd_put_ptr(map, old_ptr, true); return 0; } -static long fd_array_map_delete_elem(struct bpf_map *map, void *key) +static long __fd_array_map_delete_elem(struct bpf_map *map, void *key, bool need_defer) { struct bpf_array *array = container_of(map, struct bpf_array, map); void *old_ptr; @@ -890,13 +890,18 @@ static long fd_array_map_delete_elem(struct bpf_map *map, void *key) } if (old_ptr) { - map->ops->map_fd_put_ptr(old_ptr); + map->ops->map_fd_put_ptr(map, old_ptr, need_defer); return 0; } else { return -ENOENT; } } +static long fd_array_map_delete_elem(struct bpf_map *map, void *key) +{ + return __fd_array_map_delete_elem(map, key, true); +} + static void *prog_fd_array_get_ptr(struct bpf_map *map, struct file *map_file, int fd) { @@ -913,8 +918,9 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map, return prog; } -static void prog_fd_array_put_ptr(void *ptr) +static void prog_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { + /* bpf_prog is freed after one RCU or tasks trace grace period */ bpf_prog_put(ptr); } @@ -924,13 +930,13 @@ static u32 prog_fd_array_sys_lookup_elem(void *ptr) } /* decrement refcnt of all bpf_progs that are stored in this map */ -static void bpf_fd_array_map_clear(struct bpf_map *map) +static void bpf_fd_array_map_clear(struct bpf_map *map, bool need_defer) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; for (i = 0; i < array->map.max_entries; i++) - fd_array_map_delete_elem(map, &i); + __fd_array_map_delete_elem(map, &i, need_defer); } static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, @@ -1012,11 +1018,16 @@ static void prog_array_map_poke_untrack(struct bpf_map *map, mutex_unlock(&aux->poke_mutex); } +void __weak bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, + struct bpf_prog *new, struct bpf_prog *old) +{ + WARN_ON_ONCE(1); +} + static void prog_array_map_poke_run(struct bpf_map *map, u32 key, struct bpf_prog *old, struct bpf_prog *new) { - u8 *old_addr, *new_addr, *old_bypass_addr; struct prog_poke_elem *elem; struct bpf_array_aux *aux; @@ -1025,7 +1036,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, list_for_each_entry(elem, &aux->poke_progs, list) { struct bpf_jit_poke_descriptor *poke; - int i, ret; + int i; for (i = 0; i < elem->aux->size_poke_tab; i++) { poke = &elem->aux->poke_tab[i]; @@ -1044,21 +1055,10 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, * activated, so tail call updates can arrive from here * while JIT is still finishing its final fixup for * non-activated poke entries. - * 3) On program teardown, the program's kallsym entry gets - * removed out of RCU callback, but we can only untrack - * from sleepable context, therefore bpf_arch_text_poke() - * might not see that this is in BPF text section and - * bails out with -EINVAL. As these are unreachable since - * RCU grace period already passed, we simply skip them. - * 4) Also programs reaching refcount of zero while patching + * 3) Also programs reaching refcount of zero while patching * is in progress is okay since we're protected under * poke_mutex and untrack the programs before the JIT - * buffer is freed. When we're still in the middle of - * patching and suddenly kallsyms entry of the program - * gets evicted, we just skip the rest which is fine due - * to point 3). - * 5) Any other error happening below from bpf_arch_text_poke() - * is a unexpected bug. + * buffer is freed. */ if (!READ_ONCE(poke->tailcall_target_stable)) continue; @@ -1068,39 +1068,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, poke->tail_call.key != key) continue; - old_bypass_addr = old ? NULL : poke->bypass_addr; - old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL; - new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL; - - if (new) { - ret = bpf_arch_text_poke(poke->tailcall_target, - BPF_MOD_JUMP, - old_addr, new_addr); - BUG_ON(ret < 0 && ret != -EINVAL); - if (!old) { - ret = bpf_arch_text_poke(poke->tailcall_bypass, - BPF_MOD_JUMP, - poke->bypass_addr, - NULL); - BUG_ON(ret < 0 && ret != -EINVAL); - } - } else { - ret = bpf_arch_text_poke(poke->tailcall_bypass, - BPF_MOD_JUMP, - old_bypass_addr, - poke->bypass_addr); - BUG_ON(ret < 0 && ret != -EINVAL); - /* let other CPUs finish the execution of program - * so that it will not possible to expose them - * to invalid nop, stack unwind, nop state - */ - if (!ret) - synchronize_rcu(); - ret = bpf_arch_text_poke(poke->tailcall_target, - BPF_MOD_JUMP, - old_addr, NULL); - BUG_ON(ret < 0 && ret != -EINVAL); - } + bpf_arch_poke_desc_update(poke, new, old); } } } @@ -1109,7 +1077,7 @@ static void prog_array_map_clear_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_array_aux, work)->map; - bpf_fd_array_map_clear(map); + bpf_fd_array_map_clear(map, true); bpf_map_put(map); } @@ -1189,7 +1157,7 @@ static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, { struct bpf_event_entry *ee; - ee = kzalloc(sizeof(*ee), GFP_ATOMIC); + ee = kzalloc(sizeof(*ee), GFP_KERNEL); if (ee) { ee->event = perf_file->private_data; ee->perf_file = perf_file; @@ -1239,8 +1207,9 @@ err_out: return ee; } -static void perf_event_fd_array_put_ptr(void *ptr) +static void perf_event_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { + /* bpf_perf_event is freed after one RCU grace period */ bpf_event_entry_free_rcu(ptr); } @@ -1258,7 +1227,7 @@ static void perf_event_fd_array_release(struct bpf_map *map, for (i = 0; i < array->map.max_entries; i++) { ee = READ_ONCE(array->ptrs[i]); if (ee && ee->map_file == map_file) - fd_array_map_delete_elem(map, &i); + __fd_array_map_delete_elem(map, &i, true); } rcu_read_unlock(); } @@ -1266,7 +1235,7 @@ static void perf_event_fd_array_release(struct bpf_map *map, static void perf_event_fd_array_map_free(struct bpf_map *map) { if (map->map_flags & BPF_F_PRESERVE_ELEMS) - bpf_fd_array_map_clear(map); + bpf_fd_array_map_clear(map, false); fd_array_map_free(map); } @@ -1294,7 +1263,7 @@ static void *cgroup_fd_array_get_ptr(struct bpf_map *map, return cgroup_get_from_fd(fd); } -static void cgroup_fd_array_put_ptr(void *ptr) +static void cgroup_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { /* cgroup_put free cgrp after a rcu grace period */ cgroup_put(ptr); @@ -1302,7 +1271,7 @@ static void cgroup_fd_array_put_ptr(void *ptr) static void cgroup_fd_array_free(struct bpf_map *map) { - bpf_fd_array_map_clear(map); + bpf_fd_array_map_clear(map, false); fd_array_map_free(map); } @@ -1347,7 +1316,7 @@ static void array_of_map_free(struct bpf_map *map) * is protected by fdget/fdput. */ bpf_map_meta_free(map->inner_map_meta); - bpf_fd_array_map_clear(map); + bpf_fd_array_map_clear(map, false); fd_array_map_free(map); } diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index d44fe8dd9732..28efd0a3f220 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -82,7 +82,7 @@ static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key) int fd; fd = *(int *)key; - cgroup = cgroup_get_from_fd(fd); + cgroup = cgroup_v1v2_get_from_fd(fd); if (IS_ERR(cgroup)) return ERR_CAST(cgroup); @@ -101,7 +101,7 @@ static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key, int fd; fd = *(int *)key; - cgroup = cgroup_get_from_fd(fd); + cgroup = cgroup_v1v2_get_from_fd(fd); if (IS_ERR(cgroup)) return PTR_ERR(cgroup); @@ -131,7 +131,7 @@ static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key) int err, fd; fd = *(int *)key; - cgroup = cgroup_get_from_fd(fd); + cgroup = cgroup_v1v2_get_from_fd(fd); if (IS_ERR(cgroup)) return PTR_ERR(cgroup); diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 96856f130cbf..0fae79164187 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -782,9 +782,7 @@ struct bpf_iter_num_kern { int end; /* final value, exclusive */ } __aligned(8); -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "Global functions as their definitions will be in vmlinux BTF"); +__bpf_kfunc_start_defs(); __bpf_kfunc int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end) { @@ -793,8 +791,6 @@ __bpf_kfunc int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end) BUILD_BUG_ON(sizeof(struct bpf_iter_num_kern) != sizeof(struct bpf_iter_num)); BUILD_BUG_ON(__alignof__(struct bpf_iter_num_kern) != __alignof__(struct bpf_iter_num)); - BTF_TYPE_EMIT(struct btf_iter_num); - /* start == end is legit, it's an empty range and we'll just get NULL * on first (and any subsequent) bpf_iter_num_next() call */ @@ -845,4 +841,4 @@ __bpf_kfunc void bpf_iter_num_destroy(struct bpf_iter_num *it) s->cur = s->end = 0; } -__diag_pop(); +__bpf_kfunc_end_defs(); diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index b5149cfce7d4..146824cc9689 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -553,7 +553,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, void *value, u64 map_flags, gfp_t gfp_flags) { struct bpf_local_storage_data *old_sdata = NULL; - struct bpf_local_storage_elem *selem = NULL; + struct bpf_local_storage_elem *alloc_selem, *selem = NULL; struct bpf_local_storage *local_storage; unsigned long flags; int err; @@ -607,11 +607,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, } } - if (gfp_flags == GFP_KERNEL) { - selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags); - if (!selem) - return ERR_PTR(-ENOMEM); - } + /* A lookup has just been done before and concluded a new selem is + * needed. The chance of an unnecessary alloc is unlikely. + */ + alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags); + if (!alloc_selem) + return ERR_PTR(-ENOMEM); raw_spin_lock_irqsave(&local_storage->lock, flags); @@ -623,13 +624,13 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, * simple. */ err = -EAGAIN; - goto unlock_err; + goto unlock; } old_sdata = bpf_local_storage_lookup(local_storage, smap, false); err = check_flags(old_sdata, map_flags); if (err) - goto unlock_err; + goto unlock; if (old_sdata && (map_flags & BPF_F_LOCK)) { copy_map_value_locked(&smap->map, old_sdata->data, value, @@ -638,23 +639,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, goto unlock; } - if (gfp_flags != GFP_KERNEL) { - /* local_storage->lock is held. Hence, we are sure - * we can unlink and uncharge the old_sdata successfully - * later. Hence, instead of charging the new selem now - * and then uncharge the old selem later (which may cause - * a potential but unnecessary charge failure), avoid taking - * a charge at all here (the "!old_sdata" check) and the - * old_sdata will not be uncharged later during - * bpf_selem_unlink_storage_nolock(). - */ - selem = bpf_selem_alloc(smap, owner, value, !old_sdata, gfp_flags); - if (!selem) { - err = -ENOMEM; - goto unlock_err; - } - } - + alloc_selem = NULL; /* First, link the new selem to the map */ bpf_selem_link_map(smap, selem); @@ -665,20 +650,16 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (old_sdata) { bpf_selem_unlink_map(SELEM(old_sdata)); bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata), - false, false); + true, false); } unlock: raw_spin_unlock_irqrestore(&local_storage->lock, flags); - return SDATA(selem); - -unlock_err: - raw_spin_unlock_irqrestore(&local_storage->lock, flags); - if (selem) { + if (alloc_selem) { mem_uncharge(smap, owner, smap->elem_size); - bpf_selem_free(selem, smap, true); + bpf_selem_free(alloc_selem, smap, true); } - return ERR_PTR(err); + return err ? ERR_PTR(err) : SDATA(selem); } static u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache) @@ -779,7 +760,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) * of the loop will set the free_cgroup_storage to true. */ free_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, false, true); + local_storage, selem, true, true); } raw_spin_unlock_irqrestore(&local_storage->lock, flags); diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index e14c822f8911..e8e910395bf6 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -298,6 +298,18 @@ BTF_ID(func, bpf_lsm_kernel_module_request) BTF_ID(func, bpf_lsm_kernel_read_file) BTF_ID(func, bpf_lsm_kernfs_init_security) +#ifdef CONFIG_SECURITY_PATH +BTF_ID(func, bpf_lsm_path_unlink) +BTF_ID(func, bpf_lsm_path_mkdir) +BTF_ID(func, bpf_lsm_path_rmdir) +BTF_ID(func, bpf_lsm_path_truncate) +BTF_ID(func, bpf_lsm_path_symlink) +BTF_ID(func, bpf_lsm_path_link) +BTF_ID(func, bpf_lsm_path_rename) +BTF_ID(func, bpf_lsm_path_chmod) +BTF_ID(func, bpf_lsm_path_chown) +#endif /* CONFIG_SECURITY_PATH */ + #ifdef CONFIG_KEYS BTF_ID(func, bpf_lsm_key_free) #endif /* CONFIG_KEYS */ diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index fdc3e8705a3c..02068bd0e4d9 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -352,18 +352,24 @@ const struct bpf_link_ops bpf_struct_ops_link_lops = { int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, struct bpf_tramp_link *link, const struct btf_func_model *model, - void *image, void *image_end) + void *stub_func, void *image, void *image_end) { - u32 flags; + u32 flags = BPF_TRAMP_F_INDIRECT; + int size; tlinks[BPF_TRAMP_FENTRY].links[0] = link; tlinks[BPF_TRAMP_FENTRY].nr_links = 1; - /* BPF_TRAMP_F_RET_FENTRY_RET is only used by bpf_struct_ops, - * and it must be used alone. - */ - flags = model->ret_size > 0 ? BPF_TRAMP_F_RET_FENTRY_RET : 0; + + if (model->ret_size > 0) + flags |= BPF_TRAMP_F_RET_FENTRY_RET; + + size = arch_bpf_trampoline_size(model, flags, tlinks, NULL); + if (size < 0) + return size; + if (size > (unsigned long)image_end - (unsigned long)image) + return -E2BIG; return arch_prepare_bpf_trampoline(NULL, image, image_end, - model, flags, tlinks, NULL); + model, flags, tlinks, stub_func); } static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, @@ -497,11 +503,12 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, err = bpf_struct_ops_prepare_trampoline(tlinks, link, &st_ops->func_models[i], + *(void **)(st_ops->cfi_stubs + moff), image, image_end); if (err < 0) goto reset_unlock; - *(void **)(kdata + moff) = image; + *(void **)(kdata + moff) = image + cfi_get_offset(); image += err; /* put prog_id to udata */ @@ -515,7 +522,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, if (err) goto reset_unlock; } - set_memory_rox((long)st_map->image, 1); + arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE); /* Let bpf_link handle registration & unregistration. * * Pair with smp_load_acquire() during lookup_elem(). @@ -524,7 +531,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, goto unlock; } - set_memory_rox((long)st_map->image, 1); + arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE); err = st_ops->reg(kdata); if (likely(!err)) { /* This refcnt increment on the map here after @@ -547,8 +554,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, * there was a race in registering the struct_ops (under the same name) to * a sub-system through different struct_ops's maps. */ - set_memory_nx((long)st_map->image, 1); - set_memory_rw((long)st_map->image, 1); + arch_unprotect_bpf_trampoline(st_map->image, PAGE_SIZE); reset_unlock: bpf_struct_ops_map_put_progs(st_map); @@ -615,7 +621,10 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map) if (st_map->links) bpf_struct_ops_map_put_progs(st_map); bpf_map_area_free(st_map->links); - bpf_jit_free_exec(st_map->image); + if (st_map->image) { + arch_free_bpf_trampoline(st_map->image, PAGE_SIZE); + bpf_jit_uncharge_modmem(PAGE_SIZE); + } bpf_map_area_free(st_map->uvalue); bpf_map_area_free(st_map); } @@ -657,6 +666,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) struct bpf_struct_ops_map *st_map; const struct btf_type *t, *vt; struct bpf_map *map; + int ret; st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id); if (!st_ops) @@ -681,18 +691,32 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) st_map->st_ops = st_ops; map = &st_map->map; + ret = bpf_jit_charge_modmem(PAGE_SIZE); + if (ret) { + __bpf_struct_ops_map_free(map); + return ERR_PTR(ret); + } + + st_map->image = arch_alloc_bpf_trampoline(PAGE_SIZE); + if (!st_map->image) { + /* __bpf_struct_ops_map_free() uses st_map->image as flag + * for "charged or not". In this case, we need to unchange + * here. + */ + bpf_jit_uncharge_modmem(PAGE_SIZE); + __bpf_struct_ops_map_free(map); + return ERR_PTR(-ENOMEM); + } st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE); st_map->links = bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_links *), NUMA_NO_NODE); - st_map->image = bpf_jit_alloc_exec(PAGE_SIZE); - if (!st_map->uvalue || !st_map->links || !st_map->image) { + if (!st_map->uvalue || !st_map->links) { __bpf_struct_ops_map_free(map); return ERR_PTR(-ENOMEM); } mutex_init(&st_map->lock); - set_vm_flush_reset_perms(st_map->image); bpf_map_init_from_attr(map, attr); return map; @@ -907,4 +931,3 @@ err_out: kfree(link); return err; } - diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 1095bbe29859..596471189176 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3293,6 +3293,8 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, type = BPF_KPTR_UNREF; else if (!strcmp("kptr", __btf_name_by_offset(btf, t->name_off))) type = BPF_KPTR_REF; + else if (!strcmp("percpu_kptr", __btf_name_by_offset(btf, t->name_off))) + type = BPF_KPTR_PERCPU; else return -EINVAL; @@ -3308,10 +3310,10 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, return BTF_FIELD_FOUND; } -static const char *btf_find_decl_tag_value(const struct btf *btf, - const struct btf_type *pt, - int comp_idx, const char *tag_key) +const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt, + int comp_idx, const char *tag_key) { + const char *value = NULL; int i; for (i = 1; i < btf_nr_types(btf); i++) { @@ -3325,9 +3327,14 @@ static const char *btf_find_decl_tag_value(const struct btf *btf, continue; if (strncmp(__btf_name_by_offset(btf, t->name_off), tag_key, len)) continue; - return __btf_name_by_offset(btf, t->name_off) + len; + /* Prevent duplicate entries for same type */ + if (value) + return ERR_PTR(-EEXIST); + value = __btf_name_by_offset(btf, t->name_off) + len; } - return NULL; + if (!value) + return ERR_PTR(-ENOENT); + return value; } static int @@ -3345,7 +3352,7 @@ btf_find_graph_root(const struct btf *btf, const struct btf_type *pt, if (t->size != sz) return BTF_FIELD_IGNORE; value_type = btf_find_decl_tag_value(btf, pt, comp_idx, "contains:"); - if (!value_type) + if (IS_ERR(value_type)) return -EINVAL; node_field_name = strstr(value_type, ":"); if (!node_field_name) @@ -3457,6 +3464,7 @@ static int btf_find_struct_field(const struct btf *btf, break; case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_KPTR_PERCPU: ret = btf_find_kptr(btf, member_type, off, sz, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) @@ -3523,6 +3531,7 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, break; case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_KPTR_PERCPU: ret = btf_find_kptr(btf, var_type, off, sz, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) @@ -3783,6 +3792,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type break; case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_KPTR_PERCPU: ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]); if (ret < 0) goto end; @@ -3830,9 +3840,6 @@ end: return ERR_PTR(ret); } -#define GRAPH_ROOT_MASK (BPF_LIST_HEAD | BPF_RB_ROOT) -#define GRAPH_NODE_MASK (BPF_LIST_NODE | BPF_RB_NODE) - int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec) { int i; @@ -3845,13 +3852,13 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec) * Hence we only need to ensure that bpf_{list_head,rb_root} ownership * does not form cycles. */ - if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & GRAPH_ROOT_MASK)) + if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & BPF_GRAPH_ROOT)) return 0; for (i = 0; i < rec->cnt; i++) { struct btf_struct_meta *meta; u32 btf_id; - if (!(rec->fields[i].type & GRAPH_ROOT_MASK)) + if (!(rec->fields[i].type & BPF_GRAPH_ROOT)) continue; btf_id = rec->fields[i].graph_root.value_btf_id; meta = btf_find_struct_meta(btf, btf_id); @@ -3863,7 +3870,7 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec) * to check ownership cycle for a type unless it's also a * node type. */ - if (!(rec->field_mask & GRAPH_NODE_MASK)) + if (!(rec->field_mask & BPF_GRAPH_NODE)) continue; /* We need to ensure ownership acyclicity among all types. The @@ -3899,7 +3906,7 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec) * - A is both an root and node. * - B is only an node. */ - if (meta->record->field_mask & GRAPH_ROOT_MASK) + if (meta->record->field_mask & BPF_GRAPH_ROOT) return -ELOOP; } return 0; @@ -5608,21 +5615,46 @@ static u8 bpf_ctx_convert_map[] = { #undef BPF_MAP_TYPE #undef BPF_LINK_TYPE -const struct btf_member * -btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, - const struct btf_type *t, enum bpf_prog_type prog_type, - int arg) +static const struct btf_type *find_canonical_prog_ctx_type(enum bpf_prog_type prog_type) { const struct btf_type *conv_struct; - const struct btf_type *ctx_struct; const struct btf_member *ctx_type; - const char *tname, *ctx_tname; conv_struct = bpf_ctx_convert.t; - if (!conv_struct) { - bpf_log(log, "btf_vmlinux is malformed\n"); + if (!conv_struct) return NULL; - } + /* prog_type is valid bpf program type. No need for bounds check. */ + ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2; + /* ctx_type is a pointer to prog_ctx_type in vmlinux. + * Like 'struct __sk_buff' + */ + return btf_type_by_id(btf_vmlinux, ctx_type->type); +} + +static int find_kern_ctx_type_id(enum bpf_prog_type prog_type) +{ + const struct btf_type *conv_struct; + const struct btf_member *ctx_type; + + conv_struct = bpf_ctx_convert.t; + if (!conv_struct) + return -EFAULT; + /* prog_type is valid bpf program type. No need for bounds check. */ + ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2 + 1; + /* ctx_type is a pointer to prog_ctx_type in vmlinux. + * Like 'struct sk_buff' + */ + return ctx_type->type; +} + +const struct btf_type * +btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, enum bpf_prog_type prog_type, + int arg) +{ + const struct btf_type *ctx_type; + const char *tname, *ctx_tname; + t = btf_type_by_id(btf, t->type); while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); @@ -5639,17 +5671,15 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, bpf_log(log, "arg#%d struct doesn't have a name\n", arg); return NULL; } - /* prog_type is valid bpf program type. No need for bounds check. */ - ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2; - /* ctx_struct is a pointer to prog_ctx_type in vmlinux. - * Like 'struct __sk_buff' - */ - ctx_struct = btf_type_by_id(btf_vmlinux, ctx_type->type); - if (!ctx_struct) + + ctx_type = find_canonical_prog_ctx_type(prog_type); + if (!ctx_type) { + bpf_log(log, "btf_vmlinux is malformed\n"); /* should not happen */ return NULL; + } again: - ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_struct->name_off); + ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_type->name_off); if (!ctx_tname) { /* should not happen */ bpf_log(log, "Please fix kernel include/linux/bpf_types.h\n"); @@ -5670,28 +5700,167 @@ again: /* bpf_user_pt_regs_t is a typedef, so resolve it to * underlying struct and check name again */ - if (!btf_type_is_modifier(ctx_struct)) + if (!btf_type_is_modifier(ctx_type)) return NULL; - while (btf_type_is_modifier(ctx_struct)) - ctx_struct = btf_type_by_id(btf_vmlinux, ctx_struct->type); + while (btf_type_is_modifier(ctx_type)) + ctx_type = btf_type_by_id(btf_vmlinux, ctx_type->type); goto again; } return ctx_type; } +/* forward declarations for arch-specific underlying types of + * bpf_user_pt_regs_t; this avoids the need for arch-specific #ifdef + * compilation guards below for BPF_PROG_TYPE_PERF_EVENT checks, but still + * works correctly with __builtin_types_compatible_p() on respective + * architectures + */ +struct user_regs_struct; +struct user_pt_regs; + +static int btf_validate_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, int arg, + enum bpf_prog_type prog_type, + enum bpf_attach_type attach_type) +{ + const struct btf_type *ctx_type; + const char *tname, *ctx_tname; + + if (!btf_is_ptr(t)) { + bpf_log(log, "arg#%d type isn't a pointer\n", arg); + return -EINVAL; + } + t = btf_type_by_id(btf, t->type); + + /* KPROBE and PERF_EVENT programs allow bpf_user_pt_regs_t typedef */ + if (prog_type == BPF_PROG_TYPE_KPROBE || prog_type == BPF_PROG_TYPE_PERF_EVENT) { + while (btf_type_is_modifier(t) && !btf_type_is_typedef(t)) + t = btf_type_by_id(btf, t->type); + + if (btf_type_is_typedef(t)) { + tname = btf_name_by_offset(btf, t->name_off); + if (tname && strcmp(tname, "bpf_user_pt_regs_t") == 0) + return 0; + } + } + + /* all other program types don't use typedefs for context type */ + while (btf_type_is_modifier(t)) + t = btf_type_by_id(btf, t->type); + + /* `void *ctx __arg_ctx` is always valid */ + if (btf_type_is_void(t)) + return 0; + + tname = btf_name_by_offset(btf, t->name_off); + if (str_is_empty(tname)) { + bpf_log(log, "arg#%d type doesn't have a name\n", arg); + return -EINVAL; + } + + /* special cases */ + switch (prog_type) { + case BPF_PROG_TYPE_KPROBE: + if (__btf_type_is_struct(t) && strcmp(tname, "pt_regs") == 0) + return 0; + break; + case BPF_PROG_TYPE_PERF_EVENT: + if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct pt_regs) && + __btf_type_is_struct(t) && strcmp(tname, "pt_regs") == 0) + return 0; + if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct user_pt_regs) && + __btf_type_is_struct(t) && strcmp(tname, "user_pt_regs") == 0) + return 0; + if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct user_regs_struct) && + __btf_type_is_struct(t) && strcmp(tname, "user_regs_struct") == 0) + return 0; + break; + case BPF_PROG_TYPE_RAW_TRACEPOINT: + case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: + /* allow u64* as ctx */ + if (btf_is_int(t) && t->size == 8) + return 0; + break; + case BPF_PROG_TYPE_TRACING: + switch (attach_type) { + case BPF_TRACE_RAW_TP: + /* tp_btf program is TRACING, so need special case here */ + if (__btf_type_is_struct(t) && + strcmp(tname, "bpf_raw_tracepoint_args") == 0) + return 0; + /* allow u64* as ctx */ + if (btf_is_int(t) && t->size == 8) + return 0; + break; + case BPF_TRACE_ITER: + /* allow struct bpf_iter__xxx types only */ + if (__btf_type_is_struct(t) && + strncmp(tname, "bpf_iter__", sizeof("bpf_iter__") - 1) == 0) + return 0; + break; + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + case BPF_MODIFY_RETURN: + /* allow u64* as ctx */ + if (btf_is_int(t) && t->size == 8) + return 0; + break; + default: + break; + } + break; + case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_STRUCT_OPS: + /* allow u64* as ctx */ + if (btf_is_int(t) && t->size == 8) + return 0; + break; + case BPF_PROG_TYPE_TRACEPOINT: + case BPF_PROG_TYPE_SYSCALL: + case BPF_PROG_TYPE_EXT: + return 0; /* anything goes */ + default: + break; + } + + ctx_type = find_canonical_prog_ctx_type(prog_type); + if (!ctx_type) { + /* should not happen */ + bpf_log(log, "btf_vmlinux is malformed\n"); + return -EINVAL; + } + + /* resolve typedefs and check that underlying structs are matching as well */ + while (btf_type_is_modifier(ctx_type)) + ctx_type = btf_type_by_id(btf_vmlinux, ctx_type->type); + + /* if program type doesn't have distinctly named struct type for + * context, then __arg_ctx argument can only be `void *`, which we + * already checked above + */ + if (!__btf_type_is_struct(ctx_type)) { + bpf_log(log, "arg#%d should be void pointer\n", arg); + return -EINVAL; + } + + ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_type->name_off); + if (!__btf_type_is_struct(t) || strcmp(ctx_tname, tname) != 0) { + bpf_log(log, "arg#%d should be `struct %s *`\n", arg, ctx_tname); + return -EINVAL; + } + + return 0; +} + static int btf_translate_to_vmlinux(struct bpf_verifier_log *log, struct btf *btf, const struct btf_type *t, enum bpf_prog_type prog_type, int arg) { - const struct btf_member *prog_ctx_type, *kern_ctx_type; - - prog_ctx_type = btf_get_prog_ctx_type(log, btf, t, prog_type, arg); - if (!prog_ctx_type) + if (!btf_get_prog_ctx_type(log, btf, t, prog_type, arg)) return -ENOENT; - kern_ctx_type = prog_ctx_type + 1; - return kern_ctx_type->type; + return find_kern_ctx_type_id(prog_type); } int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type) @@ -6758,222 +6927,64 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr return btf_check_func_type_match(log, btf1, t1, btf2, t2); } -static int btf_check_func_arg_match(struct bpf_verifier_env *env, - const struct btf *btf, u32 func_id, - struct bpf_reg_state *regs, - bool ptr_to_mem_ok, - bool processing_call) +static bool btf_is_dynptr_ptr(const struct btf *btf, const struct btf_type *t) { - enum bpf_prog_type prog_type = resolve_prog_type(env->prog); - struct bpf_verifier_log *log = &env->log; - const char *func_name, *ref_tname; - const struct btf_type *t, *ref_t; - const struct btf_param *args; - u32 i, nargs, ref_id; - int ret; - - t = btf_type_by_id(btf, func_id); - if (!t || !btf_type_is_func(t)) { - /* These checks were already done by the verifier while loading - * struct bpf_func_info or in add_kfunc_call(). - */ - bpf_log(log, "BTF of func_id %u doesn't point to KIND_FUNC\n", - func_id); - return -EFAULT; - } - func_name = btf_name_by_offset(btf, t->name_off); - - t = btf_type_by_id(btf, t->type); - if (!t || !btf_type_is_func_proto(t)) { - bpf_log(log, "Invalid BTF of func %s\n", func_name); - return -EFAULT; - } - args = (const struct btf_param *)(t + 1); - nargs = btf_type_vlen(t); - if (nargs > MAX_BPF_FUNC_REG_ARGS) { - bpf_log(log, "Function %s has %d > %d args\n", func_name, nargs, - MAX_BPF_FUNC_REG_ARGS); - return -EINVAL; - } - - /* check that BTF function arguments match actual types that the - * verifier sees. - */ - for (i = 0; i < nargs; i++) { - enum bpf_arg_type arg_type = ARG_DONTCARE; - u32 regno = i + 1; - struct bpf_reg_state *reg = ®s[regno]; - - t = btf_type_skip_modifiers(btf, args[i].type, NULL); - if (btf_type_is_scalar(t)) { - if (reg->type == SCALAR_VALUE) - continue; - bpf_log(log, "R%d is not a scalar\n", regno); - return -EINVAL; - } - - if (!btf_type_is_ptr(t)) { - bpf_log(log, "Unrecognized arg#%d type %s\n", - i, btf_type_str(t)); - return -EINVAL; - } - - ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); - ref_tname = btf_name_by_offset(btf, ref_t->name_off); - - ret = check_func_arg_reg_off(env, reg, regno, arg_type); - if (ret < 0) - return ret; + const char *name; - if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { - /* If function expects ctx type in BTF check that caller - * is passing PTR_TO_CTX. - */ - if (reg->type != PTR_TO_CTX) { - bpf_log(log, - "arg#%d expected pointer to ctx, but got %s\n", - i, btf_type_str(t)); - return -EINVAL; - } - } else if (ptr_to_mem_ok && processing_call) { - const struct btf_type *resolve_ret; - u32 type_size; + t = btf_type_by_id(btf, t->type); /* skip PTR */ - resolve_ret = btf_resolve_size(btf, ref_t, &type_size); - if (IS_ERR(resolve_ret)) { - bpf_log(log, - "arg#%d reference type('%s %s') size cannot be determined: %ld\n", - i, btf_type_str(ref_t), ref_tname, - PTR_ERR(resolve_ret)); - return -EINVAL; - } + while (btf_type_is_modifier(t)) + t = btf_type_by_id(btf, t->type); - if (check_mem_reg(env, reg, regno, type_size)) - return -EINVAL; - } else { - bpf_log(log, "reg type unsupported for arg#%d function %s#%d\n", i, - func_name, func_id); - return -EINVAL; - } + /* allow either struct or struct forward declaration */ + if (btf_type_is_struct(t) || + (btf_type_is_fwd(t) && btf_type_kflag(t) == 0)) { + name = btf_str_by_offset(btf, t->name_off); + return name && strcmp(name, "bpf_dynptr") == 0; } - return 0; -} - -/* Compare BTF of a function declaration with given bpf_reg_state. - * Returns: - * EFAULT - there is a verifier bug. Abort verification. - * EINVAL - there is a type mismatch or BTF is not available. - * 0 - BTF matches with what bpf_reg_state expects. - * Only PTR_TO_CTX and SCALAR_VALUE states are recognized. - */ -int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *regs) -{ - struct bpf_prog *prog = env->prog; - struct btf *btf = prog->aux->btf; - bool is_global; - u32 btf_id; - int err; - - if (!prog->aux->func_info) - return -EINVAL; - - btf_id = prog->aux->func_info[subprog].type_id; - if (!btf_id) - return -EFAULT; - - if (prog->aux->func_info_aux[subprog].unreliable) - return -EINVAL; - - is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; - err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, false); - - /* Compiler optimizations can remove arguments from static functions - * or mismatched type can be passed into a global function. - * In such cases mark the function as unreliable from BTF point of view. - */ - if (err) - prog->aux->func_info_aux[subprog].unreliable = true; - return err; -} - -/* Compare BTF of a function call with given bpf_reg_state. - * Returns: - * EFAULT - there is a verifier bug. Abort verification. - * EINVAL - there is a type mismatch or BTF is not available. - * 0 - BTF matches with what bpf_reg_state expects. - * Only PTR_TO_CTX and SCALAR_VALUE states are recognized. - * - * NOTE: the code is duplicated from btf_check_subprog_arg_match() - * because btf_check_func_arg_match() is still doing both. Once that - * function is split in 2, we can call from here btf_check_subprog_arg_match() - * first, and then treat the calling part in a new code path. - */ -int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *regs) -{ - struct bpf_prog *prog = env->prog; - struct btf *btf = prog->aux->btf; - bool is_global; - u32 btf_id; - int err; - - if (!prog->aux->func_info) - return -EINVAL; - - btf_id = prog->aux->func_info[subprog].type_id; - if (!btf_id) - return -EFAULT; - - if (prog->aux->func_info_aux[subprog].unreliable) - return -EINVAL; - - is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; - err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, true); - - /* Compiler optimizations can remove arguments from static functions - * or mismatched type can be passed into a global function. - * In such cases mark the function as unreliable from BTF point of view. - */ - if (err) - prog->aux->func_info_aux[subprog].unreliable = true; - return err; + return false; } -/* Convert BTF of a function into bpf_reg_state if possible +/* Process BTF of a function to produce high-level expectation of function + * arguments (like ARG_PTR_TO_CTX, or ARG_PTR_TO_MEM, etc). This information + * is cached in subprog info for reuse. * Returns: * EFAULT - there is a verifier bug. Abort verification. * EINVAL - cannot convert BTF. - * 0 - Successfully converted BTF into bpf_reg_state - * (either PTR_TO_CTX or SCALAR_VALUE). + * 0 - Successfully processed BTF and constructed argument expectations. */ -int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *regs) +int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) { + bool is_global = subprog_aux(env, subprog)->linkage == BTF_FUNC_GLOBAL; + struct bpf_subprog_info *sub = subprog_info(env, subprog); struct bpf_verifier_log *log = &env->log; struct bpf_prog *prog = env->prog; enum bpf_prog_type prog_type = prog->type; struct btf *btf = prog->aux->btf; const struct btf_param *args; - const struct btf_type *t, *ref_t; + const struct btf_type *t, *ref_t, *fn_t; u32 i, nargs, btf_id; const char *tname; - if (!prog->aux->func_info || - prog->aux->func_info_aux[subprog].linkage != BTF_FUNC_GLOBAL) { + if (sub->args_cached) + return 0; + + if (!prog->aux->func_info) { bpf_log(log, "Verifier bug\n"); return -EFAULT; } btf_id = prog->aux->func_info[subprog].type_id; if (!btf_id) { + if (!is_global) /* not fatal for static funcs */ + return -EINVAL; bpf_log(log, "Global functions need valid BTF\n"); return -EFAULT; } - t = btf_type_by_id(btf, btf_id); - if (!t || !btf_type_is_func(t)) { + fn_t = btf_type_by_id(btf, btf_id); + if (!fn_t || !btf_type_is_func(fn_t)) { /* These checks were already done by the verifier while loading * struct bpf_func_info */ @@ -6981,11 +6992,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, subprog); return -EFAULT; } - tname = btf_name_by_offset(btf, t->name_off); - - if (log->level & BPF_LOG_LEVEL) - bpf_log(log, "Validating %s() func#%d...\n", - tname, subprog); + tname = btf_name_by_offset(btf, fn_t->name_off); if (prog->aux->func_info_aux[subprog].unreliable) { bpf_log(log, "Verifier bug in function %s()\n", tname); @@ -6994,7 +7001,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, if (prog_type == BPF_PROG_TYPE_EXT) prog_type = prog->aux->dst_prog->type; - t = btf_type_by_id(btf, t->type); + t = btf_type_by_id(btf, fn_t->type); if (!t || !btf_type_is_func_proto(t)) { bpf_log(log, "Invalid type of function %s()\n", tname); return -EFAULT; @@ -7006,7 +7013,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, tname, nargs, MAX_BPF_FUNC_REG_ARGS); return -EINVAL; } - /* check that function returns int */ + /* check that function returns int, exception cb also requires this */ t = btf_type_by_id(btf, t->type); while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); @@ -7020,24 +7027,54 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, * Only PTR_TO_CTX and SCALAR are supported atm. */ for (i = 0; i < nargs; i++) { - struct bpf_reg_state *reg = ®s[i + 1]; + bool is_nonnull = false; + const char *tag; t = btf_type_by_id(btf, args[i].type); + + tag = btf_find_decl_tag_value(btf, fn_t, i, "arg:"); + if (IS_ERR(tag) && PTR_ERR(tag) == -ENOENT) { + tag = NULL; + } else if (IS_ERR(tag)) { + bpf_log(log, "arg#%d type's tag fetching failure: %ld\n", i, PTR_ERR(tag)); + return PTR_ERR(tag); + } + /* 'arg:<tag>' decl_tag takes precedence over derivation of + * register type from BTF type itself + */ + if (tag) { + /* disallow arg tags in static subprogs */ + if (!is_global) { + bpf_log(log, "arg#%d type tag is not supported in static functions\n", i); + return -EOPNOTSUPP; + } + if (strcmp(tag, "ctx") == 0) { + sub->args[i].arg_type = ARG_PTR_TO_CTX; + continue; + } + if (strcmp(tag, "nonnull") == 0) + is_nonnull = true; + } + while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); if (btf_type_is_int(t) || btf_is_any_enum(t)) { - reg->type = SCALAR_VALUE; + sub->args[i].arg_type = ARG_ANYTHING; continue; } - if (btf_type_is_ptr(t)) { - if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { - reg->type = PTR_TO_CTX; - continue; - } + if (btf_type_is_ptr(t) && btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { + sub->args[i].arg_type = ARG_PTR_TO_CTX; + continue; + } + if (btf_type_is_ptr(t) && btf_is_dynptr_ptr(btf, t)) { + sub->args[i].arg_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY; + continue; + } + if (is_global && btf_type_is_ptr(t)) { + u32 mem_size; t = btf_type_skip_modifiers(btf, t->type, NULL); - - ref_t = btf_resolve_size(btf, t, ®->mem_size); + ref_t = btf_resolve_size(btf, t, &mem_size); if (IS_ERR(ref_t)) { bpf_log(log, "arg#%d reference type('%s %s') size cannot be determined: %ld\n", @@ -7046,15 +7083,39 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, return -EINVAL; } - reg->type = PTR_TO_MEM | PTR_MAYBE_NULL; - reg->id = ++env->id_gen; - + sub->args[i].arg_type = is_nonnull ? ARG_PTR_TO_MEM : ARG_PTR_TO_MEM_OR_NULL; + sub->args[i].mem_size = mem_size; continue; } + if (is_nonnull) { + bpf_log(log, "arg#%d marked as non-null, but is not a pointer type\n", i); + return -EINVAL; + } bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n", i, btf_type_str(t), tname); return -EINVAL; } + + for (i = 0; i < nargs; i++) { + const char *tag; + + if (sub->args[i].arg_type != ARG_PTR_TO_CTX) + continue; + + /* check if arg has "arg:ctx" tag */ + t = btf_type_by_id(btf, args[i].type); + tag = btf_find_decl_tag_value(btf, fn_t, i, "arg:"); + if (IS_ERR_OR_NULL(tag) || strcmp(tag, "ctx") != 0) + continue; + + if (btf_validate_prog_ctx_type(log, btf, t, i, prog_type, + prog->expected_attach_type)) + return -EINVAL; + } + + sub->arg_cnt = nargs; + sub->args_cached = true; + return 0; } @@ -7832,6 +7893,7 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) case BPF_PROG_TYPE_SYSCALL: return BTF_KFUNC_HOOK_SYSCALL; case BPF_PROG_TYPE_CGROUP_SKB: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: return BTF_KFUNC_HOOK_CGROUP_SKB; case BPF_PROG_TYPE_SCHED_ACT: return BTF_KFUNC_HOOK_SCHED_ACT; @@ -8501,7 +8563,7 @@ bool btf_nested_type_is_trusted(struct bpf_verifier_log *log, tname = btf_name_by_offset(btf, walk_type->name_off); ret = snprintf(safe_tname, sizeof(safe_tname), "%s%s", tname, suffix); - if (ret < 0) + if (ret >= sizeof(safe_tname)) return false; safe_id = btf_find_by_name_kind(btf, safe_tname, BTF_INFO_KIND(walk_type->info)); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 5b2741aa0d9b..491d20038cbe 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -785,7 +785,8 @@ found: * to descendants * @cgrp: The cgroup which descendants to traverse * @link: A link for which to replace BPF program - * @type: Type of attach operation + * @new_prog: &struct bpf_prog for the target BPF program with its refcnt + * incremented * * Must be called with cgroup_mutex held. */ @@ -1334,7 +1335,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering * @sk: The socket sending or receiving traffic * @skb: The skb that is being sent or received - * @type: The type of program to be executed + * @atype: The type of program to be executed * * If no socket is passed, or the socket is not of type INET or INET6, * this function does nothing and returns 0. @@ -1424,7 +1425,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); /** * __cgroup_bpf_run_filter_sk() - Run a program on a sock * @sk: sock structure to manipulate - * @type: The type of program to be executed + * @atype: The type of program to be executed * * socket is passed is expected to be of type INET or INET6. * @@ -1449,18 +1450,22 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); * provided by user sockaddr * @sk: sock struct that will use sockaddr * @uaddr: sockaddr struct provided by user - * @type: The type of program to be executed + * @uaddrlen: Pointer to the size of the sockaddr struct provided by user. It is + * read-only for AF_INET[6] uaddr but can be modified for AF_UNIX + * uaddr. + * @atype: The type of program to be executed * @t_ctx: Pointer to attach type specific context * @flags: Pointer to u32 which contains higher bits of BPF program * return value (OR'ed together). * - * socket is expected to be of type INET or INET6. + * socket is expected to be of type INET, INET6 or UNIX. * * This function will return %-EPERM if an attached program is found and * returned value != 1 during execution. In all other cases, 0 is returned. */ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct sockaddr *uaddr, + int *uaddrlen, enum cgroup_bpf_attach_type atype, void *t_ctx, u32 *flags) @@ -1472,21 +1477,31 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, }; struct sockaddr_storage unspec; struct cgroup *cgrp; + int ret; /* Check socket family since not all sockets represent network * endpoint (e.g. AF_UNIX). */ - if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) + if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6 && + sk->sk_family != AF_UNIX) return 0; if (!ctx.uaddr) { memset(&unspec, 0, sizeof(unspec)); ctx.uaddr = (struct sockaddr *)&unspec; + ctx.uaddrlen = 0; + } else { + ctx.uaddrlen = *uaddrlen; } cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - return bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, - 0, flags); + ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, + 0, flags); + + if (!ret && uaddr) + *uaddrlen = ctx.uaddrlen; + + return ret; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); @@ -1496,7 +1511,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains * sk with connection information (IP addresses, etc.) May not contain * cgroup info if it is a req sock. - * @type: The type of program to be executed + * @atype: The type of program to be executed * * socket passed is expected to be of type INET or INET6. * @@ -1670,7 +1685,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { * @ppos: value-result argument: value is position at which read from or write * to sysctl is happening, result is new position if program overrode it, * initial value otherwise - * @type: type of program to be executed + * @atype: type of program to be executed * * Program is run when sysctl is being accessed, either read or written, and * can allow or deny such access. @@ -1785,7 +1800,7 @@ static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx, } int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, - int *optname, char __user *optval, + int *optname, sockptr_t optval, int *optlen, char **kernel_optval) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); @@ -1808,7 +1823,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, ctx.optlen = *optlen; - if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) { + if (copy_from_sockptr(ctx.optval, optval, + min(*optlen, max_optlen))) { ret = -EFAULT; goto out; } @@ -1875,8 +1891,8 @@ out: } int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, - int optname, char __user *optval, - int __user *optlen, int max_optlen, + int optname, sockptr_t optval, + sockptr_t optlen, int max_optlen, int retval) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); @@ -1903,8 +1919,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, * one that kernel returned as well to let * BPF programs inspect the value. */ - - if (get_user(ctx.optlen, optlen)) { + if (copy_from_sockptr(&ctx.optlen, optlen, + sizeof(ctx.optlen))) { ret = -EFAULT; goto out; } @@ -1915,8 +1931,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, } orig_optlen = ctx.optlen; - if (copy_from_user(ctx.optval, optval, - min(ctx.optlen, max_optlen)) != 0) { + if (copy_from_sockptr(ctx.optval, optval, + min(ctx.optlen, max_optlen))) { ret = -EFAULT; goto out; } @@ -1930,7 +1946,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, if (ret < 0) goto out; - if (optval && (ctx.optlen > max_optlen || ctx.optlen < 0)) { + if (!sockptr_is_null(optval) && + (ctx.optlen > max_optlen || ctx.optlen < 0)) { if (orig_optlen > PAGE_SIZE && ctx.optlen >= 0) { pr_info_once("bpf getsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n", ctx.optlen, max_optlen); @@ -1942,11 +1959,12 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, } if (ctx.optlen != 0) { - if (optval && copy_to_user(optval, ctx.optval, ctx.optlen)) { + if (!sockptr_is_null(optval) && + copy_to_sockptr(optval, ctx.optval, ctx.optlen)) { ret = -EFAULT; goto out; } - if (put_user(ctx.optlen, optlen)) { + if (copy_to_sockptr(optlen, &ctx.optlen, sizeof(ctx.optlen))) { ret = -EFAULT; goto out; } @@ -2519,10 +2537,13 @@ cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UNIX_RECVMSG: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: + case BPF_CGROUP_UNIX_GETSOCKNAME: return NULL; default: return &bpf_get_retval_proto; @@ -2534,10 +2555,13 @@ cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UNIX_RECVMSG: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: + case BPF_CGROUP_UNIX_GETSOCKNAME: return NULL; default: return &bpf_set_retval_proto; diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c index 810378f04fbc..f04a468cf6a7 100644 --- a/kernel/bpf/cgroup_iter.c +++ b/kernel/bpf/cgroup_iter.c @@ -282,7 +282,7 @@ static struct bpf_iter_reg bpf_cgroup_reg_info = { .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__cgroup, cgroup), - PTR_TO_BTF_ID_OR_NULL }, + PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, }, .seq_info = &cgroup_iter_seq_info, }; @@ -294,3 +294,66 @@ static int __init bpf_cgroup_iter_init(void) } late_initcall(bpf_cgroup_iter_init); + +struct bpf_iter_css { + __u64 __opaque[3]; +} __attribute__((aligned(8))); + +struct bpf_iter_css_kern { + struct cgroup_subsys_state *start; + struct cgroup_subsys_state *pos; + unsigned int flags; +} __attribute__((aligned(8))); + +__bpf_kfunc_start_defs(); + +__bpf_kfunc int bpf_iter_css_new(struct bpf_iter_css *it, + struct cgroup_subsys_state *start, unsigned int flags) +{ + struct bpf_iter_css_kern *kit = (void *)it; + + BUILD_BUG_ON(sizeof(struct bpf_iter_css_kern) > sizeof(struct bpf_iter_css)); + BUILD_BUG_ON(__alignof__(struct bpf_iter_css_kern) != __alignof__(struct bpf_iter_css)); + + kit->start = NULL; + switch (flags) { + case BPF_CGROUP_ITER_DESCENDANTS_PRE: + case BPF_CGROUP_ITER_DESCENDANTS_POST: + case BPF_CGROUP_ITER_ANCESTORS_UP: + break; + default: + return -EINVAL; + } + + kit->start = start; + kit->pos = NULL; + kit->flags = flags; + return 0; +} + +__bpf_kfunc struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *it) +{ + struct bpf_iter_css_kern *kit = (void *)it; + + if (!kit->start) + return NULL; + + switch (kit->flags) { + case BPF_CGROUP_ITER_DESCENDANTS_PRE: + kit->pos = css_next_descendant_pre(kit->pos, kit->start); + break; + case BPF_CGROUP_ITER_DESCENDANTS_POST: + kit->pos = css_next_descendant_post(kit->pos, kit->start); + break; + case BPF_CGROUP_ITER_ANCESTORS_UP: + kit->pos = kit->pos ? kit->pos->parent : kit->start; + } + + return kit->pos; +} + +__bpf_kfunc void bpf_iter_css_destroy(struct bpf_iter_css *it) +{ +} + +__bpf_kfunc_end_defs(); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 0f8f036d8bd1..ea6843be2616 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -121,6 +121,9 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag #endif INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode); +#ifdef CONFIG_FINEIBT + INIT_LIST_HEAD_RCU(&fp->aux->ksym_prefix.lnode); +#endif mutex_init(&fp->aux->used_maps_mutex); mutex_init(&fp->aux->dst_mutex); @@ -212,7 +215,7 @@ void bpf_prog_fill_jited_linfo(struct bpf_prog *prog, const struct bpf_line_info *linfo; void **jited_linfo; - if (!prog->aux->jited_linfo) + if (!prog->aux->jited_linfo || prog->aux->func_idx > prog->aux->func_cnt) /* Userspace did not provide linfo */ return; @@ -371,14 +374,18 @@ static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old, static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old, s32 end_new, s32 curr, const bool probe_pass) { - const s32 off_min = S16_MIN, off_max = S16_MAX; + s64 off_min, off_max, off; s32 delta = end_new - end_old; - s32 off; - if (insn->code == (BPF_JMP32 | BPF_JA)) + if (insn->code == (BPF_JMP32 | BPF_JA)) { off = insn->imm; - else + off_min = S32_MIN; + off_max = S32_MAX; + } else { off = insn->off; + off_min = S16_MIN; + off_max = S16_MAX; + } if (curr < pos && curr + off + 1 >= end_old) off += delta; @@ -539,7 +546,7 @@ static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp) { int i; - for (i = 0; i < fp->aux->func_cnt; i++) + for (i = 0; i < fp->aux->real_func_cnt; i++) bpf_prog_kallsyms_del(fp->aux->func[i]); } @@ -589,7 +596,7 @@ bpf_prog_ksym_set_name(struct bpf_prog *prog) sym = bin2hex(sym, prog->tag, sizeof(prog->tag)); /* prog->aux->name will be ignored if full btf name is available */ - if (prog->aux->func_info_cnt) { + if (prog->aux->func_info_cnt && prog->aux->func_idx < prog->aux->func_info_cnt) { type = btf_type_by_id(prog->aux->btf, prog->aux->func_info[prog->aux->func_idx].type_id); func_name = btf_name_by_offset(prog->aux->btf, type->name_off); @@ -623,7 +630,11 @@ static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n) if (val < ksym->start) return -1; - if (val >= ksym->end) + /* Ensure that we detect return addresses as part of the program, when + * the final instruction is a call for a program part of the stack + * trace. Therefore, do val > ksym->end instead of val >= ksym->end. + */ + if (val > ksym->end) return 1; return 0; @@ -679,6 +690,23 @@ void bpf_prog_kallsyms_add(struct bpf_prog *fp) fp->aux->ksym.prog = true; bpf_ksym_add(&fp->aux->ksym); + +#ifdef CONFIG_FINEIBT + /* + * When FineIBT, code in the __cfi_foo() symbols can get executed + * and hence unwinder needs help. + */ + if (cfi_mode != CFI_FINEIBT) + return; + + snprintf(fp->aux->ksym_prefix.name, KSYM_NAME_LEN, + "__cfi_%s", fp->aux->ksym.name); + + fp->aux->ksym_prefix.start = (unsigned long) fp->bpf_func - 16; + fp->aux->ksym_prefix.end = (unsigned long) fp->bpf_func; + + bpf_ksym_add(&fp->aux->ksym_prefix); +#endif } void bpf_prog_kallsyms_del(struct bpf_prog *fp) @@ -687,6 +715,11 @@ void bpf_prog_kallsyms_del(struct bpf_prog *fp) return; bpf_ksym_del(&fp->aux->ksym); +#ifdef CONFIG_FINEIBT + if (cfi_mode != CFI_FINEIBT) + return; + bpf_ksym_del(&fp->aux->ksym_prefix); +#endif } static struct bpf_ksym *bpf_ksym_find(unsigned long addr) @@ -733,7 +766,7 @@ bool is_bpf_text_address(unsigned long addr) return ret; } -static struct bpf_prog *bpf_prog_ksym_find(unsigned long addr) +struct bpf_prog *bpf_prog_ksym_find(unsigned long addr) { struct bpf_ksym *ksym = bpf_ksym_find(addr); @@ -870,7 +903,7 @@ static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_ins GFP_KERNEL); if (!pack) return NULL; - pack->ptr = module_alloc(BPF_PROG_PACK_SIZE); + pack->ptr = bpf_jit_alloc_exec(BPF_PROG_PACK_SIZE); if (!pack->ptr) { kfree(pack); return NULL; @@ -894,7 +927,7 @@ void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns) mutex_lock(&pack_mutex); if (size > BPF_PROG_PACK_SIZE) { size = round_up(size, PAGE_SIZE); - ptr = module_alloc(size); + ptr = bpf_jit_alloc_exec(size); if (ptr) { bpf_fill_ill_insns(ptr, size); set_vm_flush_reset_perms(ptr); @@ -924,20 +957,20 @@ out: return ptr; } -void bpf_prog_pack_free(struct bpf_binary_header *hdr) +void bpf_prog_pack_free(void *ptr, u32 size) { struct bpf_prog_pack *pack = NULL, *tmp; unsigned int nbits; unsigned long pos; mutex_lock(&pack_mutex); - if (hdr->size > BPF_PROG_PACK_SIZE) { - module_memfree(hdr); + if (size > BPF_PROG_PACK_SIZE) { + bpf_jit_free_exec(ptr); goto out; } list_for_each_entry(tmp, &pack_list, list) { - if ((void *)hdr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > (void *)hdr) { + if (ptr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > ptr) { pack = tmp; break; } @@ -946,17 +979,17 @@ void bpf_prog_pack_free(struct bpf_binary_header *hdr) if (WARN_ONCE(!pack, "bpf_prog_pack bug\n")) goto out; - nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size); - pos = ((unsigned long)hdr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT; + nbits = BPF_PROG_SIZE_TO_NBITS(size); + pos = ((unsigned long)ptr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT; - WARN_ONCE(bpf_arch_text_invalidate(hdr, hdr->size), + WARN_ONCE(bpf_arch_text_invalidate(ptr, size), "bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n"); bitmap_clear(pack->bitmap, pos, nbits); if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0, BPF_PROG_CHUNK_COUNT, 0) == 0) { list_del(&pack->list); - module_memfree(pack->ptr); + bpf_jit_free_exec(pack->ptr); kfree(pack); } out: @@ -1096,8 +1129,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr, *rw_header = kvmalloc(size, GFP_KERNEL); if (!*rw_header) { - bpf_arch_text_copy(&ro_header->size, &size, sizeof(size)); - bpf_prog_pack_free(ro_header); + bpf_prog_pack_free(ro_header, size); bpf_jit_uncharge_modmem(size); return NULL; } @@ -1128,7 +1160,7 @@ int bpf_jit_binary_pack_finalize(struct bpf_prog *prog, kvfree(rw_header); if (IS_ERR(ptr)) { - bpf_prog_pack_free(ro_header); + bpf_prog_pack_free(ro_header, ro_header->size); return PTR_ERR(ptr); } return 0; @@ -1149,7 +1181,7 @@ void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header, { u32 size = ro_header->size; - bpf_prog_pack_free(ro_header); + bpf_prog_pack_free(ro_header, size); kvfree(rw_header); bpf_jit_uncharge_modmem(size); } @@ -1208,7 +1240,7 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog, if (!extra_pass) addr = NULL; else if (prog->aux->func && - off >= 0 && off < prog->aux->func_cnt) + off >= 0 && off < prog->aux->real_func_cnt) addr = (u8 *)prog->aux->func[off]->bpf_func; else return -EINVAL; @@ -2660,12 +2692,16 @@ void __bpf_free_used_maps(struct bpf_prog_aux *aux, struct bpf_map **used_maps, u32 len) { struct bpf_map *map; + bool sleepable; u32 i; + sleepable = aux->sleepable; for (i = 0; i < len; i++) { map = used_maps[i]; if (map->ops->map_poke_untrack) map->ops->map_poke_untrack(map, aux); + if (sleepable) + atomic64_dec(&map->sleepable_refcnt); bpf_map_put(map); } } @@ -2721,7 +2757,7 @@ static void bpf_prog_free_deferred(struct work_struct *work) #endif if (aux->dst_trampoline) bpf_trampoline_put(aux->dst_trampoline); - for (i = 0; i < aux->func_cnt; i++) { + for (i = 0; i < aux->real_func_cnt; i++) { /* We can just unlink the subprog poke descriptor table as * it was originally linked to the main program and is also * released along with it. @@ -2729,7 +2765,7 @@ static void bpf_prog_free_deferred(struct work_struct *work) aux->func[i]->aux->poke_tab = NULL; bpf_jit_free(aux->func[i]); } - if (aux->func_cnt) { + if (aux->real_func_cnt) { kfree(aux->func); bpf_prog_unlock_free(aux->prog); } else { @@ -2914,6 +2950,15 @@ int __weak bpf_arch_text_invalidate(void *dst, size_t len) return -ENOTSUPP; } +bool __weak bpf_jit_supports_exceptions(void) +{ + return false; +} + +void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie) +{ +} + #ifdef CONFIG_BPF_SYSCALL static int __init bpf_global_ma_init(void) { diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index e42a1bdb7f53..8a0bb80fe48a 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -764,6 +764,16 @@ void __cpu_map_flush(void) } } +#ifdef CONFIG_DEBUG_NET +bool cpu_map_check_flush(void) +{ + if (list_empty(this_cpu_ptr(&cpu_map_flush_list))) + return false; + __cpu_map_flush(); + return true; +} +#endif + static int __init cpu_map_init(void) { int cpu; diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index 6983af8e093c..2e73533a3811 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -34,9 +34,7 @@ static bool cpu_valid(u32 cpu) return cpu < nr_cpu_ids; } -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "Global kfuncs as their definitions will be in BTF"); +__bpf_kfunc_start_defs(); /** * bpf_cpumask_create() - Create a mutable BPF cpumask. @@ -98,6 +96,12 @@ __bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask) migrate_enable(); } +__bpf_kfunc void bpf_cpumask_release_dtor(void *cpumask) +{ + bpf_cpumask_release(cpumask); +} +CFI_NOSEAL(bpf_cpumask_release_dtor); + /** * bpf_cpumask_first() - Get the index of the first nonzero bit in the cpumask. * @cpumask: The cpumask being queried. @@ -407,7 +411,18 @@ __bpf_kfunc u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, return cpumask_any_and_distribute(src1, src2); } -__diag_pop(); +/** + * bpf_cpumask_weight() - Return the number of bits in @cpumask. + * @cpumask: The cpumask being queried. + * + * Count the number of set bits in the given cpumask. + */ +__bpf_kfunc u32 bpf_cpumask_weight(const struct cpumask *cpumask) +{ + return cpumask_weight(cpumask); +} + +__bpf_kfunc_end_defs(); BTF_SET8_START(cpumask_kfunc_btf_ids) BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL) @@ -434,6 +449,7 @@ BTF_ID_FLAGS(func, bpf_cpumask_full, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_weight, KF_RCU) BTF_SET8_END(cpumask_kfunc_btf_ids) static const struct btf_kfunc_id_set cpumask_kfunc_set = { @@ -443,7 +459,7 @@ static const struct btf_kfunc_id_set cpumask_kfunc_set = { BTF_ID_LIST(cpumask_dtor_ids) BTF_ID(struct, bpf_cpumask) -BTF_ID(func, bpf_cpumask_release) +BTF_ID(func, bpf_cpumask_release_dtor) static int __init cpumask_kfunc_init(void) { diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 4d42f6ed6c11..a936c704d4e7 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -418,6 +418,16 @@ void __dev_flush(void) } } +#ifdef CONFIG_DEBUG_NET +bool dev_check_flush(void) +{ + if (list_empty(this_cpu_ptr(&dev_flush_list))) + return false; + __dev_flush(); + return true; +} +#endif + /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or * by local_bh_disable() (from XDP calls inside NAPI). The * rcu_read_lock_bh_held() below makes lockdep accept both. diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c index fa3e9225aedc..70fb82bf1637 100644 --- a/kernel/bpf/dispatcher.c +++ b/kernel/bpf/dispatcher.c @@ -150,14 +150,11 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, goto out; d->rw_image = bpf_jit_alloc_exec(PAGE_SIZE); if (!d->rw_image) { - u32 size = PAGE_SIZE; - - bpf_arch_text_copy(d->image, &size, sizeof(size)); - bpf_prog_pack_free((struct bpf_binary_header *)d->image); + bpf_prog_pack_free(d->image, PAGE_SIZE); d->image = NULL; goto out; } - bpf_image_ksym_add(d->image, &d->ksym); + bpf_image_ksym_add(d->image, PAGE_SIZE, &d->ksym); } prev_num_progs = d->num_progs; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index a8c7e1c5abfa..03a6a2500b6a 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -7,6 +7,7 @@ #include <linux/jhash.h> #include <linux/filter.h> #include <linux/rculist_nulls.h> +#include <linux/rcupdate_wait.h> #include <linux/random.h> #include <uapi/linux/btf.h> #include <linux/rcupdate_trace.h> @@ -155,13 +156,15 @@ static inline int htab_lock_bucket(const struct bpf_htab *htab, hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); preempt_disable(); + local_irq_save(flags); if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) { __this_cpu_dec(*(htab->map_locked[hash])); + local_irq_restore(flags); preempt_enable(); return -EBUSY; } - raw_spin_lock_irqsave(&b->raw_lock, flags); + raw_spin_lock(&b->raw_lock); *pflags = flags; return 0; @@ -172,8 +175,9 @@ static inline void htab_unlock_bucket(const struct bpf_htab *htab, unsigned long flags) { hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); - raw_spin_unlock_irqrestore(&b->raw_lock, flags); + raw_spin_unlock(&b->raw_lock); __this_cpu_dec(*(htab->map_locked[hash])); + local_irq_restore(flags); preempt_enable(); } @@ -894,7 +898,7 @@ static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) if (map->ops->map_fd_put_ptr) { ptr = fd_htab_map_get_ptr(map, l); - map->ops->map_fd_put_ptr(ptr); + map->ops->map_fd_put_ptr(map, ptr, true); } } @@ -2481,7 +2485,7 @@ static void fd_htab_map_free(struct bpf_map *map) hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { void *ptr = fd_htab_map_get_ptr(map, l); - map->ops->map_fd_put_ptr(ptr); + map->ops->map_fd_put_ptr(map, ptr, false); } } @@ -2520,9 +2524,15 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, if (IS_ERR(ptr)) return PTR_ERR(ptr); + /* The htab bucket lock is always held during update operations in fd + * htab map, and the following rcu_read_lock() is only used to avoid + * the WARN_ON_ONCE in htab_map_update_elem(). + */ + rcu_read_lock(); ret = htab_map_update_elem(map, key, &ptr, map_flags); + rcu_read_unlock(); if (ret) - map->ops->map_fd_put_ptr(ptr); + map->ops->map_fd_put_ptr(map, ptr, false); return ret; } diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 8bd3812fb8df..be72824f32b2 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -22,6 +22,7 @@ #include <linux/security.h> #include <linux/btf_ids.h> #include <linux/bpf_mem_alloc.h> +#include <linux/kasan.h> #include "../../lib/kstrtox.h" @@ -31,12 +32,13 @@ * * Different map implementations will rely on rcu in map methods * lookup/update/delete, therefore eBPF programs must run under rcu lock - * if program is allowed to access maps, so check rcu_read_lock_held in - * all three functions. + * if program is allowed to access maps, so check rcu_read_lock_held() or + * rcu_read_lock_trace_held() in all three functions. */ BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && + !rcu_read_lock_bh_held()); return (unsigned long) map->ops->map_lookup_elem(map, key); } @@ -52,7 +54,8 @@ const struct bpf_func_proto bpf_map_lookup_elem_proto = { BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key, void *, value, u64, flags) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && + !rcu_read_lock_bh_held()); return map->ops->map_update_elem(map, key, value, flags); } @@ -69,7 +72,8 @@ const struct bpf_func_proto bpf_map_update_elem_proto = { BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && + !rcu_read_lock_bh_held()); return map->ops->map_delete_elem(map, key); } @@ -1176,13 +1180,6 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map ret = -EBUSY; goto out; } - if (!atomic64_read(&map->usercnt)) { - /* maps with timers must be either held by user space - * or pinned in bpffs. - */ - ret = -EPERM; - goto out; - } /* allocate hrtimer via map_kmalloc to use memcg accounting */ t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, map->numa_node); if (!t) { @@ -1195,7 +1192,21 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map rcu_assign_pointer(t->callback_fn, NULL); hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); t->timer.function = bpf_timer_cb; - timer->timer = t; + WRITE_ONCE(timer->timer, t); + /* Guarantee the order between timer->timer and map->usercnt. So + * when there are concurrent uref release and bpf timer init, either + * bpf_timer_cancel_and_free() called by uref release reads a no-NULL + * timer or atomic64_read() below returns a zero usercnt. + */ + smp_mb(); + if (!atomic64_read(&map->usercnt)) { + /* maps with timers must be either held by user space + * or pinned in bpffs. + */ + WRITE_ONCE(timer->timer, NULL); + kfree(t); + ret = -EPERM; + } out: __bpf_spin_unlock_irqrestore(&timer->lock); return ret; @@ -1271,7 +1282,7 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla if (in_nmi()) return -EOPNOTSUPP; - if (flags > BPF_F_TIMER_ABS) + if (flags & ~(BPF_F_TIMER_ABS | BPF_F_TIMER_CPU_PIN)) return -EINVAL; __bpf_spin_lock_irqsave(&timer->lock); t = timer->timer; @@ -1285,6 +1296,9 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla else mode = HRTIMER_MODE_REL_SOFT; + if (flags & BPF_F_TIMER_CPU_PIN) + mode |= HRTIMER_MODE_PINNED; + hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode); out: __bpf_spin_unlock_irqrestore(&timer->lock); @@ -1370,7 +1384,7 @@ void bpf_timer_cancel_and_free(void *val) /* The subsequent bpf_timer_start/cancel() helpers won't be able to use * this timer, since it won't be initialized. */ - timer->timer = NULL; + WRITE_ONCE(timer->timer, NULL); out: __bpf_spin_unlock_irqrestore(&timer->lock); if (!t) @@ -1807,8 +1821,6 @@ bpf_base_func_proto(enum bpf_func_id func_id) } } -void __bpf_obj_drop_impl(void *p, const struct btf_record *rec); - void bpf_list_head_free(const struct btf_field *field, void *list_head, struct bpf_spin_lock *spin_lock) { @@ -1840,7 +1852,7 @@ unlock: * bpf_list_head which needs to be freed. */ migrate_disable(); - __bpf_obj_drop_impl(obj, field->graph_root.value_rec); + __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); migrate_enable(); } } @@ -1879,14 +1891,12 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root, migrate_disable(); - __bpf_obj_drop_impl(obj, field->graph_root.value_rec); + __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); migrate_enable(); } } -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "Global functions as their definitions will be in vmlinux BTF"); +__bpf_kfunc_start_defs(); __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign) { @@ -1902,9 +1912,19 @@ __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign) return p; } +__bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign) +{ + u64 size = local_type_id__k; + + /* The verifier has ensured that meta__ign must be NULL */ + return bpf_mem_alloc(&bpf_global_percpu_ma, size); +} + /* Must be called under migrate_disable(), as required by bpf_mem_free */ -void __bpf_obj_drop_impl(void *p, const struct btf_record *rec) +void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu) { + struct bpf_mem_alloc *ma; + if (rec && rec->refcount_off >= 0 && !refcount_dec_and_test((refcount_t *)(p + rec->refcount_off))) { /* Object is refcounted and refcount_dec didn't result in 0 @@ -1916,10 +1936,11 @@ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec) if (rec) bpf_obj_free_fields(rec, p); - if (rec && rec->refcount_off >= 0) - bpf_mem_free_rcu(&bpf_global_ma, p); + if (percpu) + ma = &bpf_global_percpu_ma; else - bpf_mem_free(&bpf_global_ma, p); + ma = &bpf_global_ma; + bpf_mem_free_rcu(ma, p); } __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign) @@ -1927,7 +1948,13 @@ __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign) struct btf_struct_meta *meta = meta__ign; void *p = p__alloc; - __bpf_obj_drop_impl(p, meta ? meta->record : NULL); + __bpf_obj_drop_impl(p, meta ? meta->record : NULL, false); +} + +__bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign) +{ + /* The verifier has ensured that meta__ign must be NULL */ + bpf_mem_free_rcu(&bpf_global_percpu_ma, p__alloc); } __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign) @@ -1965,7 +1992,7 @@ static int __bpf_list_add(struct bpf_list_node_kern *node, */ if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) { /* Only called from BPF prog, no need to migrate_disable */ - __bpf_obj_drop_impl((void *)n - off, rec); + __bpf_obj_drop_impl((void *)n - off, rec, false); return -EINVAL; } @@ -2064,7 +2091,7 @@ static int __bpf_rbtree_add(struct bpf_rb_root *root, */ if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) { /* Only called from BPF prog, no need to migrate_disable */ - __bpf_obj_drop_impl((void *)n - off, rec); + __bpf_obj_drop_impl((void *)n - off, rec, false); return -EINVAL; } @@ -2123,6 +2150,12 @@ __bpf_kfunc void bpf_task_release(struct task_struct *p) put_task_struct_rcu_user(p); } +__bpf_kfunc void bpf_task_release_dtor(void *p) +{ + put_task_struct_rcu_user(p); +} +CFI_NOSEAL(bpf_task_release_dtor); + #ifdef CONFIG_CGROUPS /** * bpf_cgroup_acquire - Acquire a reference to a cgroup. A cgroup acquired by @@ -2147,6 +2180,12 @@ __bpf_kfunc void bpf_cgroup_release(struct cgroup *cgrp) cgroup_put(cgrp); } +__bpf_kfunc void bpf_cgroup_release_dtor(void *cgrp) +{ + cgroup_put(cgrp); +} +CFI_NOSEAL(bpf_cgroup_release_dtor); + /** * bpf_cgroup_ancestor - Perform a lookup on an entry in a cgroup's ancestor * array. A cgroup returned by this kfunc which is not subsequently stored in a @@ -2197,7 +2236,31 @@ __bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid) __bpf_kfunc long bpf_task_under_cgroup(struct task_struct *task, struct cgroup *ancestor) { - return task_under_cgroup_hierarchy(task, ancestor); + long ret; + + rcu_read_lock(); + ret = task_under_cgroup_hierarchy(task, ancestor); + rcu_read_unlock(); + return ret; +} + +/** + * bpf_task_get_cgroup1 - Acquires the associated cgroup of a task within a + * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its + * hierarchy ID. + * @task: The target task + * @hierarchy_id: The ID of a cgroup1 hierarchy + * + * On success, the cgroup is returen. On failure, NULL is returned. + */ +__bpf_kfunc struct cgroup * +bpf_task_get_cgroup1(struct task_struct *task, int hierarchy_id) +{ + struct cgroup *cgrp = task_get_cgroup1(task, hierarchy_id); + + if (IS_ERR(cgrp)) + return NULL; + return cgrp; } #endif /* CONFIG_CGROUPS */ @@ -2435,15 +2498,60 @@ __bpf_kfunc void bpf_rcu_read_unlock(void) rcu_read_unlock(); } -__diag_pop(); +struct bpf_throw_ctx { + struct bpf_prog_aux *aux; + u64 sp; + u64 bp; + int cnt; +}; + +static bool bpf_stack_walker(void *cookie, u64 ip, u64 sp, u64 bp) +{ + struct bpf_throw_ctx *ctx = cookie; + struct bpf_prog *prog; + + if (!is_bpf_text_address(ip)) + return !ctx->cnt; + prog = bpf_prog_ksym_find(ip); + ctx->cnt++; + if (bpf_is_subprog(prog)) + return true; + ctx->aux = prog->aux; + ctx->sp = sp; + ctx->bp = bp; + return false; +} + +__bpf_kfunc void bpf_throw(u64 cookie) +{ + struct bpf_throw_ctx ctx = {}; + + arch_bpf_stack_walk(bpf_stack_walker, &ctx); + WARN_ON_ONCE(!ctx.aux); + if (ctx.aux) + WARN_ON_ONCE(!ctx.aux->exception_boundary); + WARN_ON_ONCE(!ctx.bp); + WARN_ON_ONCE(!ctx.cnt); + /* Prevent KASAN false positives for CONFIG_KASAN_STACK by unpoisoning + * deeper stack depths than ctx.sp as we do not return from bpf_throw, + * which skips compiler generated instrumentation to do the same. + */ + kasan_unpoison_task_stack_below((void *)(long)ctx.sp); + ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp, 0, 0); + WARN(1, "A call to BPF exception callback should never return\n"); +} + +__bpf_kfunc_end_defs(); BTF_SET8_START(generic_btf_ids) #ifdef CONFIG_KEXEC_CORE BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) #endif BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_percpu_obj_drop_impl, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL | KF_RCU) BTF_ID_FLAGS(func, bpf_list_push_front_impl) BTF_ID_FLAGS(func, bpf_list_push_back_impl) BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL) @@ -2460,8 +2568,10 @@ BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU) +BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL) #endif BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_throw) BTF_SET8_END(generic_btf_ids) static const struct btf_kfunc_id_set generic_kfunc_set = { @@ -2472,10 +2582,10 @@ static const struct btf_kfunc_id_set generic_kfunc_set = { BTF_ID_LIST(generic_dtor_ids) BTF_ID(struct, task_struct) -BTF_ID(func, bpf_task_release) +BTF_ID(func, bpf_task_release_dtor) #ifdef CONFIG_CGROUPS BTF_ID(struct, cgroup) -BTF_ID(func, bpf_cgroup_release) +BTF_ID(func, bpf_cgroup_release_dtor) #endif BTF_SET8_START(common_btf_ids) @@ -2488,6 +2598,20 @@ BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY) +BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW | KF_RCU) +BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY) +#ifdef CONFIG_CGROUPS +BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY) +BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED) +BTF_ID_FLAGS(func, bpf_iter_css_next, KF_ITER_NEXT | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY) +#endif +BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED) +BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_dynptr_adjust) BTF_ID_FLAGS(func, bpf_dynptr_is_null) BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) @@ -2518,6 +2642,7 @@ static int __init kfunc_init(void) ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set); ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors, ARRAY_SIZE(generic_dtors), @@ -2526,3 +2651,22 @@ static int __init kfunc_init(void) } late_initcall(kfunc_init); + +/* Get a pointer to dynptr data up to len bytes for read only access. If + * the dynptr doesn't have continuous data up to len bytes, return NULL. + */ +const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len) +{ + return bpf_dynptr_slice(ptr, 0, NULL, len); +} + +/* Get a pointer to dynptr data up to len bytes for read write access. If + * the dynptr doesn't have continuous data up to len bytes, or the dynptr + * is read only, return NULL. + */ +void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len) +{ + if (__bpf_dynptr_is_rdonly(ptr)) + return NULL; + return (void *)__bpf_dynptr_data(ptr, len); +} diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 99d0625b6c82..41e0a55c35f5 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -118,8 +118,7 @@ static struct inode *bpf_get_inode(struct super_block *sb, return ERR_PTR(-ENOSPC); inode->i_ino = get_next_ino(); - inode->i_atime = inode_set_ctime_current(inode); - inode->i_mtime = inode->i_atime; + simple_inode_init_ts(inode); inode_init_owner(&nop_mnt_idmap, inode, dir, mode); @@ -147,7 +146,7 @@ static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode, d_instantiate(dentry, inode); dget(dentry); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); } static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir, @@ -600,8 +599,15 @@ EXPORT_SYMBOL(bpf_prog_get_type_path); */ static int bpf_show_options(struct seq_file *m, struct dentry *root) { - umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX; + struct inode *inode = d_inode(root); + umode_t mode = inode->i_mode & S_IALLUGO & ~S_ISVTX; + if (!uid_eq(inode->i_uid, GLOBAL_ROOT_UID)) + seq_printf(m, ",uid=%u", + from_kuid_munged(&init_user_ns, inode->i_uid)); + if (!gid_eq(inode->i_gid, GLOBAL_ROOT_GID)) + seq_printf(m, ",gid=%u", + from_kgid_munged(&init_user_ns, inode->i_gid)); if (mode != S_IRWXUGO) seq_printf(m, ",mode=%o", mode); return 0; @@ -626,15 +632,21 @@ static const struct super_operations bpf_super_ops = { }; enum { + OPT_UID, + OPT_GID, OPT_MODE, }; static const struct fs_parameter_spec bpf_fs_parameters[] = { + fsparam_u32 ("uid", OPT_UID), + fsparam_u32 ("gid", OPT_GID), fsparam_u32oct ("mode", OPT_MODE), {} }; struct bpf_mount_opts { + kuid_t uid; + kgid_t gid; umode_t mode; }; @@ -642,6 +654,8 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct bpf_mount_opts *opts = fc->fs_private; struct fs_parse_result result; + kuid_t uid; + kgid_t gid; int opt; opt = fs_parse(fc, bpf_fs_parameters, param, &result); @@ -663,12 +677,42 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) } switch (opt) { + case OPT_UID: + uid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(uid)) + goto bad_value; + + /* + * The requested uid must be representable in the + * filesystem's idmapping. + */ + if (!kuid_has_mapping(fc->user_ns, uid)) + goto bad_value; + + opts->uid = uid; + break; + case OPT_GID: + gid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(gid)) + goto bad_value; + + /* + * The requested gid must be representable in the + * filesystem's idmapping. + */ + if (!kgid_has_mapping(fc->user_ns, gid)) + goto bad_value; + + opts->gid = gid; + break; case OPT_MODE: opts->mode = result.uint_32 & S_IALLUGO; break; } return 0; +bad_value: + return invalfc(fc, "Bad value for '%s'", param->key); } struct bpf_preload_ops *bpf_preload_ops; @@ -751,6 +795,8 @@ static int bpf_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_op = &bpf_super_ops; inode = sb->s_root->d_inode; + inode->i_uid = opts->uid; + inode->i_gid = opts->gid; inode->i_op = &bpf_dir_iops; inode->i_mode &= ~S_IALLUGO; populate_bpffs(sb->s_root); @@ -786,6 +832,8 @@ static int bpf_init_fs_context(struct fs_context *fc) return -ENOMEM; opts->mode = S_IRWXUGO; + opts->uid = current_fsuid(); + opts->gid = current_fsgid(); fc->fs_private = opts; fc->ops = &bpf_context_ops; diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 850494423530..594a234f122b 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -10,6 +10,8 @@ #include <linux/bpf_verifier.h> #include <linux/math64.h> +#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args) + static bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log) { /* ubuf and len_total should both be specified (or not) together */ @@ -325,3 +327,505 @@ __printf(2, 3) void bpf_log(struct bpf_verifier_log *log, va_end(args); } EXPORT_SYMBOL_GPL(bpf_log); + +static const struct bpf_line_info * +find_linfo(const struct bpf_verifier_env *env, u32 insn_off) +{ + const struct bpf_line_info *linfo; + const struct bpf_prog *prog; + u32 i, nr_linfo; + + prog = env->prog; + nr_linfo = prog->aux->nr_linfo; + + if (!nr_linfo || insn_off >= prog->len) + return NULL; + + linfo = prog->aux->linfo; + for (i = 1; i < nr_linfo; i++) + if (insn_off < linfo[i].insn_off) + break; + + return &linfo[i - 1]; +} + +static const char *ltrim(const char *s) +{ + while (isspace(*s)) + s++; + + return s; +} + +__printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env, + u32 insn_off, + const char *prefix_fmt, ...) +{ + const struct bpf_line_info *linfo; + + if (!bpf_verifier_log_needed(&env->log)) + return; + + linfo = find_linfo(env, insn_off); + if (!linfo || linfo == env->prev_linfo) + return; + + if (prefix_fmt) { + va_list args; + + va_start(args, prefix_fmt); + bpf_verifier_vlog(&env->log, prefix_fmt, args); + va_end(args); + } + + verbose(env, "%s\n", + ltrim(btf_name_by_offset(env->prog->aux->btf, + linfo->line_off))); + + env->prev_linfo = linfo; +} + +static const char *btf_type_name(const struct btf *btf, u32 id) +{ + return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off); +} + +/* string representation of 'enum bpf_reg_type' + * + * Note that reg_type_str() can not appear more than once in a single verbose() + * statement. + */ +const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type) +{ + char postfix[16] = {0}, prefix[64] = {0}; + static const char * const str[] = { + [NOT_INIT] = "?", + [SCALAR_VALUE] = "scalar", + [PTR_TO_CTX] = "ctx", + [CONST_PTR_TO_MAP] = "map_ptr", + [PTR_TO_MAP_VALUE] = "map_value", + [PTR_TO_STACK] = "fp", + [PTR_TO_PACKET] = "pkt", + [PTR_TO_PACKET_META] = "pkt_meta", + [PTR_TO_PACKET_END] = "pkt_end", + [PTR_TO_FLOW_KEYS] = "flow_keys", + [PTR_TO_SOCKET] = "sock", + [PTR_TO_SOCK_COMMON] = "sock_common", + [PTR_TO_TCP_SOCK] = "tcp_sock", + [PTR_TO_TP_BUFFER] = "tp_buffer", + [PTR_TO_XDP_SOCK] = "xdp_sock", + [PTR_TO_BTF_ID] = "ptr_", + [PTR_TO_MEM] = "mem", + [PTR_TO_BUF] = "buf", + [PTR_TO_FUNC] = "func", + [PTR_TO_MAP_KEY] = "map_key", + [CONST_PTR_TO_DYNPTR] = "dynptr_ptr", + }; + + if (type & PTR_MAYBE_NULL) { + if (base_type(type) == PTR_TO_BTF_ID) + strncpy(postfix, "or_null_", 16); + else + strncpy(postfix, "_or_null", 16); + } + + snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s%s", + type & MEM_RDONLY ? "rdonly_" : "", + type & MEM_RINGBUF ? "ringbuf_" : "", + type & MEM_USER ? "user_" : "", + type & MEM_PERCPU ? "percpu_" : "", + type & MEM_RCU ? "rcu_" : "", + type & PTR_UNTRUSTED ? "untrusted_" : "", + type & PTR_TRUSTED ? "trusted_" : "" + ); + + snprintf(env->tmp_str_buf, TMP_STR_BUF_LEN, "%s%s%s", + prefix, str[base_type(type)], postfix); + return env->tmp_str_buf; +} + +const char *dynptr_type_str(enum bpf_dynptr_type type) +{ + switch (type) { + case BPF_DYNPTR_TYPE_LOCAL: + return "local"; + case BPF_DYNPTR_TYPE_RINGBUF: + return "ringbuf"; + case BPF_DYNPTR_TYPE_SKB: + return "skb"; + case BPF_DYNPTR_TYPE_XDP: + return "xdp"; + case BPF_DYNPTR_TYPE_INVALID: + return "<invalid>"; + default: + WARN_ONCE(1, "unknown dynptr type %d\n", type); + return "<unknown>"; + } +} + +const char *iter_type_str(const struct btf *btf, u32 btf_id) +{ + if (!btf || btf_id == 0) + return "<invalid>"; + + /* we already validated that type is valid and has conforming name */ + return btf_type_name(btf, btf_id) + sizeof(ITER_PREFIX) - 1; +} + +const char *iter_state_str(enum bpf_iter_state state) +{ + switch (state) { + case BPF_ITER_STATE_ACTIVE: + return "active"; + case BPF_ITER_STATE_DRAINED: + return "drained"; + case BPF_ITER_STATE_INVALID: + return "<invalid>"; + default: + WARN_ONCE(1, "unknown iter state %d\n", state); + return "<unknown>"; + } +} + +static char slot_type_char[] = { + [STACK_INVALID] = '?', + [STACK_SPILL] = 'r', + [STACK_MISC] = 'm', + [STACK_ZERO] = '0', + [STACK_DYNPTR] = 'd', + [STACK_ITER] = 'i', +}; + +static void print_liveness(struct bpf_verifier_env *env, + enum bpf_reg_liveness live) +{ + if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN | REG_LIVE_DONE)) + verbose(env, "_"); + if (live & REG_LIVE_READ) + verbose(env, "r"); + if (live & REG_LIVE_WRITTEN) + verbose(env, "w"); + if (live & REG_LIVE_DONE) + verbose(env, "D"); +} + +#define UNUM_MAX_DECIMAL U16_MAX +#define SNUM_MAX_DECIMAL S16_MAX +#define SNUM_MIN_DECIMAL S16_MIN + +static bool is_unum_decimal(u64 num) +{ + return num <= UNUM_MAX_DECIMAL; +} + +static bool is_snum_decimal(s64 num) +{ + return num >= SNUM_MIN_DECIMAL && num <= SNUM_MAX_DECIMAL; +} + +static void verbose_unum(struct bpf_verifier_env *env, u64 num) +{ + if (is_unum_decimal(num)) + verbose(env, "%llu", num); + else + verbose(env, "%#llx", num); +} + +static void verbose_snum(struct bpf_verifier_env *env, s64 num) +{ + if (is_snum_decimal(num)) + verbose(env, "%lld", num); + else + verbose(env, "%#llx", num); +} + +int tnum_strn(char *str, size_t size, struct tnum a) +{ + /* print as a constant, if tnum is fully known */ + if (a.mask == 0) { + if (is_unum_decimal(a.value)) + return snprintf(str, size, "%llu", a.value); + else + return snprintf(str, size, "%#llx", a.value); + } + return snprintf(str, size, "(%#llx; %#llx)", a.value, a.mask); +} +EXPORT_SYMBOL_GPL(tnum_strn); + +static void print_scalar_ranges(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + const char **sep) +{ + /* For signed ranges, we want to unify 64-bit and 32-bit values in the + * output as much as possible, but there is a bit of a complication. + * If we choose to print values as decimals, this is natural to do, + * because negative 64-bit and 32-bit values >= -S32_MIN have the same + * representation due to sign extension. But if we choose to print + * them in hex format (see is_snum_decimal()), then sign extension is + * misleading. + * E.g., smin=-2 and smin32=-2 are exactly the same in decimal, but in + * hex they will be smin=0xfffffffffffffffe and smin32=0xfffffffe, two + * very different numbers. + * So we avoid sign extension if we choose to print values in hex. + */ + struct { + const char *name; + u64 val; + bool omit; + } minmaxs[] = { + {"smin", reg->smin_value, reg->smin_value == S64_MIN}, + {"smax", reg->smax_value, reg->smax_value == S64_MAX}, + {"umin", reg->umin_value, reg->umin_value == 0}, + {"umax", reg->umax_value, reg->umax_value == U64_MAX}, + {"smin32", + is_snum_decimal((s64)reg->s32_min_value) + ? (s64)reg->s32_min_value + : (u32)reg->s32_min_value, reg->s32_min_value == S32_MIN}, + {"smax32", + is_snum_decimal((s64)reg->s32_max_value) + ? (s64)reg->s32_max_value + : (u32)reg->s32_max_value, reg->s32_max_value == S32_MAX}, + {"umin32", reg->u32_min_value, reg->u32_min_value == 0}, + {"umax32", reg->u32_max_value, reg->u32_max_value == U32_MAX}, + }, *m1, *m2, *mend = &minmaxs[ARRAY_SIZE(minmaxs)]; + bool neg1, neg2; + + for (m1 = &minmaxs[0]; m1 < mend; m1++) { + if (m1->omit) + continue; + + neg1 = m1->name[0] == 's' && (s64)m1->val < 0; + + verbose(env, "%s%s=", *sep, m1->name); + *sep = ","; + + for (m2 = m1 + 2; m2 < mend; m2 += 2) { + if (m2->omit || m2->val != m1->val) + continue; + /* don't mix negatives with positives */ + neg2 = m2->name[0] == 's' && (s64)m2->val < 0; + if (neg2 != neg1) + continue; + m2->omit = true; + verbose(env, "%s=", m2->name); + } + + if (m1->name[0] == 's') + verbose_snum(env, m1->val); + else + verbose_unum(env, m1->val); + } +} + +static bool type_is_map_ptr(enum bpf_reg_type t) { + switch (base_type(t)) { + case CONST_PTR_TO_MAP: + case PTR_TO_MAP_KEY: + case PTR_TO_MAP_VALUE: + return true; + default: + return false; + } +} + +/* + * _a stands for append, was shortened to avoid multiline statements below. + * This macro is used to output a comma separated list of attributes. + */ +#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, ##__VA_ARGS__); sep = ","; }) + +static void print_reg_state(struct bpf_verifier_env *env, + const struct bpf_func_state *state, + const struct bpf_reg_state *reg) +{ + enum bpf_reg_type t; + const char *sep = ""; + + t = reg->type; + if (t == SCALAR_VALUE && reg->precise) + verbose(env, "P"); + if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) { + /* reg->off should be 0 for SCALAR_VALUE */ + verbose_snum(env, reg->var_off.value + reg->off); + return; + } + + verbose(env, "%s", reg_type_str(env, t)); + if (t == PTR_TO_STACK) { + if (state->frameno != reg->frameno) + verbose(env, "[%d]", reg->frameno); + if (tnum_is_const(reg->var_off)) { + verbose_snum(env, reg->var_off.value + reg->off); + return; + } + } + if (base_type(t) == PTR_TO_BTF_ID) + verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id)); + verbose(env, "("); + if (reg->id) + verbose_a("id=%d", reg->id); + if (reg->ref_obj_id) + verbose_a("ref_obj_id=%d", reg->ref_obj_id); + if (type_is_non_owning_ref(reg->type)) + verbose_a("%s", "non_own_ref"); + if (type_is_map_ptr(t)) { + if (reg->map_ptr->name[0]) + verbose_a("map=%s", reg->map_ptr->name); + verbose_a("ks=%d,vs=%d", + reg->map_ptr->key_size, + reg->map_ptr->value_size); + } + if (t != SCALAR_VALUE && reg->off) { + verbose_a("off="); + verbose_snum(env, reg->off); + } + if (type_is_pkt_pointer(t)) { + verbose_a("r="); + verbose_unum(env, reg->range); + } + if (base_type(t) == PTR_TO_MEM) { + verbose_a("sz="); + verbose_unum(env, reg->mem_size); + } + if (t == CONST_PTR_TO_DYNPTR) + verbose_a("type=%s", dynptr_type_str(reg->dynptr.type)); + if (tnum_is_const(reg->var_off)) { + /* a pointer register with fixed offset */ + if (reg->var_off.value) { + verbose_a("imm="); + verbose_snum(env, reg->var_off.value); + } + } else { + print_scalar_ranges(env, reg, &sep); + if (!tnum_is_unknown(reg->var_off)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose_a("var_off=%s", tn_buf); + } + } + verbose(env, ")"); +} + +void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_state *state, + bool print_all) +{ + const struct bpf_reg_state *reg; + int i; + + if (state->frameno) + verbose(env, " frame%d:", state->frameno); + for (i = 0; i < MAX_BPF_REG; i++) { + reg = &state->regs[i]; + if (reg->type == NOT_INIT) + continue; + if (!print_all && !reg_scratched(env, i)) + continue; + verbose(env, " R%d", i); + print_liveness(env, reg->live); + verbose(env, "="); + print_reg_state(env, state, reg); + } + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + char types_buf[BPF_REG_SIZE + 1]; + const char *sep = ""; + bool valid = false; + u8 slot_type; + int j; + + if (!print_all && !stack_slot_scratched(env, i)) + continue; + + for (j = 0; j < BPF_REG_SIZE; j++) { + slot_type = state->stack[i].slot_type[j]; + if (slot_type != STACK_INVALID) + valid = true; + types_buf[j] = slot_type_char[slot_type]; + } + types_buf[BPF_REG_SIZE] = 0; + if (!valid) + continue; + + reg = &state->stack[i].spilled_ptr; + switch (state->stack[i].slot_type[BPF_REG_SIZE - 1]) { + case STACK_SPILL: + /* print MISC/ZERO/INVALID slots above subreg spill */ + for (j = 0; j < BPF_REG_SIZE; j++) + if (state->stack[i].slot_type[j] == STACK_SPILL) + break; + types_buf[j] = '\0'; + + verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); + print_liveness(env, reg->live); + verbose(env, "=%s", types_buf); + print_reg_state(env, state, reg); + break; + case STACK_DYNPTR: + /* skip to main dynptr slot */ + i += BPF_DYNPTR_NR_SLOTS - 1; + reg = &state->stack[i].spilled_ptr; + + verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); + print_liveness(env, reg->live); + verbose(env, "=dynptr_%s(", dynptr_type_str(reg->dynptr.type)); + if (reg->id) + verbose_a("id=%d", reg->id); + if (reg->ref_obj_id) + verbose_a("ref_id=%d", reg->ref_obj_id); + if (reg->dynptr_id) + verbose_a("dynptr_id=%d", reg->dynptr_id); + verbose(env, ")"); + break; + case STACK_ITER: + /* only main slot has ref_obj_id set; skip others */ + if (!reg->ref_obj_id) + continue; + + verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); + print_liveness(env, reg->live); + verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)", + iter_type_str(reg->iter.btf, reg->iter.btf_id), + reg->ref_obj_id, iter_state_str(reg->iter.state), + reg->iter.depth); + break; + case STACK_MISC: + case STACK_ZERO: + default: + verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); + print_liveness(env, reg->live); + verbose(env, "=%s", types_buf); + break; + } + } + if (state->acquired_refs && state->refs[0].id) { + verbose(env, " refs=%d", state->refs[0].id); + for (i = 1; i < state->acquired_refs; i++) + if (state->refs[i].id) + verbose(env, ",%d", state->refs[i].id); + } + if (state->in_callback_fn) + verbose(env, " cb"); + if (state->in_async_callback_fn) + verbose(env, " async_cb"); + verbose(env, "\n"); + if (!print_all) + mark_verifier_state_clean(env); +} + +static inline u32 vlog_alignment(u32 pos) +{ + return round_up(max(pos + BPF_LOG_MIN_ALIGNMENT / 2, BPF_LOG_ALIGNMENT), + BPF_LOG_MIN_ALIGNMENT) - pos - 1; +} + +void print_insn_state(struct bpf_verifier_env *env, const struct bpf_func_state *state) +{ + if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) { + /* remove new line character */ + bpf_vlog_reset(&env->log, env->prev_log_pos - 1); + verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_pos), ' '); + } else { + verbose(env, "%d:", env->insn_idx); + } + print_verifier_state(env, state, false); +} diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 17c7e7782a1f..b32be680da6c 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -231,6 +231,9 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key) struct lpm_trie_node *node, *found = NULL; struct bpf_lpm_trie_key *key = _key; + if (key->prefixlen > trie->max_prefixlen) + return NULL; + /* Start walking the trie from the root node ... */ for (node = rcu_dereference_check(trie->root, rcu_read_lock_bh_held()); diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index cd5eafaba97e..8ef269e66ba5 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -127,12 +127,21 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map, return inner_map; } -void bpf_map_fd_put_ptr(void *ptr) +void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { - /* ptr->ops->map_free() has to go through one - * rcu grace period by itself. + struct bpf_map *inner_map = ptr; + + /* Defer the freeing of inner map according to the sleepable attribute + * of bpf program which owns the outer map, so unnecessary waiting for + * RCU tasks trace grace period can be avoided. */ - bpf_map_put(ptr); + if (need_defer) { + if (atomic64_read(&map->sleepable_refcnt)) + WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true); + else + WRITE_ONCE(inner_map->free_after_rcu_gp, true); + } + bpf_map_put(inner_map); } u32 bpf_map_fd_sys_lookup_elem(void *ptr) diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h index bcb7534afb3c..7d61602354de 100644 --- a/kernel/bpf/map_in_map.h +++ b/kernel/bpf/map_in_map.h @@ -13,7 +13,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd); void bpf_map_meta_free(struct bpf_map *map_meta); void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, int ufd); -void bpf_map_fd_put_ptr(void *ptr); +void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer); u32 bpf_map_fd_sys_lookup_elem(void *ptr); #endif diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 6fc9dae9edc8..6abd7c5df4b3 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -193,9 +193,7 @@ static int __init bpf_map_iter_init(void) late_initcall(bpf_map_iter_init); -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "Global functions as their definitions will be in vmlinux BTF"); +__bpf_kfunc_start_defs(); __bpf_kfunc s64 bpf_map_sum_elem_count(const struct bpf_map *map) { @@ -213,7 +211,7 @@ __bpf_kfunc s64 bpf_map_sum_elem_count(const struct bpf_map *map) return ret; } -__diag_pop(); +__bpf_kfunc_end_defs(); BTF_SET8_START(bpf_map_iter_kfunc_ids) BTF_ID_FLAGS(func, bpf_map_sum_elem_count, KF_TRUSTED_ARGS) diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 9c49ae53deaf..550f02e2cb13 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -121,6 +121,8 @@ struct bpf_mem_caches { struct bpf_mem_cache cache[NUM_CACHES]; }; +static const u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; + static struct llist_node notrace *__llist_del_first(struct llist_head *head) { struct llist_node *entry, *next; @@ -340,6 +342,7 @@ static void free_bulk(struct bpf_mem_cache *c) int cnt; WARN_ON_ONCE(tgt->unit_size != c->unit_size); + WARN_ON_ONCE(tgt->percpu_size != c->percpu_size); do { inc_active(c, &flags); @@ -365,6 +368,9 @@ static void __free_by_rcu(struct rcu_head *head) struct bpf_mem_cache *tgt = c->tgt; struct llist_node *llnode; + WARN_ON_ONCE(tgt->unit_size != c->unit_size); + WARN_ON_ONCE(tgt->percpu_size != c->percpu_size); + llnode = llist_del_all(&c->waiting_for_gp); if (!llnode) goto out; @@ -458,12 +464,17 @@ static void notrace irq_work_raise(struct bpf_mem_cache *c) * consume ~ 11 Kbyte per cpu. * Typical case will be between 11K and 116K closer to 11K. * bpf progs can and should share bpf_mem_cache when possible. + * + * Percpu allocation is typically rare. To avoid potential unnecessary large + * memory consumption, set low_mark = 1 and high_mark = 3, resulting in c->batch = 1. */ - -static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) +static void init_refill_work(struct bpf_mem_cache *c) { init_irq_work(&c->refill_work, bpf_mem_refill); - if (c->unit_size <= 256) { + if (c->percpu_size) { + c->low_watermark = 1; + c->high_watermark = 3; + } else if (c->unit_size <= 256) { c->low_watermark = 32; c->high_watermark = 96; } else { @@ -476,12 +487,20 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) c->high_watermark = max(96 * 256 / c->unit_size, 3); } c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1); +} - /* To avoid consuming memory assume that 1st run of bpf - * prog won't be doing more than 4 map_update_elem from - * irq disabled region +static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) +{ + int cnt = 1; + + /* To avoid consuming memory, for non-percpu allocation, assume that + * 1st run of bpf prog won't be doing more than 4 map_update_elem from + * irq disabled region if unit size is less than or equal to 256. + * For all other cases, let us just do one allocation. */ - alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu), false); + if (!c->percpu_size && c->unit_size <= 256) + cnt = 4; + alloc_bulk(c, cnt, cpu_to_node(cpu), false); } /* When size != 0 bpf_mem_cache for each cpu. @@ -493,21 +512,25 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) */ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) { - static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; struct bpf_mem_caches *cc, __percpu *pcc; struct bpf_mem_cache *c, __percpu *pc; struct obj_cgroup *objcg = NULL; int cpu, i, unit_size, percpu_size = 0; + if (percpu && size == 0) + return -EINVAL; + + /* room for llist_node and per-cpu pointer */ + if (percpu) + percpu_size = LLIST_NODE_SZ + sizeof(void *); + ma->percpu = percpu; + if (size) { pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL); if (!pc) return -ENOMEM; - if (percpu) - /* room for llist_node and per-cpu pointer */ - percpu_size = LLIST_NODE_SZ + sizeof(void *); - else + if (!percpu) size += LLIST_NODE_SZ; /* room for llist_node */ unit_size = size; @@ -515,39 +538,93 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) if (memcg_bpf_enabled()) objcg = get_obj_cgroup_from_current(); #endif + ma->objcg = objcg; + for_each_possible_cpu(cpu) { c = per_cpu_ptr(pc, cpu); c->unit_size = unit_size; c->objcg = objcg; c->percpu_size = percpu_size; c->tgt = c; + init_refill_work(c); prefill_mem_cache(c, cpu); } ma->cache = pc; return 0; } - /* size == 0 && percpu is an invalid combination */ - if (WARN_ON_ONCE(percpu)) - return -EINVAL; - pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL); if (!pcc) return -ENOMEM; #ifdef CONFIG_MEMCG_KMEM objcg = get_obj_cgroup_from_current(); #endif + ma->objcg = objcg; for_each_possible_cpu(cpu) { cc = per_cpu_ptr(pcc, cpu); for (i = 0; i < NUM_CACHES; i++) { c = &cc->cache[i]; c->unit_size = sizes[i]; c->objcg = objcg; + c->percpu_size = percpu_size; c->tgt = c; + + init_refill_work(c); prefill_mem_cache(c, cpu); } } + + ma->caches = pcc; + return 0; +} + +int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg) +{ + struct bpf_mem_caches __percpu *pcc; + + pcc = __alloc_percpu_gfp(sizeof(struct bpf_mem_caches), 8, GFP_KERNEL); + if (!pcc) + return -ENOMEM; + ma->caches = pcc; + ma->objcg = objcg; + ma->percpu = true; + return 0; +} + +int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size) +{ + struct bpf_mem_caches *cc, __percpu *pcc; + int cpu, i, unit_size, percpu_size; + struct obj_cgroup *objcg; + struct bpf_mem_cache *c; + + i = bpf_mem_cache_idx(size); + if (i < 0) + return -EINVAL; + + /* room for llist_node and per-cpu pointer */ + percpu_size = LLIST_NODE_SZ + sizeof(void *); + + unit_size = sizes[i]; + objcg = ma->objcg; + pcc = ma->caches; + + for_each_possible_cpu(cpu) { + cc = per_cpu_ptr(pcc, cpu); + c = &cc->cache[i]; + if (c->unit_size) + break; + + c->unit_size = unit_size; + c->objcg = objcg; + c->percpu_size = percpu_size; + c->tgt = c; + + init_refill_work(c); + prefill_mem_cache(c, cpu); + } + return 0; } @@ -682,9 +759,8 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } - /* objcg is the same across cpus */ - if (c->objcg) - obj_cgroup_put(c->objcg); + if (ma->objcg) + obj_cgroup_put(ma->objcg); destroy_mem_alloc(ma, rcu_in_progress); } if (ma->caches) { @@ -700,8 +776,8 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } } - if (c->objcg) - obj_cgroup_put(c->objcg); + if (ma->objcg) + obj_cgroup_put(ma->objcg); destroy_mem_alloc(ma, rcu_in_progress); } } @@ -734,12 +810,17 @@ static void notrace *unit_alloc(struct bpf_mem_cache *c) } } local_dec(&c->active); - local_irq_restore(flags); WARN_ON(cnt < 0); if (cnt < c->low_watermark) irq_work_raise(c); + /* Enable IRQ after the enqueue of irq work completes, so irq work + * will run after IRQ is enabled and free_llist may be refilled by + * irq work before other task preempts current task. + */ + local_irq_restore(flags); + return llnode; } @@ -775,11 +856,16 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) llist_add(llnode, &c->free_llist_extra); } local_dec(&c->active); - local_irq_restore(flags); if (cnt > c->high_watermark) /* free few objects from current cpu into global kmalloc pool */ irq_work_raise(c); + /* Enable IRQ after irq_work_raise() completes, otherwise when current + * task is preempted by task which does unit_alloc(), unit_alloc() may + * return NULL unexpectedly because irq work is already pending but can + * not been triggered and free_llist can not be refilled timely. + */ + local_irq_restore(flags); } static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr) @@ -797,10 +883,10 @@ static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr) llist_add(llnode, &c->free_llist_extra_rcu); } local_dec(&c->active); - local_irq_restore(flags); if (!atomic_read(&c->call_rcu_in_progress)) irq_work_raise(c); + local_irq_restore(flags); } /* Called from BPF program or from sys_bpf syscall. @@ -812,9 +898,11 @@ void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size) void *ret; if (!size) - return ZERO_SIZE_PTR; + return NULL; - idx = bpf_mem_cache_idx(size + LLIST_NODE_SZ); + if (!ma->percpu) + size += LLIST_NODE_SZ; + idx = bpf_mem_cache_idx(size); if (idx < 0) return NULL; @@ -824,13 +912,15 @@ void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size) void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) { + struct bpf_mem_cache *c; int idx; if (!ptr) return; - idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ)); - if (idx < 0) + c = *(void **)(ptr - LLIST_NODE_SZ); + idx = bpf_mem_cache_idx(c->unit_size); + if (WARN_ON_ONCE(idx < 0)) return; unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr); @@ -838,13 +928,15 @@ void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr) { + struct bpf_mem_cache *c; int idx; if (!ptr) return; - idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ)); - if (idx < 0) + c = *(void **)(ptr - LLIST_NODE_SZ); + idx = bpf_mem_cache_idx(c->unit_size); + if (WARN_ON_ONCE(idx < 0)) return; unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr); @@ -910,6 +1002,8 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags) memcg = get_memcg(c); old_memcg = set_active_memcg(memcg); ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT); + if (ret) + *(struct bpf_mem_cache **)ret = c; set_active_memcg(old_memcg); mem_cgroup_put(memcg); } diff --git a/kernel/bpf/mprog.c b/kernel/bpf/mprog.c index 32d2c4829eb8..1394168062e8 100644 --- a/kernel/bpf/mprog.c +++ b/kernel/bpf/mprog.c @@ -253,6 +253,9 @@ int bpf_mprog_attach(struct bpf_mprog_entry *entry, goto out; } idx = tidx; + } else if (bpf_mprog_total(entry) == bpf_mprog_max()) { + ret = -ERANGE; + goto out; } if (flags & BPF_F_BEFORE) { tidx = bpf_mprog_pos_before(entry, &rtuple); @@ -398,14 +401,16 @@ int bpf_mprog_query(const union bpf_attr *attr, union bpf_attr __user *uattr, struct bpf_mprog_cp *cp; struct bpf_prog *prog; const u32 flags = 0; + u32 id, count = 0; + u64 revision = 1; int i, ret = 0; - u32 id, count; - u64 revision; if (attr->query.query_flags || attr->query.attach_flags) return -EINVAL; - revision = bpf_mprog_revision(entry); - count = bpf_mprog_total(entry); + if (entry) { + revision = bpf_mprog_revision(entry); + count = bpf_mprog_total(entry); + } if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) return -EFAULT; if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision))) diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 3e4f2ec1af06..1a4fec330eaa 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -199,12 +199,14 @@ static int __bpf_prog_dev_bound_init(struct bpf_prog *prog, struct net_device *n offload->netdev = netdev; ondev = bpf_offload_find_netdev(offload->netdev); + /* When program is offloaded require presence of "true" + * bpf_offload_netdev, avoid the one created for !ondev case below. + */ + if (bpf_prog_is_offloaded(prog->aux) && (!ondev || !ondev->offdev)) { + err = -EINVAL; + goto err_free; + } if (!ondev) { - if (bpf_prog_is_offloaded(prog->aux)) { - err = -EINVAL; - goto err_free; - } - /* When only binding to the device, explicitly * create an entry in the hashtable. */ @@ -232,7 +234,14 @@ int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr) attr->prog_type != BPF_PROG_TYPE_XDP) return -EINVAL; - if (attr->prog_flags & ~BPF_F_XDP_DEV_BOUND_ONLY) + if (attr->prog_flags & ~(BPF_F_XDP_DEV_BOUND_ONLY | BPF_F_XDP_HAS_FRAGS)) + return -EINVAL; + + /* Frags are allowed only if program is dev-bound-only, but not + * if it is requesting bpf offload. + */ + if (attr->prog_flags & BPF_F_XDP_HAS_FRAGS && + !(attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY)) return -EINVAL; if (attr->prog_type == BPF_PROG_TYPE_SCHED_CLS && @@ -845,10 +854,11 @@ void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id) if (!ops) goto out; - if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP)) - p = ops->xmo_rx_timestamp; - else if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_HASH)) - p = ops->xmo_rx_hash; +#define XDP_METADATA_KFUNC(name, _, __, xmo) \ + if (func_id == bpf_xdp_metadata_kfunc_id(name)) p = ops->xmo; + XDP_METADATA_KFUNC_xxx +#undef XDP_METADATA_KFUNC + out: up_read(&bpf_devs_lock); diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 8d2ddcb7566b..d869f51ea93a 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -98,7 +98,12 @@ static long __queue_map_get(struct bpf_map *map, void *value, bool delete) int err = 0; void *ptr; - raw_spin_lock_irqsave(&qs->lock, flags); + if (in_nmi()) { + if (!raw_spin_trylock_irqsave(&qs->lock, flags)) + return -EBUSY; + } else { + raw_spin_lock_irqsave(&qs->lock, flags); + } if (queue_stack_map_is_empty(qs)) { memset(value, 0, qs->map.value_size); @@ -128,7 +133,12 @@ static long __stack_map_get(struct bpf_map *map, void *value, bool delete) void *ptr; u32 index; - raw_spin_lock_irqsave(&qs->lock, flags); + if (in_nmi()) { + if (!raw_spin_trylock_irqsave(&qs->lock, flags)) + return -EBUSY; + } else { + raw_spin_lock_irqsave(&qs->lock, flags); + } if (queue_stack_map_is_empty(qs)) { memset(value, 0, qs->map.value_size); @@ -193,7 +203,12 @@ static long queue_stack_map_push_elem(struct bpf_map *map, void *value, if (flags & BPF_NOEXIST || flags > BPF_EXIST) return -EINVAL; - raw_spin_lock_irqsave(&qs->lock, irq_flags); + if (in_nmi()) { + if (!raw_spin_trylock_irqsave(&qs->lock, irq_flags)) + return -EBUSY; + } else { + raw_spin_lock_irqsave(&qs->lock, irq_flags); + } if (queue_stack_map_is_full(qs)) { if (!replace) { diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index f045fde632e5..0ee653a936ea 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -770,8 +770,7 @@ schedule_work_return: /* Prevent the clearing of the busy-bit from being reordered before the * storing of any rb consumer or producer positions. */ - smp_mb__before_atomic(); - atomic_set(&rb->busy, 0); + atomic_set_release(&rb->busy, 0); if (flags & BPF_RB_FORCE_WAKEUP) irq_work_queue(&rb->work); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 458bb80b14d5..dff7ba539701 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -28,7 +28,7 @@ struct bpf_stack_map { void *elems; struct pcpu_freelist freelist; u32 n_buckets; - struct stack_map_bucket *buckets[]; + struct stack_map_bucket *buckets[] __counted_by(n_buckets); }; static inline bool stack_map_use_build_id(struct bpf_map *map) @@ -388,6 +388,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, { u32 trace_nr, copy_len, elem_size, num_elem, max_depth; bool user_build_id = flags & BPF_F_USER_BUILD_ID; + bool crosstask = task && task != current; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; bool user = flags & BPF_F_USER_STACK; struct perf_callchain_entry *trace; @@ -410,6 +411,14 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, if (task && user && !user_mode(regs)) goto err_fault; + /* get_perf_callchain does not support crosstask user stack walking + * but returns an empty stack instead of NULL. + */ + if (crosstask && user) { + err = -EOPNOTSUPP; + goto clear; + } + num_elem = size / elem_size; max_depth = num_elem + skip; if (sysctl_perf_event_max_stack < max_depth) @@ -421,7 +430,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, trace = get_callchain_entry_for_task(task, max_depth); else trace = get_perf_callchain(regs, 0, kernel, user, max_depth, - false, false); + crosstask, false); if (unlikely(!trace)) goto err_fault; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ebeb0695305a..a1f18681721c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -35,8 +35,9 @@ #include <linux/rcupdate_trace.h> #include <linux/memcontrol.h> #include <linux/trace_events.h> -#include <net/netfilter/nf_bpf_link.h> +#include <net/netfilter/nf_bpf_link.h> +#include <net/netkit.h> #include <net/tcx.h> #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ @@ -141,9 +142,13 @@ static u32 bpf_map_value_size(const struct bpf_map *map) static void maybe_wait_bpf_programs(struct bpf_map *map) { - /* Wait for any running BPF programs to complete so that - * userspace, when we return to it, knows that all programs - * that could be running use the new map value. + /* Wait for any running non-sleepable BPF programs to complete so that + * userspace, when we return to it, knows that all non-sleepable + * programs that could be running use the new map value. For sleepable + * BPF programs, synchronize_rcu_tasks_trace() should be used to wait + * for the completions of these programs, but considering the waiting + * time can be very long and userspace may think it will hang forever, + * so don't handle sleepable BPF programs now. */ if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS || map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) @@ -179,15 +184,11 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file, err = bpf_percpu_cgroup_storage_update(map, key, value, flags); } else if (IS_FD_ARRAY(map)) { - rcu_read_lock(); err = bpf_fd_array_map_update_elem(map, map_file, key, value, flags); - rcu_read_unlock(); } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { - rcu_read_lock(); err = bpf_fd_htab_map_update_elem(map, map_file, key, value, flags); - rcu_read_unlock(); } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { /* rcu_read_lock() is not needed */ err = bpf_fd_reuseport_array_update_elem(map, key, value, @@ -202,7 +203,6 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file, rcu_read_unlock(); } bpf_enable_instrumentation(); - maybe_wait_bpf_programs(map); return err; } @@ -263,7 +263,6 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, } bpf_enable_instrumentation(); - maybe_wait_bpf_programs(map); return err; } @@ -514,6 +513,7 @@ void btf_record_free(struct btf_record *rec) switch (rec->fields[i].type) { case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_KPTR_PERCPU: if (rec->fields[i].kptr.module) module_put(rec->fields[i].kptr.module); btf_put(rec->fields[i].kptr.btf); @@ -560,6 +560,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) switch (fields[i].type) { case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_KPTR_PERCPU: btf_get(fields[i].kptr.btf); if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) { ret = -ENXIO; @@ -624,8 +625,6 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj) bpf_timer_cancel_and_free(obj + rec->timer_off); } -extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec); - void bpf_obj_free_fields(const struct btf_record *rec, void *obj) { const struct btf_field *fields; @@ -650,6 +649,7 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) WRITE_ONCE(*(u64 *)field_ptr, 0); break; case BPF_KPTR_REF: + case BPF_KPTR_PERCPU: xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0); if (!xchgd_field) break; @@ -659,8 +659,8 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) field->kptr.btf_id); migrate_disable(); __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ? - pointee_struct_meta->record : - NULL); + pointee_struct_meta->record : NULL, + fields[i].type == BPF_KPTR_PERCPU); migrate_enable(); } else { field->kptr.dtor(xchgd_field); @@ -692,6 +692,7 @@ static void bpf_map_free_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_map, work); struct btf_record *rec = map->record; + struct btf *btf = map->btf; security_bpf_map_free(map); bpf_map_release_memcg(map); @@ -707,6 +708,10 @@ static void bpf_map_free_deferred(struct work_struct *work) * template bpf_map struct used during verification. */ btf_record_free(rec); + /* Delay freeing of btf for maps, as map_free callback may need + * struct_meta info which will be freed with btf_put(). + */ + btf_put(btf); } static void bpf_map_put_uref(struct bpf_map *map) @@ -717,6 +722,28 @@ static void bpf_map_put_uref(struct bpf_map *map) } } +static void bpf_map_free_in_work(struct bpf_map *map) +{ + INIT_WORK(&map->work, bpf_map_free_deferred); + /* Avoid spawning kworkers, since they all might contend + * for the same mutex like slab_mutex. + */ + queue_work(system_unbound_wq, &map->work); +} + +static void bpf_map_free_rcu_gp(struct rcu_head *rcu) +{ + bpf_map_free_in_work(container_of(rcu, struct bpf_map, rcu)); +} + +static void bpf_map_free_mult_rcu_gp(struct rcu_head *rcu) +{ + if (rcu_trace_implies_rcu_gp()) + bpf_map_free_rcu_gp(rcu); + else + call_rcu(rcu, bpf_map_free_rcu_gp); +} + /* decrement map refcnt and schedule it for freeing via workqueue * (underlying map implementation ops->map_free() might sleep) */ @@ -725,12 +752,14 @@ void bpf_map_put(struct bpf_map *map) if (atomic64_dec_and_test(&map->refcnt)) { /* bpf_map_free_id() must be called first */ bpf_map_free_id(map); - btf_put(map->btf); - INIT_WORK(&map->work, bpf_map_free_deferred); - /* Avoid spawning kworkers, since they all might contend - * for the same mutex like slab_mutex. - */ - queue_work(system_unbound_wq, &map->work); + + WARN_ON_ONCE(atomic64_read(&map->sleepable_refcnt)); + if (READ_ONCE(map->free_after_mult_rcu_gp)) + call_rcu_tasks_trace(&map->rcu, bpf_map_free_mult_rcu_gp); + else if (READ_ONCE(map->free_after_rcu_gp)) + call_rcu(&map->rcu, bpf_map_free_rcu_gp); + else + bpf_map_free_in_work(map); } } EXPORT_SYMBOL_GPL(bpf_map_put); @@ -1045,6 +1074,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, break; case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_KPTR_PERCPU: case BPF_REFCOUNT: if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_PERCPU_HASH && @@ -1521,6 +1551,8 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) } err = bpf_map_update_value(map, f.file, key, value, attr->flags); + if (!err) + maybe_wait_bpf_programs(map); kvfree(value); free_key: @@ -1576,7 +1608,8 @@ static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr) err = map->ops->map_delete_elem(map, key); rcu_read_unlock(); bpf_enable_instrumentation(); - maybe_wait_bpf_programs(map); + if (!err) + maybe_wait_bpf_programs(map); out: kvfree(key); err_put: @@ -1673,6 +1706,9 @@ int generic_map_delete_batch(struct bpf_map *map, if (!max_count) return 0; + if (put_user(0, &uattr->batch.count)) + return -EFAULT; + key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); if (!key) return -ENOMEM; @@ -1702,7 +1738,6 @@ int generic_map_delete_batch(struct bpf_map *map, kvfree(key); - maybe_wait_bpf_programs(map); return err; } @@ -1730,6 +1765,9 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file, if (!max_count) return 0; + if (put_user(0, &uattr->batch.count)) + return -EFAULT; + key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); if (!key) return -ENOMEM; @@ -1760,6 +1798,7 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file, kvfree(value); kvfree(key); + return err; } @@ -2442,14 +2481,19 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UNIX_CONNECT: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: + case BPF_CGROUP_UNIX_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UNIX_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UNIX_RECVMSG: return 0; default: return -EINVAL; @@ -2565,7 +2609,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) BPF_F_SLEEPABLE | BPF_F_TEST_RND_HI32 | BPF_F_XDP_HAS_FRAGS | - BPF_F_XDP_DEV_BOUND_ONLY)) + BPF_F_XDP_DEV_BOUND_ONLY | + BPF_F_TEST_REG_INVARIANTS)) return -EINVAL; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && @@ -2693,6 +2738,22 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) goto free_prog_sec; } + /* + * Bookkeeping for managing the program attachment chain. + * + * It might be tempting to set attach_tracing_prog flag at the attachment + * time, but this will not prevent from loading bunch of tracing prog + * first, then attach them one to another. + * + * The flag attach_tracing_prog is set for the whole program lifecycle, and + * doesn't have to be cleared in bpf_tracing_link_release, since tracing + * programs cannot change attachment target. + */ + if (type == BPF_PROG_TYPE_TRACING && dst_prog && + dst_prog->type == BPF_PROG_TYPE_TRACING) { + prog->aux->attach_tracing_prog = true; + } + /* find program type: socket_filter vs tracing_filter */ err = find_prog_type(type, prog); if (err < 0) @@ -2745,7 +2806,7 @@ free_used_maps: * period before we can tear down JIT memory since symbols * are already exposed under kallsyms. */ - __bpf_prog_put_noref(prog, prog->aux->func_cnt); + __bpf_prog_put_noref(prog, prog->aux->real_func_cnt); return err; free_prog_sec: free_uid(prog->aux->user); @@ -3126,7 +3187,12 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, } if (tgt_prog_fd) { - /* For now we only allow new targets for BPF_PROG_TYPE_EXT */ + /* + * For now we only allow new targets for BPF_PROG_TYPE_EXT. If this + * part would be changed to implement the same for + * BPF_PROG_TYPE_TRACING, do not forget to update the way how + * attach_tracing_prog flag is set. + */ if (prog->type != BPF_PROG_TYPE_EXT) { err = -EINVAL; goto out_put_prog; @@ -3171,6 +3237,10 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, * * - if prog->aux->dst_trampoline and tgt_prog is NULL, the program * was detached and is going for re-attachment. + * + * - if prog->aux->dst_trampoline is NULL and tgt_prog and prog->aux->attach_btf + * are NULL, then program was already attached and user did not provide + * tgt_prog_fd so we have no way to find out or create trampoline */ if (!prog->aux->dst_trampoline && !tgt_prog) { /* @@ -3184,6 +3254,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, err = -EINVAL; goto out_unlock; } + /* We can allow re-attach only if we have valid attach_btf. */ + if (!prog->aux->attach_btf) { + err = -EINVAL; + goto out_unlock; + } btf_id = prog->aux->attach_btf_id; key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, btf_id); } @@ -3370,7 +3445,7 @@ static void bpf_perf_link_dealloc(struct bpf_link *link) static int bpf_perf_link_fill_common(const struct perf_event *event, char __user *uname, u32 ulen, u64 *probe_offset, u64 *probe_addr, - u32 *fd_type) + u32 *fd_type, unsigned long *missed) { const char *buf; u32 prog_id; @@ -3381,7 +3456,7 @@ static int bpf_perf_link_fill_common(const struct perf_event *event, return -EINVAL; err = bpf_get_perf_event_info(event, &prog_id, fd_type, &buf, - probe_offset, probe_addr); + probe_offset, probe_addr, missed); if (err) return err; if (!uname) @@ -3404,6 +3479,7 @@ static int bpf_perf_link_fill_common(const struct perf_event *event, static int bpf_perf_link_fill_kprobe(const struct perf_event *event, struct bpf_link_info *info) { + unsigned long missed; char __user *uname; u64 addr, offset; u32 ulen, type; @@ -3412,7 +3488,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event, uname = u64_to_user_ptr(info->perf_event.kprobe.func_name); ulen = info->perf_event.kprobe.name_len; err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr, - &type); + &type, &missed); if (err) return err; if (type == BPF_FD_TYPE_KRETPROBE) @@ -3421,6 +3497,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event, info->perf_event.type = BPF_PERF_EVENT_KPROBE; info->perf_event.kprobe.offset = offset; + info->perf_event.kprobe.missed = missed; if (!kallsyms_show_value(current_cred())) addr = 0; info->perf_event.kprobe.addr = addr; @@ -3440,7 +3517,7 @@ static int bpf_perf_link_fill_uprobe(const struct perf_event *event, uname = u64_to_user_ptr(info->perf_event.uprobe.file_name); ulen = info->perf_event.uprobe.name_len; err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr, - &type); + &type, NULL); if (err) return err; @@ -3476,7 +3553,7 @@ static int bpf_perf_link_fill_tracepoint(const struct perf_event *event, uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name); ulen = info->perf_event.tracepoint.name_len; info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT; - return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL); + return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL, NULL); } static int bpf_perf_link_fill_perf_event(const struct perf_event *event, @@ -3672,14 +3749,19 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UNIX_CONNECT: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: + case BPF_CGROUP_UNIX_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UNIX_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UNIX_RECVMSG: return BPF_PROG_TYPE_CGROUP_SOCK_ADDR; case BPF_CGROUP_SOCK_OPS: return BPF_PROG_TYPE_SOCK_OPS; @@ -3716,6 +3798,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_LSM; case BPF_TCX_INGRESS: case BPF_TCX_EGRESS: + case BPF_NETKIT_PRIMARY: + case BPF_NETKIT_PEER: return BPF_PROG_TYPE_SCHED_CLS; default: return BPF_PROG_TYPE_UNSPEC; @@ -3767,7 +3851,9 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, return 0; case BPF_PROG_TYPE_SCHED_CLS: if (attach_type != BPF_TCX_INGRESS && - attach_type != BPF_TCX_EGRESS) + attach_type != BPF_TCX_EGRESS && + attach_type != BPF_NETKIT_PRIMARY && + attach_type != BPF_NETKIT_PEER) return -EINVAL; return 0; default: @@ -3796,7 +3882,6 @@ static int bpf_prog_attach(const union bpf_attr *attr) { enum bpf_prog_type ptype; struct bpf_prog *prog; - u32 mask; int ret; if (CHECK_ATTR(BPF_PROG_ATTACH)) @@ -3805,10 +3890,16 @@ static int bpf_prog_attach(const union bpf_attr *attr) ptype = attach_type_to_prog_type(attr->attach_type); if (ptype == BPF_PROG_TYPE_UNSPEC) return -EINVAL; - mask = bpf_mprog_supported(ptype) ? - BPF_F_ATTACH_MASK_MPROG : BPF_F_ATTACH_MASK_BASE; - if (attr->attach_flags & ~mask) - return -EINVAL; + if (bpf_mprog_supported(ptype)) { + if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG) + return -EINVAL; + } else { + if (attr->attach_flags & ~BPF_F_ATTACH_MASK_BASE) + return -EINVAL; + if (attr->relative_fd || + attr->expected_revision) + return -EINVAL; + } prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); if (IS_ERR(prog)) @@ -3845,7 +3936,11 @@ static int bpf_prog_attach(const union bpf_attr *attr) ret = cgroup_bpf_prog_attach(attr, ptype, prog); break; case BPF_PROG_TYPE_SCHED_CLS: - ret = tcx_prog_attach(attr, prog); + if (attr->attach_type == BPF_TCX_INGRESS || + attr->attach_type == BPF_TCX_EGRESS) + ret = tcx_prog_attach(attr, prog); + else + ret = netkit_prog_attach(attr, prog); break; default: ret = -EINVAL; @@ -3878,6 +3973,10 @@ static int bpf_prog_detach(const union bpf_attr *attr) if (IS_ERR(prog)) return PTR_ERR(prog); } + } else if (attr->attach_flags || + attr->relative_fd || + attr->expected_revision) { + return -EINVAL; } switch (ptype) { @@ -3902,7 +4001,11 @@ static int bpf_prog_detach(const union bpf_attr *attr) ret = cgroup_bpf_prog_detach(attr, ptype); break; case BPF_PROG_TYPE_SCHED_CLS: - ret = tcx_prog_detach(attr, prog); + if (attr->attach_type == BPF_TCX_INGRESS || + attr->attach_type == BPF_TCX_EGRESS) + ret = tcx_prog_detach(attr, prog); + else + ret = netkit_prog_detach(attr, prog); break; default: ret = -EINVAL; @@ -3913,7 +4016,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) return ret; } -#define BPF_PROG_QUERY_LAST_FIELD query.link_attach_flags +#define BPF_PROG_QUERY_LAST_FIELD query.revision static int bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -3936,14 +4039,19 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET6_POST_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UNIX_CONNECT: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: + case BPF_CGROUP_UNIX_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UNIX_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UNIX_RECVMSG: case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: case BPF_CGROUP_SYSCTL: @@ -3964,6 +4072,9 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_TCX_INGRESS: case BPF_TCX_EGRESS: return tcx_prog_query(attr, uattr); + case BPF_NETKIT_PRIMARY: + case BPF_NETKIT_PEER: + return netkit_prog_query(attr, uattr); default: return -EINVAL; } @@ -4809,7 +4920,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr, err = bpf_get_perf_event_info(event, &prog_id, &fd_type, &buf, &probe_offset, - &probe_addr); + &probe_addr, NULL); if (!err) err = bpf_task_fd_query_copy(attr, uattr, prog_id, fd_type, buf, @@ -4875,8 +4986,10 @@ static int bpf_map_do_batch(const union bpf_attr *attr, else BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr); err_put: - if (has_write) + if (has_write) { + maybe_wait_bpf_programs(map); bpf_map_write_active_dec(map); + } fdput(f); return err; } @@ -4945,7 +5058,11 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = bpf_xdp_link_attach(attr, prog); break; case BPF_PROG_TYPE_SCHED_CLS: - ret = tcx_link_attach(attr, prog); + if (attr->link_create.attach_type == BPF_TCX_INGRESS || + attr->link_create.attach_type == BPF_TCX_EGRESS) + ret = tcx_link_attach(attr, prog); + else + ret = netkit_link_attach(attr, prog); break; case BPF_PROG_TYPE_NETFILTER: ret = bpf_nf_link_attach(attr, prog); @@ -5274,6 +5391,11 @@ static int bpf_prog_bind_map(union bpf_attr *attr) goto out_unlock; } + /* The bpf program will not access the bpf map, but for the sake of + * simplicity, increase sleepable_refcnt for sleepable program as well. + */ + if (prog->aux->sleepable) + atomic64_inc(&map->sleepable_refcnt); memcpy(used_maps_new, used_maps_old, sizeof(used_maps_old[0]) * prog->aux->used_map_cnt); used_maps_new[prog->aux->used_map_cnt] = map; @@ -5502,9 +5624,9 @@ int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size) } run_ctx.bpf_cookie = 0; - run_ctx.saved_run_ctx = NULL; if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) { /* recursion detected */ + __bpf_prog_exit_sleepable_recur(prog, 0, &run_ctx); bpf_prog_put(prog); return -EBUSY; } diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index c4ab9d6cdbe9..e5c3500443c6 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -7,7 +7,9 @@ #include <linux/fs.h> #include <linux/fdtable.h> #include <linux/filter.h> +#include <linux/bpf_mem_alloc.h> #include <linux/btf_ids.h> +#include <linux/mm_types.h> #include "mmap_unlock_work.h" static const char * const iter_task_type_names[] = { @@ -35,16 +37,13 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm u32 *tid, bool skip_if_dup_files) { - struct task_struct *task, *next_task; + struct task_struct *task; struct pid *pid; - u32 saved_tid; + u32 next_tid; if (!*tid) { /* The first time, the iterator calls this function. */ pid = find_pid_ns(common->pid, common->ns); - if (!pid) - return NULL; - task = get_pid_task(pid, PIDTYPE_TGID); if (!task) return NULL; @@ -66,44 +65,25 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm return task; } - pid = find_pid_ns(common->pid_visiting, common->ns); - if (!pid) - return NULL; - - task = get_pid_task(pid, PIDTYPE_PID); + task = find_task_by_pid_ns(common->pid_visiting, common->ns); if (!task) return NULL; retry: - if (!pid_alive(task)) { - put_task_struct(task); - return NULL; - } - - next_task = next_thread(task); - put_task_struct(task); - if (!next_task) - return NULL; - - saved_tid = *tid; - *tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns); - if (!*tid || *tid == common->pid) { - /* Run out of tasks of a process. The tasks of a - * thread_group are linked as circular linked list. - */ - *tid = saved_tid; + task = __next_thread(task); + if (!task) return NULL; - } - get_task_struct(next_task); - common->pid_visiting = *tid; + next_tid = __task_pid_nr_ns(task, PIDTYPE_PID, common->ns); + if (!next_tid) + goto retry; - if (skip_if_dup_files && task->files == task->group_leader->files) { - task = next_task; + if (skip_if_dup_files && task->files == task->group_leader->files) goto retry; - } - return next_task; + *tid = common->pid_visiting = next_tid; + get_task_struct(task); + return task; } static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common, @@ -308,11 +288,9 @@ again: rcu_read_lock(); for (;; curr_fd++) { struct file *f; - f = task_lookup_next_fd_rcu(curr_task, &curr_fd); + f = task_lookup_next_fdget_rcu(curr_task, &curr_fd); if (!f) break; - if (!get_file_rcu(f)) - continue; /* set info->fd */ info->fd = curr_fd; @@ -724,7 +702,7 @@ static struct bpf_iter_reg task_reg_info = { .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__task, task), - PTR_TO_BTF_ID_OR_NULL }, + PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, }, .seq_info = &task_seq_info, .fill_link_info = bpf_iter_fill_link_info, @@ -823,6 +801,239 @@ const struct bpf_func_proto bpf_find_vma_proto = { .arg5_type = ARG_ANYTHING, }; +struct bpf_iter_task_vma_kern_data { + struct task_struct *task; + struct mm_struct *mm; + struct mmap_unlock_irq_work *work; + struct vma_iterator vmi; +}; + +struct bpf_iter_task_vma { + /* opaque iterator state; having __u64 here allows to preserve correct + * alignment requirements in vmlinux.h, generated from BTF + */ + __u64 __opaque[1]; +} __attribute__((aligned(8))); + +/* Non-opaque version of bpf_iter_task_vma */ +struct bpf_iter_task_vma_kern { + struct bpf_iter_task_vma_kern_data *data; +} __attribute__((aligned(8))); + +__bpf_kfunc_start_defs(); + +__bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it, + struct task_struct *task, u64 addr) +{ + struct bpf_iter_task_vma_kern *kit = (void *)it; + bool irq_work_busy = false; + int err; + + BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma)); + BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma)); + + /* is_iter_reg_valid_uninit guarantees that kit hasn't been initialized + * before, so non-NULL kit->data doesn't point to previously + * bpf_mem_alloc'd bpf_iter_task_vma_kern_data + */ + kit->data = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_iter_task_vma_kern_data)); + if (!kit->data) + return -ENOMEM; + + kit->data->task = get_task_struct(task); + kit->data->mm = task->mm; + if (!kit->data->mm) { + err = -ENOENT; + goto err_cleanup_iter; + } + + /* kit->data->work == NULL is valid after bpf_mmap_unlock_get_irq_work */ + irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work); + if (irq_work_busy || !mmap_read_trylock(kit->data->mm)) { + err = -EBUSY; + goto err_cleanup_iter; + } + + vma_iter_init(&kit->data->vmi, kit->data->mm, addr); + return 0; + +err_cleanup_iter: + if (kit->data->task) + put_task_struct(kit->data->task); + bpf_mem_free(&bpf_global_ma, kit->data); + /* NULL kit->data signals failed bpf_iter_task_vma initialization */ + kit->data = NULL; + return err; +} + +__bpf_kfunc struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it) +{ + struct bpf_iter_task_vma_kern *kit = (void *)it; + + if (!kit->data) /* bpf_iter_task_vma_new failed */ + return NULL; + return vma_next(&kit->data->vmi); +} + +__bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it) +{ + struct bpf_iter_task_vma_kern *kit = (void *)it; + + if (kit->data) { + bpf_mmap_unlock_mm(kit->data->work, kit->data->mm); + put_task_struct(kit->data->task); + bpf_mem_free(&bpf_global_ma, kit->data); + } +} + +__bpf_kfunc_end_defs(); + +#ifdef CONFIG_CGROUPS + +struct bpf_iter_css_task { + __u64 __opaque[1]; +} __attribute__((aligned(8))); + +struct bpf_iter_css_task_kern { + struct css_task_iter *css_it; +} __attribute__((aligned(8))); + +__bpf_kfunc_start_defs(); + +__bpf_kfunc int bpf_iter_css_task_new(struct bpf_iter_css_task *it, + struct cgroup_subsys_state *css, unsigned int flags) +{ + struct bpf_iter_css_task_kern *kit = (void *)it; + + BUILD_BUG_ON(sizeof(struct bpf_iter_css_task_kern) != sizeof(struct bpf_iter_css_task)); + BUILD_BUG_ON(__alignof__(struct bpf_iter_css_task_kern) != + __alignof__(struct bpf_iter_css_task)); + kit->css_it = NULL; + switch (flags) { + case CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED: + case CSS_TASK_ITER_PROCS: + case 0: + break; + default: + return -EINVAL; + } + + kit->css_it = bpf_mem_alloc(&bpf_global_ma, sizeof(struct css_task_iter)); + if (!kit->css_it) + return -ENOMEM; + css_task_iter_start(css, flags, kit->css_it); + return 0; +} + +__bpf_kfunc struct task_struct *bpf_iter_css_task_next(struct bpf_iter_css_task *it) +{ + struct bpf_iter_css_task_kern *kit = (void *)it; + + if (!kit->css_it) + return NULL; + return css_task_iter_next(kit->css_it); +} + +__bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it) +{ + struct bpf_iter_css_task_kern *kit = (void *)it; + + if (!kit->css_it) + return; + css_task_iter_end(kit->css_it); + bpf_mem_free(&bpf_global_ma, kit->css_it); +} + +__bpf_kfunc_end_defs(); + +#endif /* CONFIG_CGROUPS */ + +struct bpf_iter_task { + __u64 __opaque[3]; +} __attribute__((aligned(8))); + +struct bpf_iter_task_kern { + struct task_struct *task; + struct task_struct *pos; + unsigned int flags; +} __attribute__((aligned(8))); + +enum { + /* all process in the system */ + BPF_TASK_ITER_ALL_PROCS, + /* all threads in the system */ + BPF_TASK_ITER_ALL_THREADS, + /* all threads of a specific process */ + BPF_TASK_ITER_PROC_THREADS +}; + +__bpf_kfunc_start_defs(); + +__bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it, + struct task_struct *task__nullable, unsigned int flags) +{ + struct bpf_iter_task_kern *kit = (void *)it; + + BUILD_BUG_ON(sizeof(struct bpf_iter_task_kern) > sizeof(struct bpf_iter_task)); + BUILD_BUG_ON(__alignof__(struct bpf_iter_task_kern) != + __alignof__(struct bpf_iter_task)); + + switch (flags) { + case BPF_TASK_ITER_ALL_THREADS: + case BPF_TASK_ITER_ALL_PROCS: + break; + case BPF_TASK_ITER_PROC_THREADS: + if (!task__nullable) + return -EINVAL; + break; + default: + return -EINVAL; + } + + if (flags == BPF_TASK_ITER_PROC_THREADS) + kit->task = task__nullable; + else + kit->task = &init_task; + kit->pos = kit->task; + kit->flags = flags; + return 0; +} + +__bpf_kfunc struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it) +{ + struct bpf_iter_task_kern *kit = (void *)it; + struct task_struct *pos; + unsigned int flags; + + flags = kit->flags; + pos = kit->pos; + + if (!pos) + return pos; + + if (flags == BPF_TASK_ITER_ALL_PROCS) + goto get_next_task; + + kit->pos = __next_thread(kit->pos); + if (kit->pos || flags == BPF_TASK_ITER_PROC_THREADS) + return pos; + +get_next_task: + kit->task = next_task(kit->task); + if (kit->task == &init_task) + kit->pos = NULL; + else + kit->pos = kit->task; + + return pos; +} + +__bpf_kfunc void bpf_iter_task_destroy(struct bpf_iter_task *it) +{ +} + +__bpf_kfunc_end_defs(); + DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work); static void do_mmap_read_unlock(struct irq_work *entry) diff --git a/kernel/bpf/tcx.c b/kernel/bpf/tcx.c index 13f0b5dc8262..2e4885e7781f 100644 --- a/kernel/bpf/tcx.c +++ b/kernel/bpf/tcx.c @@ -123,7 +123,6 @@ int tcx_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { bool ingress = attr->query.attach_type == BPF_TCX_INGRESS; struct net *net = current->nsproxy->net_ns; - struct bpf_mprog_entry *entry; struct net_device *dev; int ret; @@ -133,12 +132,7 @@ int tcx_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) ret = -ENODEV; goto out; } - entry = tcx_entry_fetch(dev, ingress); - if (!entry) { - ret = -ENOENT; - goto out; - } - ret = bpf_mprog_query(attr, uattr, entry); + ret = bpf_mprog_query(attr, uattr, tcx_entry_fetch(dev, ingress)); out: rtnl_unlock(); return ret; @@ -256,7 +250,7 @@ static void tcx_link_dealloc(struct bpf_link *link) static void tcx_link_fdinfo(const struct bpf_link *link, struct seq_file *seq) { - const struct tcx_link *tcx = tcx_link_const(link); + const struct tcx_link *tcx = tcx_link(link); u32 ifindex = 0; rtnl_lock(); @@ -273,7 +267,7 @@ static void tcx_link_fdinfo(const struct bpf_link *link, struct seq_file *seq) static int tcx_link_fill_info(const struct bpf_link *link, struct bpf_link_info *info) { - const struct tcx_link *tcx = tcx_link_const(link); + const struct tcx_link *tcx = tcx_link(link); u32 ifindex = 0; rtnl_lock(); diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index 3d7127f439a1..9dbc31b25e3d 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -172,12 +172,6 @@ bool tnum_in(struct tnum a, struct tnum b) return a.value == b.value; } -int tnum_strn(char *str, size_t size, struct tnum a) -{ - return snprintf(str, size, "(%#llx; %#llx)", a.value, a.mask); -} -EXPORT_SYMBOL_GPL(tnum_strn); - int tnum_sbin(char *str, size_t size, struct tnum a) { size_t n; @@ -208,7 +202,12 @@ struct tnum tnum_clear_subreg(struct tnum a) return tnum_lshift(tnum_rshift(a, 32), 32); } +struct tnum tnum_with_subreg(struct tnum reg, struct tnum subreg) +{ + return tnum_or(tnum_clear_subreg(reg), tnum_subreg(subreg)); +} + struct tnum tnum_const_subreg(struct tnum a, u32 value) { - return tnum_or(tnum_clear_subreg(a), tnum_const(value)); + return tnum_with_subreg(a, tnum_const(value)); } diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 78acf28d4873..d382f5ebe06c 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -115,10 +115,10 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog) (ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC); } -void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym) +void bpf_image_ksym_add(void *data, unsigned int size, struct bpf_ksym *ksym) { ksym->start = (unsigned long) data; - ksym->end = ksym->start + PAGE_SIZE; + ksym->end = ksym->start + size; bpf_ksym_add(ksym); perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start, PAGE_SIZE, false, ksym->name); @@ -254,8 +254,8 @@ bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_a static void bpf_tramp_image_free(struct bpf_tramp_image *im) { bpf_image_ksym_del(&im->ksym); - bpf_jit_free_exec(im->image); - bpf_jit_uncharge_modmem(PAGE_SIZE); + arch_free_bpf_trampoline(im->image, im->size); + bpf_jit_uncharge_modmem(im->size); percpu_ref_exit(&im->pcref); kfree_rcu(im, rcu); } @@ -349,7 +349,7 @@ static void bpf_tramp_image_put(struct bpf_tramp_image *im) call_rcu_tasks_trace(&im->rcu, __bpf_tramp_image_put_rcu_tasks); } -static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key) +static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, int size) { struct bpf_tramp_image *im; struct bpf_ksym *ksym; @@ -360,15 +360,15 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key) if (!im) goto out; - err = bpf_jit_charge_modmem(PAGE_SIZE); + err = bpf_jit_charge_modmem(size); if (err) goto out_free_im; + im->size = size; err = -ENOMEM; - im->image = image = bpf_jit_alloc_exec(PAGE_SIZE); + im->image = image = arch_alloc_bpf_trampoline(size); if (!image) goto out_uncharge; - set_vm_flush_reset_perms(image); err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL); if (err) @@ -377,13 +377,13 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key) ksym = &im->ksym; INIT_LIST_HEAD_RCU(&ksym->lnode); snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", key); - bpf_image_ksym_add(image, ksym); + bpf_image_ksym_add(image, size, ksym); return im; out_free_image: - bpf_jit_free_exec(im->image); + arch_free_bpf_trampoline(im->image, im->size); out_uncharge: - bpf_jit_uncharge_modmem(PAGE_SIZE); + bpf_jit_uncharge_modmem(size); out_free_im: kfree(im); out: @@ -396,7 +396,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut struct bpf_tramp_links *tlinks; u32 orig_flags = tr->flags; bool ip_arg = false; - int err, total; + int err, total, size; tlinks = bpf_trampoline_get_progs(tr, &total, &ip_arg); if (IS_ERR(tlinks)) @@ -409,14 +409,8 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut goto out; } - im = bpf_tramp_image_alloc(tr->key); - if (IS_ERR(im)) { - err = PTR_ERR(im); - goto out; - } - - /* clear all bits except SHARE_IPMODIFY */ - tr->flags &= BPF_TRAMP_F_SHARE_IPMODIFY; + /* clear all bits except SHARE_IPMODIFY and TAIL_CALL_CTX */ + tr->flags &= (BPF_TRAMP_F_SHARE_IPMODIFY | BPF_TRAMP_F_TAIL_CALL_CTX); if (tlinks[BPF_TRAMP_FEXIT].nr_links || tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) { @@ -438,13 +432,31 @@ again: tr->flags |= BPF_TRAMP_F_ORIG_STACK; #endif - err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE, + size = arch_bpf_trampoline_size(&tr->func.model, tr->flags, + tlinks, tr->func.addr); + if (size < 0) { + err = size; + goto out; + } + + if (size > PAGE_SIZE) { + err = -E2BIG; + goto out; + } + + im = bpf_tramp_image_alloc(tr->key, size); + if (IS_ERR(im)) { + err = PTR_ERR(im); + goto out; + } + + err = arch_prepare_bpf_trampoline(im, im->image, im->image + size, &tr->func.model, tr->flags, tlinks, tr->func.addr); if (err < 0) goto out_free; - set_memory_rox((long)im->image, 1); + arch_protect_bpf_trampoline(im->image, im->size); WARN_ON(tr->cur_image && total == 0); if (tr->cur_image) @@ -464,9 +476,8 @@ again: tr->fops->func = NULL; tr->fops->trampoline = 0; - /* reset im->image memory attr for arch_prepare_bpf_trampoline */ - set_memory_nx((long)im->image, 1); - set_memory_rw((long)im->image, 1); + /* free im memory and reallocate later */ + bpf_tramp_image_free(im); goto again; } #endif @@ -926,13 +937,12 @@ u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, migrate_disable(); might_fault(); + run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); + if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { bpf_prog_inc_misses_counter(prog); return 0; } - - run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); - return bpf_prog_start_time(); } @@ -1033,10 +1043,50 @@ bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog) } int __weak -arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end, +arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, - void *orig_call) + void *func_addr) +{ + return -ENOTSUPP; +} + +void * __weak arch_alloc_bpf_trampoline(unsigned int size) +{ + void *image; + + if (WARN_ON_ONCE(size > PAGE_SIZE)) + return NULL; + image = bpf_jit_alloc_exec(PAGE_SIZE); + if (image) + set_vm_flush_reset_perms(image); + return image; +} + +void __weak arch_free_bpf_trampoline(void *image, unsigned int size) +{ + WARN_ON_ONCE(size > PAGE_SIZE); + /* bpf_jit_free_exec doesn't need "size", but + * bpf_prog_pack_free() needs it. + */ + bpf_jit_free_exec(image); +} + +void __weak arch_protect_bpf_trampoline(void *image, unsigned int size) +{ + WARN_ON_ONCE(size > PAGE_SIZE); + set_memory_rox((long)image, 1); +} + +void __weak arch_unprotect_bpf_trampoline(void *image, unsigned int size) +{ + WARN_ON_ONCE(size > PAGE_SIZE); + set_memory_nx((long)image, 1); + set_memory_rw((long)image, 1); +} + +int __weak arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, + struct bpf_tramp_links *tlinks, void *func_addr) { return -ENOTSUPP; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bb78212fa5b2..65f598694d55 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -26,6 +26,7 @@ #include <linux/poison.h> #include <linux/module.h> #include <linux/cpumask.h> +#include <linux/bpf_mem_alloc.h> #include <net/xdp.h> #include "disasm.h" @@ -41,6 +42,9 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { #undef BPF_LINK_TYPE }; +struct bpf_mem_alloc bpf_global_percpu_ma; +static bool bpf_global_percpu_ma_set; + /* bpf_check() is a static code analyzer that walks eBPF program * instruction by instruction and updates register/stack state. * All paths of conditional branches are analyzed until 'bpf_exit' insn. @@ -191,6 +195,8 @@ struct bpf_verifier_stack_elem { POISON_POINTER_DELTA)) #define BPF_MAP_PTR(X) ((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV)) +#define BPF_GLOBAL_PERCPU_MA_MAX_SIZE 512 + static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx); static int release_reference(struct bpf_verifier_env *env, int ref_obj_id); static void invalidate_non_owning_refs(struct bpf_verifier_env *env); @@ -304,7 +310,7 @@ struct bpf_kfunc_call_arg_meta { /* arg_{btf,btf_id,owning_ref} are used by kfunc-specific handling, * generally to pass info about user-defined local kptr types to later * verification logic - * bpf_obj_drop + * bpf_obj_drop/bpf_percpu_obj_drop * Record the local kptr type to be drop'd * bpf_refcount_acquire (via KF_ARG_PTR_TO_REFCOUNTED_KPTR arg type) * Record the local kptr type to be refcount_incr'd and use @@ -335,29 +341,14 @@ struct bpf_kfunc_call_arg_meta { struct btf *btf_vmlinux; -static DEFINE_MUTEX(bpf_verifier_lock); - -static const struct bpf_line_info * -find_linfo(const struct bpf_verifier_env *env, u32 insn_off) +static const char *btf_type_name(const struct btf *btf, u32 id) { - const struct bpf_line_info *linfo; - const struct bpf_prog *prog; - u32 i, nr_linfo; - - prog = env->prog; - nr_linfo = prog->aux->nr_linfo; - - if (!nr_linfo || insn_off >= prog->len) - return NULL; - - linfo = prog->aux->linfo; - for (i = 1; i < nr_linfo; i++) - if (insn_off < linfo[i].insn_off) - break; - - return &linfo[i - 1]; + return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off); } +static DEFINE_MUTEX(bpf_verifier_lock); +static DEFINE_MUTEX(bpf_percpu_ma_lock); + __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...) { struct bpf_verifier_env *env = private_data; @@ -371,73 +362,25 @@ __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...) va_end(args); } -static const char *ltrim(const char *s) -{ - while (isspace(*s)) - s++; - - return s; -} - -__printf(3, 4) static void verbose_linfo(struct bpf_verifier_env *env, - u32 insn_off, - const char *prefix_fmt, ...) -{ - const struct bpf_line_info *linfo; - - if (!bpf_verifier_log_needed(&env->log)) - return; - - linfo = find_linfo(env, insn_off); - if (!linfo || linfo == env->prev_linfo) - return; - - if (prefix_fmt) { - va_list args; - - va_start(args, prefix_fmt); - bpf_verifier_vlog(&env->log, prefix_fmt, args); - va_end(args); - } - - verbose(env, "%s\n", - ltrim(btf_name_by_offset(env->prog->aux->btf, - linfo->line_off))); - - env->prev_linfo = linfo; -} - static void verbose_invalid_scalar(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - struct tnum *range, const char *ctx, + struct bpf_retval_range range, const char *ctx, const char *reg_name) { - char tn_buf[48]; + bool unknown = true; - verbose(env, "At %s the register %s ", ctx, reg_name); - if (!tnum_is_unknown(reg->var_off)) { - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "has value %s", tn_buf); - } else { - verbose(env, "has unknown scalar value"); + verbose(env, "%s the register %s has", ctx, reg_name); + if (reg->smin_value > S64_MIN) { + verbose(env, " smin=%lld", reg->smin_value); + unknown = false; } - tnum_strn(tn_buf, sizeof(tn_buf), *range); - verbose(env, " should have been in %s\n", tn_buf); -} - -static bool type_is_pkt_pointer(enum bpf_reg_type type) -{ - type = base_type(type); - return type == PTR_TO_PACKET || - type == PTR_TO_PACKET_META; -} - -static bool type_is_sk_pointer(enum bpf_reg_type type) -{ - return type == PTR_TO_SOCKET || - type == PTR_TO_SOCK_COMMON || - type == PTR_TO_TCP_SOCK || - type == PTR_TO_XDP_SOCK; + if (reg->smax_value < S64_MAX) { + verbose(env, " smax=%lld", reg->smax_value); + unknown = false; + } + if (unknown) + verbose(env, " unknown scalar value"); + verbose(env, " should have been in [%d, %d]\n", range.minval, range.maxval); } static bool type_may_be_null(u32 type) @@ -463,16 +406,6 @@ static bool reg_not_null(const struct bpf_reg_state *reg) type == PTR_TO_MEM; } -static bool type_is_ptr_alloc_obj(u32 type) -{ - return base_type(type) == PTR_TO_BTF_ID && type_flag(type) & MEM_ALLOC; -} - -static bool type_is_non_owning_ref(u32 type) -{ - return type_is_ptr_alloc_obj(type) && type_flag(type) & NON_OWN_REF; -} - static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg) { struct btf_record *rec = NULL; @@ -495,6 +428,31 @@ static bool subprog_is_global(const struct bpf_verifier_env *env, int subprog) return aux && aux[subprog].linkage == BTF_FUNC_GLOBAL; } +static const char *subprog_name(const struct bpf_verifier_env *env, int subprog) +{ + struct bpf_func_info *info; + + if (!env->prog->aux->func_info) + return ""; + + info = &env->prog->aux->func_info[subprog]; + return btf_type_name(env->prog->aux->btf, info->type_id); +} + +static void mark_subprog_exc_cb(struct bpf_verifier_env *env, int subprog) +{ + struct bpf_subprog_info *info = subprog_info(env, subprog); + + info->is_cb = true; + info->is_async_cb = true; + info->is_exception_cb = true; +} + +static bool subprog_is_exc_cb(struct bpf_verifier_env *env, int subprog) +{ + return subprog_info(env, subprog)->is_exception_cb; +} + static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) { return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK); @@ -542,12 +500,12 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id) return func_id == BPF_FUNC_dynptr_data; } -static bool is_callback_calling_kfunc(u32 btf_id); +static bool is_sync_callback_calling_kfunc(u32 btf_id); +static bool is_bpf_throw_kfunc(struct bpf_insn *insn); -static bool is_callback_calling_function(enum bpf_func_id func_id) +static bool is_sync_callback_calling_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_for_each_map_elem || - func_id == BPF_FUNC_timer_set_callback || func_id == BPF_FUNC_find_vma || func_id == BPF_FUNC_loop || func_id == BPF_FUNC_user_ringbuf_drain; @@ -558,6 +516,18 @@ static bool is_async_callback_calling_function(enum bpf_func_id func_id) return func_id == BPF_FUNC_timer_set_callback; } +static bool is_callback_calling_function(enum bpf_func_id func_id) +{ + return is_sync_callback_calling_function(func_id) || + is_async_callback_calling_function(func_id); +} + +static bool is_sync_callback_calling_insn(struct bpf_insn *insn) +{ + return (bpf_helper_call(insn) && is_sync_callback_calling_function(insn->imm)) || + (bpf_pseudo_kfunc_call(insn) && is_sync_callback_calling_kfunc(insn->imm)); +} + static bool is_storage_get_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_sk_storage_get || @@ -588,83 +558,6 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn) insn->imm == BPF_CMPXCHG; } -/* string representation of 'enum bpf_reg_type' - * - * Note that reg_type_str() can not appear more than once in a single verbose() - * statement. - */ -static const char *reg_type_str(struct bpf_verifier_env *env, - enum bpf_reg_type type) -{ - char postfix[16] = {0}, prefix[64] = {0}; - static const char * const str[] = { - [NOT_INIT] = "?", - [SCALAR_VALUE] = "scalar", - [PTR_TO_CTX] = "ctx", - [CONST_PTR_TO_MAP] = "map_ptr", - [PTR_TO_MAP_VALUE] = "map_value", - [PTR_TO_STACK] = "fp", - [PTR_TO_PACKET] = "pkt", - [PTR_TO_PACKET_META] = "pkt_meta", - [PTR_TO_PACKET_END] = "pkt_end", - [PTR_TO_FLOW_KEYS] = "flow_keys", - [PTR_TO_SOCKET] = "sock", - [PTR_TO_SOCK_COMMON] = "sock_common", - [PTR_TO_TCP_SOCK] = "tcp_sock", - [PTR_TO_TP_BUFFER] = "tp_buffer", - [PTR_TO_XDP_SOCK] = "xdp_sock", - [PTR_TO_BTF_ID] = "ptr_", - [PTR_TO_MEM] = "mem", - [PTR_TO_BUF] = "buf", - [PTR_TO_FUNC] = "func", - [PTR_TO_MAP_KEY] = "map_key", - [CONST_PTR_TO_DYNPTR] = "dynptr_ptr", - }; - - if (type & PTR_MAYBE_NULL) { - if (base_type(type) == PTR_TO_BTF_ID) - strncpy(postfix, "or_null_", 16); - else - strncpy(postfix, "_or_null", 16); - } - - snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s%s", - type & MEM_RDONLY ? "rdonly_" : "", - type & MEM_RINGBUF ? "ringbuf_" : "", - type & MEM_USER ? "user_" : "", - type & MEM_PERCPU ? "percpu_" : "", - type & MEM_RCU ? "rcu_" : "", - type & PTR_UNTRUSTED ? "untrusted_" : "", - type & PTR_TRUSTED ? "trusted_" : "" - ); - - snprintf(env->tmp_str_buf, TMP_STR_BUF_LEN, "%s%s%s", - prefix, str[base_type(type)], postfix); - return env->tmp_str_buf; -} - -static char slot_type_char[] = { - [STACK_INVALID] = '?', - [STACK_SPILL] = 'r', - [STACK_MISC] = 'm', - [STACK_ZERO] = '0', - [STACK_DYNPTR] = 'd', - [STACK_ITER] = 'i', -}; - -static void print_liveness(struct bpf_verifier_env *env, - enum bpf_reg_liveness live) -{ - if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN | REG_LIVE_DONE)) - verbose(env, "_"); - if (live & REG_LIVE_READ) - verbose(env, "r"); - if (live & REG_LIVE_WRITTEN) - verbose(env, "w"); - if (live & REG_LIVE_DONE) - verbose(env, "D"); -} - static int __get_spi(s32 off) { return (-off - 1) / BPF_REG_SIZE; @@ -729,92 +622,6 @@ static int iter_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg, return stack_slot_obj_get_spi(env, reg, "iter", nr_slots); } -static const char *btf_type_name(const struct btf *btf, u32 id) -{ - return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off); -} - -static const char *dynptr_type_str(enum bpf_dynptr_type type) -{ - switch (type) { - case BPF_DYNPTR_TYPE_LOCAL: - return "local"; - case BPF_DYNPTR_TYPE_RINGBUF: - return "ringbuf"; - case BPF_DYNPTR_TYPE_SKB: - return "skb"; - case BPF_DYNPTR_TYPE_XDP: - return "xdp"; - case BPF_DYNPTR_TYPE_INVALID: - return "<invalid>"; - default: - WARN_ONCE(1, "unknown dynptr type %d\n", type); - return "<unknown>"; - } -} - -static const char *iter_type_str(const struct btf *btf, u32 btf_id) -{ - if (!btf || btf_id == 0) - return "<invalid>"; - - /* we already validated that type is valid and has conforming name */ - return btf_type_name(btf, btf_id) + sizeof(ITER_PREFIX) - 1; -} - -static const char *iter_state_str(enum bpf_iter_state state) -{ - switch (state) { - case BPF_ITER_STATE_ACTIVE: - return "active"; - case BPF_ITER_STATE_DRAINED: - return "drained"; - case BPF_ITER_STATE_INVALID: - return "<invalid>"; - default: - WARN_ONCE(1, "unknown iter state %d\n", state); - return "<unknown>"; - } -} - -static void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno) -{ - env->scratched_regs |= 1U << regno; -} - -static void mark_stack_slot_scratched(struct bpf_verifier_env *env, u32 spi) -{ - env->scratched_stack_slots |= 1ULL << spi; -} - -static bool reg_scratched(const struct bpf_verifier_env *env, u32 regno) -{ - return (env->scratched_regs >> regno) & 1; -} - -static bool stack_slot_scratched(const struct bpf_verifier_env *env, u64 regno) -{ - return (env->scratched_stack_slots >> regno) & 1; -} - -static bool verifier_state_scratched(const struct bpf_verifier_env *env) -{ - return env->scratched_regs || env->scratched_stack_slots; -} - -static void mark_verifier_state_clean(struct bpf_verifier_env *env) -{ - env->scratched_regs = 0U; - env->scratched_stack_slots = 0ULL; -} - -/* Used for printing the entire verifier state. */ -static void mark_verifier_state_scratched(struct bpf_verifier_env *env) -{ - env->scratched_regs = ~0U; - env->scratched_stack_slots = ~0ULL; -} - static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type) { switch (arg_type & DYNPTR_TYPE_FLAG_MASK) { @@ -1172,7 +979,12 @@ static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg static void __mark_reg_known_zero(struct bpf_reg_state *reg); +static bool in_rcu_cs(struct bpf_verifier_env *env); + +static bool is_kfunc_rcu_protected(struct bpf_kfunc_call_arg_meta *meta); + static int mark_stack_slots_iter(struct bpf_verifier_env *env, + struct bpf_kfunc_call_arg_meta *meta, struct bpf_reg_state *reg, int insn_idx, struct btf *btf, u32 btf_id, int nr_slots) { @@ -1193,6 +1005,12 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env, __mark_reg_known_zero(st); st->type = PTR_TO_STACK; /* we don't have dedicated reg type */ + if (is_kfunc_rcu_protected(meta)) { + if (in_rcu_cs(env)) + st->type |= MEM_RCU; + else + st->type |= PTR_UNTRUSTED; + } st->live |= REG_LIVE_WRITTEN; st->ref_obj_id = i == 0 ? id : 0; st->iter.btf = btf; @@ -1267,7 +1085,7 @@ static bool is_iter_reg_valid_uninit(struct bpf_verifier_env *env, return true; } -static bool is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg, +static int is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg, struct btf *btf, u32 btf_id, int nr_slots) { struct bpf_func_state *state = func(env, reg); @@ -1275,26 +1093,28 @@ static bool is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_ spi = iter_get_spi(env, reg, nr_slots); if (spi < 0) - return false; + return -EINVAL; for (i = 0; i < nr_slots; i++) { struct bpf_stack_state *slot = &state->stack[spi - i]; struct bpf_reg_state *st = &slot->spilled_ptr; + if (st->type & PTR_UNTRUSTED) + return -EPROTO; /* only main (first) slot has ref_obj_id set */ if (i == 0 && !st->ref_obj_id) - return false; + return -EINVAL; if (i != 0 && st->ref_obj_id) - return false; + return -EINVAL; if (st->iter.btf != btf || st->iter.btf_id != btf_id) - return false; + return -EINVAL; for (j = 0; j < BPF_REG_SIZE; j++) if (slot->slot_type[j] != STACK_ITER) - return false; + return -EINVAL; } - return true; + return 0; } /* Check if given stack slot is "special": @@ -1335,206 +1155,25 @@ static bool is_spilled_scalar_reg(const struct bpf_stack_state *stack) stack->spilled_ptr.type == SCALAR_VALUE; } -static void scrub_spilled_slot(u8 *stype) -{ - if (*stype != STACK_INVALID) - *stype = STACK_MISC; -} - -static void print_verifier_state(struct bpf_verifier_env *env, - const struct bpf_func_state *state, - bool print_all) -{ - const struct bpf_reg_state *reg; - enum bpf_reg_type t; - int i; - - if (state->frameno) - verbose(env, " frame%d:", state->frameno); - for (i = 0; i < MAX_BPF_REG; i++) { - reg = &state->regs[i]; - t = reg->type; - if (t == NOT_INIT) - continue; - if (!print_all && !reg_scratched(env, i)) - continue; - verbose(env, " R%d", i); - print_liveness(env, reg->live); - verbose(env, "="); - if (t == SCALAR_VALUE && reg->precise) - verbose(env, "P"); - if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && - tnum_is_const(reg->var_off)) { - /* reg->off should be 0 for SCALAR_VALUE */ - verbose(env, "%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t)); - verbose(env, "%lld", reg->var_off.value + reg->off); - } else { - const char *sep = ""; - - verbose(env, "%s", reg_type_str(env, t)); - if (base_type(t) == PTR_TO_BTF_ID) - verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id)); - verbose(env, "("); -/* - * _a stands for append, was shortened to avoid multiline statements below. - * This macro is used to output a comma separated list of attributes. +/* Mark stack slot as STACK_MISC, unless it is already STACK_INVALID, in which + * case they are equivalent, or it's STACK_ZERO, in which case we preserve + * more precise STACK_ZERO. + * Note, in uprivileged mode leaving STACK_INVALID is wrong, so we take + * env->allow_ptr_leaks into account and force STACK_MISC, if necessary. */ -#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, __VA_ARGS__); sep = ","; }) - - if (reg->id) - verbose_a("id=%d", reg->id); - if (reg->ref_obj_id) - verbose_a("ref_obj_id=%d", reg->ref_obj_id); - if (type_is_non_owning_ref(reg->type)) - verbose_a("%s", "non_own_ref"); - if (t != SCALAR_VALUE) - verbose_a("off=%d", reg->off); - if (type_is_pkt_pointer(t)) - verbose_a("r=%d", reg->range); - else if (base_type(t) == CONST_PTR_TO_MAP || - base_type(t) == PTR_TO_MAP_KEY || - base_type(t) == PTR_TO_MAP_VALUE) - verbose_a("ks=%d,vs=%d", - reg->map_ptr->key_size, - reg->map_ptr->value_size); - if (tnum_is_const(reg->var_off)) { - /* Typically an immediate SCALAR_VALUE, but - * could be a pointer whose offset is too big - * for reg->off - */ - verbose_a("imm=%llx", reg->var_off.value); - } else { - if (reg->smin_value != reg->umin_value && - reg->smin_value != S64_MIN) - verbose_a("smin=%lld", (long long)reg->smin_value); - if (reg->smax_value != reg->umax_value && - reg->smax_value != S64_MAX) - verbose_a("smax=%lld", (long long)reg->smax_value); - if (reg->umin_value != 0) - verbose_a("umin=%llu", (unsigned long long)reg->umin_value); - if (reg->umax_value != U64_MAX) - verbose_a("umax=%llu", (unsigned long long)reg->umax_value); - if (!tnum_is_unknown(reg->var_off)) { - char tn_buf[48]; - - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose_a("var_off=%s", tn_buf); - } - if (reg->s32_min_value != reg->smin_value && - reg->s32_min_value != S32_MIN) - verbose_a("s32_min=%d", (int)(reg->s32_min_value)); - if (reg->s32_max_value != reg->smax_value && - reg->s32_max_value != S32_MAX) - verbose_a("s32_max=%d", (int)(reg->s32_max_value)); - if (reg->u32_min_value != reg->umin_value && - reg->u32_min_value != U32_MIN) - verbose_a("u32_min=%d", (int)(reg->u32_min_value)); - if (reg->u32_max_value != reg->umax_value && - reg->u32_max_value != U32_MAX) - verbose_a("u32_max=%d", (int)(reg->u32_max_value)); - } -#undef verbose_a - - verbose(env, ")"); - } - } - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - char types_buf[BPF_REG_SIZE + 1]; - bool valid = false; - int j; - - for (j = 0; j < BPF_REG_SIZE; j++) { - if (state->stack[i].slot_type[j] != STACK_INVALID) - valid = true; - types_buf[j] = slot_type_char[state->stack[i].slot_type[j]]; - } - types_buf[BPF_REG_SIZE] = 0; - if (!valid) - continue; - if (!print_all && !stack_slot_scratched(env, i)) - continue; - switch (state->stack[i].slot_type[BPF_REG_SIZE - 1]) { - case STACK_SPILL: - reg = &state->stack[i].spilled_ptr; - t = reg->type; - - verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); - print_liveness(env, reg->live); - verbose(env, "=%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t)); - if (t == SCALAR_VALUE && reg->precise) - verbose(env, "P"); - if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) - verbose(env, "%lld", reg->var_off.value + reg->off); - break; - case STACK_DYNPTR: - i += BPF_DYNPTR_NR_SLOTS - 1; - reg = &state->stack[i].spilled_ptr; - - verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); - print_liveness(env, reg->live); - verbose(env, "=dynptr_%s", dynptr_type_str(reg->dynptr.type)); - if (reg->ref_obj_id) - verbose(env, "(ref_id=%d)", reg->ref_obj_id); - break; - case STACK_ITER: - /* only main slot has ref_obj_id set; skip others */ - reg = &state->stack[i].spilled_ptr; - if (!reg->ref_obj_id) - continue; - - verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); - print_liveness(env, reg->live); - verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)", - iter_type_str(reg->iter.btf, reg->iter.btf_id), - reg->ref_obj_id, iter_state_str(reg->iter.state), - reg->iter.depth); - break; - case STACK_MISC: - case STACK_ZERO: - default: - reg = &state->stack[i].spilled_ptr; - - for (j = 0; j < BPF_REG_SIZE; j++) - types_buf[j] = slot_type_char[state->stack[i].slot_type[j]]; - types_buf[BPF_REG_SIZE] = 0; - - verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); - print_liveness(env, reg->live); - verbose(env, "=%s", types_buf); - break; - } - } - if (state->acquired_refs && state->refs[0].id) { - verbose(env, " refs=%d", state->refs[0].id); - for (i = 1; i < state->acquired_refs; i++) - if (state->refs[i].id) - verbose(env, ",%d", state->refs[i].id); - } - if (state->in_callback_fn) - verbose(env, " cb"); - if (state->in_async_callback_fn) - verbose(env, " async_cb"); - verbose(env, "\n"); - mark_verifier_state_clean(env); -} - -static inline u32 vlog_alignment(u32 pos) +static void mark_stack_slot_misc(struct bpf_verifier_env *env, u8 *stype) { - return round_up(max(pos + BPF_LOG_MIN_ALIGNMENT / 2, BPF_LOG_ALIGNMENT), - BPF_LOG_MIN_ALIGNMENT) - pos - 1; + if (*stype == STACK_ZERO) + return; + if (env->allow_ptr_leaks && *stype == STACK_INVALID) + return; + *stype = STACK_MISC; } -static void print_insn_state(struct bpf_verifier_env *env, - const struct bpf_func_state *state) +static void scrub_spilled_slot(u8 *stype) { - if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) { - /* remove new line character */ - bpf_vlog_reset(&env->log, env->prev_log_pos - 1); - verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_pos), ' '); - } else { - verbose(env, "%d:", env->insn_idx); - } - print_verifier_state(env, state, false); + if (*stype != STACK_INVALID) + *stype = STACK_MISC; } /* copy array src of length n * size bytes to dst. dst is reallocated if it's too @@ -1631,9 +1270,16 @@ static int resize_reference_state(struct bpf_func_state *state, size_t n) return 0; } -static int grow_stack_state(struct bpf_func_state *state, int size) +/* Possibly update state->allocated_stack to be at least size bytes. Also + * possibly update the function's high-water mark in its bpf_subprog_info. + */ +static int grow_stack_state(struct bpf_verifier_env *env, struct bpf_func_state *state, int size) { - size_t old_n = state->allocated_stack / BPF_REG_SIZE, n = size / BPF_REG_SIZE; + size_t old_n = state->allocated_stack / BPF_REG_SIZE, n; + + /* The stack size is always a multiple of BPF_REG_SIZE. */ + size = round_up(size, BPF_REG_SIZE); + n = size / BPF_REG_SIZE; if (old_n >= n) return 0; @@ -1643,6 +1289,11 @@ static int grow_stack_state(struct bpf_func_state *state, int size) return -ENOMEM; state->allocated_stack = size; + + /* update known max for given subprogram */ + if (env->subprog_info[state->subprogno].stack_depth < size) + env->subprog_info[state->subprogno].stack_depth = size; + return 0; } @@ -1742,13 +1393,15 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, int i, err; dst_state->jmp_history = copy_array(dst_state->jmp_history, src->jmp_history, - src->jmp_history_cnt, sizeof(struct bpf_idx_pair), - GFP_USER); + src->jmp_history_cnt, sizeof(*dst_state->jmp_history), + GFP_USER); if (!dst_state->jmp_history) return -ENOMEM; dst_state->jmp_history_cnt = src->jmp_history_cnt; - /* if dst has more stack frames then src frame, free them */ + /* if dst has more stack frames then src frame, free them, this is also + * necessary in case of exceptional exits using bpf_throw. + */ for (i = src->curframe + 1; i <= dst_state->curframe; i++) { free_func_state(dst_state->frame[i]); dst_state->frame[i] = NULL; @@ -1762,6 +1415,9 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->parent = src->parent; dst_state->first_insn_idx = src->first_insn_idx; dst_state->last_insn_idx = src->last_insn_idx; + dst_state->dfs_depth = src->dfs_depth; + dst_state->callback_unroll_depth = src->callback_unroll_depth; + dst_state->used_as_loop_entry = src->used_as_loop_entry; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; if (!dst) { @@ -1777,11 +1433,203 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, return 0; } +static u32 state_htab_size(struct bpf_verifier_env *env) +{ + return env->prog->len; +} + +static struct bpf_verifier_state_list **explored_state(struct bpf_verifier_env *env, int idx) +{ + struct bpf_verifier_state *cur = env->cur_state; + struct bpf_func_state *state = cur->frame[cur->curframe]; + + return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)]; +} + +static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_state *b) +{ + int fr; + + if (a->curframe != b->curframe) + return false; + + for (fr = a->curframe; fr >= 0; fr--) + if (a->frame[fr]->callsite != b->frame[fr]->callsite) + return false; + + return true; +} + +/* Open coded iterators allow back-edges in the state graph in order to + * check unbounded loops that iterators. + * + * In is_state_visited() it is necessary to know if explored states are + * part of some loops in order to decide whether non-exact states + * comparison could be used: + * - non-exact states comparison establishes sub-state relation and uses + * read and precision marks to do so, these marks are propagated from + * children states and thus are not guaranteed to be final in a loop; + * - exact states comparison just checks if current and explored states + * are identical (and thus form a back-edge). + * + * Paper "A New Algorithm for Identifying Loops in Decompilation" + * by Tao Wei, Jian Mao, Wei Zou and Yu Chen [1] presents a convenient + * algorithm for loop structure detection and gives an overview of + * relevant terminology. It also has helpful illustrations. + * + * [1] https://api.semanticscholar.org/CorpusID:15784067 + * + * We use a similar algorithm but because loop nested structure is + * irrelevant for verifier ours is significantly simpler and resembles + * strongly connected components algorithm from Sedgewick's textbook. + * + * Define topmost loop entry as a first node of the loop traversed in a + * depth first search starting from initial state. The goal of the loop + * tracking algorithm is to associate topmost loop entries with states + * derived from these entries. + * + * For each step in the DFS states traversal algorithm needs to identify + * the following situations: + * + * initial initial initial + * | | | + * V V V + * ... ... .---------> hdr + * | | | | + * V V | V + * cur .-> succ | .------... + * | | | | | | + * V | V | V V + * succ '-- cur | ... ... + * | | | + * | V V + * | succ <- cur + * | | + * | V + * | ... + * | | + * '----' + * + * (A) successor state of cur (B) successor state of cur or it's entry + * not yet traversed are in current DFS path, thus cur and succ + * are members of the same outermost loop + * + * initial initial + * | | + * V V + * ... ... + * | | + * V V + * .------... .------... + * | | | | + * V V V V + * .-> hdr ... ... ... + * | | | | | + * | V V V V + * | succ <- cur succ <- cur + * | | | + * | V V + * | ... ... + * | | | + * '----' exit + * + * (C) successor state of cur is a part of some loop but this loop + * does not include cur or successor state is not in a loop at all. + * + * Algorithm could be described as the following python code: + * + * traversed = set() # Set of traversed nodes + * entries = {} # Mapping from node to loop entry + * depths = {} # Depth level assigned to graph node + * path = set() # Current DFS path + * + * # Find outermost loop entry known for n + * def get_loop_entry(n): + * h = entries.get(n, None) + * while h in entries and entries[h] != h: + * h = entries[h] + * return h + * + * # Update n's loop entry if h's outermost entry comes + * # before n's outermost entry in current DFS path. + * def update_loop_entry(n, h): + * n1 = get_loop_entry(n) or n + * h1 = get_loop_entry(h) or h + * if h1 in path and depths[h1] <= depths[n1]: + * entries[n] = h1 + * + * def dfs(n, depth): + * traversed.add(n) + * path.add(n) + * depths[n] = depth + * for succ in G.successors(n): + * if succ not in traversed: + * # Case A: explore succ and update cur's loop entry + * # only if succ's entry is in current DFS path. + * dfs(succ, depth + 1) + * h = get_loop_entry(succ) + * update_loop_entry(n, h) + * else: + * # Case B or C depending on `h1 in path` check in update_loop_entry(). + * update_loop_entry(n, succ) + * path.remove(n) + * + * To adapt this algorithm for use with verifier: + * - use st->branch == 0 as a signal that DFS of succ had been finished + * and cur's loop entry has to be updated (case A), handle this in + * update_branch_counts(); + * - use st->branch > 0 as a signal that st is in the current DFS path; + * - handle cases B and C in is_state_visited(); + * - update topmost loop entry for intermediate states in get_loop_entry(). + */ +static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_state *st) +{ + struct bpf_verifier_state *topmost = st->loop_entry, *old; + + while (topmost && topmost->loop_entry && topmost != topmost->loop_entry) + topmost = topmost->loop_entry; + /* Update loop entries for intermediate states to avoid this + * traversal in future get_loop_entry() calls. + */ + while (st && st->loop_entry != topmost) { + old = st->loop_entry; + st->loop_entry = topmost; + st = old; + } + return topmost; +} + +static void update_loop_entry(struct bpf_verifier_state *cur, struct bpf_verifier_state *hdr) +{ + struct bpf_verifier_state *cur1, *hdr1; + + cur1 = get_loop_entry(cur) ?: cur; + hdr1 = get_loop_entry(hdr) ?: hdr; + /* The head1->branches check decides between cases B and C in + * comment for get_loop_entry(). If hdr1->branches == 0 then + * head's topmost loop entry is not in current DFS path, + * hence 'cur' and 'hdr' are not in the same loop and there is + * no need to update cur->loop_entry. + */ + if (hdr1->branches && hdr1->dfs_depth <= cur1->dfs_depth) { + cur->loop_entry = hdr; + hdr->used_as_loop_entry = true; + } +} + static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st) { while (st) { u32 br = --st->branches; + /* br == 0 signals that DFS exploration for 'st' is finished, + * thus it is necessary to update parent's loop entry if it + * turned out that st is a part of some loop. + * This is a part of 'case A' in get_loop_entry() comment. + */ + if (br == 0 && st->parent && st->loop_entry) + update_loop_entry(st->parent, st->loop_entry); + /* WARN_ON(br > 1) technically makes sense here, * but see comment in push_stack(), hence: */ @@ -1921,10 +1769,14 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg) __mark_reg_known(reg, 0); } -static void __mark_reg_const_zero(struct bpf_reg_state *reg) +static void __mark_reg_const_zero(const struct bpf_verifier_env *env, struct bpf_reg_state *reg) { __mark_reg_known(reg, 0); reg->type = SCALAR_VALUE; + /* all scalars are assumed imprecise initially (unless unprivileged, + * in which case everything is forced to be precise) + */ + reg->precise = !env->bpf_capable; } static void mark_reg_known_zero(struct bpf_verifier_env *env, @@ -2090,69 +1942,214 @@ static void __update_reg_bounds(struct bpf_reg_state *reg) /* Uses signed min/max values to inform unsigned, and vice-versa */ static void __reg32_deduce_bounds(struct bpf_reg_state *reg) { - /* Learn sign from signed bounds. - * If we cannot cross the sign boundary, then signed and unsigned bounds - * are the same, so combine. This works even in the negative case, e.g. - * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff. + /* If upper 32 bits of u64/s64 range don't change, we can use lower 32 + * bits to improve our u32/s32 boundaries. + * + * E.g., the case where we have upper 32 bits as zero ([10, 20] in + * u64) is pretty trivial, it's obvious that in u32 we'll also have + * [10, 20] range. But this property holds for any 64-bit range as + * long as upper 32 bits in that entire range of values stay the same. + * + * E.g., u64 range [0x10000000A, 0x10000000F] ([4294967306, 4294967311] + * in decimal) has the same upper 32 bits throughout all the values in + * that range. As such, lower 32 bits form a valid [0xA, 0xF] ([10, 15]) + * range. + * + * Note also, that [0xA, 0xF] is a valid range both in u32 and in s32, + * following the rules outlined below about u64/s64 correspondence + * (which equally applies to u32 vs s32 correspondence). In general it + * depends on actual hexadecimal values of 32-bit range. They can form + * only valid u32, or only valid s32 ranges in some cases. + * + * So we use all these insights to derive bounds for subregisters here. */ - if (reg->s32_min_value >= 0 || reg->s32_max_value < 0) { - reg->s32_min_value = reg->u32_min_value = - max_t(u32, reg->s32_min_value, reg->u32_min_value); - reg->s32_max_value = reg->u32_max_value = - min_t(u32, reg->s32_max_value, reg->u32_max_value); - return; + if ((reg->umin_value >> 32) == (reg->umax_value >> 32)) { + /* u64 to u32 casting preserves validity of low 32 bits as + * a range, if upper 32 bits are the same + */ + reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->umin_value); + reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->umax_value); + + if ((s32)reg->umin_value <= (s32)reg->umax_value) { + reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value); + reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value); + } + } + if ((reg->smin_value >> 32) == (reg->smax_value >> 32)) { + /* low 32 bits should form a proper u32 range */ + if ((u32)reg->smin_value <= (u32)reg->smax_value) { + reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->smin_value); + reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->smax_value); + } + /* low 32 bits should form a proper s32 range */ + if ((s32)reg->smin_value <= (s32)reg->smax_value) { + reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value); + reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value); + } + } + /* Special case where upper bits form a small sequence of two + * sequential numbers (in 32-bit unsigned space, so 0xffffffff to + * 0x00000000 is also valid), while lower bits form a proper s32 range + * going from negative numbers to positive numbers. E.g., let's say we + * have s64 range [-1, 1] ([0xffffffffffffffff, 0x0000000000000001]). + * Possible s64 values are {-1, 0, 1} ({0xffffffffffffffff, + * 0x0000000000000000, 0x00000000000001}). Ignoring upper 32 bits, + * we still get a valid s32 range [-1, 1] ([0xffffffff, 0x00000001]). + * Note that it doesn't have to be 0xffffffff going to 0x00000000 in + * upper 32 bits. As a random example, s64 range + * [0xfffffff0fffffff0; 0xfffffff100000010], forms a valid s32 range + * [-16, 16] ([0xfffffff0; 0x00000010]) in its 32 bit subregister. + */ + if ((u32)(reg->umin_value >> 32) + 1 == (u32)(reg->umax_value >> 32) && + (s32)reg->umin_value < 0 && (s32)reg->umax_value >= 0) { + reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value); + reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value); + } + if ((u32)(reg->smin_value >> 32) + 1 == (u32)(reg->smax_value >> 32) && + (s32)reg->smin_value < 0 && (s32)reg->smax_value >= 0) { + reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value); + reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value); + } + /* if u32 range forms a valid s32 range (due to matching sign bit), + * try to learn from that + */ + if ((s32)reg->u32_min_value <= (s32)reg->u32_max_value) { + reg->s32_min_value = max_t(s32, reg->s32_min_value, reg->u32_min_value); + reg->s32_max_value = min_t(s32, reg->s32_max_value, reg->u32_max_value); } - /* Learn sign from unsigned bounds. Signed bounds cross the sign - * boundary, so we must be careful. + /* If we cannot cross the sign boundary, then signed and unsigned bounds + * are the same, so combine. This works even in the negative case, e.g. + * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff. */ - if ((s32)reg->u32_max_value >= 0) { - /* Positive. We can't learn anything from the smin, but smax - * is positive, hence safe. - */ - reg->s32_min_value = reg->u32_min_value; - reg->s32_max_value = reg->u32_max_value = - min_t(u32, reg->s32_max_value, reg->u32_max_value); - } else if ((s32)reg->u32_min_value < 0) { - /* Negative. We can't learn anything from the smax, but smin - * is negative, hence safe. - */ - reg->s32_min_value = reg->u32_min_value = - max_t(u32, reg->s32_min_value, reg->u32_min_value); - reg->s32_max_value = reg->u32_max_value; + if ((u32)reg->s32_min_value <= (u32)reg->s32_max_value) { + reg->u32_min_value = max_t(u32, reg->s32_min_value, reg->u32_min_value); + reg->u32_max_value = min_t(u32, reg->s32_max_value, reg->u32_max_value); } } static void __reg64_deduce_bounds(struct bpf_reg_state *reg) { - /* Learn sign from signed bounds. - * If we cannot cross the sign boundary, then signed and unsigned bounds + /* If u64 range forms a valid s64 range (due to matching sign bit), + * try to learn from that. Let's do a bit of ASCII art to see when + * this is happening. Let's take u64 range first: + * + * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX + * |-------------------------------|--------------------------------| + * + * Valid u64 range is formed when umin and umax are anywhere in the + * range [0, U64_MAX], and umin <= umax. u64 case is simple and + * straightforward. Let's see how s64 range maps onto the same range + * of values, annotated below the line for comparison: + * + * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX + * |-------------------------------|--------------------------------| + * 0 S64_MAX S64_MIN -1 + * + * So s64 values basically start in the middle and they are logically + * contiguous to the right of it, wrapping around from -1 to 0, and + * then finishing as S64_MAX (0x7fffffffffffffff) right before + * S64_MIN. We can try drawing the continuity of u64 vs s64 values + * more visually as mapped to sign-agnostic range of hex values. + * + * u64 start u64 end + * _______________________________________________________________ + * / \ + * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX + * |-------------------------------|--------------------------------| + * 0 S64_MAX S64_MIN -1 + * / \ + * >------------------------------ -------------------------------> + * s64 continues... s64 end s64 start s64 "midpoint" + * + * What this means is that, in general, we can't always derive + * something new about u64 from any random s64 range, and vice versa. + * + * But we can do that in two particular cases. One is when entire + * u64/s64 range is *entirely* contained within left half of the above + * diagram or when it is *entirely* contained in the right half. I.e.: + * + * |-------------------------------|--------------------------------| + * ^ ^ ^ ^ + * A B C D + * + * [A, B] and [C, D] are contained entirely in their respective halves + * and form valid contiguous ranges as both u64 and s64 values. [A, B] + * will be non-negative both as u64 and s64 (and in fact it will be + * identical ranges no matter the signedness). [C, D] treated as s64 + * will be a range of negative values, while in u64 it will be + * non-negative range of values larger than 0x8000000000000000. + * + * Now, any other range here can't be represented in both u64 and s64 + * simultaneously. E.g., [A, C], [A, D], [B, C], [B, D] are valid + * contiguous u64 ranges, but they are discontinuous in s64. [B, C] + * in s64 would be properly presented as [S64_MIN, C] and [B, S64_MAX], + * for example. Similarly, valid s64 range [D, A] (going from negative + * to positive values), would be two separate [D, U64_MAX] and [0, A] + * ranges as u64. Currently reg_state can't represent two segments per + * numeric domain, so in such situations we can only derive maximal + * possible range ([0, U64_MAX] for u64, and [S64_MIN, S64_MAX] for s64). + * + * So we use these facts to derive umin/umax from smin/smax and vice + * versa only if they stay within the same "half". This is equivalent + * to checking sign bit: lower half will have sign bit as zero, upper + * half have sign bit 1. Below in code we simplify this by just + * casting umin/umax as smin/smax and checking if they form valid + * range, and vice versa. Those are equivalent checks. + */ + if ((s64)reg->umin_value <= (s64)reg->umax_value) { + reg->smin_value = max_t(s64, reg->smin_value, reg->umin_value); + reg->smax_value = min_t(s64, reg->smax_value, reg->umax_value); + } + /* If we cannot cross the sign boundary, then signed and unsigned bounds * are the same, so combine. This works even in the negative case, e.g. * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff. */ - if (reg->smin_value >= 0 || reg->smax_value < 0) { - reg->smin_value = reg->umin_value = max_t(u64, reg->smin_value, - reg->umin_value); - reg->smax_value = reg->umax_value = min_t(u64, reg->smax_value, - reg->umax_value); - return; + if ((u64)reg->smin_value <= (u64)reg->smax_value) { + reg->umin_value = max_t(u64, reg->smin_value, reg->umin_value); + reg->umax_value = min_t(u64, reg->smax_value, reg->umax_value); } - /* Learn sign from unsigned bounds. Signed bounds cross the sign - * boundary, so we must be careful. +} + +static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg) +{ + /* Try to tighten 64-bit bounds from 32-bit knowledge, using 32-bit + * values on both sides of 64-bit range in hope to have tigher range. + * E.g., if r1 is [0x1'00000000, 0x3'80000000], and we learn from + * 32-bit signed > 0 operation that s32 bounds are now [1; 0x7fffffff]. + * With this, we can substitute 1 as low 32-bits of _low_ 64-bit bound + * (0x100000000 -> 0x100000001) and 0x7fffffff as low 32-bits of + * _high_ 64-bit bound (0x380000000 -> 0x37fffffff) and arrive at a + * better overall bounds for r1 as [0x1'000000001; 0x3'7fffffff]. + * We just need to make sure that derived bounds we are intersecting + * with are well-formed ranges in respecitve s64 or u64 domain, just + * like we do with similar kinds of 32-to-64 or 64-to-32 adjustments. */ - if ((s64)reg->umax_value >= 0) { - /* Positive. We can't learn anything from the smin, but smax - * is positive, hence safe. - */ - reg->smin_value = reg->umin_value; - reg->smax_value = reg->umax_value = min_t(u64, reg->smax_value, - reg->umax_value); - } else if ((s64)reg->umin_value < 0) { - /* Negative. We can't learn anything from the smax, but smin - * is negative, hence safe. - */ - reg->smin_value = reg->umin_value = max_t(u64, reg->smin_value, - reg->umin_value); - reg->smax_value = reg->umax_value; + __u64 new_umin, new_umax; + __s64 new_smin, new_smax; + + /* u32 -> u64 tightening, it's always well-formed */ + new_umin = (reg->umin_value & ~0xffffffffULL) | reg->u32_min_value; + new_umax = (reg->umax_value & ~0xffffffffULL) | reg->u32_max_value; + reg->umin_value = max_t(u64, reg->umin_value, new_umin); + reg->umax_value = min_t(u64, reg->umax_value, new_umax); + /* u32 -> s64 tightening, u32 range embedded into s64 preserves range validity */ + new_smin = (reg->smin_value & ~0xffffffffULL) | reg->u32_min_value; + new_smax = (reg->smax_value & ~0xffffffffULL) | reg->u32_max_value; + reg->smin_value = max_t(s64, reg->smin_value, new_smin); + reg->smax_value = min_t(s64, reg->smax_value, new_smax); + + /* if s32 can be treated as valid u32 range, we can use it as well */ + if ((u32)reg->s32_min_value <= (u32)reg->s32_max_value) { + /* s32 -> u64 tightening */ + new_umin = (reg->umin_value & ~0xffffffffULL) | (u32)reg->s32_min_value; + new_umax = (reg->umax_value & ~0xffffffffULL) | (u32)reg->s32_max_value; + reg->umin_value = max_t(u64, reg->umin_value, new_umin); + reg->umax_value = min_t(u64, reg->umax_value, new_umax); + /* s32 -> s64 tightening */ + new_smin = (reg->smin_value & ~0xffffffffULL) | (u32)reg->s32_min_value; + new_smax = (reg->smax_value & ~0xffffffffULL) | (u32)reg->s32_max_value; + reg->smin_value = max_t(s64, reg->smin_value, new_smin); + reg->smax_value = min_t(s64, reg->smax_value, new_smax); } } @@ -2160,6 +2157,7 @@ static void __reg_deduce_bounds(struct bpf_reg_state *reg) { __reg32_deduce_bounds(reg); __reg64_deduce_bounds(reg); + __reg_deduce_mixed_bounds(reg); } /* Attempts to improve var_off based on unsigned min/max information */ @@ -2181,6 +2179,7 @@ static void reg_bounds_sync(struct bpf_reg_state *reg) __update_reg_bounds(reg); /* We might have learned something about the sign bit. */ __reg_deduce_bounds(reg); + __reg_deduce_bounds(reg); /* We might have learned some bits from the bounds. */ __reg_bound_offset(reg); /* Intersecting with the old var_off might have improved our bounds @@ -2190,6 +2189,56 @@ static void reg_bounds_sync(struct bpf_reg_state *reg) __update_reg_bounds(reg); } +static int reg_bounds_sanity_check(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, const char *ctx) +{ + const char *msg; + + if (reg->umin_value > reg->umax_value || + reg->smin_value > reg->smax_value || + reg->u32_min_value > reg->u32_max_value || + reg->s32_min_value > reg->s32_max_value) { + msg = "range bounds violation"; + goto out; + } + + if (tnum_is_const(reg->var_off)) { + u64 uval = reg->var_off.value; + s64 sval = (s64)uval; + + if (reg->umin_value != uval || reg->umax_value != uval || + reg->smin_value != sval || reg->smax_value != sval) { + msg = "const tnum out of sync with range bounds"; + goto out; + } + } + + if (tnum_subreg_is_const(reg->var_off)) { + u32 uval32 = tnum_subreg(reg->var_off).value; + s32 sval32 = (s32)uval32; + + if (reg->u32_min_value != uval32 || reg->u32_max_value != uval32 || + reg->s32_min_value != sval32 || reg->s32_max_value != sval32) { + msg = "const subreg tnum out of sync with range bounds"; + goto out; + } + } + + return 0; +out: + verbose(env, "REG INVARIANTS VIOLATION (%s): %s u64=[%#llx, %#llx] " + "s64=[%#llx, %#llx] u32=[%#x, %#x] s32=[%#x, %#x] var_off=(%#llx, %#llx)\n", + ctx, msg, reg->umin_value, reg->umax_value, + reg->smin_value, reg->smax_value, + reg->u32_min_value, reg->u32_max_value, + reg->s32_min_value, reg->s32_max_value, + reg->var_off.value, reg->var_off.mask); + if (env->test_reg_invariants) + return -EFAULT; + __mark_reg_unbounded(reg); + return 0; +} + static bool __reg32_bound_s64(s32 a) { return a >= 0 && a <= S32_MAX; @@ -2214,51 +2263,6 @@ static void __reg_assign_32_into_64(struct bpf_reg_state *reg) } } -static void __reg_combine_32_into_64(struct bpf_reg_state *reg) -{ - /* special case when 64-bit register has upper 32-bit register - * zeroed. Typically happens after zext or <<32, >>32 sequence - * allowing us to use 32-bit bounds directly, - */ - if (tnum_equals_const(tnum_clear_subreg(reg->var_off), 0)) { - __reg_assign_32_into_64(reg); - } else { - /* Otherwise the best we can do is push lower 32bit known and - * unknown bits into register (var_off set from jmp logic) - * then learn as much as possible from the 64-bit tnum - * known and unknown bits. The previous smin/smax bounds are - * invalid here because of jmp32 compare so mark them unknown - * so they do not impact tnum bounds calculation. - */ - __mark_reg64_unbounded(reg); - } - reg_bounds_sync(reg); -} - -static bool __reg64_bound_s32(s64 a) -{ - return a >= S32_MIN && a <= S32_MAX; -} - -static bool __reg64_bound_u32(u64 a) -{ - return a >= U32_MIN && a <= U32_MAX; -} - -static void __reg_combine_64_into_32(struct bpf_reg_state *reg) -{ - __mark_reg32_unbounded(reg); - if (__reg64_bound_s32(reg->smin_value) && __reg64_bound_s32(reg->smax_value)) { - reg->s32_min_value = (s32)reg->smin_value; - reg->s32_max_value = (s32)reg->smax_value; - } - if (__reg64_bound_u32(reg->umin_value) && __reg64_bound_u32(reg->umax_value)) { - reg->u32_min_value = (u32)reg->umin_value; - reg->u32_max_value = (u32)reg->umax_value; - } - reg_bounds_sync(reg); -} - /* Mark a register as having a completely unknown (scalar) value. */ static void __mark_reg_unknown(const struct bpf_verifier_env *env, struct bpf_reg_state *reg) @@ -2346,6 +2350,11 @@ static void init_reg_state(struct bpf_verifier_env *env, regs[BPF_REG_FP].frameno = state->frameno; } +static struct bpf_retval_range retval_range(s32 minval, s32 maxval) +{ + return (struct bpf_retval_range){ minval, maxval }; +} + #define BPF_MAIN_FUNC (-1) static void init_func_state(struct bpf_verifier_env *env, struct bpf_func_state *state, @@ -2354,7 +2363,7 @@ static void init_func_state(struct bpf_verifier_env *env, state->callsite = callsite; state->frameno = frameno; state->subprogno = subprogno; - state->callback_ret_range = tnum_range(0, 0); + state->callback_ret_range = retval_range(0, 0); init_reg_state(env, state); mark_verifier_state_scratched(env); } @@ -2454,6 +2463,68 @@ static int add_subprog(struct bpf_verifier_env *env, int off) return env->subprog_cnt - 1; } +static int bpf_find_exception_callback_insn_off(struct bpf_verifier_env *env) +{ + struct bpf_prog_aux *aux = env->prog->aux; + struct btf *btf = aux->btf; + const struct btf_type *t; + u32 main_btf_id, id; + const char *name; + int ret, i; + + /* Non-zero func_info_cnt implies valid btf */ + if (!aux->func_info_cnt) + return 0; + main_btf_id = aux->func_info[0].type_id; + + t = btf_type_by_id(btf, main_btf_id); + if (!t) { + verbose(env, "invalid btf id for main subprog in func_info\n"); + return -EINVAL; + } + + name = btf_find_decl_tag_value(btf, t, -1, "exception_callback:"); + if (IS_ERR(name)) { + ret = PTR_ERR(name); + /* If there is no tag present, there is no exception callback */ + if (ret == -ENOENT) + ret = 0; + else if (ret == -EEXIST) + verbose(env, "multiple exception callback tags for main subprog\n"); + return ret; + } + + ret = btf_find_by_name_kind(btf, name, BTF_KIND_FUNC); + if (ret < 0) { + verbose(env, "exception callback '%s' could not be found in BTF\n", name); + return ret; + } + id = ret; + t = btf_type_by_id(btf, id); + if (btf_func_linkage(t) != BTF_FUNC_GLOBAL) { + verbose(env, "exception callback '%s' must have global linkage\n", name); + return -EINVAL; + } + ret = 0; + for (i = 0; i < aux->func_info_cnt; i++) { + if (aux->func_info[i].type_id != id) + continue; + ret = aux->func_info[i].insn_off; + /* Further func_info and subprog checks will also happen + * later, so assume this is the right insn_off for now. + */ + if (!ret) { + verbose(env, "invalid exception callback insn_off in func_info: 0\n"); + ret = -EINVAL; + } + } + if (!ret) { + verbose(env, "exception callback type id not found in func_info\n"); + ret = -EINVAL; + } + return ret; +} + #define MAX_KFUNC_DESCS 256 #define MAX_KFUNC_BTFS 256 @@ -2793,8 +2864,8 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog, static int add_subprog_and_kfunc(struct bpf_verifier_env *env) { struct bpf_subprog_info *subprog = env->subprog_info; + int i, ret, insn_cnt = env->prog->len, ex_cb_insn; struct bpf_insn *insn = env->prog->insnsi; - int i, ret, insn_cnt = env->prog->len; /* Add entry function. */ ret = add_subprog(env, 0); @@ -2820,6 +2891,27 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env) return ret; } + ret = bpf_find_exception_callback_insn_off(env); + if (ret < 0) + return ret; + ex_cb_insn = ret; + + /* If ex_cb_insn > 0, this means that the main program has a subprog + * marked using BTF decl tag to serve as the exception callback. + */ + if (ex_cb_insn) { + ret = add_subprog(env, ex_cb_insn); + if (ret < 0) + return ret; + for (i = 1; i < env->subprog_cnt; i++) { + if (env->subprog_info[i].start != ex_cb_insn) + continue; + env->exception_callback_subprog = i; + mark_subprog_exc_cb(env, i); + break; + } + } + /* Add a fake 'exit' subprog which could simplify subprog iteration * logic. 'subprog_cnt' should not be increased. */ @@ -2868,7 +2960,7 @@ next: if (i == subprog_end - 1) { /* to avoid fall-through from one subprog into another * the last insn of the subprog should be either exit - * or unconditional jump back + * or unconditional jump back or bpf_throw call */ if (code != (BPF_JMP | BPF_EXIT) && code != (BPF_JMP32 | BPF_JA) && @@ -3029,7 +3121,7 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn, if (class == BPF_LDX) { if (t != SRC_OP) - return BPF_SIZE(code) == BPF_DW; + return BPF_SIZE(code) == BPF_DW || BPF_MODE(code) == BPF_MEMSX; /* LDX source must be ptr. */ return true; } @@ -3118,13 +3210,11 @@ static void mark_insn_zext(struct bpf_verifier_env *env, reg->subreg_def = DEF_NOT_SUBREG; } -static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, - enum reg_arg_type t) +static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno, + enum reg_arg_type t) { - struct bpf_verifier_state *vstate = env->cur_state; - struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_insn *insn = env->prog->insnsi + env->insn_idx; - struct bpf_reg_state *reg, *regs = state->regs; + struct bpf_reg_state *reg; bool rw64; if (regno >= MAX_BPF_REG) { @@ -3165,6 +3255,30 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, return 0; } +static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, + enum reg_arg_type t) +{ + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + + return __check_reg_arg(env, state->regs, regno, t); +} + +static int insn_stack_access_flags(int frameno, int spi) +{ + return INSN_F_STACK_ACCESS | (spi << INSN_F_SPI_SHIFT) | frameno; +} + +static int insn_stack_access_spi(int insn_flags) +{ + return (insn_flags >> INSN_F_SPI_SHIFT) & INSN_F_SPI_MASK; +} + +static int insn_stack_access_frameno(int insn_flags) +{ + return insn_flags & INSN_F_FRAMENO_MASK; +} + static void mark_jmp_point(struct bpf_verifier_env *env, int idx) { env->insn_aux_data[idx].jmp_point = true; @@ -3176,36 +3290,76 @@ static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx) } /* for any branch, call, exit record the history of jmps in the given state */ -static int push_jmp_history(struct bpf_verifier_env *env, - struct bpf_verifier_state *cur) +static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, + int insn_flags) { u32 cnt = cur->jmp_history_cnt; - struct bpf_idx_pair *p; + struct bpf_jmp_history_entry *p; size_t alloc_size; - if (!is_jmp_point(env, env->insn_idx)) + /* combine instruction flags if we already recorded this instruction */ + if (env->cur_hist_ent) { + /* atomic instructions push insn_flags twice, for READ and + * WRITE sides, but they should agree on stack slot + */ + WARN_ONCE((env->cur_hist_ent->flags & insn_flags) && + (env->cur_hist_ent->flags & insn_flags) != insn_flags, + "verifier insn history bug: insn_idx %d cur flags %x new flags %x\n", + env->insn_idx, env->cur_hist_ent->flags, insn_flags); + env->cur_hist_ent->flags |= insn_flags; return 0; + } cnt++; alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p))); p = krealloc(cur->jmp_history, alloc_size, GFP_USER); if (!p) return -ENOMEM; - p[cnt - 1].idx = env->insn_idx; - p[cnt - 1].prev_idx = env->prev_insn_idx; cur->jmp_history = p; + + p = &cur->jmp_history[cnt - 1]; + p->idx = env->insn_idx; + p->prev_idx = env->prev_insn_idx; + p->flags = insn_flags; cur->jmp_history_cnt = cnt; + env->cur_hist_ent = p; + return 0; } +static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_state *st, + u32 hist_end, int insn_idx) +{ + if (hist_end > 0 && st->jmp_history[hist_end - 1].idx == insn_idx) + return &st->jmp_history[hist_end - 1]; + return NULL; +} + /* Backtrack one insn at a time. If idx is not at the top of recorded * history then previous instruction came from straight line execution. + * Return -ENOENT if we exhausted all instructions within given state. + * + * It's legal to have a bit of a looping with the same starting and ending + * insn index within the same state, e.g.: 3->4->5->3, so just because current + * instruction index is the same as state's first_idx doesn't mean we are + * done. If there is still some jump history left, we should keep going. We + * need to take into account that we might have a jump history between given + * state's parent and itself, due to checkpointing. In this case, we'll have + * history entry recording a jump from last instruction of parent state and + * first instruction of given state. */ static int get_prev_insn_idx(struct bpf_verifier_state *st, int i, u32 *history) { u32 cnt = *history; + if (i == st->first_insn_idx) { + if (cnt == 0) + return -ENOENT; + if (cnt == 1 && st->jmp_history[0].idx == i) + return -ENOENT; + } + if (cnt && st->jmp_history[cnt - 1].idx == i) { i = st->jmp_history[cnt - 1].prev_idx; (*history)--; @@ -3307,16 +3461,6 @@ static inline void bt_clear_frame_slot(struct backtrack_state *bt, u32 frame, u3 bt->stack_masks[frame] &= ~(1ull << slot); } -static inline void bt_set_slot(struct backtrack_state *bt, u32 slot) -{ - bt_set_frame_slot(bt, bt->frame, slot); -} - -static inline void bt_clear_slot(struct backtrack_state *bt, u32 slot) -{ - bt_clear_frame_slot(bt, bt->frame, slot); -} - static inline u32 bt_frame_reg_mask(struct backtrack_state *bt, u32 frame) { return bt->reg_masks[frame]; @@ -3342,9 +3486,9 @@ static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg) return bt->reg_masks[bt->frame] & (1 << reg); } -static inline bool bt_is_slot_set(struct backtrack_state *bt, u32 slot) +static inline bool bt_is_frame_slot_set(struct backtrack_state *bt, u32 frame, u32 slot) { - return bt->stack_masks[bt->frame] & (1ull << slot); + return bt->stack_masks[frame] & (1ull << slot); } /* format registers bitmask, e.g., "r0,r2,r4" for 0x15 mask */ @@ -3386,6 +3530,8 @@ static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask) } } +static bool calls_callback(struct bpf_verifier_env *env, int insn_idx); + /* For given verifier state backtrack_insn() is called from the last insn to * the first insn. Its purpose is to compute a bitmask of registers and * stack slots that needs precision in the parent verifier state. @@ -3396,7 +3542,7 @@ static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask) * - *was* processed previously during backtracking. */ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, - struct backtrack_state *bt) + struct bpf_jmp_history_entry *hist, struct backtrack_state *bt) { const struct bpf_insn_cbs cbs = { .cb_call = disasm_kfunc_name, @@ -3409,7 +3555,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, u8 mode = BPF_MODE(insn->code); u32 dreg = insn->dst_reg; u32 sreg = insn->src_reg; - u32 spi, i; + u32 spi, i, fr; if (insn->code == 0) return 0; @@ -3426,7 +3572,12 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, if (class == BPF_ALU || class == BPF_ALU64) { if (!bt_is_reg_set(bt, dreg)) return 0; - if (opcode == BPF_MOV) { + if (opcode == BPF_END || opcode == BPF_NEG) { + /* sreg is reserved and unused + * dreg still need precision before this insn + */ + return 0; + } else if (opcode == BPF_MOV) { if (BPF_SRC(insn->code) == BPF_X) { /* dreg = sreg or dreg = (s8, s16, s32)sreg * dreg needs precision after this insn @@ -3465,20 +3616,15 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * by 'precise' mark in corresponding register of this state. * No further tracking necessary. */ - if (insn->src_reg != BPF_REG_FP) + if (!hist || !(hist->flags & INSN_F_STACK_ACCESS)) return 0; - /* dreg = *(u64 *)[fp - off] was a fill from the stack. * that [fp - off] slot contains scalar that needs to be * tracked with precision */ - spi = (-insn->off - 1) / BPF_REG_SIZE; - if (spi >= 64) { - verbose(env, "BUG spi %d\n", spi); - WARN_ONCE(1, "verifier backtracking bug"); - return -EFAULT; - } - bt_set_slot(bt, spi); + spi = insn_stack_access_spi(hist->flags); + fr = insn_stack_access_frameno(hist->flags); + bt_set_frame_slot(bt, fr, spi); } else if (class == BPF_STX || class == BPF_ST) { if (bt_is_reg_set(bt, dreg)) /* stx & st shouldn't be using _scalar_ dst_reg @@ -3487,17 +3633,13 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, */ return -ENOTSUPP; /* scalars can only be spilled into stack */ - if (insn->dst_reg != BPF_REG_FP) + if (!hist || !(hist->flags & INSN_F_STACK_ACCESS)) return 0; - spi = (-insn->off - 1) / BPF_REG_SIZE; - if (spi >= 64) { - verbose(env, "BUG spi %d\n", spi); - WARN_ONCE(1, "verifier backtracking bug"); - return -EFAULT; - } - if (!bt_is_slot_set(bt, spi)) + spi = insn_stack_access_spi(hist->flags); + fr = insn_stack_access_frameno(hist->flags); + if (!bt_is_frame_slot_set(bt, fr, spi)) return 0; - bt_clear_slot(bt, spi); + bt_clear_frame_slot(bt, fr, spi); if (class == BPF_STX) bt_set_reg(bt, sreg); } else if (class == BPF_JMP || class == BPF_JMP32) { @@ -3541,10 +3683,14 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, WARN_ONCE(1, "verifier backtracking bug"); return -EFAULT; } - /* we don't track register spills perfectly, - * so fallback to force-precise instead of failing */ - if (bt_stack_mask(bt) != 0) - return -ENOTSUPP; + /* we are now tracking register spills correctly, + * so any instance of leftover slots is a bug + */ + if (bt_stack_mask(bt) != 0) { + verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt)); + WARN_ONCE(1, "verifier backtracking bug (subprog leftover stack slots)"); + return -EFAULT; + } /* propagate r1-r5 to the caller */ for (i = BPF_REG_1; i <= BPF_REG_5; i++) { if (bt_is_reg_set(bt, i)) { @@ -3556,24 +3702,24 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, return -EFAULT; return 0; } - } else if ((bpf_helper_call(insn) && - is_callback_calling_function(insn->imm) && - !is_async_callback_calling_function(insn->imm)) || - (bpf_pseudo_kfunc_call(insn) && is_callback_calling_kfunc(insn->imm))) { - /* callback-calling helper or kfunc call, which means - * we are exiting from subprog, but unlike the subprog - * call handling above, we shouldn't propagate - * precision of r1-r5 (if any requested), as they are - * not actually arguments passed directly to callback - * subprogs + } else if (is_sync_callback_calling_insn(insn) && idx != subseq_idx - 1) { + /* exit from callback subprog to callback-calling helper or + * kfunc call. Use idx/subseq_idx check to discern it from + * straight line code backtracking. + * Unlike the subprog call handling above, we shouldn't + * propagate precision of r1-r5 (if any requested), as they are + * not actually arguments passed directly to callback subprogs */ if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) { verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); WARN_ONCE(1, "verifier backtracking bug"); return -EFAULT; } - if (bt_stack_mask(bt) != 0) - return -ENOTSUPP; + if (bt_stack_mask(bt) != 0) { + verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt)); + WARN_ONCE(1, "verifier backtracking bug (callback leftover stack slots)"); + return -EFAULT; + } /* clear r1-r5 in callback subprog's mask */ for (i = BPF_REG_1; i <= BPF_REG_5; i++) bt_clear_reg(bt, i); @@ -3600,10 +3746,18 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, } else if (opcode == BPF_EXIT) { bool r0_precise; + /* Backtracking to a nested function call, 'idx' is a part of + * the inner frame 'subseq_idx' is a part of the outer frame. + * In case of a regular function call, instructions giving + * precision to registers R1-R5 should have been found already. + * In case of a callback, it is ok to have R1-R5 marked for + * backtracking, as these registers are set by the function + * invoking callback. + */ + if (subseq_idx >= 0 && calls_callback(env, subseq_idx)) + for (i = BPF_REG_1; i <= BPF_REG_5; i++) + bt_clear_reg(bt, i); if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { - /* if backtracing was looking for registers R1-R5 - * they should have been found already. - */ verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); WARN_ONCE(1, "verifier backtracking bug"); return -EFAULT; @@ -4002,6 +4156,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) for (;;) { DECLARE_BITMAP(mask, 64); u32 history = st->jmp_history_cnt; + struct bpf_jmp_history_entry *hist; if (env->log.level & BPF_LOG_LEVEL2) { verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n", @@ -4047,11 +4202,9 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) bitmap_from_u64(mask, bt_reg_mask(bt)); for_each_set_bit(i, mask, 32) { reg = &st->frame[0]->regs[i]; - if (reg->type != SCALAR_VALUE) { - bt_clear_reg(bt, i); - continue; - } - reg->precise = true; + bt_clear_reg(bt, i); + if (reg->type == SCALAR_VALUE) + reg->precise = true; } return 0; } @@ -4067,7 +4220,8 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) err = 0; skip_first = false; } else { - err = backtrack_insn(env, i, subseq_idx, bt); + hist = get_jmp_hist_entry(st, history, i); + err = backtrack_insn(env, i, subseq_idx, hist, bt); } if (err == -ENOTSUPP) { mark_all_scalars_precise(env, env->cur_state); @@ -4082,10 +4236,10 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) * Nothing to be tracked further in the parent state. */ return 0; - if (i == first_idx) - break; subseq_idx = i; i = get_prev_insn_idx(st, i, &history); + if (i == -ENOENT) + break; if (i >= env->prog->len) { /* This can happen if backtracking reached insn 0 * and there are still reg_mask or stack_mask @@ -4120,22 +4274,10 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr)); for_each_set_bit(i, mask, 64) { if (i >= func->allocated_stack / BPF_REG_SIZE) { - /* the sequence of instructions: - * 2: (bf) r3 = r10 - * 3: (7b) *(u64 *)(r3 -8) = r0 - * 4: (79) r4 = *(u64 *)(r10 -8) - * doesn't contain jmps. It's backtracked - * as a single block. - * During backtracking insn 3 is not recognized as - * stack access, so at the end of backtracking - * stack slot fp-8 is still marked in stack_mask. - * However the parent state may not have accessed - * fp-8 and it's "unallocated" stack space. - * In such case fallback to conservative. - */ - mark_all_scalars_precise(env, env->cur_state); - bt_reset(bt); - return 0; + verbose(env, "BUG backtracking (stack slot %d, total slots %d)\n", + i, func->allocated_stack / BPF_REG_SIZE); + WARN_ONCE(1, "verifier backtracking bug (stack slot out of bounds)"); + return -EFAULT; } if (!is_spilled_scalar_reg(&func->stack[i])) { @@ -4225,9 +4367,17 @@ static bool register_is_null(struct bpf_reg_state *reg) return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0); } -static bool register_is_const(struct bpf_reg_state *reg) +/* check if register is a constant scalar value */ +static bool is_reg_const(struct bpf_reg_state *reg, bool subreg32) +{ + return reg->type == SCALAR_VALUE && + tnum_is_const(subreg32 ? tnum_subreg(reg->var_off) : reg->var_off); +} + +/* assuming is_reg_const() is true, return constant value of a register */ +static u64 reg_const_value(struct bpf_reg_state *reg, bool subreg32) { - return reg->type == SCALAR_VALUE && tnum_is_const(reg->var_off); + return subreg32 ? tnum_subreg(reg->var_off).value : reg->var_off.value; } static bool __is_scalar_unbounded(struct bpf_reg_state *reg) @@ -4264,7 +4414,8 @@ static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_ dst->live = live; } -static void save_register_state(struct bpf_func_state *state, +static void save_register_state(struct bpf_verifier_env *env, + struct bpf_func_state *state, int spi, struct bpf_reg_state *reg, int size) { @@ -4279,7 +4430,7 @@ static void save_register_state(struct bpf_func_state *state, /* size < 8 bytes spill */ for (; i; i--) - scrub_spilled_slot(&state->stack[spi].slot_type[i - 1]); + mark_stack_slot_misc(env, &state->stack[spi].slot_type[i - 1]); } static bool is_bpf_st_mem(struct bpf_insn *insn) @@ -4300,16 +4451,13 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; struct bpf_reg_state *reg = NULL; - u32 dst_reg = insn->dst_reg; + int insn_flags = insn_stack_access_flags(state->frameno, spi); - err = grow_stack_state(state, round_up(slot + 1, BPF_REG_SIZE)); - if (err) - return err; /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, * so it's aligned access and [off, off + size) are within stack limits */ if (!env->allow_ptr_leaks && - state->stack[spi].slot_type[0] == STACK_SPILL && + is_spilled_reg(&state->stack[spi]) && size != BPF_REG_SIZE) { verbose(env, "attempt to corrupt spilled pointer on stack\n"); return -EACCES; @@ -4339,20 +4487,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, return err; mark_stack_slot_scratched(env, spi); - if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) && - !register_is_null(reg) && env->bpf_capable) { - if (dst_reg != BPF_REG_FP) { - /* The backtracking logic can only recognize explicit - * stack slot address like [fp - 8]. Other spill of - * scalar via different register has to be conservative. - * Backtrack from here and mark all registers as precise - * that contributed into 'reg' being a constant. - */ - err = mark_chain_precision(env, value_regno); - if (err) - return err; - } - save_register_state(state, spi, reg, size); + if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) && env->bpf_capable) { + save_register_state(env, state, spi, reg, size); /* Break the relation on a narrowing spill. */ if (fls64(reg->umax_value) > BITS_PER_BYTE * size) state->stack[spi].spilled_ptr.id = 0; @@ -4360,9 +4496,9 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, insn->imm != 0 && env->bpf_capable) { struct bpf_reg_state fake_reg = {}; - __mark_reg_known(&fake_reg, (u32)insn->imm); + __mark_reg_known(&fake_reg, insn->imm); fake_reg.type = SCALAR_VALUE; - save_register_state(state, spi, &fake_reg, size); + save_register_state(env, state, spi, &fake_reg, size); } else if (reg && is_spillable_regtype(reg->type)) { /* register containing pointer is being spilled into stack */ if (size != BPF_REG_SIZE) { @@ -4374,7 +4510,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, verbose(env, "cannot spill pointers to stack into stack frame of the caller\n"); return -EINVAL; } - save_register_state(state, spi, reg, size); + save_register_state(env, state, spi, reg, size); } else { u8 type = STACK_MISC; @@ -4399,7 +4535,12 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, /* when we zero initialize stack slots mark them as such */ if ((reg && register_is_null(reg)) || (!reg && is_bpf_st_mem(insn) && insn->imm == 0)) { - /* backtracking doesn't work for STACK_ZERO yet. */ + /* STACK_ZERO case happened because register spill + * wasn't properly aligned at the stack slot boundary, + * so it's not a register spill anymore; force + * originating register to be precise to make + * STACK_ZERO correct for subsequent states + */ err = mark_chain_precision(env, value_regno); if (err) return err; @@ -4408,9 +4549,12 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, /* Mark slots affected by this stack write. */ for (i = 0; i < size; i++) - state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = - type; + state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = type; + insn_flags = 0; /* not a register spill */ } + + if (insn_flags) + return push_jmp_history(env, env->cur_state, insn_flags); return 0; } @@ -4460,10 +4604,6 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, (!value_reg && is_bpf_st_mem(insn) && insn->imm == 0)) writing_zero = true; - err = grow_stack_state(state, round_up(-min_off, BPF_REG_SIZE)); - if (err) - return err; - for (i = min_off; i < max_off; i++) { int spi; @@ -4562,21 +4702,10 @@ static void mark_reg_stack_read(struct bpf_verifier_env *env, zeros++; } if (zeros == max_off - min_off) { - /* any access_size read into register is zero extended, - * so the whole register == const_zero + /* Any access_size read into register is zero extended, + * so the whole register == const_zero. */ - __mark_reg_const_zero(&state->regs[dst_regno]); - /* backtracking doesn't support STACK_ZERO yet, - * so mark it precise here, so that later - * backtracking can stop here. - * Backtracking may not need this if this register - * doesn't participate in pointer adjustment. - * Forward propagation of precise flag is not - * necessary either. This mark is only to stop - * backtracking. Any register that contributed - * to const 0 was marked precise before spill. - */ - state->regs[dst_regno].precise = true; + __mark_reg_const_zero(env, &state->regs[dst_regno]); } else { /* have read misc data from the stack */ mark_reg_unknown(env, state->regs, dst_regno); @@ -4603,6 +4732,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; struct bpf_reg_state *reg; u8 *stype, type; + int insn_flags = insn_stack_access_flags(reg_state->frameno, spi); stype = reg_state->stack[spi].slot_type; reg = ®_state->stack[spi].spilled_ptr; @@ -4635,25 +4765,42 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, copy_register_state(&state->regs[dst_regno], reg); state->regs[dst_regno].subreg_def = subreg_def; } else { + int spill_cnt = 0, zero_cnt = 0; + for (i = 0; i < size; i++) { type = stype[(slot - i) % BPF_REG_SIZE]; - if (type == STACK_SPILL) + if (type == STACK_SPILL) { + spill_cnt++; continue; + } if (type == STACK_MISC) continue; + if (type == STACK_ZERO) { + zero_cnt++; + continue; + } if (type == STACK_INVALID && env->allow_uninit_stack) continue; verbose(env, "invalid read from stack off %d+%d size %d\n", off, i, size); return -EACCES; } - mark_reg_unknown(env, state->regs, dst_regno); + + if (spill_cnt == size && + tnum_is_const(reg->var_off) && reg->var_off.value == 0) { + __mark_reg_const_zero(env, &state->regs[dst_regno]); + /* this IS register fill, so keep insn_flags */ + } else if (zero_cnt == size) { + /* similarly to mark_reg_stack_read(), preserve zeroes */ + __mark_reg_const_zero(env, &state->regs[dst_regno]); + insn_flags = 0; /* not restoring original register state */ + } else { + mark_reg_unknown(env, state->regs, dst_regno); + insn_flags = 0; /* not restoring original register state */ + } } state->regs[dst_regno].live |= REG_LIVE_WRITTEN; - return 0; - } - - if (dst_regno >= 0) { + } else if (dst_regno >= 0) { /* restore register state from stack */ copy_register_state(&state->regs[dst_regno], reg); /* mark reg as written since spilled pointer state likely @@ -4689,7 +4836,10 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); if (dst_regno >= 0) mark_reg_stack_read(env, reg_state, off, off + size, dst_regno); + insn_flags = 0; /* we are not restoring spilled register */ } + if (insn_flags) + return push_jmp_history(env, env->cur_state, insn_flags); return 0; } @@ -4979,8 +5129,8 @@ static int __check_ptr_off_reg(struct bpf_verifier_env *env, return 0; } -int check_ptr_off_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno) +static int check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno) { return __check_ptr_off_reg(env, reg, regno, false); } @@ -5001,6 +5151,8 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, perm_flags |= PTR_UNTRUSTED; } else { perm_flags = PTR_MAYBE_NULL | MEM_ALLOC; + if (kptr_field->type == BPF_KPTR_PERCPU) + perm_flags |= MEM_PERCPU; } if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags)) @@ -5044,7 +5196,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, */ if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, kptr_field->kptr.btf, kptr_field->kptr.btf_id, - kptr_field->type == BPF_KPTR_REF)) + kptr_field->type != BPF_KPTR_UNREF)) goto bad_type; return 0; bad_type: @@ -5072,7 +5224,9 @@ static bool in_rcu_cs(struct bpf_verifier_env *env) /* Once GCC supports btf_type_tag the following mechanism will be replaced with tag check */ BTF_SET_START(rcu_protected_types) BTF_ID(struct, prog_test_ref_kfunc) +#ifdef CONFIG_CGROUPS BTF_ID(struct, cgroup) +#endif BTF_ID(struct, bpf_cpumask) BTF_ID(struct, task_struct) BTF_SET_END(rcu_protected_types) @@ -5080,15 +5234,52 @@ BTF_SET_END(rcu_protected_types) static bool rcu_protected_object(const struct btf *btf, u32 btf_id) { if (!btf_is_kernel(btf)) - return false; + return true; return btf_id_set_contains(&rcu_protected_types, btf_id); } +static struct btf_record *kptr_pointee_btf_record(struct btf_field *kptr_field) +{ + struct btf_struct_meta *meta; + + if (btf_is_kernel(kptr_field->kptr.btf)) + return NULL; + + meta = btf_find_struct_meta(kptr_field->kptr.btf, + kptr_field->kptr.btf_id); + + return meta ? meta->record : NULL; +} + static bool rcu_safe_kptr(const struct btf_field *field) { const struct btf_field_kptr *kptr = &field->kptr; - return field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id); + return field->type == BPF_KPTR_PERCPU || + (field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id)); +} + +static u32 btf_ld_kptr_type(struct bpf_verifier_env *env, struct btf_field *kptr_field) +{ + struct btf_record *rec; + u32 ret; + + ret = PTR_MAYBE_NULL; + if (rcu_safe_kptr(kptr_field) && in_rcu_cs(env)) { + ret |= MEM_RCU; + if (kptr_field->type == BPF_KPTR_PERCPU) + ret |= MEM_PERCPU; + else if (!btf_is_kernel(kptr_field->kptr.btf)) + ret |= MEM_ALLOC; + + rec = kptr_pointee_btf_record(kptr_field); + if (rec && btf_record_has_field(rec, BPF_GRAPH_NODE)) + ret |= NON_OWN_REF; + } else { + ret |= PTR_UNTRUSTED; + } + + return ret; } static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, @@ -5114,7 +5305,8 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, /* We only allow loading referenced kptr, since it will be marked as * untrusted, similar to unreferenced kptr. */ - if (class != BPF_LDX && kptr_field->type == BPF_KPTR_REF) { + if (class != BPF_LDX && + (kptr_field->type == BPF_KPTR_REF || kptr_field->type == BPF_KPTR_PERCPU)) { verbose(env, "store to referenced kptr disallowed\n"); return -EACCES; } @@ -5125,10 +5317,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, * value from map as PTR_TO_BTF_ID, with the correct type. */ mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf, - kptr_field->kptr.btf_id, - rcu_safe_kptr(kptr_field) && in_rcu_cs(env) ? - PTR_MAYBE_NULL | MEM_RCU : - PTR_MAYBE_NULL | PTR_UNTRUSTED); + kptr_field->kptr.btf_id, btf_ld_kptr_type(env, kptr_field)); /* For mark_ptr_or_null_reg */ val_reg->id = ++env->id_gen; } else if (class == BPF_STX) { @@ -5182,6 +5371,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, switch (field->type) { case BPF_KPTR_UNREF: case BPF_KPTR_REF: + case BPF_KPTR_PERCPU: if (src != ACCESS_DIRECT) { verbose(env, "kptr cannot be accessed indirectly by helper\n"); return -EACCES; @@ -5578,20 +5768,6 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, strict); } -static int update_stack_depth(struct bpf_verifier_env *env, - const struct bpf_func_state *func, - int off) -{ - u16 stack = env->subprog_info[func->subprogno].stack_depth; - - if (stack >= -off) - return 0; - - /* update known max for given subprogram */ - env->subprog_info[func->subprogno].stack_depth = -off; - return 0; -} - /* starting from main bpf function walk all instructions of the function * and recursively walk all callees that given function can call. * Ignore jump and exit insns. @@ -5649,6 +5825,27 @@ continue_func: for (; i < subprog_end; i++) { int next_insn, sidx; + if (bpf_pseudo_kfunc_call(insn + i) && !insn[i].off) { + bool err = false; + + if (!is_bpf_throw_kfunc(insn + i)) + continue; + if (subprog[idx].is_cb) + err = true; + for (int c = 0; c < frame && !err; c++) { + if (subprog[ret_prog[c]].is_cb) { + err = true; + break; + } + } + if (!err) + continue; + verbose(env, + "bpf_throw kfunc (insn %d) cannot be called from callback subprog %d\n", + i, idx); + return -EINVAL; + } + if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i)) continue; /* remember insn and function to return to */ @@ -5671,6 +5868,10 @@ continue_func: /* async callbacks don't increase bpf prog stack size unless called directly */ if (!bpf_pseudo_call(insn + i)) continue; + if (subprog[sidx].is_exception_cb) { + verbose(env, "insn %d cannot call exception cb directly\n", i); + return -EINVAL; + } } i = next_insn; idx = sidx; @@ -5692,8 +5893,13 @@ continue_func: * tail call counter throughout bpf2bpf calls combined with tailcalls */ if (tail_call_reachable) - for (j = 0; j < frame; j++) + for (j = 0; j < frame; j++) { + if (subprog[ret_prog[j]].is_exception_cb) { + verbose(env, "cannot tail call within exception cb\n"); + return -EINVAL; + } subprog[ret_prog[j]].tail_call_reachable = true; + } if (subprog[0].tail_call_reachable) env->prog->aux->tail_call_reachable = true; @@ -5833,9 +6039,10 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) * values are also truncated so we push 64-bit bounds into * 32-bit bounds. Above were truncated < 32-bits already. */ - if (size >= 4) - return; - __reg_combine_64_into_32(reg); + if (size < 4) { + __mark_reg32_unbounded(reg); + reg_bounds_sync(reg); + } } static void set_sext64_default_val(struct bpf_reg_state *reg, int size) @@ -6209,7 +6416,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, } if (type_is_alloc(reg->type) && !type_is_non_owning_ref(reg->type) && - !reg->ref_obj_id) { + !(reg->type & MEM_RCU) && !reg->ref_obj_id) { verbose(env, "verifier internal error: ref_obj_id for allocated object must be non-zero\n"); return -EFAULT; } @@ -6350,13 +6557,14 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, * The minimum valid offset is -MAX_BPF_STACK for writes, and * -state->allocated_stack for reads. */ -static int check_stack_slot_within_bounds(int off, - struct bpf_func_state *state, - enum bpf_access_type t) +static int check_stack_slot_within_bounds(struct bpf_verifier_env *env, + s64 off, + struct bpf_func_state *state, + enum bpf_access_type t) { int min_valid_off; - if (t == BPF_WRITE) + if (t == BPF_WRITE || env->allow_uninit_stack) min_valid_off = -MAX_BPF_STACK; else min_valid_off = -state->allocated_stack; @@ -6379,7 +6587,7 @@ static int check_stack_access_within_bounds( struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = regs + regno; struct bpf_func_state *state = func(env, reg); - int min_off, max_off; + s64 min_off, max_off; int err; char *err_extra; @@ -6392,11 +6600,8 @@ static int check_stack_access_within_bounds( err_extra = " write to"; if (tnum_is_const(reg->var_off)) { - min_off = reg->var_off.value + off; - if (access_size > 0) - max_off = min_off + access_size - 1; - else - max_off = min_off; + min_off = (s64)reg->var_off.value + off; + max_off = min_off + access_size; } else { if (reg->smax_value >= BPF_MAX_VAR_OFF || reg->smin_value <= -BPF_MAX_VAR_OFF) { @@ -6405,15 +6610,12 @@ static int check_stack_access_within_bounds( return -EACCES; } min_off = reg->smin_value + off; - if (access_size > 0) - max_off = reg->smax_value + off + access_size - 1; - else - max_off = min_off; + max_off = reg->smax_value + off + access_size; } - err = check_stack_slot_within_bounds(min_off, state, type); - if (!err) - err = check_stack_slot_within_bounds(max_off, state, type); + err = check_stack_slot_within_bounds(env, min_off, state, type); + if (!err && max_off > 0) + err = -EINVAL; /* out of stack access into non-negative offsets */ if (err) { if (tnum_is_const(reg->var_off)) { @@ -6423,11 +6625,16 @@ static int check_stack_access_within_bounds( char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "invalid variable-offset%s stack R%d var_off=%s size=%d\n", - err_extra, regno, tn_buf, access_size); + verbose(env, "invalid variable-offset%s stack R%d var_off=%s off=%d size=%d\n", + err_extra, regno, tn_buf, off, access_size); } + return err; } - return err; + + /* Note that there is no stack access with offset zero, so the needed stack + * size is -min_off, not -min_off+1. + */ + return grow_stack_state(env, state, -min_off /* size */); } /* check whether memory at (regno + off) is accessible for t = (read | write) @@ -6442,7 +6649,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn { struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = regs + regno; - struct bpf_func_state *state; int size, err = 0; size = bpf_size_to_bytes(bpf_size); @@ -6585,11 +6791,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (err) return err; - state = func(env, reg); - err = update_stack_depth(env, state, off); - if (err) - return err; - if (t == BPF_READ) err = check_stack_read(env, regno, off, size, value_regno); @@ -6778,13 +6979,13 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i BPF_SIZE(insn->code), BPF_WRITE, -1, true, false); if (err) return err; - return 0; } /* When register 'regno' is used to read the stack (either directly or through * a helper function) make sure that it's within stack boundary and, depending - * on the access type, that all elements of the stack are initialized. + * on the access type and privileges, that all elements of the stack are + * initialized. * * 'off' includes 'regno->off', but not its dynamic part (if any). * @@ -6892,8 +7093,11 @@ static int check_stack_range_initialized( slot = -i - 1; spi = slot / BPF_REG_SIZE; - if (state->allocated_stack <= slot) - goto err; + if (state->allocated_stack <= slot) { + verbose(env, "verifier bug: allocated_stack too small"); + return -EFAULT; + } + stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE]; if (*stype == STACK_MISC) goto mark; @@ -6917,7 +7121,6 @@ static int check_stack_range_initialized( goto mark; } -err: if (tnum_is_const(reg->var_off)) { verbose(env, "invalid%s read from stack R%d off %d+%d size %d\n", err_extra, regno, min_off, i - min_off, access_size); @@ -6942,7 +7145,7 @@ mark: * helper may write to the entire memory range. */ } - return update_stack_depth(env, state, min_off); + return 0; } static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, @@ -7038,6 +7241,12 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, } } +/* verify arguments to helpers or kfuncs consisting of a pointer and an access + * size. + * + * @regno is the register containing the access size. regno-1 is the register + * containing the pointer. + */ static int check_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, bool zero_size_allowed, @@ -7072,12 +7281,10 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, return -EACCES; } - if (reg->umin_value == 0) { - err = check_helper_mem_access(env, regno - 1, 0, - zero_size_allowed, - meta); - if (err) - return err; + if (reg->umin_value == 0 && !zero_size_allowed) { + verbose(env, "R%d invalid zero-sized read: u64=[%lld,%lld]\n", + regno, reg->umin_value, reg->umax_value); + return -EACCES; } if (reg->umax_value >= BPF_MAX_VAR_SIZ) { @@ -7093,8 +7300,8 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, return err; } -int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - u32 regno, u32 mem_size) +static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + u32 regno, u32 mem_size) { bool may_be_null = type_may_be_null(reg->type); struct bpf_reg_state saved_reg; @@ -7320,7 +7527,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, verbose(env, "off=%d doesn't point to kptr\n", kptr_off); return -EACCES; } - if (kptr_field->type != BPF_KPTR_REF) { + if (kptr_field->type != BPF_KPTR_REF && kptr_field->type != BPF_KPTR_PERCPU) { verbose(env, "off=%d kptr isn't referenced kptr\n", kptr_off); return -EACCES; } @@ -7491,15 +7698,24 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id return err; } - err = mark_stack_slots_iter(env, reg, insn_idx, meta->btf, btf_id, nr_slots); + err = mark_stack_slots_iter(env, meta, reg, insn_idx, meta->btf, btf_id, nr_slots); if (err) return err; } else { /* iter_next() or iter_destroy() expect initialized iter state*/ - if (!is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots)) { + err = is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots); + switch (err) { + case 0: + break; + case -EINVAL: verbose(env, "expected an initialized iter_%s as arg #%d\n", iter_type_str(meta->btf, btf_id), regno); - return -EINVAL; + return err; + case -EPROTO: + verbose(env, "expected an RCU CS when using %s\n", meta->func_name); + return err; + default: + return err; } spi = iter_get_spi(env, reg, nr_slots); @@ -7525,6 +7741,81 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id return 0; } +/* Look for a previous loop entry at insn_idx: nearest parent state + * stopped at insn_idx with callsites matching those in cur->frame. + */ +static struct bpf_verifier_state *find_prev_entry(struct bpf_verifier_env *env, + struct bpf_verifier_state *cur, + int insn_idx) +{ + struct bpf_verifier_state_list *sl; + struct bpf_verifier_state *st; + + /* Explored states are pushed in stack order, most recent states come first */ + sl = *explored_state(env, insn_idx); + for (; sl; sl = sl->next) { + /* If st->branches != 0 state is a part of current DFS verification path, + * hence cur & st for a loop. + */ + st = &sl->state; + if (st->insn_idx == insn_idx && st->branches && same_callsites(st, cur) && + st->dfs_depth < cur->dfs_depth) + return st; + } + + return NULL; +} + +static void reset_idmap_scratch(struct bpf_verifier_env *env); +static bool regs_exact(const struct bpf_reg_state *rold, + const struct bpf_reg_state *rcur, + struct bpf_idmap *idmap); + +static void maybe_widen_reg(struct bpf_verifier_env *env, + struct bpf_reg_state *rold, struct bpf_reg_state *rcur, + struct bpf_idmap *idmap) +{ + if (rold->type != SCALAR_VALUE) + return; + if (rold->type != rcur->type) + return; + if (rold->precise || rcur->precise || regs_exact(rold, rcur, idmap)) + return; + __mark_reg_unknown(env, rcur); +} + +static int widen_imprecise_scalars(struct bpf_verifier_env *env, + struct bpf_verifier_state *old, + struct bpf_verifier_state *cur) +{ + struct bpf_func_state *fold, *fcur; + int i, fr; + + reset_idmap_scratch(env); + for (fr = old->curframe; fr >= 0; fr--) { + fold = old->frame[fr]; + fcur = cur->frame[fr]; + + for (i = 0; i < MAX_BPF_REG; i++) + maybe_widen_reg(env, + &fold->regs[i], + &fcur->regs[i], + &env->idmap_scratch); + + for (i = 0; i < fold->allocated_stack / BPF_REG_SIZE; i++) { + if (!is_spilled_reg(&fold->stack[i]) || + !is_spilled_reg(&fcur->stack[i])) + continue; + + maybe_widen_reg(env, + &fold->stack[i].spilled_ptr, + &fcur->stack[i].spilled_ptr, + &env->idmap_scratch); + } + } + return 0; +} + /* process_iter_next_call() is called when verifier gets to iterator's next * "method" (e.g., bpf_iter_num_next() for numbers iterator) call. We'll refer * to it as just "iter_next()" in comments below. @@ -7566,25 +7857,47 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id * is some statically known limit on number of iterations (e.g., if there is * an explicit `if n > 100 then break;` statement somewhere in the loop). * - * One very subtle but very important aspect is that we *always* simulate NULL - * condition first (as the current state) before we simulate non-NULL case. - * This has to do with intricacies of scalar precision tracking. By simulating - * "exit condition" of iter_next() returning NULL first, we make sure all the - * relevant precision marks *that will be set **after** we exit iterator loop* - * are propagated backwards to common parent state of NULL and non-NULL - * branches. Thanks to that, state equivalence checks done later in forked - * state, when reaching iter_next() for ACTIVE iterator, can assume that - * precision marks are finalized and won't change. Because simulating another - * ACTIVE iterator iteration won't change them (because given same input - * states we'll end up with exactly same output states which we are currently - * comparing; and verification after the loop already propagated back what - * needs to be **additionally** tracked as precise). It's subtle, grok - * precision tracking for more intuitive understanding. + * Iteration convergence logic in is_state_visited() relies on exact + * states comparison, which ignores read and precision marks. + * This is necessary because read and precision marks are not finalized + * while in the loop. Exact comparison might preclude convergence for + * simple programs like below: + * + * i = 0; + * while(iter_next(&it)) + * i++; + * + * At each iteration step i++ would produce a new distinct state and + * eventually instruction processing limit would be reached. + * + * To avoid such behavior speculatively forget (widen) range for + * imprecise scalar registers, if those registers were not precise at the + * end of the previous iteration and do not match exactly. + * + * This is a conservative heuristic that allows to verify wide range of programs, + * however it precludes verification of programs that conjure an + * imprecise value on the first loop iteration and use it as precise on a second. + * For example, the following safe program would fail to verify: + * + * struct bpf_num_iter it; + * int arr[10]; + * int i = 0, a = 0; + * bpf_iter_num_new(&it, 0, 10); + * while (bpf_iter_num_next(&it)) { + * if (a == 0) { + * a = 1; + * i = 7; // Because i changed verifier would forget + * // it's range on second loop entry. + * } else { + * arr[i] = 42; // This would fail to verify. + * } + * } + * bpf_iter_num_destroy(&it); */ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, struct bpf_kfunc_call_arg_meta *meta) { - struct bpf_verifier_state *cur_st = env->cur_state, *queued_st; + struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st; struct bpf_func_state *cur_fr = cur_st->frame[cur_st->curframe], *queued_fr; struct bpf_reg_state *cur_iter, *queued_iter; int iter_frameno = meta->iter.frameno; @@ -7602,6 +7915,19 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, } if (cur_iter->iter.state == BPF_ITER_STATE_ACTIVE) { + /* Because iter_next() call is a checkpoint is_state_visitied() + * should guarantee parent state with same call sites and insn_idx. + */ + if (!cur_st->parent || cur_st->parent->insn_idx != insn_idx || + !same_callsites(cur_st->parent, cur_st)) { + verbose(env, "bug: bad parent state for iter next call"); + return -EFAULT; + } + /* Note cur_st->parent in the call below, it is necessary to skip + * checkpoint created for cur_st by is_state_visited() + * right at this instruction. + */ + prev_st = find_prev_entry(env, cur_st->parent, insn_idx); /* branch out active iter state */ queued_st = push_stack(env, insn_idx + 1, insn_idx, false); if (!queued_st) @@ -7610,6 +7936,8 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, queued_iter = &queued_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr; queued_iter->iter.state = BPF_ITER_STATE_ACTIVE; queued_iter->iter.depth++; + if (prev_st) + widen_imprecise_scalars(env, prev_st, queued_st); queued_fr = queued_st->frame[queued_st->curframe]; mark_ptr_not_null_reg(&queued_fr->regs[BPF_REG_0]); @@ -7618,7 +7946,7 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, /* switch to DRAINED state, but keep the depth unchanged */ /* mark current iter state as drained and assume returned NULL */ cur_iter->iter.state = BPF_ITER_STATE_DRAINED; - __mark_reg_const_zero(&cur_fr->regs[BPF_REG_0]); + __mark_reg_const_zero(env, &cur_fr->regs[BPF_REG_0]); return 0; } @@ -7753,6 +8081,7 @@ static const struct bpf_reg_types btf_ptr_types = { static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_BTF_ID | MEM_PERCPU, + PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU, PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED, } }; @@ -7831,8 +8160,10 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, if (base_type(arg_type) == ARG_PTR_TO_MEM) type &= ~DYNPTR_TYPE_FLAG_MASK; - if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type)) + if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type)) { type &= ~MEM_ALLOC; + type &= ~MEM_PERCPU; + } for (i = 0; i < ARRAY_SIZE(compatible->types); i++) { expected = compatible->types[i]; @@ -7915,6 +8246,7 @@ found: break; } case PTR_TO_BTF_ID | MEM_ALLOC: + case PTR_TO_BTF_ID | MEM_PERCPU | MEM_ALLOC: if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock && meta->func_id != BPF_FUNC_kptr_xchg) { verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n"); @@ -7926,6 +8258,7 @@ found: } break; case PTR_TO_BTF_ID | MEM_PERCPU: + case PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU: case PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED: /* Handled by helper specific checks */ break; @@ -7953,9 +8286,9 @@ reg_find_field_offset(const struct bpf_reg_state *reg, s32 off, u32 fields) return field; } -int check_func_arg_reg_off(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno, - enum bpf_arg_type arg_type) +static int check_func_arg_reg_off(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno, + enum bpf_arg_type arg_type) { u32 type = reg->type; @@ -8089,6 +8422,54 @@ static enum bpf_dynptr_type dynptr_get_type(struct bpf_verifier_env *env, return state->stack[spi].spilled_ptr.dynptr.type; } +static int check_reg_const_str(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, u32 regno) +{ + struct bpf_map *map = reg->map_ptr; + int err; + int map_off; + u64 map_addr; + char *str_ptr; + + if (reg->type != PTR_TO_MAP_VALUE) + return -EINVAL; + + if (!bpf_map_is_rdonly(map)) { + verbose(env, "R%d does not point to a readonly map'\n", regno); + return -EACCES; + } + + if (!tnum_is_const(reg->var_off)) { + verbose(env, "R%d is not a constant address'\n", regno); + return -EACCES; + } + + if (!map->ops->map_direct_value_addr) { + verbose(env, "no direct value access support for this map type\n"); + return -EACCES; + } + + err = check_map_access(env, regno, reg->off, + map->value_size - reg->off, false, + ACCESS_HELPER); + if (err) + return err; + + map_off = reg->off + reg->var_off.value; + err = map->ops->map_direct_value_addr(map, &map_addr, map_off); + if (err) { + verbose(env, "direct value access on string failed\n"); + return err; + } + + str_ptr = (char *)(long)(map_addr); + if (!strnchr(str_ptr + map_off, map->value_size - map_off, 0)) { + verbose(env, "string is not zero-terminated\n"); + return -EINVAL; + } + return 0; +} + static int check_func_arg(struct bpf_verifier_env *env, u32 arg, struct bpf_call_arg_meta *meta, const struct bpf_func_proto *fn, @@ -8333,44 +8714,9 @@ skip_type_check: } case ARG_PTR_TO_CONST_STR: { - struct bpf_map *map = reg->map_ptr; - int map_off; - u64 map_addr; - char *str_ptr; - - if (!bpf_map_is_rdonly(map)) { - verbose(env, "R%d does not point to a readonly map'\n", regno); - return -EACCES; - } - - if (!tnum_is_const(reg->var_off)) { - verbose(env, "R%d is not a constant address'\n", regno); - return -EACCES; - } - - if (!map->ops->map_direct_value_addr) { - verbose(env, "no direct value access support for this map type\n"); - return -EACCES; - } - - err = check_map_access(env, regno, reg->off, - map->value_size - reg->off, false, - ACCESS_HELPER); + err = check_reg_const_str(env, reg, regno); if (err) return err; - - map_off = reg->off + reg->var_off.value; - err = map->ops->map_direct_value_addr(map, &map_addr, map_off); - if (err) { - verbose(env, "direct value access on string failed\n"); - return err; - } - - str_ptr = (char *)(long)(map_addr); - if (!strnchr(str_ptr + map_off, map->value_size - map_off, 0)) { - verbose(env, "string is not zero-terminated\n"); - return -EINVAL; - } break; } case ARG_PTR_TO_KPTR: @@ -8839,7 +9185,7 @@ static void clear_caller_saved_regs(struct bpf_verifier_env *env, /* after the call registers r0 - r5 were scratched */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, regs, caller_saved[i]); - check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); + __check_reg_arg(env, regs, caller_saved[i], DST_OP_NO_MARK); } } @@ -8852,11 +9198,10 @@ static int set_callee_state(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee, int insn_idx); -static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, - int *insn_idx, int subprog, - set_callee_state_fn set_callee_state_cb) +static int setup_func_entry(struct bpf_verifier_env *env, int subprog, int callsite, + set_callee_state_fn set_callee_state_cb, + struct bpf_verifier_state *state) { - struct bpf_verifier_state *state = env->cur_state; struct bpf_func_state *caller, *callee; int err; @@ -8866,53 +9211,168 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -E2BIG; } - caller = state->frame[state->curframe]; if (state->frame[state->curframe + 1]) { verbose(env, "verifier bug. Frame %d already allocated\n", state->curframe + 1); return -EFAULT; } - err = btf_check_subprog_call(env, subprog, caller->regs); - if (err == -EFAULT) - return err; - if (subprog_is_global(env, subprog)) { - if (err) { - verbose(env, "Caller passes invalid args into func#%d\n", - subprog); - return err; - } else { - if (env->log.level & BPF_LOG_LEVEL) - verbose(env, - "Func#%d is global and valid. Skipping.\n", - subprog); - clear_caller_saved_regs(env, caller->regs); + caller = state->frame[state->curframe]; + callee = kzalloc(sizeof(*callee), GFP_KERNEL); + if (!callee) + return -ENOMEM; + state->frame[state->curframe + 1] = callee; + + /* callee cannot access r0, r6 - r9 for reading and has to write + * into its own stack before reading from it. + * callee can read/write into caller's stack + */ + init_func_state(env, callee, + /* remember the callsite, it will be used by bpf_exit */ + callsite, + state->curframe + 1 /* frameno within this callchain */, + subprog /* subprog number within this prog */); + /* Transfer references to the callee */ + err = copy_reference_state(callee, caller); + err = err ?: set_callee_state_cb(env, caller, callee, callsite); + if (err) + goto err_out; - /* All global functions return a 64-bit SCALAR_VALUE */ - mark_reg_unknown(env, caller->regs, BPF_REG_0); - caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; + /* only increment it after check_reg_arg() finished */ + state->curframe++; - /* continue with next insn after call */ - return 0; + return 0; + +err_out: + free_func_state(callee); + state->frame[state->curframe + 1] = NULL; + return err; +} + +static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, + const struct btf *btf, + struct bpf_reg_state *regs) +{ + struct bpf_subprog_info *sub = subprog_info(env, subprog); + struct bpf_verifier_log *log = &env->log; + u32 i; + int ret; + + ret = btf_prepare_func_args(env, subprog); + if (ret) + return ret; + + /* check that BTF function arguments match actual types that the + * verifier sees. + */ + for (i = 0; i < sub->arg_cnt; i++) { + u32 regno = i + 1; + struct bpf_reg_state *reg = ®s[regno]; + struct bpf_subprog_arg_info *arg = &sub->args[i]; + + if (arg->arg_type == ARG_ANYTHING) { + if (reg->type != SCALAR_VALUE) { + bpf_log(log, "R%d is not a scalar\n", regno); + return -EINVAL; + } + } else if (arg->arg_type == ARG_PTR_TO_CTX) { + ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE); + if (ret < 0) + return ret; + /* If function expects ctx type in BTF check that caller + * is passing PTR_TO_CTX. + */ + if (reg->type != PTR_TO_CTX) { + bpf_log(log, "arg#%d expects pointer to ctx\n", i); + return -EINVAL; + } + } else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) { + ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE); + if (ret < 0) + return ret; + if (check_mem_reg(env, reg, regno, arg->mem_size)) + return -EINVAL; + if (!(arg->arg_type & PTR_MAYBE_NULL) && (reg->type & PTR_MAYBE_NULL)) { + bpf_log(log, "arg#%d is expected to be non-NULL\n", i); + return -EINVAL; + } + } else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) { + ret = process_dynptr_func(env, regno, -1, arg->arg_type, 0); + if (ret) + return ret; + } else { + bpf_log(log, "verifier bug: unrecognized arg#%d type %d\n", + i, arg->arg_type); + return -EFAULT; } } + return 0; +} + +/* Compare BTF of a function call with given bpf_reg_state. + * Returns: + * EFAULT - there is a verifier bug. Abort verification. + * EINVAL - there is a type mismatch or BTF is not available. + * 0 - BTF matches with what bpf_reg_state expects. + * Only PTR_TO_CTX and SCALAR_VALUE states are recognized. + */ +static int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, + struct bpf_reg_state *regs) +{ + struct bpf_prog *prog = env->prog; + struct btf *btf = prog->aux->btf; + u32 btf_id; + int err; + + if (!prog->aux->func_info) + return -EINVAL; + + btf_id = prog->aux->func_info[subprog].type_id; + if (!btf_id) + return -EFAULT; + + if (prog->aux->func_info_aux[subprog].unreliable) + return -EINVAL; + + err = btf_check_func_arg_match(env, subprog, btf, regs); + /* Compiler optimizations can remove arguments from static functions + * or mismatched type can be passed into a global function. + * In such cases mark the function as unreliable from BTF point of view. + */ + if (err) + prog->aux->func_info_aux[subprog].unreliable = true; + return err; +} + +static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int insn_idx, int subprog, + set_callee_state_fn set_callee_state_cb) +{ + struct bpf_verifier_state *state = env->cur_state, *callback_state; + struct bpf_func_state *caller, *callee; + int err; + + caller = state->frame[state->curframe]; + err = btf_check_subprog_call(env, subprog, caller->regs); + if (err == -EFAULT) + return err; + /* set_callee_state is used for direct subprog calls, but we are * interested in validating only BPF helpers that can call subprogs as * callbacks */ - if (set_callee_state_cb != set_callee_state) { - if (bpf_pseudo_kfunc_call(insn) && - !is_callback_calling_kfunc(insn->imm)) { - verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n", - func_id_name(insn->imm), insn->imm); - return -EFAULT; - } else if (!bpf_pseudo_kfunc_call(insn) && - !is_callback_calling_function(insn->imm)) { /* helper */ - verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n", - func_id_name(insn->imm), insn->imm); - return -EFAULT; - } + env->subprog_info[subprog].is_cb = true; + if (bpf_pseudo_kfunc_call(insn) && + !is_sync_callback_calling_kfunc(insn->imm)) { + verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n", + func_id_name(insn->imm), insn->imm); + return -EFAULT; + } else if (!bpf_pseudo_kfunc_call(insn) && + !is_callback_calling_function(insn->imm)) { /* helper */ + verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n", + func_id_name(insn->imm), insn->imm); + return -EFAULT; } if (insn->code == (BPF_JMP | BPF_CALL) && @@ -8923,53 +9383,88 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn /* there is no real recursion here. timer callbacks are async */ env->subprog_info[subprog].is_async_cb = true; async_cb = push_async_cb(env, env->subprog_info[subprog].start, - *insn_idx, subprog); + insn_idx, subprog); if (!async_cb) return -EFAULT; callee = async_cb->frame[0]; callee->async_entry_cnt = caller->async_entry_cnt + 1; /* Convert bpf_timer_set_callback() args into timer callback args */ - err = set_callee_state_cb(env, caller, callee, *insn_idx); + err = set_callee_state_cb(env, caller, callee, insn_idx); if (err) return err; + return 0; + } + + /* for callback functions enqueue entry to callback and + * proceed with next instruction within current frame. + */ + callback_state = push_stack(env, env->subprog_info[subprog].start, insn_idx, false); + if (!callback_state) + return -ENOMEM; + + err = setup_func_entry(env, subprog, insn_idx, set_callee_state_cb, + callback_state); + if (err) + return err; + + callback_state->callback_unroll_depth++; + callback_state->frame[callback_state->curframe - 1]->callback_depth++; + caller->callback_depth = 0; + return 0; +} + +static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx) +{ + struct bpf_verifier_state *state = env->cur_state; + struct bpf_func_state *caller; + int err, subprog, target_insn; + + target_insn = *insn_idx + insn->imm + 1; + subprog = find_subprog(env, target_insn); + if (subprog < 0) { + verbose(env, "verifier bug. No program starts at insn %d\n", target_insn); + return -EFAULT; + } + + caller = state->frame[state->curframe]; + err = btf_check_subprog_call(env, subprog, caller->regs); + if (err == -EFAULT) + return err; + if (subprog_is_global(env, subprog)) { + const char *sub_name = subprog_name(env, subprog); + + if (err) { + verbose(env, "Caller passes invalid args into func#%d ('%s')\n", + subprog, sub_name); + return err; + } + + verbose(env, "Func#%d ('%s') is global and assumed valid.\n", + subprog, sub_name); + /* mark global subprog for verifying after main prog */ + subprog_aux(env, subprog)->called = true; clear_caller_saved_regs(env, caller->regs); + + /* All global functions return a 64-bit SCALAR_VALUE */ mark_reg_unknown(env, caller->regs, BPF_REG_0); caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; + /* continue with next insn after call */ return 0; } - callee = kzalloc(sizeof(*callee), GFP_KERNEL); - if (!callee) - return -ENOMEM; - state->frame[state->curframe + 1] = callee; - - /* callee cannot access r0, r6 - r9 for reading and has to write - * into its own stack before reading from it. - * callee can read/write into caller's stack + /* for regular function entry setup new frame and continue + * from that frame. */ - init_func_state(env, callee, - /* remember the callsite, it will be used by bpf_exit */ - *insn_idx /* callsite */, - state->curframe + 1 /* frameno within this callchain */, - subprog /* subprog number within this prog */); - - /* Transfer references to the callee */ - err = copy_reference_state(callee, caller); + err = setup_func_entry(env, subprog, *insn_idx, set_callee_state, state); if (err) - goto err_out; - - err = set_callee_state_cb(env, caller, callee, *insn_idx); - if (err) - goto err_out; + return err; clear_caller_saved_regs(env, caller->regs); - /* only increment it after check_reg_arg() finished */ - state->curframe++; - /* and go analyze first insn of the callee */ *insn_idx = env->subprog_info[subprog].start - 1; @@ -8977,14 +9472,10 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn verbose(env, "caller:\n"); print_verifier_state(env, caller, true); verbose(env, "callee:\n"); - print_verifier_state(env, callee, true); + print_verifier_state(env, state->frame[state->curframe], true); } - return 0; -err_out: - free_func_state(callee); - state->frame[state->curframe + 1] = NULL; - return err; + return 0; } int map_set_for_each_callback_args(struct bpf_verifier_env *env, @@ -9028,22 +9519,6 @@ static int set_callee_state(struct bpf_verifier_env *env, return 0; } -static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, - int *insn_idx) -{ - int subprog, target_insn; - - target_insn = *insn_idx + insn->imm + 1; - subprog = find_subprog(env, target_insn); - if (subprog < 0) { - verbose(env, "verifier bug. No program starts at insn %d\n", - target_insn); - return -EFAULT; - } - - return __check_func_call(env, insn, insn_idx, subprog, set_callee_state); -} - static int set_map_elem_callback_state(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee, @@ -9070,7 +9545,7 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env, return err; callee->in_callback_fn = true; - callee->callback_ret_range = tnum_range(0, 1); + callee->callback_ret_range = retval_range(0, 1); return 0; } @@ -9092,7 +9567,7 @@ static int set_loop_callback_state(struct bpf_verifier_env *env, __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_callback_fn = true; - callee->callback_ret_range = tnum_range(0, 1); + callee->callback_ret_range = retval_range(0, 1); return 0; } @@ -9122,7 +9597,7 @@ static int set_timer_callback_state(struct bpf_verifier_env *env, __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_async_callback_fn = true; - callee->callback_ret_range = tnum_range(0, 1); + callee->callback_ret_range = retval_range(0, 1); return 0; } @@ -9141,7 +9616,7 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env, callee->regs[BPF_REG_2].type = PTR_TO_BTF_ID; __mark_reg_known_zero(&callee->regs[BPF_REG_2]); callee->regs[BPF_REG_2].btf = btf_vmlinux; - callee->regs[BPF_REG_2].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA], + callee->regs[BPF_REG_2].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA]; /* pointer to stack or null */ callee->regs[BPF_REG_3] = caller->regs[BPF_REG_4]; @@ -9150,7 +9625,7 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env, __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_callback_fn = true; - callee->callback_ret_range = tnum_range(0, 1); + callee->callback_ret_range = retval_range(0, 1); return 0; } @@ -9173,7 +9648,7 @@ static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env, __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_callback_fn = true; - callee->callback_ret_range = tnum_range(0, 1); + callee->callback_ret_range = retval_range(0, 1); return 0; } @@ -9205,7 +9680,7 @@ static int set_rbtree_add_callback_state(struct bpf_verifier_env *env, __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_callback_fn = true; - callee->callback_ret_range = tnum_range(0, 1); + callee->callback_ret_range = retval_range(0, 1); return 0; } @@ -9234,11 +9709,17 @@ static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env) return is_rbtree_lock_required_kfunc(kfunc_btf_id); } +static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg) +{ + return range.minval <= reg->smin_value && reg->smax_value <= range.maxval; +} + static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) { - struct bpf_verifier_state *state = env->cur_state; + struct bpf_verifier_state *state = env->cur_state, *prev_st; struct bpf_func_state *caller, *callee; struct bpf_reg_state *r0; + bool in_callback_fn; int err; callee = state->frame[state->curframe]; @@ -9256,17 +9737,28 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) caller = state->frame[state->curframe - 1]; if (callee->in_callback_fn) { - /* enforce R0 return value range [0, 1]. */ - struct tnum range = callee->callback_ret_range; - if (r0->type != SCALAR_VALUE) { verbose(env, "R0 not a scalar value\n"); return -EACCES; } - if (!tnum_in(range, r0->var_off)) { - verbose_invalid_scalar(env, r0, &range, "callback return", "R0"); + + /* we are going to rely on register's precise value */ + err = mark_reg_read(env, r0, r0->parent, REG_LIVE_READ64); + err = err ?: mark_chain_precision(env, BPF_REG_0); + if (err) + return err; + + /* enforce R0 return value range */ + if (!retval_range_within(callee->callback_ret_range, r0)) { + verbose_invalid_scalar(env, r0, callee->callback_ret_range, + "At callback return", "R0"); return -EINVAL; } + if (!calls_callback(env, callee->callsite)) { + verbose(env, "BUG: in callback at %d, callsite %d !calls_callback\n", + *insn_idx, callee->callsite); + return -EFAULT; + } } else { /* return to the caller whatever r0 had in the callee */ caller->regs[BPF_REG_0] = *r0; @@ -9284,27 +9776,56 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) return err; } - *insn_idx = callee->callsite + 1; + /* for callbacks like bpf_loop or bpf_for_each_map_elem go back to callsite, + * there function call logic would reschedule callback visit. If iteration + * converges is_state_visited() would prune that visit eventually. + */ + in_callback_fn = callee->in_callback_fn; + if (in_callback_fn) + *insn_idx = callee->callsite; + else + *insn_idx = callee->callsite + 1; + if (env->log.level & BPF_LOG_LEVEL) { verbose(env, "returning from callee:\n"); print_verifier_state(env, callee, true); verbose(env, "to caller at %d:\n", *insn_idx); print_verifier_state(env, caller, true); } - /* clear everything in the callee */ + /* clear everything in the callee. In case of exceptional exits using + * bpf_throw, this will be done by copy_verifier_state for extra frames. */ free_func_state(callee); state->frame[state->curframe--] = NULL; + + /* for callbacks widen imprecise scalars to make programs like below verify: + * + * struct ctx { int i; } + * void cb(int idx, struct ctx *ctx) { ctx->i++; ... } + * ... + * struct ctx = { .i = 0; } + * bpf_loop(100, cb, &ctx, 0); + * + * This is similar to what is done in process_iter_next_call() for open + * coded iterators. + */ + prev_st = in_callback_fn ? find_prev_entry(env, state, *insn_idx) : NULL; + if (prev_st) { + err = widen_imprecise_scalars(env, prev_st, state); + if (err) + return err; + } return 0; } -static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type, - int func_id, - struct bpf_call_arg_meta *meta) +static int do_refine_retval_range(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, int ret_type, + int func_id, + struct bpf_call_arg_meta *meta) { struct bpf_reg_state *ret_reg = ®s[BPF_REG_0]; if (ret_type != RET_INTEGER) - return; + return 0; switch (func_id) { case BPF_FUNC_get_stack: @@ -9330,6 +9851,8 @@ static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type, reg_bounds_sync(ret_reg); break; } + + return reg_bounds_sanity_check(env, ret_reg, "retval"); } static int @@ -9399,7 +9922,7 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, val = reg->var_off.value; max = map->max_entries; - if (!(register_is_const(reg) && val < max)) { + if (!(is_reg_const(reg, false) && val < max)) { bpf_map_key_store(aux, BPF_MAP_KEY_POISON); return 0; } @@ -9415,17 +9938,17 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, return 0; } -static int check_reference_leak(struct bpf_verifier_env *env) +static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exit) { struct bpf_func_state *state = cur_func(env); bool refs_lingering = false; int i; - if (state->frameno && !state->in_callback_fn) + if (!exception_exit && state->frameno && !state->in_callback_fn) return 0; for (i = 0; i < state->acquired_refs; i++) { - if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno) + if (!exception_exit && state->in_callback_fn && state->refs[i].callback_ref != state->frameno) continue; verbose(env, "Unreleased reference id=%d alloc_insn=%d\n", state->refs[i].id, state->refs[i].insn_idx); @@ -9532,6 +10055,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn int *insn_idx_p) { enum bpf_prog_type prog_type = resolve_prog_type(env->prog); + bool returns_cpu_specific_alloc_ptr = false; const struct bpf_func_proto *fn = NULL; enum bpf_return_type ret_type; enum bpf_type_flag ret_flag; @@ -9642,6 +10166,26 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EFAULT; } err = unmark_stack_slots_dynptr(env, ®s[meta.release_regno]); + } else if (func_id == BPF_FUNC_kptr_xchg && meta.ref_obj_id) { + u32 ref_obj_id = meta.ref_obj_id; + bool in_rcu = in_rcu_cs(env); + struct bpf_func_state *state; + struct bpf_reg_state *reg; + + err = release_reference_state(cur_func(env), ref_obj_id); + if (!err) { + bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ + if (reg->ref_obj_id == ref_obj_id) { + if (in_rcu && (reg->type & MEM_ALLOC) && (reg->type & MEM_PERCPU)) { + reg->ref_obj_id = 0; + reg->type &= ~MEM_ALLOC; + reg->type |= MEM_RCU; + } else { + mark_reg_invalid(env, reg); + } + } + })); + } } else if (meta.ref_obj_id) { err = release_reference(env, meta.ref_obj_id); } else if (register_is_null(®s[meta.release_regno])) { @@ -9659,7 +10203,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn switch (func_id) { case BPF_FUNC_tail_call: - err = check_reference_leak(env); + err = check_reference_leak(env, false); if (err) { verbose(env, "tail_call would lead to reference leak\n"); return err; @@ -9675,24 +10219,37 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } break; case BPF_FUNC_for_each_map_elem: - err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, - set_map_elem_callback_state); + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_map_elem_callback_state); break; case BPF_FUNC_timer_set_callback: - err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, - set_timer_callback_state); + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_timer_callback_state); break; case BPF_FUNC_find_vma: - err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, - set_find_vma_callback_state); + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_find_vma_callback_state); break; case BPF_FUNC_snprintf: err = check_bpf_snprintf_call(env, regs); break; case BPF_FUNC_loop: update_loop_inline_state(env, meta.subprogno); - err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, - set_loop_callback_state); + /* Verifier relies on R1 value to determine if bpf_loop() iteration + * is finished, thus mark it precise. + */ + err = mark_chain_precision(env, BPF_REG_1); + if (err) + return err; + if (cur_func(env)->callback_depth < regs[BPF_REG_1].umax_value) { + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_loop_callback_state); + } else { + cur_func(env)->callback_depth = 0; + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "frame%d bpf_loop iteration limit reached\n", + env->cur_state->curframe); + } break; case BPF_FUNC_dynptr_from_mem: if (regs[BPF_REG_1].type != PTR_TO_MAP_VALUE) { @@ -9770,9 +10327,26 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn break; } + case BPF_FUNC_per_cpu_ptr: + case BPF_FUNC_this_cpu_ptr: + { + struct bpf_reg_state *reg = ®s[BPF_REG_1]; + const struct btf_type *type; + + if (reg->type & MEM_RCU) { + type = btf_type_by_id(reg->btf, reg->btf_id); + if (!type || !btf_type_is_struct(type)) { + verbose(env, "Helper has invalid btf/btf_id in R1\n"); + return -EFAULT; + } + returns_cpu_specific_alloc_ptr = true; + env->insn_aux_data[insn_idx].call_with_percpu_alloc_ptr = true; + } + break; + } case BPF_FUNC_user_ringbuf_drain: - err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, - set_user_ringbuf_callback_state); + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_user_ringbuf_callback_state); break; } @@ -9859,14 +10433,18 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag; regs[BPF_REG_0].mem_size = tsize; } else { - /* MEM_RDONLY may be carried from ret_flag, but it - * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise - * it will confuse the check of PTR_TO_BTF_ID in - * check_mem_access(). - */ - ret_flag &= ~MEM_RDONLY; + if (returns_cpu_specific_alloc_ptr) { + regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC | MEM_RCU; + } else { + /* MEM_RDONLY may be carried from ret_flag, but it + * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise + * it will confuse the check of PTR_TO_BTF_ID in + * check_mem_access(). + */ + ret_flag &= ~MEM_RDONLY; + regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag; + } - regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag; regs[BPF_REG_0].btf = meta.ret_btf; regs[BPF_REG_0].btf_id = meta.ret_btf_id; } @@ -9882,8 +10460,11 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (func_id == BPF_FUNC_kptr_xchg) { ret_btf = meta.kptr_field->kptr.btf; ret_btf_id = meta.kptr_field->kptr.btf_id; - if (!btf_is_kernel(ret_btf)) + if (!btf_is_kernel(ret_btf)) { regs[BPF_REG_0].type |= MEM_ALLOC; + if (meta.kptr_field->type == BPF_KPTR_PERCPU) + regs[BPF_REG_0].type |= MEM_PERCPU; + } } else { if (fn->ret_btf_id == BPF_PTR_POISON) { verbose(env, "verifier internal error:"); @@ -9936,7 +10517,9 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].ref_obj_id = id; } - do_refine_retval_range(regs, fn->ret_type, func_id, &meta); + err = do_refine_retval_range(env, regs, fn->ret_type, func_id, &meta); + if (err) + return err; err = check_map_func_compatibility(env, meta.map_ptr, func_id); if (err) @@ -10030,6 +10613,11 @@ static bool is_kfunc_rcu(struct bpf_kfunc_call_arg_meta *meta) return meta->kfunc_flags & KF_RCU; } +static bool is_kfunc_rcu_protected(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->kfunc_flags & KF_RCU_PROTECTED; +} + static bool __kfunc_param_match_suffix(const struct btf *btf, const struct btf_param *arg, const char *suffix) @@ -10104,6 +10692,16 @@ static bool is_kfunc_arg_refcounted_kptr(const struct btf *btf, const struct btf return __kfunc_param_match_suffix(btf, arg, "__refcounted_kptr"); } +static bool is_kfunc_arg_nullable(const struct btf *btf, const struct btf_param *arg) +{ + return __kfunc_param_match_suffix(btf, arg, "__nullable"); +} + +static bool is_kfunc_arg_const_str(const struct btf *btf, const struct btf_param *arg) +{ + return __kfunc_param_match_suffix(btf, arg, "__str"); +} + static bool is_kfunc_arg_scalar_with_name(const struct btf *btf, const struct btf_param *arg, const char *name) @@ -10246,6 +10844,8 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_CALLBACK, KF_ARG_PTR_TO_RB_ROOT, KF_ARG_PTR_TO_RB_NODE, + KF_ARG_PTR_TO_NULL, + KF_ARG_PTR_TO_CONST_STR, }; enum special_kfunc_type { @@ -10268,6 +10868,10 @@ enum special_kfunc_type { KF_bpf_dynptr_slice, KF_bpf_dynptr_slice_rdwr, KF_bpf_dynptr_clone, + KF_bpf_percpu_obj_new_impl, + KF_bpf_percpu_obj_drop_impl, + KF_bpf_throw, + KF_bpf_iter_css_task_new, }; BTF_SET_START(special_kfunc_set) @@ -10288,6 +10892,12 @@ BTF_ID(func, bpf_dynptr_from_xdp) BTF_ID(func, bpf_dynptr_slice) BTF_ID(func, bpf_dynptr_slice_rdwr) BTF_ID(func, bpf_dynptr_clone) +BTF_ID(func, bpf_percpu_obj_new_impl) +BTF_ID(func, bpf_percpu_obj_drop_impl) +BTF_ID(func, bpf_throw) +#ifdef CONFIG_CGROUPS +BTF_ID(func, bpf_iter_css_task_new) +#endif BTF_SET_END(special_kfunc_set) BTF_ID_LIST(special_kfunc_list) @@ -10310,6 +10920,14 @@ BTF_ID(func, bpf_dynptr_from_xdp) BTF_ID(func, bpf_dynptr_slice) BTF_ID(func, bpf_dynptr_slice_rdwr) BTF_ID(func, bpf_dynptr_clone) +BTF_ID(func, bpf_percpu_obj_new_impl) +BTF_ID(func, bpf_percpu_obj_drop_impl) +BTF_ID(func, bpf_throw) +#ifdef CONFIG_CGROUPS +BTF_ID(func, bpf_iter_css_task_new) +#else +BTF_ID_UNUSED +#endif static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { @@ -10378,6 +10996,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_rbtree_node(meta->btf, &args[argno])) return KF_ARG_PTR_TO_RB_NODE; + if (is_kfunc_arg_const_str(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_CONST_STR; + if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) { if (!btf_type_is_struct(ref_t)) { verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n", @@ -10390,6 +11011,8 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_callback(env, meta->btf, &args[argno])) return KF_ARG_PTR_TO_CALLBACK; + if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg)) + return KF_ARG_PTR_TO_NULL; if (argno + 1 < nargs && (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]) || @@ -10622,11 +11245,17 @@ static bool is_bpf_graph_api_kfunc(u32 btf_id) btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]; } -static bool is_callback_calling_kfunc(u32 btf_id) +static bool is_sync_callback_calling_kfunc(u32 btf_id) { return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]; } +static bool is_bpf_throw_kfunc(struct bpf_insn *insn) +{ + return bpf_pseudo_kfunc_call(insn) && insn->off == 0 && + insn->imm == special_kfunc_list[KF_bpf_throw]; +} + static bool is_rbtree_lock_required_kfunc(u32 btf_id) { return is_bpf_rbtree_api_kfunc(btf_id); @@ -10834,6 +11463,28 @@ static int process_kf_arg_ptr_to_rbtree_node(struct bpf_verifier_env *env, &meta->arg_rbtree_root.field); } +/* + * css_task iter allowlist is needed to avoid dead locking on css_set_lock. + * LSM hooks and iters (both sleepable and non-sleepable) are safe. + * Any sleepable progs are also safe since bpf_check_attach_target() enforce + * them can only be attached to some specific hook points. + */ +static bool check_css_task_iter_allowlist(struct bpf_verifier_env *env) +{ + enum bpf_prog_type prog_type = resolve_prog_type(env->prog); + + switch (prog_type) { + case BPF_PROG_TYPE_LSM: + return true; + case BPF_PROG_TYPE_TRACING: + if (env->prog->expected_attach_type == BPF_TRACE_ITER) + return true; + fallthrough; + default: + return env->prog->aux->sleepable; + } +} + static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta, int insn_idx) { @@ -10920,7 +11571,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } if ((is_kfunc_trusted_args(meta) || is_kfunc_rcu(meta)) && - (register_is_null(reg) || type_may_be_null(reg->type))) { + (register_is_null(reg) || type_may_be_null(reg->type)) && + !is_kfunc_arg_nullable(meta->btf, &args[i])) { verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i); return -EACCES; } @@ -10945,6 +11597,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return kf_arg_type; switch (kf_arg_type) { + case KF_ARG_PTR_TO_NULL: + continue; case KF_ARG_PTR_TO_ALLOC_BTF_ID: case KF_ARG_PTR_TO_BTF_ID: if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta)) @@ -10976,6 +11630,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_MEM_SIZE: case KF_ARG_PTR_TO_CALLBACK: case KF_ARG_PTR_TO_REFCOUNTED_KPTR: + case KF_ARG_PTR_TO_CONST_STR: /* Trusted by default */ break; default: @@ -11004,7 +11659,17 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } break; case KF_ARG_PTR_TO_ALLOC_BTF_ID: - if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { + if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC)) { + if (meta->func_id != special_kfunc_list[KF_bpf_obj_drop_impl]) { + verbose(env, "arg#%d expected for bpf_obj_drop_impl()\n", i); + return -EINVAL; + } + } else if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC | MEM_PERCPU)) { + if (meta->func_id != special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) { + verbose(env, "arg#%d expected for bpf_percpu_obj_drop_impl()\n", i); + return -EINVAL; + } + } else { verbose(env, "arg#%d expected pointer to allocated object\n", i); return -EINVAL; } @@ -11012,8 +11677,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "allocated object must be referenced\n"); return -EINVAL; } - if (meta->btf == btf_vmlinux && - meta->func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) { + if (meta->btf == btf_vmlinux) { meta->arg_btf = reg->btf; meta->arg_btf_id = reg->btf_id; } @@ -11075,6 +11739,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ break; } case KF_ARG_PTR_TO_ITER: + if (meta->func_id == special_kfunc_list[KF_bpf_iter_css_task_new]) { + if (!check_css_task_iter_allowlist(env)) { + verbose(env, "css_task_iter is only allowed in bpf_lsm, bpf_iter and sleepable progs\n"); + return -EINVAL; + } + } ret = process_iter_arg(env, regno, insn_idx, meta); if (ret < 0) return ret; @@ -11204,6 +11874,10 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ break; } case KF_ARG_PTR_TO_CALLBACK: + if (reg->type != PTR_TO_FUNC) { + verbose(env, "arg%d expected pointer to func\n", i); + return -EINVAL; + } meta->subprogno = reg->subprogno; break; case KF_ARG_PTR_TO_REFCOUNTED_KPTR: @@ -11228,6 +11902,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ meta->arg_btf = reg->btf; meta->arg_btf_id = reg->btf_id; break; + case KF_ARG_PTR_TO_CONST_STR: + if (reg->type != PTR_TO_MAP_VALUE) { + verbose(env, "arg#%d doesn't point to a const string\n", i); + return -EINVAL; + } + ret = check_reg_const_str(env, reg, regno); + if (ret) + return ret; + break; } } @@ -11282,6 +11965,8 @@ static int fetch_kfunc_meta(struct bpf_verifier_env *env, return 0; } +static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name); + static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { @@ -11322,12 +12007,28 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EACCES; } + /* Check the arguments */ + err = check_kfunc_args(env, &meta, insn_idx); + if (err < 0) + return err; + + if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_rbtree_add_callback_state); + if (err) { + verbose(env, "kfunc %s#%d failed callback verification\n", + func_name, meta.func_id); + return err; + } + } + rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta); rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta); if (env->cur_state->active_rcu_lock) { struct bpf_func_state *state; struct bpf_reg_state *reg; + u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER); if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) { verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n"); @@ -11338,7 +12039,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, verbose(env, "nested rcu read lock (kernel function %s)\n", func_name); return -EINVAL; } else if (rcu_unlock) { - bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ + bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, clear_mask, ({ if (reg->type & MEM_RCU) { reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL); reg->type |= PTR_UNTRUSTED; @@ -11356,10 +12057,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EINVAL; } - /* Check the arguments */ - err = check_kfunc_args(env, &meta, insn_idx); - if (err < 0) - return err; /* In case of release function, we get register number of refcounted * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now. */ @@ -11393,13 +12090,21 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } - if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { - err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, - set_rbtree_add_callback_state); - if (err) { - verbose(env, "kfunc %s#%d failed callback verification\n", + if (meta.func_id == special_kfunc_list[KF_bpf_throw]) { + if (!bpf_jit_supports_exceptions()) { + verbose(env, "JIT does not support calling kfunc %s#%d\n", func_name, meta.func_id); - return err; + return -ENOTSUPP; + } + env->seen_exception = true; + + /* In the case of the default callback, the cookie value passed + * to bpf_throw becomes the return value of the program. + */ + if (!env->exception_callback_subprog) { + err = check_return_code(env, BPF_REG_1, "R1"); + if (err < 0) + return err; } } @@ -11413,6 +12118,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* Only exception is bpf_obj_new_impl */ if (meta.btf != btf_vmlinux || (meta.func_id != special_kfunc_list[KF_bpf_obj_new_impl] && + meta.func_id != special_kfunc_list[KF_bpf_percpu_obj_new_impl] && meta.func_id != special_kfunc_list[KF_bpf_refcount_acquire_impl])) { verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n"); return -EINVAL; @@ -11426,11 +12132,13 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id); if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) { - if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl]) { + if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] || + meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { + struct btf_struct_meta *struct_meta; struct btf *ret_btf; u32 ret_btf_id; - if (unlikely(!bpf_global_ma_set)) + if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set) return -ENOMEM; if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) { @@ -11443,24 +12151,67 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* This may be NULL due to user not supplying a BTF */ if (!ret_btf) { - verbose(env, "bpf_obj_new requires prog BTF\n"); + verbose(env, "bpf_obj_new/bpf_percpu_obj_new requires prog BTF\n"); return -EINVAL; } ret_t = btf_type_by_id(ret_btf, ret_btf_id); if (!ret_t || !__btf_type_is_struct(ret_t)) { - verbose(env, "bpf_obj_new type ID argument must be of a struct\n"); + verbose(env, "bpf_obj_new/bpf_percpu_obj_new type ID argument must be of a struct\n"); return -EINVAL; } + if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { + if (ret_t->size > BPF_GLOBAL_PERCPU_MA_MAX_SIZE) { + verbose(env, "bpf_percpu_obj_new type size (%d) is greater than %d\n", + ret_t->size, BPF_GLOBAL_PERCPU_MA_MAX_SIZE); + return -EINVAL; + } + + if (!bpf_global_percpu_ma_set) { + mutex_lock(&bpf_percpu_ma_lock); + if (!bpf_global_percpu_ma_set) { + /* Charge memory allocated with bpf_global_percpu_ma to + * root memcg. The obj_cgroup for root memcg is NULL. + */ + err = bpf_mem_alloc_percpu_init(&bpf_global_percpu_ma, NULL); + if (!err) + bpf_global_percpu_ma_set = true; + } + mutex_unlock(&bpf_percpu_ma_lock); + if (err) + return err; + } + + mutex_lock(&bpf_percpu_ma_lock); + err = bpf_mem_alloc_percpu_unit_init(&bpf_global_percpu_ma, ret_t->size); + mutex_unlock(&bpf_percpu_ma_lock); + if (err) + return err; + } + + struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id); + if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { + if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) { + verbose(env, "bpf_percpu_obj_new type ID argument must be of a struct of scalars\n"); + return -EINVAL; + } + + if (struct_meta) { + verbose(env, "bpf_percpu_obj_new type ID argument must not contain special fields\n"); + return -EINVAL; + } + } + mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; regs[BPF_REG_0].btf = ret_btf; regs[BPF_REG_0].btf_id = ret_btf_id; + if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) + regs[BPF_REG_0].type |= MEM_PERCPU; insn_aux->obj_new_size = ret_t->size; - insn_aux->kptr_struct_meta = - btf_find_struct_meta(ret_btf, ret_btf_id); + insn_aux->kptr_struct_meta = struct_meta; } else if (meta.func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; @@ -11597,7 +12348,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, regs[BPF_REG_0].id = ++env->id_gen; } else if (btf_type_is_void(t)) { if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) { - if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) { + if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl] || + meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) { insn_aux->kptr_struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id); @@ -12074,6 +12826,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } switch (base_type(ptr_reg->type)) { + case PTR_TO_FLOW_KEYS: + if (known) + break; + fallthrough; case CONST_PTR_TO_MAP: /* smin_val represents the known value */ if (known && smin_val == 0 && opcode == BPF_ADD) @@ -13294,13 +14050,12 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) /* check dest operand */ err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); + err = err ?: adjust_reg_min_max_vals(env, insn); if (err) return err; - - return adjust_reg_min_max_vals(env, insn); } - return 0; + return reg_bounds_sanity_check(env, ®s[insn->dst_reg], "alu"); } static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, @@ -13382,153 +14137,130 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, })); } -static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode) -{ - struct tnum subreg = tnum_subreg(reg->var_off); - s32 sval = (s32)val; +/* + * <reg1> <op> <reg2>, currently assuming reg2 is a constant + */ +static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2, + u8 opcode, bool is_jmp32) +{ + struct tnum t1 = is_jmp32 ? tnum_subreg(reg1->var_off) : reg1->var_off; + struct tnum t2 = is_jmp32 ? tnum_subreg(reg2->var_off) : reg2->var_off; + u64 umin1 = is_jmp32 ? (u64)reg1->u32_min_value : reg1->umin_value; + u64 umax1 = is_jmp32 ? (u64)reg1->u32_max_value : reg1->umax_value; + s64 smin1 = is_jmp32 ? (s64)reg1->s32_min_value : reg1->smin_value; + s64 smax1 = is_jmp32 ? (s64)reg1->s32_max_value : reg1->smax_value; + u64 umin2 = is_jmp32 ? (u64)reg2->u32_min_value : reg2->umin_value; + u64 umax2 = is_jmp32 ? (u64)reg2->u32_max_value : reg2->umax_value; + s64 smin2 = is_jmp32 ? (s64)reg2->s32_min_value : reg2->smin_value; + s64 smax2 = is_jmp32 ? (s64)reg2->s32_max_value : reg2->smax_value; switch (opcode) { case BPF_JEQ: - if (tnum_is_const(subreg)) - return !!tnum_equals_const(subreg, val); - else if (val < reg->u32_min_value || val > reg->u32_max_value) - return 0; - break; - case BPF_JNE: - if (tnum_is_const(subreg)) - return !tnum_equals_const(subreg, val); - else if (val < reg->u32_min_value || val > reg->u32_max_value) - return 1; - break; - case BPF_JSET: - if ((~subreg.mask & subreg.value) & val) - return 1; - if (!((subreg.mask | subreg.value) & val)) - return 0; - break; - case BPF_JGT: - if (reg->u32_min_value > val) - return 1; - else if (reg->u32_max_value <= val) - return 0; - break; - case BPF_JSGT: - if (reg->s32_min_value > sval) - return 1; - else if (reg->s32_max_value <= sval) - return 0; - break; - case BPF_JLT: - if (reg->u32_max_value < val) - return 1; - else if (reg->u32_min_value >= val) - return 0; - break; - case BPF_JSLT: - if (reg->s32_max_value < sval) - return 1; - else if (reg->s32_min_value >= sval) - return 0; - break; - case BPF_JGE: - if (reg->u32_min_value >= val) - return 1; - else if (reg->u32_max_value < val) - return 0; - break; - case BPF_JSGE: - if (reg->s32_min_value >= sval) - return 1; - else if (reg->s32_max_value < sval) + /* constants, umin/umax and smin/smax checks would be + * redundant in this case because they all should match + */ + if (tnum_is_const(t1) && tnum_is_const(t2)) + return t1.value == t2.value; + /* non-overlapping ranges */ + if (umin1 > umax2 || umax1 < umin2) return 0; - break; - case BPF_JLE: - if (reg->u32_max_value <= val) - return 1; - else if (reg->u32_min_value > val) - return 0; - break; - case BPF_JSLE: - if (reg->s32_max_value <= sval) - return 1; - else if (reg->s32_min_value > sval) - return 0; - break; - } - - return -1; -} - - -static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) -{ - s64 sval = (s64)val; - - switch (opcode) { - case BPF_JEQ: - if (tnum_is_const(reg->var_off)) - return !!tnum_equals_const(reg->var_off, val); - else if (val < reg->umin_value || val > reg->umax_value) + if (smin1 > smax2 || smax1 < smin2) return 0; + if (!is_jmp32) { + /* if 64-bit ranges are inconclusive, see if we can + * utilize 32-bit subrange knowledge to eliminate + * branches that can't be taken a priori + */ + if (reg1->u32_min_value > reg2->u32_max_value || + reg1->u32_max_value < reg2->u32_min_value) + return 0; + if (reg1->s32_min_value > reg2->s32_max_value || + reg1->s32_max_value < reg2->s32_min_value) + return 0; + } break; case BPF_JNE: - if (tnum_is_const(reg->var_off)) - return !tnum_equals_const(reg->var_off, val); - else if (val < reg->umin_value || val > reg->umax_value) + /* constants, umin/umax and smin/smax checks would be + * redundant in this case because they all should match + */ + if (tnum_is_const(t1) && tnum_is_const(t2)) + return t1.value != t2.value; + /* non-overlapping ranges */ + if (umin1 > umax2 || umax1 < umin2) return 1; + if (smin1 > smax2 || smax1 < smin2) + return 1; + if (!is_jmp32) { + /* if 64-bit ranges are inconclusive, see if we can + * utilize 32-bit subrange knowledge to eliminate + * branches that can't be taken a priori + */ + if (reg1->u32_min_value > reg2->u32_max_value || + reg1->u32_max_value < reg2->u32_min_value) + return 1; + if (reg1->s32_min_value > reg2->s32_max_value || + reg1->s32_max_value < reg2->s32_min_value) + return 1; + } break; case BPF_JSET: - if ((~reg->var_off.mask & reg->var_off.value) & val) + if (!is_reg_const(reg2, is_jmp32)) { + swap(reg1, reg2); + swap(t1, t2); + } + if (!is_reg_const(reg2, is_jmp32)) + return -1; + if ((~t1.mask & t1.value) & t2.value) return 1; - if (!((reg->var_off.mask | reg->var_off.value) & val)) + if (!((t1.mask | t1.value) & t2.value)) return 0; break; case BPF_JGT: - if (reg->umin_value > val) + if (umin1 > umax2) return 1; - else if (reg->umax_value <= val) + else if (umax1 <= umin2) return 0; break; case BPF_JSGT: - if (reg->smin_value > sval) + if (smin1 > smax2) return 1; - else if (reg->smax_value <= sval) + else if (smax1 <= smin2) return 0; break; case BPF_JLT: - if (reg->umax_value < val) + if (umax1 < umin2) return 1; - else if (reg->umin_value >= val) + else if (umin1 >= umax2) return 0; break; case BPF_JSLT: - if (reg->smax_value < sval) + if (smax1 < smin2) return 1; - else if (reg->smin_value >= sval) + else if (smin1 >= smax2) return 0; break; case BPF_JGE: - if (reg->umin_value >= val) + if (umin1 >= umax2) return 1; - else if (reg->umax_value < val) + else if (umax1 < umin2) return 0; break; case BPF_JSGE: - if (reg->smin_value >= sval) + if (smin1 >= smax2) return 1; - else if (reg->smax_value < sval) + else if (smax1 < smin2) return 0; break; case BPF_JLE: - if (reg->umax_value <= val) + if (umax1 <= umin2) return 1; - else if (reg->umin_value > val) + else if (umin1 > umax2) return 0; break; case BPF_JSLE: - if (reg->smax_value <= sval) + if (smax1 <= smin2) return 1; - else if (reg->smin_value > sval) + else if (smin1 > smax2) return 0; break; } @@ -13536,41 +14268,6 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) return -1; } -/* compute branch direction of the expression "if (reg opcode val) goto target;" - * and return: - * 1 - branch will be taken and "goto target" will be executed - * 0 - branch will not be taken and fall-through to next insn - * -1 - unknown. Example: "if (reg < 5)" is unknown when register value - * range [0,10] - */ -static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode, - bool is_jmp32) -{ - if (__is_pointer_value(false, reg)) { - if (!reg_not_null(reg)) - return -1; - - /* If pointer is valid tests against zero will fail so we can - * use this to direct branch taken. - */ - if (val != 0) - return -1; - - switch (opcode) { - case BPF_JEQ: - return 0; - case BPF_JNE: - return 1; - default: - return -1; - } - } - - if (is_jmp32) - return is_branch32_taken(reg, val, opcode); - return is_branch64_taken(reg, val, opcode); -} - static int flip_opcode(u32 opcode) { /* How can we transform "a <op> b" into "b <op> a"? */ @@ -13632,216 +14329,280 @@ static int is_pkt_ptr_branch_taken(struct bpf_reg_state *dst_reg, return -1; } -/* Adjusts the register min/max values in the case that the dst_reg is the - * variable register that we are working on, and src_reg is a constant or we're - * simply doing a BPF_K check. - * In JEQ/JNE cases we also adjust the var_off values. +/* compute branch direction of the expression "if (<reg1> opcode <reg2>) goto target;" + * and return: + * 1 - branch will be taken and "goto target" will be executed + * 0 - branch will not be taken and fall-through to next insn + * -1 - unknown. Example: "if (reg1 < 5)" is unknown when register value + * range [0,10] */ -static void reg_set_min_max(struct bpf_reg_state *true_reg, - struct bpf_reg_state *false_reg, - u64 val, u32 val32, - u8 opcode, bool is_jmp32) -{ - struct tnum false_32off = tnum_subreg(false_reg->var_off); - struct tnum false_64off = false_reg->var_off; - struct tnum true_32off = tnum_subreg(true_reg->var_off); - struct tnum true_64off = true_reg->var_off; - s64 sval = (s64)val; - s32 sval32 = (s32)val32; - - /* If the dst_reg is a pointer, we can't learn anything about its - * variable offset from the compare (unless src_reg were a pointer into - * the same object, but we don't bother with that. - * Since false_reg and true_reg have the same type by construction, we - * only need to check one of them for pointerness. - */ - if (__is_pointer_value(false, false_reg)) - return; +static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2, + u8 opcode, bool is_jmp32) +{ + if (reg_is_pkt_pointer_any(reg1) && reg_is_pkt_pointer_any(reg2) && !is_jmp32) + return is_pkt_ptr_branch_taken(reg1, reg2, opcode); + + if (__is_pointer_value(false, reg1) || __is_pointer_value(false, reg2)) { + u64 val; + + /* arrange that reg2 is a scalar, and reg1 is a pointer */ + if (!is_reg_const(reg2, is_jmp32)) { + opcode = flip_opcode(opcode); + swap(reg1, reg2); + } + /* and ensure that reg2 is a constant */ + if (!is_reg_const(reg2, is_jmp32)) + return -1; + + if (!reg_not_null(reg1)) + return -1; + + /* If pointer is valid tests against zero will fail so we can + * use this to direct branch taken. + */ + val = reg_const_value(reg2, is_jmp32); + if (val != 0) + return -1; + + switch (opcode) { + case BPF_JEQ: + return 0; + case BPF_JNE: + return 1; + default: + return -1; + } + } + /* now deal with two scalars, but not necessarily constants */ + return is_scalar_branch_taken(reg1, reg2, opcode, is_jmp32); +} + +/* Opcode that corresponds to a *false* branch condition. + * E.g., if r1 < r2, then reverse (false) condition is r1 >= r2 + */ +static u8 rev_opcode(u8 opcode) +{ switch (opcode) { - /* JEQ/JNE comparison doesn't change the register equivalence. - * - * r1 = r2; - * if (r1 == 42) goto label; - * ... - * label: // here both r1 and r2 are known to be 42. - * - * Hence when marking register as known preserve it's ID. + case BPF_JEQ: return BPF_JNE; + case BPF_JNE: return BPF_JEQ; + /* JSET doesn't have it's reverse opcode in BPF, so add + * BPF_X flag to denote the reverse of that operation */ + case BPF_JSET: return BPF_JSET | BPF_X; + case BPF_JSET | BPF_X: return BPF_JSET; + case BPF_JGE: return BPF_JLT; + case BPF_JGT: return BPF_JLE; + case BPF_JLE: return BPF_JGT; + case BPF_JLT: return BPF_JGE; + case BPF_JSGE: return BPF_JSLT; + case BPF_JSGT: return BPF_JSLE; + case BPF_JSLE: return BPF_JSGT; + case BPF_JSLT: return BPF_JSGE; + default: return 0; + } +} + +/* Refine range knowledge for <reg1> <op> <reg>2 conditional operation. */ +static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2, + u8 opcode, bool is_jmp32) +{ + struct tnum t; + u64 val; + +again: + switch (opcode) { case BPF_JEQ: if (is_jmp32) { - __mark_reg32_known(true_reg, val32); - true_32off = tnum_subreg(true_reg->var_off); + reg1->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value); + reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value); + reg1->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value); + reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value); + reg2->u32_min_value = reg1->u32_min_value; + reg2->u32_max_value = reg1->u32_max_value; + reg2->s32_min_value = reg1->s32_min_value; + reg2->s32_max_value = reg1->s32_max_value; + + t = tnum_intersect(tnum_subreg(reg1->var_off), tnum_subreg(reg2->var_off)); + reg1->var_off = tnum_with_subreg(reg1->var_off, t); + reg2->var_off = tnum_with_subreg(reg2->var_off, t); } else { - ___mark_reg_known(true_reg, val); - true_64off = true_reg->var_off; + reg1->umin_value = max(reg1->umin_value, reg2->umin_value); + reg1->umax_value = min(reg1->umax_value, reg2->umax_value); + reg1->smin_value = max(reg1->smin_value, reg2->smin_value); + reg1->smax_value = min(reg1->smax_value, reg2->smax_value); + reg2->umin_value = reg1->umin_value; + reg2->umax_value = reg1->umax_value; + reg2->smin_value = reg1->smin_value; + reg2->smax_value = reg1->smax_value; + + reg1->var_off = tnum_intersect(reg1->var_off, reg2->var_off); + reg2->var_off = reg1->var_off; } break; case BPF_JNE: + if (!is_reg_const(reg2, is_jmp32)) + swap(reg1, reg2); + if (!is_reg_const(reg2, is_jmp32)) + break; + + /* try to recompute the bound of reg1 if reg2 is a const and + * is exactly the edge of reg1. + */ + val = reg_const_value(reg2, is_jmp32); if (is_jmp32) { - __mark_reg32_known(false_reg, val32); - false_32off = tnum_subreg(false_reg->var_off); + /* u32_min_value is not equal to 0xffffffff at this point, + * because otherwise u32_max_value is 0xffffffff as well, + * in such a case both reg1 and reg2 would be constants, + * jump would be predicted and reg_set_min_max() won't + * be called. + * + * Same reasoning works for all {u,s}{min,max}{32,64} cases + * below. + */ + if (reg1->u32_min_value == (u32)val) + reg1->u32_min_value++; + if (reg1->u32_max_value == (u32)val) + reg1->u32_max_value--; + if (reg1->s32_min_value == (s32)val) + reg1->s32_min_value++; + if (reg1->s32_max_value == (s32)val) + reg1->s32_max_value--; } else { - ___mark_reg_known(false_reg, val); - false_64off = false_reg->var_off; + if (reg1->umin_value == (u64)val) + reg1->umin_value++; + if (reg1->umax_value == (u64)val) + reg1->umax_value--; + if (reg1->smin_value == (s64)val) + reg1->smin_value++; + if (reg1->smax_value == (s64)val) + reg1->smax_value--; } break; case BPF_JSET: + if (!is_reg_const(reg2, is_jmp32)) + swap(reg1, reg2); + if (!is_reg_const(reg2, is_jmp32)) + break; + val = reg_const_value(reg2, is_jmp32); + /* BPF_JSET (i.e., TRUE branch, *not* BPF_JSET | BPF_X) + * requires single bit to learn something useful. E.g., if we + * know that `r1 & 0x3` is true, then which bits (0, 1, or both) + * are actually set? We can learn something definite only if + * it's a single-bit value to begin with. + * + * BPF_JSET | BPF_X (i.e., negation of BPF_JSET) doesn't have + * this restriction. I.e., !(r1 & 0x3) means neither bit 0 nor + * bit 1 is set, which we can readily use in adjustments. + */ + if (!is_power_of_2(val)) + break; if (is_jmp32) { - false_32off = tnum_and(false_32off, tnum_const(~val32)); - if (is_power_of_2(val32)) - true_32off = tnum_or(true_32off, - tnum_const(val32)); + t = tnum_or(tnum_subreg(reg1->var_off), tnum_const(val)); + reg1->var_off = tnum_with_subreg(reg1->var_off, t); } else { - false_64off = tnum_and(false_64off, tnum_const(~val)); - if (is_power_of_2(val)) - true_64off = tnum_or(true_64off, - tnum_const(val)); + reg1->var_off = tnum_or(reg1->var_off, tnum_const(val)); } break; - case BPF_JGE: - case BPF_JGT: - { + case BPF_JSET | BPF_X: /* reverse of BPF_JSET, see rev_opcode() */ + if (!is_reg_const(reg2, is_jmp32)) + swap(reg1, reg2); + if (!is_reg_const(reg2, is_jmp32)) + break; + val = reg_const_value(reg2, is_jmp32); if (is_jmp32) { - u32 false_umax = opcode == BPF_JGT ? val32 : val32 - 1; - u32 true_umin = opcode == BPF_JGT ? val32 + 1 : val32; - - false_reg->u32_max_value = min(false_reg->u32_max_value, - false_umax); - true_reg->u32_min_value = max(true_reg->u32_min_value, - true_umin); + t = tnum_and(tnum_subreg(reg1->var_off), tnum_const(~val)); + reg1->var_off = tnum_with_subreg(reg1->var_off, t); } else { - u64 false_umax = opcode == BPF_JGT ? val : val - 1; - u64 true_umin = opcode == BPF_JGT ? val + 1 : val; - - false_reg->umax_value = min(false_reg->umax_value, false_umax); - true_reg->umin_value = max(true_reg->umin_value, true_umin); + reg1->var_off = tnum_and(reg1->var_off, tnum_const(~val)); } break; - } - case BPF_JSGE: - case BPF_JSGT: - { + case BPF_JLE: if (is_jmp32) { - s32 false_smax = opcode == BPF_JSGT ? sval32 : sval32 - 1; - s32 true_smin = opcode == BPF_JSGT ? sval32 + 1 : sval32; - - false_reg->s32_max_value = min(false_reg->s32_max_value, false_smax); - true_reg->s32_min_value = max(true_reg->s32_min_value, true_smin); + reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value); + reg2->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value); } else { - s64 false_smax = opcode == BPF_JSGT ? sval : sval - 1; - s64 true_smin = opcode == BPF_JSGT ? sval + 1 : sval; - - false_reg->smax_value = min(false_reg->smax_value, false_smax); - true_reg->smin_value = max(true_reg->smin_value, true_smin); + reg1->umax_value = min(reg1->umax_value, reg2->umax_value); + reg2->umin_value = max(reg1->umin_value, reg2->umin_value); } break; - } - case BPF_JLE: case BPF_JLT: - { if (is_jmp32) { - u32 false_umin = opcode == BPF_JLT ? val32 : val32 + 1; - u32 true_umax = opcode == BPF_JLT ? val32 - 1 : val32; - - false_reg->u32_min_value = max(false_reg->u32_min_value, - false_umin); - true_reg->u32_max_value = min(true_reg->u32_max_value, - true_umax); + reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value - 1); + reg2->u32_min_value = max(reg1->u32_min_value + 1, reg2->u32_min_value); } else { - u64 false_umin = opcode == BPF_JLT ? val : val + 1; - u64 true_umax = opcode == BPF_JLT ? val - 1 : val; - - false_reg->umin_value = max(false_reg->umin_value, false_umin); - true_reg->umax_value = min(true_reg->umax_value, true_umax); + reg1->umax_value = min(reg1->umax_value, reg2->umax_value - 1); + reg2->umin_value = max(reg1->umin_value + 1, reg2->umin_value); } break; - } case BPF_JSLE: + if (is_jmp32) { + reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value); + reg2->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value); + } else { + reg1->smax_value = min(reg1->smax_value, reg2->smax_value); + reg2->smin_value = max(reg1->smin_value, reg2->smin_value); + } + break; case BPF_JSLT: - { if (is_jmp32) { - s32 false_smin = opcode == BPF_JSLT ? sval32 : sval32 + 1; - s32 true_smax = opcode == BPF_JSLT ? sval32 - 1 : sval32; - - false_reg->s32_min_value = max(false_reg->s32_min_value, false_smin); - true_reg->s32_max_value = min(true_reg->s32_max_value, true_smax); + reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value - 1); + reg2->s32_min_value = max(reg1->s32_min_value + 1, reg2->s32_min_value); } else { - s64 false_smin = opcode == BPF_JSLT ? sval : sval + 1; - s64 true_smax = opcode == BPF_JSLT ? sval - 1 : sval; - - false_reg->smin_value = max(false_reg->smin_value, false_smin); - true_reg->smax_value = min(true_reg->smax_value, true_smax); + reg1->smax_value = min(reg1->smax_value, reg2->smax_value - 1); + reg2->smin_value = max(reg1->smin_value + 1, reg2->smin_value); } break; - } + case BPF_JGE: + case BPF_JGT: + case BPF_JSGE: + case BPF_JSGT: + /* just reuse LE/LT logic above */ + opcode = flip_opcode(opcode); + swap(reg1, reg2); + goto again; default: return; } - - if (is_jmp32) { - false_reg->var_off = tnum_or(tnum_clear_subreg(false_64off), - tnum_subreg(false_32off)); - true_reg->var_off = tnum_or(tnum_clear_subreg(true_64off), - tnum_subreg(true_32off)); - __reg_combine_32_into_64(false_reg); - __reg_combine_32_into_64(true_reg); - } else { - false_reg->var_off = false_64off; - true_reg->var_off = true_64off; - __reg_combine_64_into_32(false_reg); - __reg_combine_64_into_32(true_reg); - } } -/* Same as above, but for the case that dst_reg holds a constant and src_reg is - * the variable reg. +/* Adjusts the register min/max values in the case that the dst_reg and + * src_reg are both SCALAR_VALUE registers (or we are simply doing a BPF_K + * check, in which case we havea fake SCALAR_VALUE representing insn->imm). + * Technically we can do similar adjustments for pointers to the same object, + * but we don't support that right now. */ -static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, - struct bpf_reg_state *false_reg, - u64 val, u32 val32, - u8 opcode, bool is_jmp32) +static int reg_set_min_max(struct bpf_verifier_env *env, + struct bpf_reg_state *true_reg1, + struct bpf_reg_state *true_reg2, + struct bpf_reg_state *false_reg1, + struct bpf_reg_state *false_reg2, + u8 opcode, bool is_jmp32) { - opcode = flip_opcode(opcode); - /* This uses zero as "not present in table"; luckily the zero opcode, - * BPF_JA, can't get here. + int err; + + /* If either register is a pointer, we can't learn anything about its + * variable offset from the compare (unless they were a pointer into + * the same object, but we don't bother with that). */ - if (opcode) - reg_set_min_max(true_reg, false_reg, val, val32, opcode, is_jmp32); -} - -/* Regs are known to be equal, so intersect their min/max/var_off */ -static void __reg_combine_min_max(struct bpf_reg_state *src_reg, - struct bpf_reg_state *dst_reg) -{ - src_reg->umin_value = dst_reg->umin_value = max(src_reg->umin_value, - dst_reg->umin_value); - src_reg->umax_value = dst_reg->umax_value = min(src_reg->umax_value, - dst_reg->umax_value); - src_reg->smin_value = dst_reg->smin_value = max(src_reg->smin_value, - dst_reg->smin_value); - src_reg->smax_value = dst_reg->smax_value = min(src_reg->smax_value, - dst_reg->smax_value); - src_reg->var_off = dst_reg->var_off = tnum_intersect(src_reg->var_off, - dst_reg->var_off); - reg_bounds_sync(src_reg); - reg_bounds_sync(dst_reg); -} + if (false_reg1->type != SCALAR_VALUE || false_reg2->type != SCALAR_VALUE) + return 0; -static void reg_combine_min_max(struct bpf_reg_state *true_src, - struct bpf_reg_state *true_dst, - struct bpf_reg_state *false_src, - struct bpf_reg_state *false_dst, - u8 opcode) -{ - switch (opcode) { - case BPF_JEQ: - __reg_combine_min_max(true_src, true_dst); - break; - case BPF_JNE: - __reg_combine_min_max(false_src, false_dst); - break; - } + /* fallthrough (FALSE) branch */ + regs_refine_cond_op(false_reg1, false_reg2, rev_opcode(opcode), is_jmp32); + reg_bounds_sync(false_reg1); + reg_bounds_sync(false_reg2); + + /* jump (TRUE) branch */ + regs_refine_cond_op(true_reg1, true_reg2, opcode, is_jmp32); + reg_bounds_sync(true_reg1); + reg_bounds_sync(true_reg2); + + err = reg_bounds_sanity_check(env, true_reg1, "true_reg1"); + err = err ?: reg_bounds_sanity_check(env, true_reg2, "true_reg2"); + err = err ?: reg_bounds_sanity_check(env, false_reg1, "false_reg1"); + err = err ?: reg_bounds_sanity_check(env, false_reg2, "false_reg2"); + return err; } static void mark_ptr_or_null_reg(struct bpf_func_state *state, @@ -14039,6 +14800,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL; struct bpf_reg_state *eq_branch_regs; + struct bpf_reg_state fake_reg = {}; u8 opcode = BPF_OP(insn->code); bool is_jmp32; int pred = -1; @@ -14079,42 +14841,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); return -EINVAL; } + src_reg = &fake_reg; + src_reg->type = SCALAR_VALUE; + __mark_reg_known(src_reg, insn->imm); } is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; - - if (BPF_SRC(insn->code) == BPF_K) { - pred = is_branch_taken(dst_reg, insn->imm, opcode, is_jmp32); - } else if (src_reg->type == SCALAR_VALUE && - is_jmp32 && tnum_is_const(tnum_subreg(src_reg->var_off))) { - pred = is_branch_taken(dst_reg, - tnum_subreg(src_reg->var_off).value, - opcode, - is_jmp32); - } else if (src_reg->type == SCALAR_VALUE && - !is_jmp32 && tnum_is_const(src_reg->var_off)) { - pred = is_branch_taken(dst_reg, - src_reg->var_off.value, - opcode, - is_jmp32); - } else if (dst_reg->type == SCALAR_VALUE && - is_jmp32 && tnum_is_const(tnum_subreg(dst_reg->var_off))) { - pred = is_branch_taken(src_reg, - tnum_subreg(dst_reg->var_off).value, - flip_opcode(opcode), - is_jmp32); - } else if (dst_reg->type == SCALAR_VALUE && - !is_jmp32 && tnum_is_const(dst_reg->var_off)) { - pred = is_branch_taken(src_reg, - dst_reg->var_off.value, - flip_opcode(opcode), - is_jmp32); - } else if (reg_is_pkt_pointer_any(dst_reg) && - reg_is_pkt_pointer_any(src_reg) && - !is_jmp32) { - pred = is_pkt_ptr_branch_taken(dst_reg, src_reg, opcode); - } - + pred = is_branch_taken(dst_reg, src_reg, opcode, is_jmp32); if (pred >= 0) { /* If we get here with a dst_reg pointer type it is because * above is_branch_taken() special cased the 0 comparison. @@ -14137,6 +14870,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, !sanitize_speculative_path(env, insn, *insn_idx + 1, *insn_idx)) return -EFAULT; + if (env->log.level & BPF_LOG_LEVEL) + print_insn_state(env, this_branch->frame[this_branch->curframe]); *insn_idx += insn->off; return 0; } else if (pred == 0) { @@ -14149,6 +14884,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, *insn_idx + insn->off + 1, *insn_idx)) return -EFAULT; + if (env->log.level & BPF_LOG_LEVEL) + print_insn_state(env, this_branch->frame[this_branch->curframe]); return 0; } @@ -14158,53 +14895,27 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return -EFAULT; other_branch_regs = other_branch->frame[other_branch->curframe]->regs; - /* detect if we are comparing against a constant value so we can adjust - * our min/max values for our dst register. - * this is only legit if both are scalars (or pointers to the same - * object, I suppose, see the PTR_MAYBE_NULL related if block below), - * because otherwise the different base pointers mean the offsets aren't - * comparable. - */ if (BPF_SRC(insn->code) == BPF_X) { - struct bpf_reg_state *src_reg = ®s[insn->src_reg]; - - if (dst_reg->type == SCALAR_VALUE && - src_reg->type == SCALAR_VALUE) { - if (tnum_is_const(src_reg->var_off) || - (is_jmp32 && - tnum_is_const(tnum_subreg(src_reg->var_off)))) - reg_set_min_max(&other_branch_regs[insn->dst_reg], - dst_reg, - src_reg->var_off.value, - tnum_subreg(src_reg->var_off).value, - opcode, is_jmp32); - else if (tnum_is_const(dst_reg->var_off) || - (is_jmp32 && - tnum_is_const(tnum_subreg(dst_reg->var_off)))) - reg_set_min_max_inv(&other_branch_regs[insn->src_reg], - src_reg, - dst_reg->var_off.value, - tnum_subreg(dst_reg->var_off).value, - opcode, is_jmp32); - else if (!is_jmp32 && - (opcode == BPF_JEQ || opcode == BPF_JNE)) - /* Comparing for equality, we can combine knowledge */ - reg_combine_min_max(&other_branch_regs[insn->src_reg], - &other_branch_regs[insn->dst_reg], - src_reg, dst_reg, opcode); - if (src_reg->id && - !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) { - find_equal_scalars(this_branch, src_reg); - find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]); - } - - } - } else if (dst_reg->type == SCALAR_VALUE) { - reg_set_min_max(&other_branch_regs[insn->dst_reg], - dst_reg, insn->imm, (u32)insn->imm, - opcode, is_jmp32); + err = reg_set_min_max(env, + &other_branch_regs[insn->dst_reg], + &other_branch_regs[insn->src_reg], + dst_reg, src_reg, opcode, is_jmp32); + } else /* BPF_SRC(insn->code) == BPF_K */ { + err = reg_set_min_max(env, + &other_branch_regs[insn->dst_reg], + src_reg /* fake one */, + dst_reg, src_reg /* same fake one */, + opcode, is_jmp32); } + if (err) + return err; + if (BPF_SRC(insn->code) == BPF_X && + src_reg->type == SCALAR_VALUE && src_reg->id && + !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) { + find_equal_scalars(this_branch, src_reg); + find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]); + } if (dst_reg->type == SCALAR_VALUE && dst_reg->id && !WARN_ON_ONCE(dst_reg->id != other_branch_regs[insn->dst_reg].id)) { find_equal_scalars(this_branch, dst_reg); @@ -14427,7 +15138,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) * gen_ld_abs() may terminate the program at runtime, leading to * reference leak. */ - err = check_reference_leak(env); + err = check_reference_leak(env, false); if (err) { verbose(env, "BPF_LD_[ABS|IND] cannot be mixed with socket references\n"); return err; @@ -14476,19 +15187,20 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } -static int check_return_code(struct bpf_verifier_env *env) +static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name) { + const char *exit_ctx = "At program exit"; struct tnum enforce_attach_type_range = tnum_unknown; const struct bpf_prog *prog = env->prog; struct bpf_reg_state *reg; - struct tnum range = tnum_range(0, 1); + struct bpf_retval_range range = retval_range(0, 1); enum bpf_prog_type prog_type = resolve_prog_type(env->prog); int err; struct bpf_func_state *frame = env->cur_state->frame[0]; const bool is_subprog = frame->subprogno; /* LSM and struct_ops func-ptr's return type could be "void" */ - if (!is_subprog) { + if (!is_subprog || frame->in_exception_callback_fn) { switch (prog_type) { case BPF_PROG_TYPE_LSM: if (prog->expected_attach_type == BPF_LSM_CGROUP) @@ -14510,36 +15222,28 @@ static int check_return_code(struct bpf_verifier_env *env) * of bpf_exit, which means that program wrote * something into it earlier */ - err = check_reg_arg(env, BPF_REG_0, SRC_OP); + err = check_reg_arg(env, regno, SRC_OP); if (err) return err; - if (is_pointer_value(env, BPF_REG_0)) { - verbose(env, "R0 leaks addr as return value\n"); + if (is_pointer_value(env, regno)) { + verbose(env, "R%d leaks addr as return value\n", regno); return -EACCES; } - reg = cur_regs(env) + BPF_REG_0; + reg = cur_regs(env) + regno; if (frame->in_async_callback_fn) { /* enforce return zero from async callbacks like timer */ - if (reg->type != SCALAR_VALUE) { - verbose(env, "In async callback the register R0 is not a known value (%s)\n", - reg_type_str(env, reg->type)); - return -EINVAL; - } - - if (!tnum_in(tnum_const(0), reg->var_off)) { - verbose_invalid_scalar(env, reg, &range, "async callback", "R0"); - return -EINVAL; - } - return 0; + exit_ctx = "At async callback return"; + range = retval_range(0, 0); + goto enforce_retval; } - if (is_subprog) { + if (is_subprog && !frame->in_exception_callback_fn) { if (reg->type != SCALAR_VALUE) { - verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n", - reg_type_str(env, reg->type)); + verbose(env, "At subprogram exit the register R%d is not a scalar value (%s)\n", + regno, reg_type_str(env, reg->type)); return -EINVAL; } return 0; @@ -14549,18 +15253,21 @@ static int check_return_code(struct bpf_verifier_env *env) case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG || + env->prog->expected_attach_type == BPF_CGROUP_UNIX_RECVMSG || env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME || env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME || + env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETPEERNAME || env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME || - env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME) - range = tnum_range(1, 1); + env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME || + env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETSOCKNAME) + range = retval_range(1, 1); if (env->prog->expected_attach_type == BPF_CGROUP_INET4_BIND || env->prog->expected_attach_type == BPF_CGROUP_INET6_BIND) - range = tnum_range(0, 3); + range = retval_range(0, 3); break; case BPF_PROG_TYPE_CGROUP_SKB: if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) { - range = tnum_range(0, 3); + range = retval_range(0, 3); enforce_attach_type_range = tnum_range(2, 3); } break; @@ -14573,13 +15280,13 @@ static int check_return_code(struct bpf_verifier_env *env) case BPF_PROG_TYPE_RAW_TRACEPOINT: if (!env->prog->aux->attach_btf_id) return 0; - range = tnum_const(0); + range = retval_range(0, 0); break; case BPF_PROG_TYPE_TRACING: switch (env->prog->expected_attach_type) { case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: - range = tnum_const(0); + range = retval_range(0, 0); break; case BPF_TRACE_RAW_TP: case BPF_MODIFY_RETURN: @@ -14591,7 +15298,7 @@ static int check_return_code(struct bpf_verifier_env *env) } break; case BPF_PROG_TYPE_SK_LOOKUP: - range = tnum_range(SK_DROP, SK_PASS); + range = retval_range(SK_DROP, SK_PASS); break; case BPF_PROG_TYPE_LSM: @@ -14605,12 +15312,12 @@ static int check_return_code(struct bpf_verifier_env *env) /* Make sure programs that attach to void * hooks don't try to modify return value. */ - range = tnum_range(1, 1); + range = retval_range(1, 1); } break; case BPF_PROG_TYPE_NETFILTER: - range = tnum_range(NF_DROP, NF_ACCEPT); + range = retval_range(NF_DROP, NF_ACCEPT); break; case BPF_PROG_TYPE_EXT: /* freplace program can return anything as its return value @@ -14620,15 +15327,21 @@ static int check_return_code(struct bpf_verifier_env *env) return 0; } +enforce_retval: if (reg->type != SCALAR_VALUE) { - verbose(env, "At program exit the register R0 is not a known value (%s)\n", - reg_type_str(env, reg->type)); + verbose(env, "%s the register R%d is not a known value (%s)\n", + exit_ctx, regno, reg_type_str(env, reg->type)); return -EINVAL; } - if (!tnum_in(range, reg->var_off)) { - verbose_invalid_scalar(env, reg, &range, "program exit", "R0"); - if (prog->expected_attach_type == BPF_LSM_CGROUP && + err = mark_chain_precision(env, regno); + if (err) + return err; + + if (!retval_range_within(range, reg)) { + verbose_invalid_scalar(env, reg, range, exit_ctx, reg_name); + if (!is_subprog && + prog->expected_attach_type == BPF_LSM_CGROUP && prog_type == BPF_PROG_TYPE_LSM && !prog->aux->attach_func_proto->type) verbose(env, "Note, BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n"); @@ -14681,21 +15394,6 @@ enum { BRANCH = 2, }; -static u32 state_htab_size(struct bpf_verifier_env *env) -{ - return env->prog->len; -} - -static struct bpf_verifier_state_list **explored_state( - struct bpf_verifier_env *env, - int idx) -{ - struct bpf_verifier_state *cur = env->cur_state; - struct bpf_func_state *state = cur->frame[cur->curframe]; - - return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)]; -} - static void mark_prune_point(struct bpf_verifier_env *env, int idx) { env->insn_aux_data[idx].prune_point = true; @@ -14716,6 +15414,15 @@ static bool is_force_checkpoint(struct bpf_verifier_env *env, int insn_idx) return env->insn_aux_data[insn_idx].force_checkpoint; } +static void mark_calls_callback(struct bpf_verifier_env *env, int idx) +{ + env->insn_aux_data[idx].calls_callback = true; +} + +static bool calls_callback(struct bpf_verifier_env *env, int insn_idx) +{ + return env->insn_aux_data[insn_idx].calls_callback; +} enum { DONE_EXPLORING = 0, @@ -14727,8 +15434,7 @@ enum { * w - next instruction * e - edge */ -static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, - bool loop_ok) +static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) { int *insn_stack = env->cfg.insn_stack; int *insn_state = env->cfg.insn_state; @@ -14760,7 +15466,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, insn_stack[env->cfg.cur_stack++] = w; return KEEP_EXPLORING; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { - if (loop_ok && env->bpf_capable) + if (env->bpf_capable) return DONE_EXPLORING; verbose_linfo(env, t, "%d: ", t); verbose_linfo(env, w, "%d: ", w); @@ -14780,24 +15486,20 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, struct bpf_verifier_env *env, bool visit_callee) { - int ret; + int ret, insn_sz; - ret = push_insn(t, t + 1, FALLTHROUGH, env, false); + insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1; + ret = push_insn(t, t + insn_sz, FALLTHROUGH, env); if (ret) return ret; - mark_prune_point(env, t + 1); + mark_prune_point(env, t + insn_sz); /* when we exit from subprog, we need to record non-linear history */ - mark_jmp_point(env, t + 1); + mark_jmp_point(env, t + insn_sz); if (visit_callee) { mark_prune_point(env, t); - ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env, - /* It's ok to allow recursion from CFG point of - * view. __check_func_call() will do the actual - * check. - */ - bpf_pseudo_func(insns + t)); + ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env); } return ret; } @@ -14810,15 +15512,17 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, static int visit_insn(int t, struct bpf_verifier_env *env) { struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t]; - int ret, off; + int ret, off, insn_sz; if (bpf_pseudo_func(insn)) return visit_func_call_insn(t, insns, env, true); /* All non-branch instructions have a single fall-through edge. */ if (BPF_CLASS(insn->code) != BPF_JMP && - BPF_CLASS(insn->code) != BPF_JMP32) - return push_insn(t, t + 1, FALLTHROUGH, env, false); + BPF_CLASS(insn->code) != BPF_JMP32) { + insn_sz = bpf_is_ldimm64(insn) ? 2 : 1; + return push_insn(t, t + insn_sz, FALLTHROUGH, env); + } switch (BPF_OP(insn->code)) { case BPF_EXIT: @@ -14832,6 +15536,21 @@ static int visit_insn(int t, struct bpf_verifier_env *env) * async state will be pushed for further exploration. */ mark_prune_point(env, t); + /* For functions that invoke callbacks it is not known how many times + * callback would be called. Verifier models callback calling functions + * by repeatedly visiting callback bodies and returning to origin call + * instruction. + * In order to stop such iteration verifier needs to identify when a + * state identical some state from a previous iteration is reached. + * Check below forces creation of checkpoint before callback calling + * instruction to allow search for such identical states. + */ + if (is_sync_callback_calling_insn(insn)) { + mark_calls_callback(env, t); + mark_force_checkpoint(env, t); + mark_prune_point(env, t); + mark_jmp_point(env, t); + } if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { struct bpf_kfunc_call_arg_meta meta; @@ -14864,8 +15583,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env) off = insn->imm; /* unconditional jump with single edge */ - ret = push_insn(t, t + off + 1, FALLTHROUGH, env, - true); + ret = push_insn(t, t + off + 1, FALLTHROUGH, env); if (ret) return ret; @@ -14878,11 +15596,11 @@ static int visit_insn(int t, struct bpf_verifier_env *env) /* conditional jump with two edges */ mark_prune_point(env, t); - ret = push_insn(t, t + 1, FALLTHROUGH, env, true); + ret = push_insn(t, t + 1, FALLTHROUGH, env); if (ret) return ret; - return push_insn(t, t + insn->off + 1, BRANCH, env, true); + return push_insn(t, t + insn->off + 1, BRANCH, env); } } @@ -14893,8 +15611,8 @@ static int check_cfg(struct bpf_verifier_env *env) { int insn_cnt = env->prog->len; int *insn_stack, *insn_state; - int ret = 0; - int i; + int ex_insn_beg, i, ret = 0; + bool ex_done = false; insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_state) @@ -14910,6 +15628,7 @@ static int check_cfg(struct bpf_verifier_env *env) insn_stack[0] = 0; /* 0 is the first instruction */ env->cfg.cur_stack = 1; +walk_cfg: while (env->cfg.cur_stack > 0) { int t = insn_stack[env->cfg.cur_stack - 1]; @@ -14936,12 +15655,32 @@ static int check_cfg(struct bpf_verifier_env *env) goto err_free; } + if (env->exception_callback_subprog && !ex_done) { + ex_insn_beg = env->subprog_info[env->exception_callback_subprog].start; + + insn_state[ex_insn_beg] = DISCOVERED; + insn_stack[0] = ex_insn_beg; + env->cfg.cur_stack = 1; + ex_done = true; + goto walk_cfg; + } + for (i = 0; i < insn_cnt; i++) { + struct bpf_insn *insn = &env->prog->insnsi[i]; + if (insn_state[i] != EXPLORED) { verbose(env, "unreachable insn %d\n", i); ret = -EINVAL; goto err_free; } + if (bpf_is_ldimm64(insn)) { + if (insn_state[i + 1] != 0) { + verbose(env, "jump into the middle of ldimm64 insn %d\n", i); + ret = -EINVAL; + goto err_free; + } + i++; /* skip second half of ldimm64 */ + } } ret = 0; /* cfg looks good */ @@ -14973,20 +15712,18 @@ static int check_abnormal_return(struct bpf_verifier_env *env) #define MIN_BPF_FUNCINFO_SIZE 8 #define MAX_FUNCINFO_REC_SIZE 252 -static int check_btf_func(struct bpf_verifier_env *env, - const union bpf_attr *attr, - bpfptr_t uattr) +static int check_btf_func_early(struct bpf_verifier_env *env, + const union bpf_attr *attr, + bpfptr_t uattr) { - const struct btf_type *type, *func_proto, *ret_type; - u32 i, nfuncs, urec_size, min_size; u32 krec_size = sizeof(struct bpf_func_info); + const struct btf_type *type, *func_proto; + u32 i, nfuncs, urec_size, min_size; struct bpf_func_info *krecord; - struct bpf_func_info_aux *info_aux = NULL; struct bpf_prog *prog; const struct btf *btf; - bpfptr_t urecord; u32 prev_offset = 0; - bool scalar_return; + bpfptr_t urecord; int ret = -ENOMEM; nfuncs = attr->func_info_cnt; @@ -14996,11 +15733,6 @@ static int check_btf_func(struct bpf_verifier_env *env, return 0; } - if (nfuncs != env->subprog_cnt) { - verbose(env, "number of funcs in func_info doesn't match number of subprogs\n"); - return -EINVAL; - } - urec_size = attr->func_info_rec_size; if (urec_size < MIN_BPF_FUNCINFO_SIZE || urec_size > MAX_FUNCINFO_REC_SIZE || @@ -15018,9 +15750,6 @@ static int check_btf_func(struct bpf_verifier_env *env, krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN); if (!krecord) return -ENOMEM; - info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL | __GFP_NOWARN); - if (!info_aux) - goto err_free; for (i = 0; i < nfuncs; i++) { ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size); @@ -15059,11 +15788,6 @@ static int check_btf_func(struct bpf_verifier_env *env, goto err_free; } - if (env->subprog_info[i].start != krecord[i].insn_off) { - verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n"); - goto err_free; - } - /* check type_id */ type = btf_type_by_id(btf, krecord[i].type_id); if (!type || !btf_type_is_func(type)) { @@ -15071,12 +15795,77 @@ static int check_btf_func(struct bpf_verifier_env *env, krecord[i].type_id); goto err_free; } - info_aux[i].linkage = BTF_INFO_VLEN(type->info); func_proto = btf_type_by_id(btf, type->type); if (unlikely(!func_proto || !btf_type_is_func_proto(func_proto))) /* btf_func_check() already verified it during BTF load */ goto err_free; + + prev_offset = krecord[i].insn_off; + bpfptr_add(&urecord, urec_size); + } + + prog->aux->func_info = krecord; + prog->aux->func_info_cnt = nfuncs; + return 0; + +err_free: + kvfree(krecord); + return ret; +} + +static int check_btf_func(struct bpf_verifier_env *env, + const union bpf_attr *attr, + bpfptr_t uattr) +{ + const struct btf_type *type, *func_proto, *ret_type; + u32 i, nfuncs, urec_size; + struct bpf_func_info *krecord; + struct bpf_func_info_aux *info_aux = NULL; + struct bpf_prog *prog; + const struct btf *btf; + bpfptr_t urecord; + bool scalar_return; + int ret = -ENOMEM; + + nfuncs = attr->func_info_cnt; + if (!nfuncs) { + if (check_abnormal_return(env)) + return -EINVAL; + return 0; + } + if (nfuncs != env->subprog_cnt) { + verbose(env, "number of funcs in func_info doesn't match number of subprogs\n"); + return -EINVAL; + } + + urec_size = attr->func_info_rec_size; + + prog = env->prog; + btf = prog->aux->btf; + + urecord = make_bpfptr(attr->func_info, uattr.is_kernel); + + krecord = prog->aux->func_info; + info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL | __GFP_NOWARN); + if (!info_aux) + return -ENOMEM; + + for (i = 0; i < nfuncs; i++) { + /* check insn_off */ + ret = -EINVAL; + + if (env->subprog_info[i].start != krecord[i].insn_off) { + verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n"); + goto err_free; + } + + /* Already checked type_id */ + type = btf_type_by_id(btf, krecord[i].type_id); + info_aux[i].linkage = BTF_INFO_VLEN(type->info); + /* Already checked func_proto */ + func_proto = btf_type_by_id(btf, type->type); + ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL); scalar_return = btf_type_is_small_int(ret_type) || btf_is_any_enum(ret_type); @@ -15089,17 +15878,13 @@ static int check_btf_func(struct bpf_verifier_env *env, goto err_free; } - prev_offset = krecord[i].insn_off; bpfptr_add(&urecord, urec_size); } - prog->aux->func_info = krecord; - prog->aux->func_info_cnt = nfuncs; prog->aux->func_info_aux = info_aux; return 0; err_free: - kvfree(krecord); kfree(info_aux); return ret; } @@ -15112,7 +15897,8 @@ static void adjust_btf_func(struct bpf_verifier_env *env) if (!aux->func_info) return; - for (i = 0; i < env->subprog_cnt; i++) + /* func_info is not available for hidden subprogs */ + for (i = 0; i < env->subprog_cnt - env->hidden_subprog_cnt; i++) aux->func_info[i].insn_off = env->subprog_info[i].start; } @@ -15316,9 +16102,9 @@ static int check_core_relo(struct bpf_verifier_env *env, return err; } -static int check_btf_info(struct bpf_verifier_env *env, - const union bpf_attr *attr, - bpfptr_t uattr) +static int check_btf_info_early(struct bpf_verifier_env *env, + const union bpf_attr *attr, + bpfptr_t uattr) { struct btf *btf; int err; @@ -15338,6 +16124,24 @@ static int check_btf_info(struct bpf_verifier_env *env, } env->prog->aux->btf = btf; + err = check_btf_func_early(env, attr, uattr); + if (err) + return err; + return 0; +} + +static int check_btf_info(struct bpf_verifier_env *env, + const union bpf_attr *attr, + bpfptr_t uattr) +{ + int err; + + if (!attr->func_info_cnt && !attr->line_info_cnt) { + if (check_abnormal_return(env)) + return -EINVAL; + return 0; + } + err = check_btf_func(env, attr, uattr); if (err) return err; @@ -15496,18 +16300,14 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn, struct bpf_verifier_state *cur) { struct bpf_verifier_state_list *sl; - int i; sl = *explored_state(env, insn); while (sl) { if (sl->state.branches) goto next; if (sl->state.insn_idx != insn || - sl->state.curframe != cur->curframe) + !same_callsites(&sl->state, cur)) goto next; - for (i = 0; i <= cur->curframe; i++) - if (sl->state.frame[i]->callsite != cur->frame[i]->callsite) - goto next; clean_verifier_state(env, &sl->state); next: sl = sl->next; @@ -15525,8 +16325,11 @@ static bool regs_exact(const struct bpf_reg_state *rold, /* Returns true if (rold safe implies rcur safe) */ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, - struct bpf_reg_state *rcur, struct bpf_idmap *idmap) + struct bpf_reg_state *rcur, struct bpf_idmap *idmap, bool exact) { + if (exact) + return regs_exact(rold, rcur, idmap); + if (!(rold->live & REG_LIVE_READ)) /* explored state didn't use this */ return true; @@ -15643,7 +16446,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, } static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, - struct bpf_func_state *cur, struct bpf_idmap *idmap) + struct bpf_func_state *cur, struct bpf_idmap *idmap, bool exact) { int i, spi; @@ -15656,7 +16459,12 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, spi = i / BPF_REG_SIZE; - if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) { + if (exact && + old->stack[spi].slot_type[i % BPF_REG_SIZE] != + cur->stack[spi].slot_type[i % BPF_REG_SIZE]) + return false; + + if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ) && !exact) { i += BPF_REG_SIZE - 1; /* explored state didn't use this */ continue; @@ -15706,7 +16514,7 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, * return false to continue verification of this path */ if (!regsafe(env, &old->stack[spi].spilled_ptr, - &cur->stack[spi].spilled_ptr, idmap)) + &cur->stack[spi].spilled_ptr, idmap, exact)) return false; break; case STACK_DYNPTR: @@ -15788,16 +16596,16 @@ static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur, * the current state will reach 'bpf_exit' instruction safely */ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old, - struct bpf_func_state *cur) + struct bpf_func_state *cur, bool exact) { int i; for (i = 0; i < MAX_BPF_REG; i++) if (!regsafe(env, &old->regs[i], &cur->regs[i], - &env->idmap_scratch)) + &env->idmap_scratch, exact)) return false; - if (!stacksafe(env, old, cur, &env->idmap_scratch)) + if (!stacksafe(env, old, cur, &env->idmap_scratch, exact)) return false; if (!refsafe(old, cur, &env->idmap_scratch)) @@ -15806,17 +16614,23 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat return true; } +static void reset_idmap_scratch(struct bpf_verifier_env *env) +{ + env->idmap_scratch.tmp_id_gen = env->id_gen; + memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map)); +} + static bool states_equal(struct bpf_verifier_env *env, struct bpf_verifier_state *old, - struct bpf_verifier_state *cur) + struct bpf_verifier_state *cur, + bool exact) { int i; if (old->curframe != cur->curframe) return false; - env->idmap_scratch.tmp_id_gen = env->id_gen; - memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map)); + reset_idmap_scratch(env); /* Verification state from speculative execution simulation * must never prune a non-speculative execution one. @@ -15846,7 +16660,7 @@ static bool states_equal(struct bpf_verifier_env *env, for (i = 0; i <= old->curframe; i++) { if (old->frame[i]->callsite != cur->frame[i]->callsite) return false; - if (!func_states_equal(env, old->frame[i], cur->frame[i])) + if (!func_states_equal(env, old->frame[i], cur->frame[i], exact)) return false; } return true; @@ -16100,10 +16914,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl, **pprev; - struct bpf_verifier_state *cur = env->cur_state, *new; - int i, j, err, states_cnt = 0; + struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry; + int i, j, n, err, states_cnt = 0; bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx); bool add_new_state = force_new_state; + bool force_exact; /* bpf progs typically have pruning point every 4 instructions * http://vger.kernel.org/bpfconf2019.html#session-1 @@ -16156,9 +16971,33 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * It's safe to assume that iterator loop will finish, taking into * account iter_next() contract of eventually returning * sticky NULL result. + * + * Note, that states have to be compared exactly in this case because + * read and precision marks might not be finalized inside the loop. + * E.g. as in the program below: + * + * 1. r7 = -16 + * 2. r6 = bpf_get_prandom_u32() + * 3. while (bpf_iter_num_next(&fp[-8])) { + * 4. if (r6 != 42) { + * 5. r7 = -32 + * 6. r6 = bpf_get_prandom_u32() + * 7. continue + * 8. } + * 9. r0 = r10 + * 10. r0 += r7 + * 11. r8 = *(u64 *)(r0 + 0) + * 12. r6 = bpf_get_prandom_u32() + * 13. } + * + * Here verifier would first visit path 1-3, create a checkpoint at 3 + * with r7=-16, continue to 4-7,3. Existing checkpoint at 3 does + * not have read or precision mark for r7 yet, thus inexact states + * comparison would discard current state with r7=-32 + * => unsafe memory access at 11 would not be caught. */ if (is_iter_next_insn(env, insn_idx)) { - if (states_equal(env, &sl->state, cur)) { + if (states_equal(env, &sl->state, cur, true)) { struct bpf_func_state *cur_frame; struct bpf_reg_state *iter_state, *iter_reg; int spi; @@ -16174,17 +17013,29 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) */ spi = __get_spi(iter_reg->off + iter_reg->var_off.value); iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr; - if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) + if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) { + update_loop_entry(cur, &sl->state); goto hit; + } } goto skip_inf_loop_check; } + if (calls_callback(env, insn_idx)) { + if (states_equal(env, &sl->state, cur, true)) + goto hit; + goto skip_inf_loop_check; + } /* attempt to detect infinite loop to avoid unnecessary doomed work */ if (states_maybe_looping(&sl->state, cur) && - states_equal(env, &sl->state, cur) && - !iter_active_depths_differ(&sl->state, cur)) { + states_equal(env, &sl->state, cur, false) && + !iter_active_depths_differ(&sl->state, cur) && + sl->state.callback_unroll_depth == cur->callback_unroll_depth) { verbose_linfo(env, insn_idx, "; "); verbose(env, "infinite loop detected at insn %d\n", insn_idx); + verbose(env, "cur state:"); + print_verifier_state(env, cur->frame[cur->curframe], true); + verbose(env, "old state:"); + print_verifier_state(env, sl->state.frame[cur->curframe], true); return -EINVAL; } /* if the verifier is processing a loop, avoid adding new state @@ -16206,7 +17057,36 @@ skip_inf_loop_check: add_new_state = false; goto miss; } - if (states_equal(env, &sl->state, cur)) { + /* If sl->state is a part of a loop and this loop's entry is a part of + * current verification path then states have to be compared exactly. + * 'force_exact' is needed to catch the following case: + * + * initial Here state 'succ' was processed first, + * | it was eventually tracked to produce a + * V state identical to 'hdr'. + * .---------> hdr All branches from 'succ' had been explored + * | | and thus 'succ' has its .branches == 0. + * | V + * | .------... Suppose states 'cur' and 'succ' correspond + * | | | to the same instruction + callsites. + * | V V In such case it is necessary to check + * | ... ... if 'succ' and 'cur' are states_equal(). + * | | | If 'succ' and 'cur' are a part of the + * | V V same loop exact flag has to be set. + * | succ <- cur To check if that is the case, verify + * | | if loop entry of 'succ' is in current + * | V DFS path. + * | ... + * | | + * '----' + * + * Additional details are in the comment before get_loop_entry(). + */ + loop_entry = get_loop_entry(&sl->state); + force_exact = loop_entry && loop_entry->branches > 0; + if (states_equal(env, &sl->state, cur, force_exact)) { + if (force_exact) + update_loop_entry(cur, loop_entry); hit: sl->hit_cnt++; /* reached equivalent register/stack state, @@ -16226,7 +17106,8 @@ hit: * the precision needs to be propagated back in * the current state. */ - err = err ? : push_jmp_history(env, cur); + if (is_jmp_point(env, env->insn_idx)) + err = err ? : push_jmp_history(env, cur, 0); err = err ? : propagate_precision(env, &sl->state); if (err) return err; @@ -16245,13 +17126,18 @@ miss: * to keep checking from state equivalence point of view. * Higher numbers increase max_states_per_insn and verification time, * but do not meaningfully decrease insn_processed. + * 'n' controls how many times state could miss before eviction. + * Use bigger 'n' for checkpoints because evicting checkpoint states + * too early would hinder iterator convergence. */ - if (sl->miss_cnt > sl->hit_cnt * 3 + 3) { + n = is_force_checkpoint(env, insn_idx) && sl->state.branches > 0 ? 64 : 3; + if (sl->miss_cnt > sl->hit_cnt * n + n) { /* the state is unlikely to be useful. Remove it to * speed up verification */ *pprev = sl->next; - if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) { + if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE && + !sl->state.used_as_loop_entry) { u32 br = sl->state.branches; WARN_ONCE(br, @@ -16320,6 +17206,7 @@ next: cur->parent = new; cur->first_insn_idx = insn_idx; + cur->dfs_depth = new->dfs_depth + 1; clear_jmp_history(cur); new_sl->next = *explored_state(env, insn_idx); *explored_state(env, insn_idx) = new_sl; @@ -16440,10 +17327,14 @@ static int do_check(struct bpf_verifier_env *env) int prev_insn_idx = -1; for (;;) { + bool exception_exit = false; struct bpf_insn *insn; u8 class; int err; + /* reset current history entry on each new instruction */ + env->cur_hist_ent = NULL; + env->prev_insn_idx = prev_insn_idx; if (env->insn_idx >= insn_cnt) { verbose(env, "invalid insn idx %d insn_cnt %d\n", @@ -16483,7 +17374,7 @@ static int do_check(struct bpf_verifier_env *env) } if (is_jmp_point(env, env->insn_idx)) { - err = push_jmp_history(env, state); + err = push_jmp_history(env, state, 0); if (err) return err; } @@ -16560,10 +17451,8 @@ static int do_check(struct bpf_verifier_env *env) insn->off, BPF_SIZE(insn->code), BPF_READ, insn->dst_reg, false, BPF_MODE(insn->code) == BPF_MEMSX); - if (err) - return err; - - err = save_aux_ptr_type(env, src_reg_type, true); + err = err ?: save_aux_ptr_type(env, src_reg_type, true); + err = err ?: reg_bounds_sanity_check(env, ®s[insn->dst_reg], "ldx"); if (err) return err; } else if (class == BPF_STX) { @@ -16654,12 +17543,17 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } } - if (insn->src_reg == BPF_PSEUDO_CALL) + if (insn->src_reg == BPF_PSEUDO_CALL) { err = check_func_call(env, insn, &env->insn_idx); - else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) + } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { err = check_kfunc_call(env, insn, &env->insn_idx); - else + if (!err && is_bpf_throw_kfunc(insn)) { + exception_exit = true; + goto process_bpf_exit_full; + } + } else { err = check_helper_call(env, insn, &env->insn_idx); + } if (err) return err; @@ -16689,7 +17583,7 @@ static int do_check(struct bpf_verifier_env *env) verbose(env, "BPF_EXIT uses reserved fields\n"); return -EINVAL; } - +process_bpf_exit_full: if (env->cur_state->active_lock.ptr && !in_rbtree_lock_required_cb(env)) { verbose(env, "bpf_spin_unlock is missing\n"); @@ -16708,10 +17602,23 @@ static int do_check(struct bpf_verifier_env *env) * function, for which reference_state must * match caller reference state when it exits. */ - err = check_reference_leak(env); + err = check_reference_leak(env, exception_exit); if (err) return err; + /* The side effect of the prepare_func_exit + * which is being skipped is that it frees + * bpf_func_state. Typically, process_bpf_exit + * will only be hit with outermost exit. + * copy_verifier_state in pop_stack will handle + * freeing of any extra bpf_func_state left over + * from not processing all nested function + * exits. We also skip return code checks as + * they are not needed for exceptional exits. + */ + if (exception_exit) + goto process_bpf_exit; + if (state->curframe) { /* exit from nested function */ err = prepare_func_exit(env, &env->insn_idx); @@ -16721,7 +17628,7 @@ static int do_check(struct bpf_verifier_env *env) continue; } - err = check_return_code(env); + err = check_return_code(env, BPF_REG_0, "R0"); if (err) return err; process_bpf_exit: @@ -17182,10 +18089,12 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) return -E2BIG; } + if (env->prog->aux->sleepable) + atomic64_inc(&map->sleepable_refcnt); /* hold the map. If the program is rejected by verifier, * the map will be released by release_maps() or it * will be used by the valid program until it's unloaded - * and all maps are released in free_used_maps() + * and all maps are released in bpf_free_used_maps() */ bpf_map_inc(map); @@ -18014,6 +18923,9 @@ static int jit_subprogs(struct bpf_verifier_env *env) } func[i]->aux->num_exentries = num_exentries; func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable; + func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb; + if (!i) + func[i]->aux->exception_boundary = env->seen_exception; func[i] = bpf_int_jit_compile(func[i]); if (!func[i]->jited) { err = -ENOTSUPP; @@ -18053,7 +18965,8 @@ static int jit_subprogs(struct bpf_verifier_env *env) * the call instruction, as an index for this list */ func[i]->aux->func = func; - func[i]->aux->func_cnt = env->subprog_cnt; + func[i]->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt; + func[i]->aux->real_func_cnt = env->subprog_cnt; } for (i = 0; i < env->subprog_cnt; i++) { old_bpf_func = func[i]->bpf_func; @@ -18099,7 +19012,10 @@ static int jit_subprogs(struct bpf_verifier_env *env) prog->aux->extable = func[0]->aux->extable; prog->aux->num_exentries = func[0]->aux->num_exentries; prog->aux->func = func; - prog->aux->func_cnt = env->subprog_cnt; + prog->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt; + prog->aux->real_func_cnt = env->subprog_cnt; + prog->aux->bpf_exception_cb = (void *)func[env->exception_callback_subprog]->bpf_func; + prog->aux->exception_boundary = func[0]->aux->exception_boundary; bpf_prog_jit_attempt_done(prog); return 0; out_free: @@ -18266,21 +19182,35 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, insn->imm = BPF_CALL_IMM(desc->addr); if (insn->off) return 0; - if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl]) { + if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl] || + desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta; struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) }; u64 obj_new_size = env->insn_aux_data[insn_idx].obj_new_size; + if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && kptr_struct_meta) { + verbose(env, "verifier internal error: NULL kptr_struct_meta expected at insn_idx %d\n", + insn_idx); + return -EFAULT; + } + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_1, obj_new_size); insn_buf[1] = addr[0]; insn_buf[2] = addr[1]; insn_buf[3] = *insn; *cnt = 4; } else if (desc->func_id == special_kfunc_list[KF_bpf_obj_drop_impl] || + desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] || desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) { struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta; struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) }; + if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] && kptr_struct_meta) { + verbose(env, "verifier internal error: NULL kptr_struct_meta expected at insn_idx %d\n", + insn_idx); + return -EFAULT; + } + if (desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] && !kptr_struct_meta) { verbose(env, "verifier internal error: kptr_struct_meta expected at insn_idx %d\n", @@ -18321,6 +19251,33 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return 0; } +/* The function requires that first instruction in 'patch' is insnsi[prog->len - 1] */ +static int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *patch, int len) +{ + struct bpf_subprog_info *info = env->subprog_info; + int cnt = env->subprog_cnt; + struct bpf_prog *prog; + + /* We only reserve one slot for hidden subprogs in subprog_info. */ + if (env->hidden_subprog_cnt) { + verbose(env, "verifier internal error: only one hidden subprog supported\n"); + return -EFAULT; + } + /* We're not patching any existing instruction, just appending the new + * ones for the hidden subprog. Hence all of the adjustment operations + * in bpf_patch_insn_data are no-ops. + */ + prog = bpf_patch_insn_data(env, env->prog->len - 1, patch, len); + if (!prog) + return -ENOMEM; + env->prog = prog; + info[cnt + 1].start = info[cnt].start; + info[cnt].start = prog->len - len + 1; + env->subprog_cnt++; + env->hidden_subprog_cnt++; + return 0; +} + /* Do various post-verification rewrites in a single program pass. * These rewrites simplify JIT and interpreter implementations. */ @@ -18339,6 +19296,24 @@ static int do_misc_fixups(struct bpf_verifier_env *env) struct bpf_map *map_ptr; int i, ret, cnt, delta = 0; + if (env->seen_exception && !env->exception_callback_subprog) { + struct bpf_insn patch[] = { + env->prog->insnsi[insn_cnt - 1], + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }; + + ret = add_hidden_subprog(env, patch, ARRAY_SIZE(patch)); + if (ret < 0) + return ret; + prog = env->prog; + insn = prog->insnsi; + + env->exception_callback_subprog = env->subprog_cnt - 1; + /* Don't update insn_cnt, as add_hidden_subprog always appends insns */ + mark_subprog_exc_cb(env, env->exception_callback_subprog); + } + for (i = 0; i < insn_cnt; i++, insn++) { /* Make divide-by-zero exceptions impossible. */ if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) || @@ -18608,6 +19583,25 @@ static int do_misc_fixups(struct bpf_verifier_env *env) goto patch_call_imm; } + /* bpf_per_cpu_ptr() and bpf_this_cpu_ptr() */ + if (env->insn_aux_data[i + delta].call_with_percpu_alloc_ptr) { + /* patch with 'r1 = *(u64 *)(r1 + 0)' since for percpu data, + * bpf_mem_alloc() returns a ptr to the percpu data ptr. + */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0); + insn_buf[1] = *insn; + cnt = 2; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto patch_call_imm; + } + /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup * and other inlining handlers are currently limited to 64 bit * only. @@ -19020,6 +20014,7 @@ static void free_states(struct bpf_verifier_env *env) static int do_check_common(struct bpf_verifier_env *env, int subprog) { bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); + struct bpf_subprog_info *sub = subprog_info(env, subprog); struct bpf_verifier_state *state; struct bpf_reg_state *regs; int ret, i; @@ -19046,40 +20041,71 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) state->first_insn_idx = env->subprog_info[subprog].start; state->last_insn_idx = -1; + regs = state->frame[state->curframe]->regs; if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) { - ret = btf_prepare_func_args(env, subprog, regs); + const char *sub_name = subprog_name(env, subprog); + struct bpf_subprog_arg_info *arg; + struct bpf_reg_state *reg; + + verbose(env, "Validating %s() func#%d...\n", sub_name, subprog); + ret = btf_prepare_func_args(env, subprog); if (ret) goto out; - for (i = BPF_REG_1; i <= BPF_REG_5; i++) { - if (regs[i].type == PTR_TO_CTX) + + if (subprog_is_exc_cb(env, subprog)) { + state->frame[0]->in_exception_callback_fn = true; + /* We have already ensured that the callback returns an integer, just + * like all global subprogs. We need to determine it only has a single + * scalar argument. + */ + if (sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_ANYTHING) { + verbose(env, "exception cb only supports single integer argument\n"); + ret = -EINVAL; + goto out; + } + } + for (i = BPF_REG_1; i <= sub->arg_cnt; i++) { + arg = &sub->args[i - BPF_REG_1]; + reg = ®s[i]; + + if (arg->arg_type == ARG_PTR_TO_CTX) { + reg->type = PTR_TO_CTX; mark_reg_known_zero(env, regs, i); - else if (regs[i].type == SCALAR_VALUE) + } else if (arg->arg_type == ARG_ANYTHING) { + reg->type = SCALAR_VALUE; mark_reg_unknown(env, regs, i); - else if (base_type(regs[i].type) == PTR_TO_MEM) { - const u32 mem_size = regs[i].mem_size; - + } else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) { + /* assume unspecial LOCAL dynptr type */ + __mark_dynptr_reg(reg, BPF_DYNPTR_TYPE_LOCAL, true, ++env->id_gen); + } else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) { + reg->type = PTR_TO_MEM; + if (arg->arg_type & PTR_MAYBE_NULL) + reg->type |= PTR_MAYBE_NULL; mark_reg_known_zero(env, regs, i); - regs[i].mem_size = mem_size; - regs[i].id = ++env->id_gen; + reg->mem_size = arg->mem_size; + reg->id = ++env->id_gen; + } else { + WARN_ONCE(1, "BUG: unhandled arg#%d type %d\n", + i - BPF_REG_1, arg->arg_type); + ret = -EFAULT; + goto out; } } } else { + /* if main BPF program has associated BTF info, validate that + * it's matching expected signature, and otherwise mark BTF + * info for main program as unreliable + */ + if (env->prog->aux->func_info_aux) { + ret = btf_prepare_func_args(env, 0); + if (ret || sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_PTR_TO_CTX) + env->prog->aux->func_info_aux[0].unreliable = true; + } + /* 1st arg to a function */ regs[BPF_REG_1].type = PTR_TO_CTX; mark_reg_known_zero(env, regs, BPF_REG_1); - ret = btf_check_subprog_arg_match(env, subprog, regs); - if (ret == -EFAULT) - /* unlikely verifier bug. abort. - * ret == 0 and ret < 0 are sadly acceptable for - * main() function due to backward compatibility. - * Like socket filter program may be written as: - * int bpf_prog(struct pt_regs *ctx) - * and never dereference that ctx in the program. - * 'struct pt_regs' is a type mismatch for socket - * filter that should be using 'struct __sk_buff'. - */ - goto out; } ret = do_check(env); @@ -19098,8 +20124,11 @@ out: return ret; } -/* Verify all global functions in a BPF program one by one based on their BTF. - * All global functions must pass verification. Otherwise the whole program is rejected. +/* Lazily verify all global functions based on their BTF, if they are called + * from main BPF program or any of subprograms transitively. + * BPF global subprogs called from dead code are not validated. + * All callable global functions must pass verification. + * Otherwise the whole program is rejected. * Consider: * int bar(int); * int foo(int f) @@ -19118,25 +20147,50 @@ out: static int do_check_subprogs(struct bpf_verifier_env *env) { struct bpf_prog_aux *aux = env->prog->aux; - int i, ret; + struct bpf_func_info_aux *sub_aux; + int i, ret, new_cnt; if (!aux->func_info) return 0; + /* exception callback is presumed to be always called */ + if (env->exception_callback_subprog) + subprog_aux(env, env->exception_callback_subprog)->called = true; + +again: + new_cnt = 0; for (i = 1; i < env->subprog_cnt; i++) { - if (aux->func_info_aux[i].linkage != BTF_FUNC_GLOBAL) + if (!subprog_is_global(env, i)) + continue; + + sub_aux = subprog_aux(env, i); + if (!sub_aux->called || sub_aux->verified) continue; + env->insn_idx = env->subprog_info[i].start; WARN_ON_ONCE(env->insn_idx == 0); ret = do_check_common(env, i); if (ret) { return ret; } else if (env->log.level & BPF_LOG_LEVEL) { - verbose(env, - "Func#%d is safe for any args that match its prototype\n", - i); + verbose(env, "Func#%d ('%s') is safe for any args that match its prototype\n", + i, subprog_name(env, i)); } + + /* We verified new global subprog, it might have called some + * more global subprogs that we haven't verified yet, so we + * need to do another pass over subprogs to verify those. + */ + sub_aux->verified = true; + new_cnt++; } + + /* We can't loop forever as we verify at least one global subprog on + * each pass. + */ + if (new_cnt) + goto again; + return 0; } @@ -19267,6 +20321,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, struct bpf_attach_target_info *tgt_info) { bool prog_extension = prog->type == BPF_PROG_TYPE_EXT; + bool prog_tracing = prog->type == BPF_PROG_TYPE_TRACING; const char prefix[] = "btf_trace_"; int ret = 0, subprog = -1, i; const struct btf_type *t; @@ -19314,6 +20369,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, bpf_log(log, "Subprog %s doesn't exist\n", tname); return -EINVAL; } + if (aux->func && aux->func[subprog]->aux->exception_cb) { + bpf_log(log, + "%s programs cannot attach to exception callback\n", + prog_extension ? "Extension" : "FENTRY/FEXIT"); + return -EINVAL; + } conservative = aux->func_info_aux[subprog].unreliable; if (prog_extension) { if (conservative) { @@ -19331,10 +20392,21 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, bpf_log(log, "Can attach to only JITed progs\n"); return -EINVAL; } - if (tgt_prog->type == prog->type) { - /* Cannot fentry/fexit another fentry/fexit program. - * Cannot attach program extension to another extension. - * It's ok to attach fentry/fexit to extension program. + if (prog_tracing) { + if (aux->attach_tracing_prog) { + /* + * Target program is an fentry/fexit which is already attached + * to another tracing program. More levels of nesting + * attachment are not allowed. + */ + bpf_log(log, "Cannot nest tracing program attach more than once\n"); + return -EINVAL; + } + } else if (tgt_prog->type == prog->type) { + /* + * To avoid potential call chain cycles, prevent attaching of a + * program extension to another extension. It's ok to attach + * fentry/fexit to extension program. */ bpf_log(log, "Cannot recursively attach\n"); return -EINVAL; @@ -19347,16 +20419,15 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, * except fentry/fexit. The reason is the following. * The fentry/fexit programs are used for performance * analysis, stats and can be attached to any program - * type except themselves. When extension program is - * replacing XDP function it is necessary to allow - * performance analysis of all functions. Both original - * XDP program and its program extension. Hence - * attaching fentry/fexit to BPF_PROG_TYPE_EXT is - * allowed. If extending of fentry/fexit was allowed it - * would be possible to create long call chain - * fentry->extension->fentry->extension beyond - * reasonable stack size. Hence extending fentry is not - * allowed. + * type. When extension program is replacing XDP function + * it is necessary to allow performance analysis of all + * functions. Both original XDP program and its program + * extension. Hence attaching fentry/fexit to + * BPF_PROG_TYPE_EXT is allowed. If extending of + * fentry/fexit was allowed it would be possible to create + * long call chain fentry->extension->fentry->extension + * beyond reasonable stack size. Hence extending fentry + * is not allowed. */ bpf_log(log, "Cannot extend fentry/fexit\n"); return -EINVAL; @@ -19643,6 +20714,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) if (!tr) return -ENOMEM; + if (tgt_prog && tgt_prog->aux->tail_call_reachable) + tr->flags = BPF_TRAMP_F_TAIL_CALL_CTX; + prog->aux->dst_trampoline = tr; return 0; } @@ -19730,6 +20804,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (is_priv) env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; + env->test_reg_invariants = attr->prog_flags & BPF_F_TEST_REG_INVARIANTS; env->explored_states = kvcalloc(state_htab_size(env), sizeof(struct bpf_verifier_state_list *), @@ -19738,6 +20813,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (!env->explored_states) goto skip_full_check; + ret = check_btf_info_early(env, attr, uattr); + if (ret < 0) + goto skip_full_check; + ret = add_subprog_and_kfunc(env); if (ret < 0) goto skip_full_check; @@ -19768,8 +20847,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret < 0) goto skip_full_check; - ret = do_check_subprogs(env); - ret = ret ?: do_check_main(env); + ret = do_check_main(env); + ret = ret ?: do_check_subprogs(env); if (ret == 0 && bpf_prog_is_offloaded(env->prog->aux)) ret = bpf_prog_offload_finalize(env); diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index c56071f150f2..520b90dd97ec 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -164,13 +164,13 @@ struct cgroup_mgctx { #define DEFINE_CGROUP_MGCTX(name) \ struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) -extern spinlock_t css_set_lock; extern struct cgroup_subsys *cgroup_subsys[]; extern struct list_head cgroup_roots; /* iterate across the hierarchies */ #define for_each_root(root) \ - list_for_each_entry((root), &cgroup_roots, root_list) + list_for_each_entry_rcu((root), &cgroup_roots, root_list, \ + lockdep_is_held(&cgroup_mutex)) /** * for_each_subsys - iterate all enabled cgroup subsystems diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index c487ffef6652..520a11cb12f4 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -360,10 +360,9 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, } css_task_iter_end(&it); length = n; - /* now sort & (if procs) strip out duplicates */ + /* now sort & strip out duplicates (tgids or recycled thread PIDs) */ sort(array, length, sizeof(pid_t), cmppid, NULL); - if (type == CGROUP_FILE_PROCS) - length = pidlist_uniq(array, length); + length = pidlist_uniq(array, length); l = cgroup_pidlist_find_create(cgrp, type); if (!l) { @@ -803,7 +802,7 @@ void cgroup1_release_agent(struct work_struct *work) goto out_free; ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); - if (ret < 0 || ret >= PATH_MAX) + if (ret < 0) goto out_free; argv[0] = agentbuf; @@ -1263,6 +1262,40 @@ int cgroup1_get_tree(struct fs_context *fc) return ret; } +/** + * task_get_cgroup1 - Acquires the associated cgroup of a task within a + * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its + * hierarchy ID. + * @tsk: The target task + * @hierarchy_id: The ID of a cgroup1 hierarchy + * + * On success, the cgroup is returned. On failure, ERR_PTR is returned. + * We limit it to cgroup1 only. + */ +struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id) +{ + struct cgroup *cgrp = ERR_PTR(-ENOENT); + struct cgroup_root *root; + unsigned long flags; + + rcu_read_lock(); + for_each_root(root) { + /* cgroup1 only*/ + if (root == &cgrp_dfl_root) + continue; + if (root->hierarchy_id != hierarchy_id) + continue; + spin_lock_irqsave(&css_set_lock, flags); + cgrp = task_cgroup_from_root(tsk, root); + if (!cgrp || !cgroup_tryget(cgrp)) + cgrp = ERR_PTR(-ENOENT); + spin_unlock_irqrestore(&css_set_lock, flags); + break; + } + rcu_read_unlock(); + return cgrp; +} + static int __init cgroup1_wq_init(void) { /* diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1fb7f562289d..a66c088c851c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -207,6 +207,8 @@ static u16 have_exit_callback __read_mostly; static u16 have_release_callback __read_mostly; static u16 have_canfork_callback __read_mostly; +static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS); + /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { .ns.count = REFCOUNT_INIT(2), @@ -1313,7 +1315,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root) void cgroup_free_root(struct cgroup_root *root) { - kfree(root); + kfree_rcu(root, rcu); } static void cgroup_destroy_root(struct cgroup_root *root) @@ -1345,12 +1347,13 @@ static void cgroup_destroy_root(struct cgroup_root *root) spin_unlock_irq(&css_set_lock); - if (!list_empty(&root->root_list)) { - list_del(&root->root_list); - cgroup_root_count--; - } + WARN_ON_ONCE(list_empty(&root->root_list)); + list_del_rcu(&root->root_list); + cgroup_root_count--; + + if (!have_favordynmods) + cgroup_favor_dynmods(root, false); - cgroup_favor_dynmods(root, false); cgroup_exit_root_id(root); cgroup_unlock(); @@ -1386,7 +1389,15 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset, } } - BUG_ON(!res_cgroup); + /* + * If cgroup_mutex is not held, the cgrp_cset_link will be freed + * before we remove the cgroup root from the root_list. Consequently, + * when accessing a cgroup root, the cset_link may have already been + * freed, resulting in a NULL res_cgroup. However, by holding the + * cgroup_mutex, we ensure that res_cgroup can't be NULL. + * If we don't hold cgroup_mutex in the caller, we must do the NULL + * check. + */ return res_cgroup; } @@ -1409,6 +1420,11 @@ current_cgns_cgroup_from_root(struct cgroup_root *root) rcu_read_unlock(); + /* + * The namespace_sem is held by current, so the root cgroup can't + * be umounted. Therefore, we can ensure that the res is non-NULL. + */ + WARN_ON_ONCE(!res); return res; } @@ -1445,7 +1461,6 @@ static struct cgroup *current_cgns_cgroup_dfl(void) static struct cgroup *cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root) { - lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock); return __cset_cgroup_from_root(cset, root); @@ -1453,7 +1468,9 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, /* * Return the cgroup for "task" from the given hierarchy. Must be - * called with cgroup_mutex and css_set_lock held. + * called with css_set_lock held to prevent task's groups from being modified. + * Must be called with either cgroup_mutex or rcu read lock to prevent the + * cgroup root from being destroyed. */ struct cgroup *task_cgroup_from_root(struct task_struct *task, struct cgroup_root *root) @@ -1719,20 +1736,22 @@ static int css_populate_dir(struct cgroup_subsys_state *css) if (!css->ss) { if (cgroup_on_dfl(cgrp)) { - ret = cgroup_addrm_files(&cgrp->self, cgrp, + ret = cgroup_addrm_files(css, cgrp, cgroup_base_files, true); if (ret < 0) return ret; if (cgroup_psi_enabled()) { - ret = cgroup_addrm_files(&cgrp->self, cgrp, + ret = cgroup_addrm_files(css, cgrp, cgroup_psi_files, true); if (ret < 0) return ret; } } else { - cgroup_addrm_files(css, cgrp, - cgroup1_base_files, true); + ret = cgroup_addrm_files(css, cgrp, + cgroup1_base_files, true); + if (ret < 0) + return ret; } } else { list_for_each_entry(cfts, &css->ss->cfts, node) { @@ -1887,7 +1906,7 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX); spin_unlock_irq(&css_set_lock); - if (len >= PATH_MAX) + if (len == -E2BIG) len = -ERANGE; else if (len > 0) { seq_escape(sf, buf, " \t\n\\"); @@ -1902,6 +1921,7 @@ enum cgroup2_param { Opt_favordynmods, Opt_memory_localevents, Opt_memory_recursiveprot, + Opt_memory_hugetlb_accounting, nr__cgroup2_params }; @@ -1910,6 +1930,7 @@ static const struct fs_parameter_spec cgroup2_fs_parameters[] = { fsparam_flag("favordynmods", Opt_favordynmods), fsparam_flag("memory_localevents", Opt_memory_localevents), fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot), + fsparam_flag("memory_hugetlb_accounting", Opt_memory_hugetlb_accounting), {} }; @@ -1936,6 +1957,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param case Opt_memory_recursiveprot: ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT; return 0; + case Opt_memory_hugetlb_accounting: + ctx->flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; + return 0; } return -EINVAL; } @@ -1960,6 +1984,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags) cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT; else cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT; + + if (root_flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING) + cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; + else + cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; } } @@ -1973,6 +2002,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root seq_puts(seq, ",memory_localevents"); if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT) seq_puts(seq, ",memory_recursiveprot"); + if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING) + seq_puts(seq, ",memory_hugetlb_accounting"); return 0; } @@ -2014,7 +2045,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) struct cgroup_root *root = ctx->root; struct cgroup *cgrp = &root->cgrp; - INIT_LIST_HEAD(&root->root_list); + INIT_LIST_HEAD_RCU(&root->root_list); atomic_set(&root->nr_cgrps, 1); cgrp->root = root; init_cgroup_housekeeping(cgrp); @@ -2097,7 +2128,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) * care of subsystems' refcounts, which are explicitly dropped in * the failure exit path. */ - list_add(&root->root_list, &cgroup_roots); + list_add_rcu(&root->root_list, &cgroup_roots); cgroup_root_count++; /* @@ -2243,9 +2274,9 @@ static int cgroup_init_fs_context(struct fs_context *fc) fc->user_ns = get_user_ns(ctx->ns->user_ns); fc->global = true; -#ifdef CONFIG_CGROUP_FAVOR_DYNMODS - ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; -#endif + if (have_favordynmods) + ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; + return 0; } @@ -3867,14 +3898,6 @@ static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, return psi_trigger_poll(&ctx->psi.trigger, of->file, pt); } -static int cgroup_pressure_open(struct kernfs_open_file *of) -{ - if (of->file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) - return -EPERM; - - return 0; -} - static void cgroup_pressure_release(struct kernfs_open_file *of) { struct cgroup_file_ctx *ctx = of->priv; @@ -4159,20 +4182,6 @@ static struct kernfs_ops cgroup_kf_ops = { .seq_show = cgroup_seqfile_show, }; -/* set uid and gid of cgroup dirs and files to that of the creator */ -static int cgroup_kn_set_ugid(struct kernfs_node *kn) -{ - struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, - .ia_uid = current_fsuid(), - .ia_gid = current_fsgid(), }; - - if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) && - gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID)) - return 0; - - return kernfs_setattr(kn, &iattr); -} - static void cgroup_file_notify_timer(struct timer_list *timer) { cgroup_file_notify(container_of(timer, struct cgroup_file, @@ -4185,25 +4194,18 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, char name[CGROUP_FILE_NAME_MAX]; struct kernfs_node *kn; struct lock_class_key *key = NULL; - int ret; #ifdef CONFIG_DEBUG_LOCK_ALLOC key = &cft->lockdep_key; #endif kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name), cgroup_file_mode(cft), - GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + current_fsuid(), current_fsgid(), 0, cft->kf_ops, cft, NULL, key); if (IS_ERR(kn)) return PTR_ERR(kn); - ret = cgroup_kn_set_ugid(kn); - if (ret) { - kernfs_remove(kn); - return ret; - } - if (cft->file_offset) { struct cgroup_file *cfile = (void *)css + cft->file_offset; @@ -4917,9 +4919,11 @@ repeat: void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, struct css_task_iter *it) { + unsigned long irqflags; + memset(it, 0, sizeof(*it)); - spin_lock_irq(&css_set_lock); + spin_lock_irqsave(&css_set_lock, irqflags); it->ss = css->ss; it->flags = flags; @@ -4933,7 +4937,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, css_task_iter_advance(it); - spin_unlock_irq(&css_set_lock); + spin_unlock_irqrestore(&css_set_lock, irqflags); } /** @@ -4946,12 +4950,14 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, */ struct task_struct *css_task_iter_next(struct css_task_iter *it) { + unsigned long irqflags; + if (it->cur_task) { put_task_struct(it->cur_task); it->cur_task = NULL; } - spin_lock_irq(&css_set_lock); + spin_lock_irqsave(&css_set_lock, irqflags); /* @it may be half-advanced by skips, finish advancing */ if (it->flags & CSS_TASK_ITER_SKIPPED) @@ -4964,7 +4970,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) css_task_iter_advance(it); } - spin_unlock_irq(&css_set_lock); + spin_unlock_irqrestore(&css_set_lock, irqflags); return it->cur_task; } @@ -4977,11 +4983,13 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) */ void css_task_iter_end(struct css_task_iter *it) { + unsigned long irqflags; + if (it->cur_cset) { - spin_lock_irq(&css_set_lock); + spin_lock_irqsave(&css_set_lock, irqflags); list_del(&it->iters_node); put_css_set_locked(it->cur_cset); - spin_unlock_irq(&css_set_lock); + spin_unlock_irqrestore(&css_set_lock, irqflags); } if (it->cur_dcset) @@ -5275,7 +5283,6 @@ static struct cftype cgroup_psi_files[] = { { .name = "io.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]), - .open = cgroup_pressure_open, .seq_show = cgroup_io_pressure_show, .write = cgroup_io_pressure_write, .poll = cgroup_pressure_poll, @@ -5284,7 +5291,6 @@ static struct cftype cgroup_psi_files[] = { { .name = "memory.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]), - .open = cgroup_pressure_open, .seq_show = cgroup_memory_pressure_show, .write = cgroup_memory_pressure_write, .poll = cgroup_pressure_poll, @@ -5293,7 +5299,6 @@ static struct cftype cgroup_psi_files[] = { { .name = "cpu.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]), - .open = cgroup_pressure_open, .seq_show = cgroup_cpu_pressure_show, .write = cgroup_cpu_pressure_write, .poll = cgroup_pressure_poll, @@ -5303,7 +5308,6 @@ static struct cftype cgroup_psi_files[] = { { .name = "irq.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]), - .open = cgroup_pressure_open, .seq_show = cgroup_irq_pressure_show, .write = cgroup_irq_pressure_write, .poll = cgroup_pressure_poll, @@ -5604,7 +5608,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, goto out_cancel_ref; /* create the directory */ - kn = kernfs_create_dir(parent->kn, name, mode, cgrp); + kn = kernfs_create_dir_ns(parent->kn, name, mode, + current_fsuid(), current_fsgid(), + cgrp, NULL); if (IS_ERR(kn)) { ret = PTR_ERR(kn); goto out_stat_exit; @@ -5749,10 +5755,6 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) */ kernfs_get(cgrp->kn); - ret = cgroup_kn_set_ugid(cgrp->kn); - if (ret) - goto out_destroy; - ret = css_populate_dir(&cgrp->self); if (ret) goto out_destroy; @@ -6121,7 +6123,7 @@ int __init cgroup_init(void) if (cgroup1_ssid_disabled(ssid)) pr_info("Disabling %s control group subsystem in v1 mounts\n", - ss->name); + ss->legacy_name); cgrp_dfl_root.subsys_mask |= 1 << ss->id; @@ -6253,7 +6255,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, if (!buf) goto out; - cgroup_lock(); + rcu_read_lock(); spin_lock_irq(&css_set_lock); for_each_root(root) { @@ -6264,6 +6266,11 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible)) continue; + cgrp = task_cgroup_from_root(tsk, root); + /* The root has already been unmounted. */ + if (!cgrp) + continue; + seq_printf(m, "%d:", root->hierarchy_id); if (root != &cgrp_dfl_root) for_each_subsys(ss, ssid) @@ -6274,9 +6281,6 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, seq_printf(m, "%sname=%s", count ? "," : "", root->name); seq_putc(m, ':'); - - cgrp = task_cgroup_from_root(tsk, root); - /* * On traditional hierarchies, all zombie tasks show up as * belonging to the root cgroup. On the default hierarchy, @@ -6289,7 +6293,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, current->nsproxy->cgroup_ns); - if (retval >= PATH_MAX) + if (retval == -E2BIG) retval = -ENAMETOOLONG; if (retval < 0) goto out_unlock; @@ -6308,7 +6312,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, retval = 0; out_unlock: spin_unlock_irq(&css_set_lock); - cgroup_unlock(); + rcu_read_unlock(); kfree(buf); out: return retval; @@ -6764,6 +6768,12 @@ static int __init enable_cgroup_debug(char *str) } __setup("cgroup_debug", enable_cgroup_debug); +static int __init cgroup_favordynmods_setup(char *str) +{ + return (kstrtobool(str, &have_favordynmods) == 0); +} +__setup("cgroup_favordynmods=", cgroup_favordynmods_setup); + /** * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest @@ -7050,7 +7060,8 @@ static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, "nsdelegate\n" "favordynmods\n" "memory_localevents\n" - "memory_recursiveprot\n"); + "memory_recursiveprot\n" + "memory_hugetlb_accounting\n"); } static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 58ec88efa4f8..ba36c073304a 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -25,6 +25,7 @@ #include <linux/cpu.h> #include <linux/cpumask.h> #include <linux/cpuset.h> +#include <linux/delay.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/kernel.h> @@ -43,6 +44,7 @@ #include <linux/sched/isolation.h> #include <linux/cgroup.h> #include <linux/wait.h> +#include <linux/workqueue.h> DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); @@ -75,16 +77,18 @@ enum prs_errcode { PERR_NOCPUS, PERR_HOTPLUG, PERR_CPUSEMPTY, + PERR_HKEEPING, }; static const char * const perr_strings[] = { - [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus", + [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus.exclusive", [PERR_INVPARENT] = "Parent is an invalid partition root", [PERR_NOTPART] = "Parent is not a partition root", [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive", [PERR_NOCPUS] = "Parent unable to distribute cpu downstream", [PERR_HOTPLUG] = "No cpu available due to hotplug", [PERR_CPUSEMPTY] = "cpuset.cpus is empty", + [PERR_HKEEPING] = "partition config conflicts with housekeeping setup", }; struct cpuset { @@ -121,14 +125,23 @@ struct cpuset { nodemask_t effective_mems; /* - * CPUs allocated to child sub-partitions (default hierarchy only) - * - CPUs granted by the parent = effective_cpus U subparts_cpus - * - effective_cpus and subparts_cpus are mutually exclusive. + * Exclusive CPUs dedicated to current cgroup (default hierarchy only) * - * effective_cpus contains only onlined CPUs, but subparts_cpus - * may have offlined ones. + * This exclusive CPUs must be a subset of cpus_allowed. A parent + * cgroup can only grant exclusive CPUs to one of its children. + * + * When the cgroup becomes a valid partition root, effective_xcpus + * defaults to cpus_allowed if not set. The effective_cpus of a valid + * partition root comes solely from its effective_xcpus and some of the + * effective_xcpus may be distributed to sub-partitions below & hence + * excluded from its effective_cpus. + */ + cpumask_var_t effective_xcpus; + + /* + * Exclusive CPUs as requested by the user (default hierarchy only) */ - cpumask_var_t subparts_cpus; + cpumask_var_t exclusive_cpus; /* * This is old Memory Nodes tasks took on. @@ -156,8 +169,8 @@ struct cpuset { /* for custom sched domain */ int relax_domain_level; - /* number of CPUs in subparts_cpus */ - int nr_subparts_cpus; + /* number of valid sub-partitions */ + int nr_subparts; /* partition root state */ int partition_root_state; @@ -183,9 +196,25 @@ struct cpuset { /* Handle for cpuset.cpus.partition */ struct cgroup_file partition_file; + + /* Remote partition silbling list anchored at remote_children */ + struct list_head remote_sibling; }; /* + * Exclusive CPUs distributed out to sub-partitions of top_cpuset + */ +static cpumask_var_t subpartitions_cpus; + +/* + * Exclusive CPUs in isolated partitions + */ +static cpumask_var_t isolated_cpus; + +/* List of remote partition root children */ +static struct list_head remote_children; + +/* * Partition root states: * * 0 - member (not a partition root) @@ -312,7 +341,7 @@ static inline int is_partition_invalid(const struct cpuset *cs) */ static inline void make_partition_invalid(struct cpuset *cs) { - if (is_partition_valid(cs)) + if (cs->partition_root_state > 0) cs->partition_root_state = -cs->partition_root_state; } @@ -334,6 +363,7 @@ static struct cpuset top_cpuset = { .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), .partition_root_state = PRS_ROOT, + .remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling), }; /** @@ -469,7 +499,7 @@ static inline bool partition_is_populated(struct cpuset *cs, if (cs->css.cgroup->nr_populated_csets) return true; - if (!excluded_child && !cs->nr_subparts_cpus) + if (!excluded_child && !cs->nr_subparts) return cgroup_is_populated(cs->css.cgroup); rcu_read_lock(); @@ -596,16 +626,18 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) */ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) { - cpumask_var_t *pmask1, *pmask2, *pmask3; + cpumask_var_t *pmask1, *pmask2, *pmask3, *pmask4; if (cs) { pmask1 = &cs->cpus_allowed; pmask2 = &cs->effective_cpus; - pmask3 = &cs->subparts_cpus; + pmask3 = &cs->effective_xcpus; + pmask4 = &cs->exclusive_cpus; } else { pmask1 = &tmp->new_cpus; pmask2 = &tmp->addmask; pmask3 = &tmp->delmask; + pmask4 = NULL; } if (!zalloc_cpumask_var(pmask1, GFP_KERNEL)) @@ -617,8 +649,14 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) if (!zalloc_cpumask_var(pmask3, GFP_KERNEL)) goto free_two; + if (pmask4 && !zalloc_cpumask_var(pmask4, GFP_KERNEL)) + goto free_three; + + return 0; +free_three: + free_cpumask_var(*pmask3); free_two: free_cpumask_var(*pmask2); free_one: @@ -636,7 +674,8 @@ static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) if (cs) { free_cpumask_var(cs->cpus_allowed); free_cpumask_var(cs->effective_cpus); - free_cpumask_var(cs->subparts_cpus); + free_cpumask_var(cs->effective_xcpus); + free_cpumask_var(cs->exclusive_cpus); } if (tmp) { free_cpumask_var(tmp->new_cpus); @@ -664,6 +703,8 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); cpumask_copy(trial->effective_cpus, cs->effective_cpus); + cpumask_copy(trial->effective_xcpus, cs->effective_xcpus); + cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus); return trial; } @@ -677,6 +718,28 @@ static inline void free_cpuset(struct cpuset *cs) kfree(cs); } +static inline struct cpumask *fetch_xcpus(struct cpuset *cs) +{ + return !cpumask_empty(cs->exclusive_cpus) ? cs->exclusive_cpus : + cpumask_empty(cs->effective_xcpus) ? cs->cpus_allowed + : cs->effective_xcpus; +} + +/* + * cpusets_are_exclusive() - check if two cpusets are exclusive + * + * Return true if exclusive, false if not + */ +static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) +{ + struct cpumask *xcpus1 = fetch_xcpus(cs1); + struct cpumask *xcpus2 = fetch_xcpus(cs2); + + if (cpumask_intersects(xcpus1, xcpus2)) + return false; + return true; +} + /* * validate_change_legacy() - Validate conditions specific to legacy (v1) * behavior. @@ -776,9 +839,10 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) ret = -EINVAL; cpuset_for_each_child(c, css, par) { if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && - c != cur && - cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) - goto out; + c != cur) { + if (!cpusets_are_exclusive(trial, c)) + goto out; + } if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && c != cur && nodes_intersects(trial->mems_allowed, c->mems_allowed)) @@ -908,7 +972,7 @@ static int generate_sched_domains(cpumask_var_t **domains, csa = NULL; /* Special case for the 99% of systems with one, full, sched domain */ - if (root_load_balance && !top_cpuset.nr_subparts_cpus) { + if (root_load_balance && !top_cpuset.nr_subparts) { ndoms = 1; doms = alloc_sched_domains(ndoms); if (!doms) @@ -1159,7 +1223,7 @@ static void rebuild_sched_domains_locked(void) * should be the same as the active CPUs, so checking only top_cpuset * is enough to detect racing CPU offlines. */ - if (!top_cpuset.nr_subparts_cpus && + if (cpumask_empty(subpartitions_cpus) && !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) return; @@ -1168,7 +1232,7 @@ static void rebuild_sched_domains_locked(void) * root should be only a subset of the active CPUs. Since a CPU in any * partition root could be offlined, all must be checked. */ - if (top_cpuset.nr_subparts_cpus) { + if (top_cpuset.nr_subparts) { rcu_read_lock(); cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { if (!is_partition_valid(cs)) { @@ -1232,7 +1296,7 @@ static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) */ if (kthread_is_per_cpu(task)) continue; - cpumask_andnot(new_cpus, possible_mask, cs->subparts_cpus); + cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus); } else { cpumask_and(new_cpus, possible_mask, cs->effective_cpus); } @@ -1247,32 +1311,23 @@ static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) * @cs: the cpuset the need to recompute the new effective_cpus mask * @parent: the parent cpuset * - * If the parent has subpartition CPUs, include them in the list of - * allowable CPUs in computing the new effective_cpus mask. Since offlined - * CPUs are not removed from subparts_cpus, we have to use cpu_active_mask - * to mask those out. + * The result is valid only if the given cpuset isn't a partition root. */ static void compute_effective_cpumask(struct cpumask *new_cpus, struct cpuset *cs, struct cpuset *parent) { - if (parent->nr_subparts_cpus && is_partition_valid(cs)) { - cpumask_or(new_cpus, parent->effective_cpus, - parent->subparts_cpus); - cpumask_and(new_cpus, new_cpus, cs->cpus_allowed); - cpumask_and(new_cpus, new_cpus, cpu_active_mask); - } else { - cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus); - } + cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus); } /* - * Commands for update_parent_subparts_cpumask + * Commands for update_parent_effective_cpumask */ -enum subparts_cmd { - partcmd_enable, /* Enable partition root */ - partcmd_disable, /* Disable partition root */ - partcmd_update, /* Update parent's subparts_cpus */ - partcmd_invalidate, /* Make partition invalid */ +enum partition_cmd { + partcmd_enable, /* Enable partition root */ + partcmd_enablei, /* Enable isolated partition root */ + partcmd_disable, /* Disable partition root */ + partcmd_update, /* Update parent's effective_cpus */ + partcmd_invalidate, /* Make partition invalid */ }; static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, @@ -1304,13 +1359,23 @@ static int update_partition_exclusive(struct cpuset *cs, int new_prs) * * Changing load balance flag will automatically call * rebuild_sched_domains_locked(). + * This function is for cgroup v2 only. */ static void update_partition_sd_lb(struct cpuset *cs, int old_prs) { int new_prs = cs->partition_root_state; - bool new_lb = (new_prs != PRS_ISOLATED); bool rebuild_domains = (new_prs > 0) || (old_prs > 0); + bool new_lb; + /* + * If cs is not a valid partition root, the load balance state + * will follow its parent. + */ + if (new_prs > 0) { + new_lb = (new_prs != PRS_ISOLATED); + } else { + new_lb = is_sched_load_balance(parent_cs(cs)); + } if (new_lb != !!is_sched_load_balance(cs)) { rebuild_domains = true; if (new_lb) @@ -1323,35 +1388,417 @@ static void update_partition_sd_lb(struct cpuset *cs, int old_prs) rebuild_sched_domains_locked(); } +/* + * tasks_nocpu_error - Return true if tasks will have no effective_cpus + */ +static bool tasks_nocpu_error(struct cpuset *parent, struct cpuset *cs, + struct cpumask *xcpus) +{ + /* + * A populated partition (cs or parent) can't have empty effective_cpus + */ + return (cpumask_subset(parent->effective_cpus, xcpus) && + partition_is_populated(parent, cs)) || + (!cpumask_intersects(xcpus, cpu_active_mask) && + partition_is_populated(cs, NULL)); +} + +static void reset_partition_data(struct cpuset *cs) +{ + struct cpuset *parent = parent_cs(cs); + + if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) + return; + + lockdep_assert_held(&callback_lock); + + cs->nr_subparts = 0; + if (cpumask_empty(cs->exclusive_cpus)) { + cpumask_clear(cs->effective_xcpus); + if (is_cpu_exclusive(cs)) + clear_bit(CS_CPU_EXCLUSIVE, &cs->flags); + } + if (!cpumask_and(cs->effective_cpus, + parent->effective_cpus, cs->cpus_allowed)) { + cs->use_parent_ecpus = true; + parent->child_ecpus_count++; + cpumask_copy(cs->effective_cpus, parent->effective_cpus); + } +} + +/* + * partition_xcpus_newstate - Exclusive CPUs state change + * @old_prs: old partition_root_state + * @new_prs: new partition_root_state + * @xcpus: exclusive CPUs with state change + */ +static void partition_xcpus_newstate(int old_prs, int new_prs, struct cpumask *xcpus) +{ + WARN_ON_ONCE(old_prs == new_prs); + if (new_prs == PRS_ISOLATED) + cpumask_or(isolated_cpus, isolated_cpus, xcpus); + else + cpumask_andnot(isolated_cpus, isolated_cpus, xcpus); +} + +/* + * partition_xcpus_add - Add new exclusive CPUs to partition + * @new_prs: new partition_root_state + * @parent: parent cpuset + * @xcpus: exclusive CPUs to be added + * Return: true if isolated_cpus modified, false otherwise + * + * Remote partition if parent == NULL + */ +static bool partition_xcpus_add(int new_prs, struct cpuset *parent, + struct cpumask *xcpus) +{ + bool isolcpus_updated; + + WARN_ON_ONCE(new_prs < 0); + lockdep_assert_held(&callback_lock); + if (!parent) + parent = &top_cpuset; + + + if (parent == &top_cpuset) + cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus); + + isolcpus_updated = (new_prs != parent->partition_root_state); + if (isolcpus_updated) + partition_xcpus_newstate(parent->partition_root_state, new_prs, + xcpus); + + cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus); + return isolcpus_updated; +} + +/* + * partition_xcpus_del - Remove exclusive CPUs from partition + * @old_prs: old partition_root_state + * @parent: parent cpuset + * @xcpus: exclusive CPUs to be removed + * Return: true if isolated_cpus modified, false otherwise + * + * Remote partition if parent == NULL + */ +static bool partition_xcpus_del(int old_prs, struct cpuset *parent, + struct cpumask *xcpus) +{ + bool isolcpus_updated; + + WARN_ON_ONCE(old_prs < 0); + lockdep_assert_held(&callback_lock); + if (!parent) + parent = &top_cpuset; + + if (parent == &top_cpuset) + cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus); + + isolcpus_updated = (old_prs != parent->partition_root_state); + if (isolcpus_updated) + partition_xcpus_newstate(old_prs, parent->partition_root_state, + xcpus); + + cpumask_and(xcpus, xcpus, cpu_active_mask); + cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus); + return isolcpus_updated; +} + +static void update_unbound_workqueue_cpumask(bool isolcpus_updated) +{ + int ret; + + lockdep_assert_cpus_held(); + + if (!isolcpus_updated) + return; + + ret = workqueue_unbound_exclude_cpumask(isolated_cpus); + WARN_ON_ONCE(ret < 0); +} + +/** + * cpuset_cpu_is_isolated - Check if the given CPU is isolated + * @cpu: the CPU number to be checked + * Return: true if CPU is used in an isolated partition, false otherwise + */ +bool cpuset_cpu_is_isolated(int cpu) +{ + return cpumask_test_cpu(cpu, isolated_cpus); +} +EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated); + +/* + * compute_effective_exclusive_cpumask - compute effective exclusive CPUs + * @cs: cpuset + * @xcpus: effective exclusive CPUs value to be set + * Return: true if xcpus is not empty, false otherwise. + * + * Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set), + * it must be a subset of cpus_allowed and parent's effective_xcpus. + */ +static bool compute_effective_exclusive_cpumask(struct cpuset *cs, + struct cpumask *xcpus) +{ + struct cpuset *parent = parent_cs(cs); + + if (!xcpus) + xcpus = cs->effective_xcpus; + + if (!cpumask_empty(cs->exclusive_cpus)) + cpumask_and(xcpus, cs->exclusive_cpus, cs->cpus_allowed); + else + cpumask_copy(xcpus, cs->cpus_allowed); + + return cpumask_and(xcpus, xcpus, parent->effective_xcpus); +} + +static inline bool is_remote_partition(struct cpuset *cs) +{ + return !list_empty(&cs->remote_sibling); +} + +static inline bool is_local_partition(struct cpuset *cs) +{ + return is_partition_valid(cs) && !is_remote_partition(cs); +} + +/* + * remote_partition_enable - Enable current cpuset as a remote partition root + * @cs: the cpuset to update + * @new_prs: new partition_root_state + * @tmp: temparary masks + * Return: 1 if successful, 0 if error + * + * Enable the current cpuset to become a remote partition root taking CPUs + * directly from the top cpuset. cpuset_mutex must be held by the caller. + */ +static int remote_partition_enable(struct cpuset *cs, int new_prs, + struct tmpmasks *tmp) +{ + bool isolcpus_updated; + + /* + * The user must have sysadmin privilege. + */ + if (!capable(CAP_SYS_ADMIN)) + return 0; + + /* + * The requested exclusive_cpus must not be allocated to other + * partitions and it can't use up all the root's effective_cpus. + * + * Note that if there is any local partition root above it or + * remote partition root underneath it, its exclusive_cpus must + * have overlapped with subpartitions_cpus. + */ + compute_effective_exclusive_cpumask(cs, tmp->new_cpus); + if (cpumask_empty(tmp->new_cpus) || + cpumask_intersects(tmp->new_cpus, subpartitions_cpus) || + cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus)) + return 0; + + spin_lock_irq(&callback_lock); + isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus); + list_add(&cs->remote_sibling, &remote_children); + if (cs->use_parent_ecpus) { + struct cpuset *parent = parent_cs(cs); + + cs->use_parent_ecpus = false; + parent->child_ecpus_count--; + } + spin_unlock_irq(&callback_lock); + update_unbound_workqueue_cpumask(isolcpus_updated); + + /* + * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. + */ + update_tasks_cpumask(&top_cpuset, tmp->new_cpus); + update_sibling_cpumasks(&top_cpuset, NULL, tmp); + return 1; +} + +/* + * remote_partition_disable - Remove current cpuset from remote partition list + * @cs: the cpuset to update + * @tmp: temparary masks + * + * The effective_cpus is also updated. + * + * cpuset_mutex must be held by the caller. + */ +static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) +{ + bool isolcpus_updated; + + compute_effective_exclusive_cpumask(cs, tmp->new_cpus); + WARN_ON_ONCE(!is_remote_partition(cs)); + WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus)); + + spin_lock_irq(&callback_lock); + list_del_init(&cs->remote_sibling); + isolcpus_updated = partition_xcpus_del(cs->partition_root_state, + NULL, tmp->new_cpus); + cs->partition_root_state = -cs->partition_root_state; + if (!cs->prs_err) + cs->prs_err = PERR_INVCPUS; + reset_partition_data(cs); + spin_unlock_irq(&callback_lock); + update_unbound_workqueue_cpumask(isolcpus_updated); + + /* + * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. + */ + update_tasks_cpumask(&top_cpuset, tmp->new_cpus); + update_sibling_cpumasks(&top_cpuset, NULL, tmp); +} + +/* + * remote_cpus_update - cpus_exclusive change of remote partition + * @cs: the cpuset to be updated + * @newmask: the new effective_xcpus mask + * @tmp: temparary masks + * + * top_cpuset and subpartitions_cpus will be updated or partition can be + * invalidated. + */ +static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, + struct tmpmasks *tmp) +{ + bool adding, deleting; + int prs = cs->partition_root_state; + int isolcpus_updated = 0; + + if (WARN_ON_ONCE(!is_remote_partition(cs))) + return; + + WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus)); + + if (cpumask_empty(newmask)) + goto invalidate; + + adding = cpumask_andnot(tmp->addmask, newmask, cs->effective_xcpus); + deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, newmask); + + /* + * Additions of remote CPUs is only allowed if those CPUs are + * not allocated to other partitions and there are effective_cpus + * left in the top cpuset. + */ + if (adding && (!capable(CAP_SYS_ADMIN) || + cpumask_intersects(tmp->addmask, subpartitions_cpus) || + cpumask_subset(top_cpuset.effective_cpus, tmp->addmask))) + goto invalidate; + + spin_lock_irq(&callback_lock); + if (adding) + isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask); + if (deleting) + isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask); + spin_unlock_irq(&callback_lock); + update_unbound_workqueue_cpumask(isolcpus_updated); + + /* + * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. + */ + update_tasks_cpumask(&top_cpuset, tmp->new_cpus); + update_sibling_cpumasks(&top_cpuset, NULL, tmp); + return; + +invalidate: + remote_partition_disable(cs, tmp); +} + +/* + * remote_partition_check - check if a child remote partition needs update + * @cs: the cpuset to be updated + * @newmask: the new effective_xcpus mask + * @delmask: temporary mask for deletion (not in tmp) + * @tmp: temparary masks + * + * This should be called before the given cs has updated its cpus_allowed + * and/or effective_xcpus. + */ +static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask, + struct cpumask *delmask, struct tmpmasks *tmp) +{ + struct cpuset *child, *next; + int disable_cnt = 0; + + /* + * Compute the effective exclusive CPUs that will be deleted. + */ + if (!cpumask_andnot(delmask, cs->effective_xcpus, newmask) || + !cpumask_intersects(delmask, subpartitions_cpus)) + return; /* No deletion of exclusive CPUs in partitions */ + + /* + * Searching the remote children list to look for those that will + * be impacted by the deletion of exclusive CPUs. + * + * Since a cpuset must be removed from the remote children list + * before it can go offline and holding cpuset_mutex will prevent + * any change in cpuset status. RCU read lock isn't needed. + */ + lockdep_assert_held(&cpuset_mutex); + list_for_each_entry_safe(child, next, &remote_children, remote_sibling) + if (cpumask_intersects(child->effective_cpus, delmask)) { + remote_partition_disable(child, tmp); + disable_cnt++; + } + if (disable_cnt) + rebuild_sched_domains_locked(); +} + +/* + * prstate_housekeeping_conflict - check for partition & housekeeping conflicts + * @prstate: partition root state to be checked + * @new_cpus: cpu mask + * Return: true if there is conflict, false otherwise + * + * CPUs outside of housekeeping_cpumask(HK_TYPE_DOMAIN) can only be used in + * an isolated partition. + */ +static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) +{ + const struct cpumask *hk_domain = housekeeping_cpumask(HK_TYPE_DOMAIN); + bool all_in_hk = cpumask_subset(new_cpus, hk_domain); + + if (!all_in_hk && (prstate != PRS_ISOLATED)) + return true; + + return false; +} + /** - * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset + * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset * @cs: The cpuset that requests change in partition root state * @cmd: Partition root state change command * @newmask: Optional new cpumask for partcmd_update * @tmp: Temporary addmask and delmask * Return: 0 or a partition root state error code * - * For partcmd_enable, the cpuset is being transformed from a non-partition - * root to a partition root. The cpus_allowed mask of the given cpuset will - * be put into parent's subparts_cpus and taken away from parent's - * effective_cpus. The function will return 0 if all the CPUs listed in - * cpus_allowed can be granted or an error code will be returned. + * For partcmd_enable*, the cpuset is being transformed from a non-partition + * root to a partition root. The effective_xcpus (cpus_allowed if + * effective_xcpus not set) mask of the given cpuset will be taken away from + * parent's effective_cpus. The function will return 0 if all the CPUs listed + * in effective_xcpus can be granted or an error code will be returned. * * For partcmd_disable, the cpuset is being transformed from a partition - * root back to a non-partition root. Any CPUs in cpus_allowed that are in - * parent's subparts_cpus will be taken away from that cpumask and put back - * into parent's effective_cpus. 0 will always be returned. + * root back to a non-partition root. Any CPUs in effective_xcpus will be + * given back to parent's effective_cpus. 0 will always be returned. * * For partcmd_update, if the optional newmask is specified, the cpu list is - * to be changed from cpus_allowed to newmask. Otherwise, cpus_allowed is + * to be changed from effective_xcpus to newmask. Otherwise, effective_xcpus is * assumed to remain the same. The cpuset should either be a valid or invalid * partition root. The partition root state may change from valid to invalid - * or vice versa. An error code will only be returned if transitioning from + * or vice versa. An error code will be returned if transitioning from * invalid to valid violates the exclusivity rule. * * For partcmd_invalidate, the current partition will be made invalid. * - * The partcmd_enable and partcmd_disable commands are used by + * The partcmd_enable* and partcmd_disable commands are used by * update_prstate(). An error code may be returned and the caller will check * for error. * @@ -1361,19 +1808,49 @@ static void update_partition_sd_lb(struct cpuset *cs, int old_prs) * check for error and so partition_root_state and prs_error will be updated * directly. */ -static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, - struct cpumask *newmask, - struct tmpmasks *tmp) +static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, + struct cpumask *newmask, + struct tmpmasks *tmp) { struct cpuset *parent = parent_cs(cs); - int adding; /* Moving cpus from effective_cpus to subparts_cpus */ - int deleting; /* Moving cpus from subparts_cpus to effective_cpus */ + int adding; /* Adding cpus to parent's effective_cpus */ + int deleting; /* Deleting cpus from parent's effective_cpus */ int old_prs, new_prs; int part_error = PERR_NONE; /* Partition error? */ + int subparts_delta = 0; + struct cpumask *xcpus; /* cs effective_xcpus */ + int isolcpus_updated = 0; + bool nocpu; lockdep_assert_held(&cpuset_mutex); /* + * new_prs will only be changed for the partcmd_update and + * partcmd_invalidate commands. + */ + adding = deleting = false; + old_prs = new_prs = cs->partition_root_state; + xcpus = !cpumask_empty(cs->exclusive_cpus) + ? cs->effective_xcpus : cs->cpus_allowed; + + if (cmd == partcmd_invalidate) { + if (is_prs_invalid(old_prs)) + return 0; + + /* + * Make the current partition invalid. + */ + if (is_partition_valid(parent)) + adding = cpumask_and(tmp->addmask, + xcpus, parent->effective_xcpus); + if (old_prs > 0) { + new_prs = -old_prs; + subparts_delta--; + } + goto write_error; + } + + /* * The parent must be a partition root. * The new cpumask, if present, or the current cpus_allowed must * not be empty. @@ -1385,124 +1862,140 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, if (!newmask && cpumask_empty(cs->cpus_allowed)) return PERR_CPUSEMPTY; - /* - * new_prs will only be changed for the partcmd_update and - * partcmd_invalidate commands. - */ - adding = deleting = false; - old_prs = new_prs = cs->partition_root_state; - if (cmd == partcmd_enable) { + nocpu = tasks_nocpu_error(parent, cs, xcpus); + + if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) { /* - * Enabling partition root is not allowed if cpus_allowed - * doesn't overlap parent's cpus_allowed. + * Enabling partition root is not allowed if its + * effective_xcpus is empty or doesn't overlap with + * parent's effective_xcpus. */ - if (!cpumask_intersects(cs->cpus_allowed, parent->cpus_allowed)) + if (cpumask_empty(xcpus) || + !cpumask_intersects(xcpus, parent->effective_xcpus)) return PERR_INVCPUS; + if (prstate_housekeeping_conflict(new_prs, xcpus)) + return PERR_HKEEPING; + /* * A parent can be left with no CPU as long as there is no * task directly associated with the parent partition. */ - if (cpumask_subset(parent->effective_cpus, cs->cpus_allowed) && - partition_is_populated(parent, cs)) + if (nocpu) return PERR_NOCPUS; - cpumask_copy(tmp->addmask, cs->cpus_allowed); - adding = true; + cpumask_copy(tmp->delmask, xcpus); + deleting = true; + subparts_delta++; + new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED; } else if (cmd == partcmd_disable) { /* - * Need to remove cpus from parent's subparts_cpus for valid - * partition root. + * May need to add cpus to parent's effective_cpus for + * valid partition root. */ - deleting = !is_prs_invalid(old_prs) && - cpumask_and(tmp->delmask, cs->cpus_allowed, - parent->subparts_cpus); - } else if (cmd == partcmd_invalidate) { - if (is_prs_invalid(old_prs)) - return 0; - + adding = !is_prs_invalid(old_prs) && + cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus); + if (adding) + subparts_delta--; + new_prs = PRS_MEMBER; + } else if (newmask) { /* - * Make the current partition invalid. It is assumed that - * invalidation is caused by violating cpu exclusivity rule. + * Empty cpumask is not allowed */ - deleting = cpumask_and(tmp->delmask, cs->cpus_allowed, - parent->subparts_cpus); - if (old_prs > 0) { - new_prs = -old_prs; - part_error = PERR_NOTEXCL; + if (cpumask_empty(newmask)) { + part_error = PERR_CPUSEMPTY; + goto write_error; } - } else if (newmask) { + /* * partcmd_update with newmask: * - * Compute add/delete mask to/from subparts_cpus + * Compute add/delete mask to/from effective_cpus * - * delmask = cpus_allowed & ~newmask & parent->subparts_cpus - * addmask = newmask & parent->cpus_allowed - * & ~parent->subparts_cpus + * For valid partition: + * addmask = exclusive_cpus & ~newmask + * & parent->effective_xcpus + * delmask = newmask & ~exclusive_cpus + * & parent->effective_xcpus + * + * For invalid partition: + * delmask = newmask & parent->effective_xcpus */ - cpumask_andnot(tmp->delmask, cs->cpus_allowed, newmask); - deleting = cpumask_and(tmp->delmask, tmp->delmask, - parent->subparts_cpus); + if (is_prs_invalid(old_prs)) { + adding = false; + deleting = cpumask_and(tmp->delmask, + newmask, parent->effective_xcpus); + } else { + cpumask_andnot(tmp->addmask, xcpus, newmask); + adding = cpumask_and(tmp->addmask, tmp->addmask, + parent->effective_xcpus); - cpumask_and(tmp->addmask, newmask, parent->cpus_allowed); - adding = cpumask_andnot(tmp->addmask, tmp->addmask, - parent->subparts_cpus); - /* - * Empty cpumask is not allowed - */ - if (cpumask_empty(newmask)) { - part_error = PERR_CPUSEMPTY; + cpumask_andnot(tmp->delmask, newmask, xcpus); + deleting = cpumask_and(tmp->delmask, tmp->delmask, + parent->effective_xcpus); + } /* * Make partition invalid if parent's effective_cpus could * become empty and there are tasks in the parent. */ - } else if (adding && - cpumask_subset(parent->effective_cpus, tmp->addmask) && - !cpumask_intersects(tmp->delmask, cpu_active_mask) && - partition_is_populated(parent, cs)) { + if (nocpu && (!adding || + !cpumask_intersects(tmp->addmask, cpu_active_mask))) { part_error = PERR_NOCPUS; - adding = false; - deleting = cpumask_and(tmp->delmask, cs->cpus_allowed, - parent->subparts_cpus); + deleting = false; + adding = cpumask_and(tmp->addmask, + xcpus, parent->effective_xcpus); } } else { /* - * partcmd_update w/o newmask: + * partcmd_update w/o newmask + * + * delmask = effective_xcpus & parent->effective_cpus * - * delmask = cpus_allowed & parent->subparts_cpus - * addmask = cpus_allowed & parent->cpus_allowed - * & ~parent->subparts_cpus + * This can be called from: + * 1) update_cpumasks_hier() + * 2) cpuset_hotplug_update_tasks() * - * This gets invoked either due to a hotplug event or from - * update_cpumasks_hier(). This can cause the state of a - * partition root to transition from valid to invalid or vice - * versa. So we still need to compute the addmask and delmask. - - * A partition error happens when: - * 1) Cpuset is valid partition, but parent does not distribute - * out any CPUs. - * 2) Parent has tasks and all its effective CPUs will have - * to be distributed out. + * Check to see if it can be transitioned from valid to + * invalid partition or vice versa. + * + * A partition error happens when parent has tasks and all + * its effective CPUs will have to be distributed out. */ - cpumask_and(tmp->addmask, cs->cpus_allowed, - parent->cpus_allowed); - adding = cpumask_andnot(tmp->addmask, tmp->addmask, - parent->subparts_cpus); - - if ((is_partition_valid(cs) && !parent->nr_subparts_cpus) || - (adding && - cpumask_subset(parent->effective_cpus, tmp->addmask) && - partition_is_populated(parent, cs))) { + WARN_ON_ONCE(!is_partition_valid(parent)); + if (nocpu) { part_error = PERR_NOCPUS; - adding = false; - } + if (is_partition_valid(cs)) + adding = cpumask_and(tmp->addmask, + xcpus, parent->effective_xcpus); + } else if (is_partition_invalid(cs) && + cpumask_subset(xcpus, parent->effective_xcpus)) { + struct cgroup_subsys_state *css; + struct cpuset *child; + bool exclusive = true; - if (part_error && is_partition_valid(cs) && - parent->nr_subparts_cpus) - deleting = cpumask_and(tmp->delmask, cs->cpus_allowed, - parent->subparts_cpus); + /* + * Convert invalid partition to valid has to + * pass the cpu exclusivity test. + */ + rcu_read_lock(); + cpuset_for_each_child(child, css, parent) { + if (child == cs) + continue; + if (!cpusets_are_exclusive(cs, child)) { + exclusive = false; + break; + } + } + rcu_read_unlock(); + if (exclusive) + deleting = cpumask_and(tmp->delmask, + xcpus, parent->effective_cpus); + else + part_error = PERR_NOTEXCL; + } } + +write_error: if (part_error) WRITE_ONCE(cs->prs_err, part_error); @@ -1514,13 +2007,17 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, switch (cs->partition_root_state) { case PRS_ROOT: case PRS_ISOLATED: - if (part_error) + if (part_error) { new_prs = -old_prs; + subparts_delta--; + } break; case PRS_INVALID_ROOT: case PRS_INVALID_ISOLATED: - if (!part_error) + if (!part_error) { new_prs = -old_prs; + subparts_delta++; + } break; } } @@ -1530,9 +2027,11 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, /* * Transitioning between invalid to valid or vice versa may require - * changing CS_CPU_EXCLUSIVE. + * changing CS_CPU_EXCLUSIVE. In the case of partcmd_update, + * validate_change() has already been successfully called and + * CPU lists in cs haven't been updated yet. So defer it to later. */ - if (old_prs != new_prs) { + if ((old_prs != new_prs) && (cmd != partcmd_update)) { int err = update_partition_exclusive(cs, new_prs); if (err) @@ -1540,39 +2039,42 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, } /* - * Change the parent's subparts_cpus. + * Change the parent's effective_cpus & effective_xcpus (top cpuset + * only). + * * Newly added CPUs will be removed from effective_cpus and * newly deleted ones will be added back to effective_cpus. */ spin_lock_irq(&callback_lock); - if (adding) { - cpumask_or(parent->subparts_cpus, - parent->subparts_cpus, tmp->addmask); - cpumask_andnot(parent->effective_cpus, - parent->effective_cpus, tmp->addmask); + if (old_prs != new_prs) { + cs->partition_root_state = new_prs; + if (new_prs <= 0) + cs->nr_subparts = 0; } - if (deleting) { - cpumask_andnot(parent->subparts_cpus, - parent->subparts_cpus, tmp->delmask); - /* - * Some of the CPUs in subparts_cpus might have been offlined. - */ - cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask); - cpumask_or(parent->effective_cpus, - parent->effective_cpus, tmp->delmask); + /* + * Adding to parent's effective_cpus means deletion CPUs from cs + * and vice versa. + */ + if (adding) + isolcpus_updated += partition_xcpus_del(old_prs, parent, + tmp->addmask); + if (deleting) + isolcpus_updated += partition_xcpus_add(new_prs, parent, + tmp->delmask); + + if (is_partition_valid(parent)) { + parent->nr_subparts += subparts_delta; + WARN_ON_ONCE(parent->nr_subparts < 0); } - - parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus); - - if (old_prs != new_prs) - cs->partition_root_state = new_prs; - spin_unlock_irq(&callback_lock); + update_unbound_workqueue_cpumask(isolcpus_updated); + + if ((old_prs != new_prs) && (cmd == partcmd_update)) + update_partition_exclusive(cs, new_prs); if (adding || deleting) { update_tasks_cpumask(parent, tmp->addmask); - if (parent->child_ecpus_count) - update_sibling_cpumasks(parent, cs, tmp); + update_sibling_cpumasks(parent, cs, tmp); } /* @@ -1590,6 +2092,73 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, return 0; } +/** + * compute_partition_effective_cpumask - compute effective_cpus for partition + * @cs: partition root cpuset + * @new_ecpus: previously computed effective_cpus to be updated + * + * Compute the effective_cpus of a partition root by scanning effective_xcpus + * of child partition roots and excluding their effective_xcpus. + * + * This has the side effect of invalidating valid child partition roots, + * if necessary. Since it is called from either cpuset_hotplug_update_tasks() + * or update_cpumasks_hier() where parent and children are modified + * successively, we don't need to call update_parent_effective_cpumask() + * and the child's effective_cpus will be updated in later iterations. + * + * Note that rcu_read_lock() is assumed to be held. + */ +static void compute_partition_effective_cpumask(struct cpuset *cs, + struct cpumask *new_ecpus) +{ + struct cgroup_subsys_state *css; + struct cpuset *child; + bool populated = partition_is_populated(cs, NULL); + + /* + * Check child partition roots to see if they should be + * invalidated when + * 1) child effective_xcpus not a subset of new + * excluisve_cpus + * 2) All the effective_cpus will be used up and cp + * has tasks + */ + compute_effective_exclusive_cpumask(cs, new_ecpus); + cpumask_and(new_ecpus, new_ecpus, cpu_active_mask); + + rcu_read_lock(); + cpuset_for_each_child(child, css, cs) { + if (!is_partition_valid(child)) + continue; + + child->prs_err = 0; + if (!cpumask_subset(child->effective_xcpus, + cs->effective_xcpus)) + child->prs_err = PERR_INVCPUS; + else if (populated && + cpumask_subset(new_ecpus, child->effective_xcpus)) + child->prs_err = PERR_NOCPUS; + + if (child->prs_err) { + int old_prs = child->partition_root_state; + + /* + * Invalidate child partition + */ + spin_lock_irq(&callback_lock); + make_partition_invalid(child); + cs->nr_subparts--; + child->nr_subparts = 0; + spin_unlock_irq(&callback_lock); + notify_partition_change(child, old_prs); + continue; + } + cpumask_andnot(new_ecpus, new_ecpus, + child->effective_xcpus); + } + rcu_read_unlock(); +} + /* * update_cpumasks_hier() flags */ @@ -1620,9 +2189,44 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, cs) { struct cpuset *parent = parent_cs(cp); + bool remote = is_remote_partition(cp); bool update_parent = false; - compute_effective_cpumask(tmp->new_cpus, cp, parent); + /* + * Skip descendent remote partition that acquires CPUs + * directly from top cpuset unless it is cs. + */ + if (remote && (cp != cs)) { + pos_css = css_rightmost_descendant(pos_css); + continue; + } + + /* + * Update effective_xcpus if exclusive_cpus set. + * The case when exclusive_cpus isn't set is handled later. + */ + if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) { + spin_lock_irq(&callback_lock); + compute_effective_exclusive_cpumask(cp, NULL); + spin_unlock_irq(&callback_lock); + } + + old_prs = new_prs = cp->partition_root_state; + if (remote || (is_partition_valid(parent) && + is_partition_valid(cp))) + compute_partition_effective_cpumask(cp, tmp->new_cpus); + else + compute_effective_cpumask(tmp->new_cpus, cp, parent); + + /* + * A partition with no effective_cpus is allowed as long as + * there is no task associated with it. Call + * update_parent_effective_cpumask() to check it. + */ + if (is_partition_valid(cp) && cpumask_empty(tmp->new_cpus)) { + update_parent = true; + goto update_parent_effective; + } /* * If it becomes empty, inherit the effective mask of the @@ -1630,11 +2234,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, * it is a partition root that has explicitly distributed * out all its CPUs. */ - if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) { - if (is_partition_valid(cp) && - cpumask_equal(cp->cpus_allowed, cp->subparts_cpus)) - goto update_parent_subparts; - + if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus)) { cpumask_copy(tmp->new_cpus, parent->effective_cpus); if (!cp->use_parent_ecpus) { cp->use_parent_ecpus = true; @@ -1646,6 +2246,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, parent->child_ecpus_count--; } + if (remote) + goto get_css; + /* * Skip the whole subtree if * 1) the cpumask remains the same, @@ -1661,14 +2264,13 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, continue; } -update_parent_subparts: +update_parent_effective: /* - * update_parent_subparts_cpumask() should have been called + * update_parent_effective_cpumask() should have been called * for cs already in update_cpumask(). We should also call * update_tasks_cpumask() again for tasks in the parent - * cpuset if the parent's subparts_cpus changes. + * cpuset if the parent's effective_cpus changes. */ - old_prs = new_prs = cp->partition_root_state; if ((cp != cs) && old_prs) { switch (parent->partition_root_state) { case PRS_ROOT: @@ -1690,14 +2292,13 @@ update_parent_subparts: break; } } - +get_css: if (!css_tryget_online(&cp->css)) continue; rcu_read_unlock(); if (update_parent) { - update_parent_subparts_cpumask(cp, partcmd_update, NULL, - tmp); + update_parent_effective_cpumask(cp, partcmd_update, NULL, tmp); /* * The cpuset partition_root_state may become * invalid. Capture it. @@ -1706,30 +2307,17 @@ update_parent_subparts: } spin_lock_irq(&callback_lock); - - if (cp->nr_subparts_cpus && !is_partition_valid(cp)) { - /* - * Put all active subparts_cpus back to effective_cpus. - */ - cpumask_or(tmp->new_cpus, tmp->new_cpus, - cp->subparts_cpus); - cpumask_and(tmp->new_cpus, tmp->new_cpus, - cpu_active_mask); - cp->nr_subparts_cpus = 0; - cpumask_clear(cp->subparts_cpus); - } - cpumask_copy(cp->effective_cpus, tmp->new_cpus); - if (cp->nr_subparts_cpus) { - /* - * Make sure that effective_cpus & subparts_cpus - * are mutually exclusive. - */ - cpumask_andnot(cp->effective_cpus, cp->effective_cpus, - cp->subparts_cpus); - } - cp->partition_root_state = new_prs; + /* + * Make sure effective_xcpus is properly set for a valid + * partition root. + */ + if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus)) + cpumask_and(cp->effective_xcpus, + cp->cpus_allowed, parent->effective_xcpus); + else if (new_prs < 0) + reset_partition_data(cp); spin_unlock_irq(&callback_lock); notify_partition_change(cp, old_prs); @@ -1737,7 +2325,7 @@ update_parent_subparts: WARN_ON(!is_in_v2_mode() && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); - update_tasks_cpumask(cp, tmp->new_cpus); + update_tasks_cpumask(cp, cp->effective_cpus); /* * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE @@ -1790,8 +2378,13 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, /* * Check all its siblings and call update_cpumasks_hier() - * if their use_parent_ecpus flag is set in order for them - * to use the right effective_cpus value. + * if their effective_cpus will need to be changed. + * + * With the addition of effective_xcpus which is a subset of + * cpus_allowed. It is possible a change in parent's effective_cpus + * due to a change in a child partition's effective_xcpus will impact + * its siblings even if they do not inherit parent's effective_cpus + * directly. * * The update_cpumasks_hier() function may sleep. So we have to * release the RCU read lock before calling it. HIER_NO_SD_REBUILD @@ -1802,8 +2395,13 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, cpuset_for_each_child(sibling, pos_css, parent) { if (sibling == cs) continue; - if (!sibling->use_parent_ecpus) - continue; + if (!sibling->use_parent_ecpus && + !is_partition_valid(sibling)) { + compute_effective_cpumask(tmp->new_cpus, sibling, + parent); + if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus)) + continue; + } if (!css_tryget_online(&sibling->css)) continue; @@ -1826,7 +2424,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, { int retval; struct tmpmasks tmp; + struct cpuset *parent = parent_cs(cs); bool invalidate = false; + int hier_flags = 0; int old_prs = cs->partition_root_state; /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ @@ -1841,6 +2441,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, */ if (!*buf) { cpumask_clear(trialcs->cpus_allowed); + cpumask_clear(trialcs->effective_xcpus); } else { retval = cpulist_parse(buf, trialcs->cpus_allowed); if (retval < 0) @@ -1849,6 +2450,15 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (!cpumask_subset(trialcs->cpus_allowed, top_cpuset.cpus_allowed)) return -EINVAL; + + /* + * When exclusive_cpus isn't explicitly set, it is constrainted + * by cpus_allowed and parent's effective_xcpus. Otherwise, + * trialcs->effective_xcpus is used as a temporary cpumask + * for checking validity of the partition root. + */ + if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs)) + compute_effective_exclusive_cpumask(trialcs, NULL); } /* Nothing to do if the cpus didn't change */ @@ -1858,11 +2468,32 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (alloc_cpumasks(NULL, &tmp)) return -ENOMEM; + if (old_prs) { + if (is_partition_valid(cs) && + cpumask_empty(trialcs->effective_xcpus)) { + invalidate = true; + cs->prs_err = PERR_INVCPUS; + } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) { + invalidate = true; + cs->prs_err = PERR_HKEEPING; + } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) { + invalidate = true; + cs->prs_err = PERR_NOCPUS; + } + } + + /* + * Check all the descendants in update_cpumasks_hier() if + * effective_xcpus is to be changed. + */ + if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus)) + hier_flags = HIER_CHECKALL; + retval = validate_change(cs, trialcs); if ((retval == -EINVAL) && cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { - struct cpuset *cp, *parent; struct cgroup_subsys_state *css; + struct cpuset *cp; /* * The -EINVAL error code indicates that partition sibling @@ -1873,70 +2504,168 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, */ invalidate = true; rcu_read_lock(); - parent = parent_cs(cs); - cpuset_for_each_child(cp, css, parent) + cpuset_for_each_child(cp, css, parent) { + struct cpumask *xcpus = fetch_xcpus(trialcs); + if (is_partition_valid(cp) && - cpumask_intersects(trialcs->cpus_allowed, cp->cpus_allowed)) { + cpumask_intersects(xcpus, cp->effective_xcpus)) { rcu_read_unlock(); - update_parent_subparts_cpumask(cp, partcmd_invalidate, NULL, &tmp); + update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, &tmp); rcu_read_lock(); } + } rcu_read_unlock(); retval = 0; } + if (retval < 0) goto out_free; - if (cs->partition_root_state) { - if (invalidate) - update_parent_subparts_cpumask(cs, partcmd_invalidate, - NULL, &tmp); + if (is_partition_valid(cs) || + (is_partition_invalid(cs) && !invalidate)) { + struct cpumask *xcpus = trialcs->effective_xcpus; + + if (cpumask_empty(xcpus) && is_partition_invalid(cs)) + xcpus = trialcs->cpus_allowed; + + /* + * Call remote_cpus_update() to handle valid remote partition + */ + if (is_remote_partition(cs)) + remote_cpus_update(cs, xcpus, &tmp); + else if (invalidate) + update_parent_effective_cpumask(cs, partcmd_invalidate, + NULL, &tmp); else - update_parent_subparts_cpumask(cs, partcmd_update, - trialcs->cpus_allowed, &tmp); + update_parent_effective_cpumask(cs, partcmd_update, + xcpus, &tmp); + } else if (!cpumask_empty(cs->exclusive_cpus)) { + /* + * Use trialcs->effective_cpus as a temp cpumask + */ + remote_partition_check(cs, trialcs->effective_xcpus, + trialcs->effective_cpus, &tmp); } - compute_effective_cpumask(trialcs->effective_cpus, trialcs, - parent_cs(cs)); spin_lock_irq(&callback_lock); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); + cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus); + if ((old_prs > 0) && !is_partition_valid(cs)) + reset_partition_data(cs); + spin_unlock_irq(&callback_lock); + + /* effective_cpus/effective_xcpus will be updated here */ + update_cpumasks_hier(cs, &tmp, hier_flags); + + /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ + if (cs->partition_root_state) + update_partition_sd_lb(cs, old_prs); +out_free: + free_cpumasks(NULL, &tmp); + return 0; +} + +/** + * update_exclusive_cpumask - update the exclusive_cpus mask of a cpuset + * @cs: the cpuset to consider + * @trialcs: trial cpuset + * @buf: buffer of cpu numbers written to this cpuset + * + * The tasks' cpumask will be updated if cs is a valid partition root. + */ +static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, + const char *buf) +{ + int retval; + struct tmpmasks tmp; + struct cpuset *parent = parent_cs(cs); + bool invalidate = false; + int hier_flags = 0; + int old_prs = cs->partition_root_state; + + if (!*buf) { + cpumask_clear(trialcs->exclusive_cpus); + cpumask_clear(trialcs->effective_xcpus); + } else { + retval = cpulist_parse(buf, trialcs->exclusive_cpus); + if (retval < 0) + return retval; + if (!is_cpu_exclusive(cs)) + set_bit(CS_CPU_EXCLUSIVE, &trialcs->flags); + } + + /* Nothing to do if the CPUs didn't change */ + if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus)) + return 0; + + if (alloc_cpumasks(NULL, &tmp)) + return -ENOMEM; + + if (*buf) + compute_effective_exclusive_cpumask(trialcs, NULL); /* - * Make sure that subparts_cpus, if not empty, is a subset of - * cpus_allowed. Clear subparts_cpus if partition not valid or - * empty effective cpus with tasks. + * Check all the descendants in update_cpumasks_hier() if + * effective_xcpus is to be changed. */ - if (cs->nr_subparts_cpus) { - if (!is_partition_valid(cs) || - (cpumask_subset(trialcs->effective_cpus, cs->subparts_cpus) && - partition_is_populated(cs, NULL))) { - cs->nr_subparts_cpus = 0; - cpumask_clear(cs->subparts_cpus); + if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus)) + hier_flags = HIER_CHECKALL; + + retval = validate_change(cs, trialcs); + if (retval) + return retval; + + if (old_prs) { + if (cpumask_empty(trialcs->effective_xcpus)) { + invalidate = true; + cs->prs_err = PERR_INVCPUS; + } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) { + invalidate = true; + cs->prs_err = PERR_HKEEPING; + } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) { + invalidate = true; + cs->prs_err = PERR_NOCPUS; + } + + if (is_remote_partition(cs)) { + if (invalidate) + remote_partition_disable(cs, &tmp); + else + remote_cpus_update(cs, trialcs->effective_xcpus, + &tmp); + } else if (invalidate) { + update_parent_effective_cpumask(cs, partcmd_invalidate, + NULL, &tmp); } else { - cpumask_and(cs->subparts_cpus, cs->subparts_cpus, - cs->cpus_allowed); - cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus); + update_parent_effective_cpumask(cs, partcmd_update, + trialcs->effective_xcpus, &tmp); } + } else if (!cpumask_empty(trialcs->exclusive_cpus)) { + /* + * Use trialcs->effective_cpus as a temp cpumask + */ + remote_partition_check(cs, trialcs->effective_xcpus, + trialcs->effective_cpus, &tmp); } + spin_lock_irq(&callback_lock); + cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus); + cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus); + if ((old_prs > 0) && !is_partition_valid(cs)) + reset_partition_data(cs); spin_unlock_irq(&callback_lock); - /* effective_cpus will be updated here */ - update_cpumasks_hier(cs, &tmp, 0); - - if (cs->partition_root_state) { - struct cpuset *parent = parent_cs(cs); - - /* - * For partition root, update the cpumasks of sibling - * cpusets if they use parent's effective_cpus. - */ - if (parent->child_ecpus_count) - update_sibling_cpumasks(parent, cs, &tmp); + /* + * Call update_cpumasks_hier() to update effective_cpus/effective_xcpus + * of the subtree when it is a valid partition root or effective_xcpus + * is updated. + */ + if (is_partition_valid(cs) || hier_flags) + update_cpumasks_hier(cs, &tmp, hier_flags); - /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains */ + /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ + if (cs->partition_root_state) update_partition_sd_lb(cs, old_prs); - } -out_free: + free_cpumasks(NULL, &tmp); return 0; } @@ -2315,27 +3044,39 @@ static int update_prstate(struct cpuset *cs, int new_prs) int err = PERR_NONE, old_prs = cs->partition_root_state; struct cpuset *parent = parent_cs(cs); struct tmpmasks tmpmask; + bool new_xcpus_state = false; if (old_prs == new_prs) return 0; /* - * For a previously invalid partition root, leave it at being - * invalid if new_prs is not "member". + * Treat a previously invalid partition root as if it is a "member". */ - if (new_prs && is_prs_invalid(old_prs)) { - cs->partition_root_state = -new_prs; - return 0; - } + if (new_prs && is_prs_invalid(old_prs)) + old_prs = PRS_MEMBER; if (alloc_cpumasks(NULL, &tmpmask)) return -ENOMEM; + /* + * Setup effective_xcpus if not properly set yet, it will be cleared + * later if partition becomes invalid. + */ + if ((new_prs > 0) && cpumask_empty(cs->exclusive_cpus)) { + spin_lock_irq(&callback_lock); + cpumask_and(cs->effective_xcpus, + cs->cpus_allowed, parent->effective_xcpus); + spin_unlock_irq(&callback_lock); + } + err = update_partition_exclusive(cs, new_prs); if (err) goto out; if (!old_prs) { + enum partition_cmd cmd = (new_prs == PRS_ROOT) + ? partcmd_enable : partcmd_enablei; + /* * cpus_allowed cannot be empty. */ @@ -2344,31 +3085,33 @@ static int update_prstate(struct cpuset *cs, int new_prs) goto out; } - err = update_parent_subparts_cpumask(cs, partcmd_enable, - NULL, &tmpmask); + err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask); + /* + * If an attempt to become local partition root fails, + * try to become a remote partition root instead. + */ + if (err && remote_partition_enable(cs, new_prs, &tmpmask)) + err = 0; } else if (old_prs && new_prs) { /* * A change in load balance state only, no change in cpumasks. */ - ; + new_xcpus_state = true; } else { /* * Switching back to member is always allowed even if it * disables child partitions. */ - update_parent_subparts_cpumask(cs, partcmd_disable, NULL, - &tmpmask); + if (is_remote_partition(cs)) + remote_partition_disable(cs, &tmpmask); + else + update_parent_effective_cpumask(cs, partcmd_disable, + NULL, &tmpmask); /* - * If there are child partitions, they will all become invalid. + * Invalidation of child partitions will be done in + * update_cpumasks_hier(). */ - if (unlikely(cs->nr_subparts_cpus)) { - spin_lock_irq(&callback_lock); - cs->nr_subparts_cpus = 0; - cpumask_clear(cs->subparts_cpus); - compute_effective_cpumask(cs->effective_cpus, cs, parent); - spin_unlock_irq(&callback_lock); - } } out: /* @@ -2383,14 +3126,15 @@ out: spin_lock_irq(&callback_lock); cs->partition_root_state = new_prs; WRITE_ONCE(cs->prs_err, err); + if (!is_partition_valid(cs)) + reset_partition_data(cs); + else if (new_xcpus_state) + partition_xcpus_newstate(old_prs, new_prs, cs->effective_xcpus); spin_unlock_irq(&callback_lock); + update_unbound_workqueue_cpumask(new_xcpus_state); - /* - * Update child cpusets, if present. - * Force update if switching back to member. - */ - if (!list_empty(&cs->css.children)) - update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0); + /* Force update if switching back to member */ + update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0); /* Update sched domains and load balance flag */ update_partition_sd_lb(cs, old_prs); @@ -2639,7 +3383,7 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) guarantee_online_cpus(task, cpus_attach); else cpumask_andnot(cpus_attach, task_cpu_possible_mask(task), - cs->subparts_cpus); + subpartitions_cpus); /* * can_attach beforehand should guarantee that this doesn't * fail. TODO: have a better way to handle failure here @@ -2742,6 +3486,9 @@ typedef enum { FILE_EFFECTIVE_CPULIST, FILE_EFFECTIVE_MEMLIST, FILE_SUBPARTS_CPULIST, + FILE_EXCLUSIVE_CPULIST, + FILE_EFFECTIVE_XCPULIST, + FILE_ISOLATED_CPULIST, FILE_CPU_EXCLUSIVE, FILE_MEM_EXCLUSIVE, FILE_MEM_HARDWALL, @@ -2879,6 +3626,9 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, case FILE_CPULIST: retval = update_cpumask(cs, trialcs, buf); break; + case FILE_EXCLUSIVE_CPULIST: + retval = update_exclusive_cpumask(cs, trialcs, buf); + break; case FILE_MEMLIST: retval = update_nodemask(cs, trialcs, buf); break; @@ -2926,8 +3676,17 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) case FILE_EFFECTIVE_MEMLIST: seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems)); break; + case FILE_EXCLUSIVE_CPULIST: + seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->exclusive_cpus)); + break; + case FILE_EFFECTIVE_XCPULIST: + seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_xcpus)); + break; case FILE_SUBPARTS_CPULIST: - seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus)); + seq_printf(sf, "%*pbl\n", cpumask_pr_args(subpartitions_cpus)); + break; + case FILE_ISOLATED_CPULIST: + seq_printf(sf, "%*pbl\n", cpumask_pr_args(isolated_cpus)); break; default: ret = -EINVAL; @@ -3200,10 +3959,33 @@ static struct cftype dfl_files[] = { }, { + .name = "cpus.exclusive", + .seq_show = cpuset_common_seq_show, + .write = cpuset_write_resmask, + .max_write_len = (100U + 6 * NR_CPUS), + .private = FILE_EXCLUSIVE_CPULIST, + .flags = CFTYPE_NOT_ON_ROOT, + }, + + { + .name = "cpus.exclusive.effective", + .seq_show = cpuset_common_seq_show, + .private = FILE_EFFECTIVE_XCPULIST, + .flags = CFTYPE_NOT_ON_ROOT, + }, + + { .name = "cpus.subpartitions", .seq_show = cpuset_common_seq_show, .private = FILE_SUBPARTS_CPULIST, - .flags = CFTYPE_DEBUG, + .flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG, + }, + + { + .name = "cpus.isolated", + .seq_show = cpuset_common_seq_show, + .private = FILE_ISOLATED_CPULIST, + .flags = CFTYPE_ONLY_ON_ROOT, }, { } /* terminate */ @@ -3241,6 +4023,7 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) nodes_clear(cs->effective_mems); fmeter_init(&cs->fmeter); cs->relax_domain_level = -1; + INIT_LIST_HEAD(&cs->remote_sibling); /* Set CS_MEMORY_MIGRATE for default hierarchy */ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) @@ -3276,6 +4059,11 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->effective_mems = parent->effective_mems; cs->use_parent_ecpus = true; parent->child_ecpus_count++; + /* + * Clear CS_SCHED_LOAD_BALANCE if parent is isolated + */ + if (!is_sched_load_balance(parent)) + clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); } /* @@ -3377,6 +4165,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) if (is_in_v2_mode()) { cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); + cpumask_copy(top_cpuset.effective_xcpus, cpu_possible_mask); top_cpuset.mems_allowed = node_possible_map; } else { cpumask_copy(top_cpuset.cpus_allowed, @@ -3515,16 +4304,22 @@ int __init cpuset_init(void) { BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); - BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL)); cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); cpumask_setall(top_cpuset.effective_cpus); + cpumask_setall(top_cpuset.effective_xcpus); + cpumask_setall(top_cpuset.exclusive_cpus); nodes_setall(top_cpuset.effective_mems); fmeter_init(&top_cpuset.fmeter); set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); top_cpuset.relax_domain_level = -1; + INIT_LIST_HEAD(&remote_children); BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); @@ -3625,6 +4420,30 @@ void cpuset_force_rebuild(void) force_rebuild = true; } +/* + * Attempt to acquire a cpus_read_lock while a hotplug operation may be in + * progress. + * Return: true if successful, false otherwise + * + * To avoid circular lock dependency between cpuset_mutex and cpus_read_lock, + * cpus_read_trylock() is used here to acquire the lock. + */ +static bool cpuset_hotplug_cpus_read_trylock(void) +{ + int retries = 0; + + while (!cpus_read_trylock()) { + /* + * CPU hotplug still in progress. Retry 5 times + * with a 10ms wait before bailing out. + */ + if (++retries > 5) + return false; + msleep(10); + } + return true; +} + /** * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug * @cs: cpuset in interest @@ -3640,6 +4459,8 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) static nodemask_t new_mems; bool cpus_updated; bool mems_updated; + bool remote; + int partcmd = -1; struct cpuset *parent; retry: wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); @@ -3659,29 +4480,25 @@ retry: compute_effective_cpumask(&new_cpus, cs, parent); nodes_and(new_mems, cs->mems_allowed, parent->effective_mems); - if (cs->nr_subparts_cpus) - /* - * Make sure that CPUs allocated to child partitions - * do not show up in effective_cpus. - */ - cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus); - if (!tmp || !cs->partition_root_state) goto update_tasks; /* - * In the unlikely event that a partition root has empty - * effective_cpus with tasks, we will have to invalidate child - * partitions, if present, by setting nr_subparts_cpus to 0 to - * reclaim their cpus. + * Compute effective_cpus for valid partition root, may invalidate + * child partition roots if necessary. */ - if (cs->nr_subparts_cpus && is_partition_valid(cs) && - cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)) { - spin_lock_irq(&callback_lock); - cs->nr_subparts_cpus = 0; - cpumask_clear(cs->subparts_cpus); - spin_unlock_irq(&callback_lock); + remote = is_remote_partition(cs); + if (remote || (is_partition_valid(cs) && is_partition_valid(parent))) + compute_partition_effective_cpumask(cs, &new_cpus); + + if (remote && cpumask_empty(&new_cpus) && + partition_is_populated(cs, NULL) && + cpuset_hotplug_cpus_read_trylock()) { + remote_partition_disable(cs, tmp); compute_effective_cpumask(&new_cpus, cs, parent); + remote = false; + cpuset_force_rebuild(); + cpus_read_unlock(); } /* @@ -3691,44 +4508,32 @@ retry: * 2) parent is invalid or doesn't grant any cpus to child * partitions. */ - if (is_partition_valid(cs) && (!parent->nr_subparts_cpus || - (cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)))) { - int old_prs, parent_prs; - - update_parent_subparts_cpumask(cs, partcmd_disable, NULL, tmp); - if (cs->nr_subparts_cpus) { - spin_lock_irq(&callback_lock); - cs->nr_subparts_cpus = 0; - cpumask_clear(cs->subparts_cpus); - spin_unlock_irq(&callback_lock); - compute_effective_cpumask(&new_cpus, cs, parent); - } - - old_prs = cs->partition_root_state; - parent_prs = parent->partition_root_state; - if (is_partition_valid(cs)) { - spin_lock_irq(&callback_lock); - make_partition_invalid(cs); - spin_unlock_irq(&callback_lock); - if (is_prs_invalid(parent_prs)) - WRITE_ONCE(cs->prs_err, PERR_INVPARENT); - else if (!parent_prs) - WRITE_ONCE(cs->prs_err, PERR_NOTPART); - else - WRITE_ONCE(cs->prs_err, PERR_HOTPLUG); - notify_partition_change(cs, old_prs); - } - cpuset_force_rebuild(); - } - + if (is_local_partition(cs) && (!is_partition_valid(parent) || + tasks_nocpu_error(parent, cs, &new_cpus))) + partcmd = partcmd_invalidate; /* * On the other hand, an invalid partition root may be transitioned * back to a regular one. */ - else if (is_partition_valid(parent) && is_partition_invalid(cs)) { - update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp); - if (is_partition_valid(cs)) + else if (is_partition_valid(parent) && is_partition_invalid(cs)) + partcmd = partcmd_update; + + /* + * cpus_read_lock needs to be held before calling + * update_parent_effective_cpumask(). To avoid circular lock + * dependency between cpuset_mutex and cpus_read_lock, + * cpus_read_trylock() is used here to acquire the lock. + */ + if (partcmd >= 0) { + if (!cpuset_hotplug_cpus_read_trylock()) + goto update_tasks; + + update_parent_effective_cpumask(cs, partcmd, NULL, tmp); + cpus_read_unlock(); + if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) { + compute_partition_effective_cpumask(cs, &new_cpus); cpuset_force_rebuild(); + } } update_tasks: @@ -3786,21 +4591,22 @@ static void cpuset_hotplug_workfn(struct work_struct *work) new_mems = node_states[N_MEMORY]; /* - * If subparts_cpus is populated, it is likely that the check below - * will produce a false positive on cpus_updated when the cpu list - * isn't changed. It is extra work, but it is better to be safe. + * If subpartitions_cpus is populated, it is likely that the check + * below will produce a false positive on cpus_updated when the cpu + * list isn't changed. It is extra work, but it is better to be safe. */ - cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus); + cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus) || + !cpumask_empty(subpartitions_cpus); mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems); /* - * In the rare case that hotplug removes all the cpus in subparts_cpus, - * we assumed that cpus are updated. + * In the rare case that hotplug removes all the cpus in + * subpartitions_cpus, we assumed that cpus are updated. */ - if (!cpus_updated && top_cpuset.nr_subparts_cpus) + if (!cpus_updated && top_cpuset.nr_subparts) cpus_updated = true; - /* synchronize cpus_allowed to cpu_active_mask */ + /* For v1, synchronize cpus_allowed to cpu_active_mask */ if (cpus_updated) { spin_lock_irq(&callback_lock); if (!on_dfl) @@ -3808,17 +4614,16 @@ static void cpuset_hotplug_workfn(struct work_struct *work) /* * Make sure that CPUs allocated to child partitions * do not show up in effective_cpus. If no CPU is left, - * we clear the subparts_cpus & let the child partitions + * we clear the subpartitions_cpus & let the child partitions * fight for the CPUs again. */ - if (top_cpuset.nr_subparts_cpus) { - if (cpumask_subset(&new_cpus, - top_cpuset.subparts_cpus)) { - top_cpuset.nr_subparts_cpus = 0; - cpumask_clear(top_cpuset.subparts_cpus); + if (!cpumask_empty(subpartitions_cpus)) { + if (cpumask_subset(&new_cpus, subpartitions_cpus)) { + top_cpuset.nr_subparts = 0; + cpumask_clear(subpartitions_cpus); } else { cpumask_andnot(&new_cpus, &new_cpus, - top_cpuset.subparts_cpus); + subpartitions_cpus); } } cpumask_copy(top_cpuset.effective_cpus, &new_cpus); @@ -3950,7 +4755,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) * We first exclude cpus allocated to partitions. If there is no * allowable online cpu left, we fall back to all possible cpus. */ - cpumask_andnot(pmask, possible_mask, top_cpuset.subparts_cpus); + cpumask_andnot(pmask, possible_mask, subpartitions_cpus); if (!cpumask_intersects(pmask, cpu_online_mask)) cpumask_copy(pmask, possible_mask); } @@ -4287,7 +5092,7 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, current->nsproxy->cgroup_ns); css_put(css); - if (retval >= PATH_MAX) + if (retval == -E2BIG) retval = -ENAMETOOLONG; if (retval < 0) goto out_free; diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 122dacb3a443..66d1708042a7 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -66,9 +66,15 @@ static struct freezer *parent_freezer(struct freezer *freezer) bool cgroup_freezing(struct task_struct *task) { bool ret; + unsigned int state; rcu_read_lock(); - ret = task_freezer(task)->state & CGROUP_FREEZING; + /* Check if the cgroup is still FREEZING, but not FROZEN. The extra + * !FROZEN check is required, because the FREEZING bit is not cleared + * when the state FROZEN is reached. + */ + state = task_freezer(task)->state; + ret = (state & CGROUP_FREEZING) && !(state & CGROUP_FROZEN); rcu_read_unlock(); return ret; diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index d80d7a608141..a8350d2d63e6 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -74,64 +74,109 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) } /** - * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree - * @pos: current position - * @root: root of the tree to traversal + * cgroup_rstat_push_children - push children cgroups into the given list + * @head: current head of the list (= subtree root) + * @child: first child of the root * @cpu: target cpu + * Return: A new singly linked list of cgroups to be flush * - * Walks the updated rstat_cpu tree on @cpu from @root. %NULL @pos starts - * the traversal and %NULL return indicates the end. During traversal, - * each returned cgroup is unlinked from the tree. Must be called with the - * matching cgroup_rstat_cpu_lock held. + * Iteratively traverse down the cgroup_rstat_cpu updated tree level by + * level and push all the parents first before their next level children + * into a singly linked list built from the tail backward like "pushing" + * cgroups into a stack. The root is pushed by the caller. + */ +static struct cgroup *cgroup_rstat_push_children(struct cgroup *head, + struct cgroup *child, int cpu) +{ + struct cgroup *chead = child; /* Head of child cgroup level */ + struct cgroup *ghead = NULL; /* Head of grandchild cgroup level */ + struct cgroup *parent, *grandchild; + struct cgroup_rstat_cpu *crstatc; + + child->rstat_flush_next = NULL; + +next_level: + while (chead) { + child = chead; + chead = child->rstat_flush_next; + parent = cgroup_parent(child); + + /* updated_next is parent cgroup terminated */ + while (child != parent) { + child->rstat_flush_next = head; + head = child; + crstatc = cgroup_rstat_cpu(child, cpu); + grandchild = crstatc->updated_children; + if (grandchild != child) { + /* Push the grand child to the next level */ + crstatc->updated_children = child; + grandchild->rstat_flush_next = ghead; + ghead = grandchild; + } + child = crstatc->updated_next; + crstatc->updated_next = NULL; + } + } + + if (ghead) { + chead = ghead; + ghead = NULL; + goto next_level; + } + return head; +} + +/** + * cgroup_rstat_updated_list - return a list of updated cgroups to be flushed + * @root: root of the cgroup subtree to traverse + * @cpu: target cpu + * Return: A singly linked list of cgroups to be flushed + * + * Walks the updated rstat_cpu tree on @cpu from @root. During traversal, + * each returned cgroup is unlinked from the updated tree. * * The only ordering guarantee is that, for a parent and a child pair - * covered by a given traversal, if a child is visited, its parent is - * guaranteed to be visited afterwards. + * covered by a given traversal, the child is before its parent in + * the list. + * + * Note that updated_children is self terminated and points to a list of + * child cgroups if not empty. Whereas updated_next is like a sibling link + * within the children list and terminated by the parent cgroup. An exception + * here is the cgroup root whose updated_next can be self terminated. */ -static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, - struct cgroup *root, int cpu) +static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu) { - struct cgroup_rstat_cpu *rstatc; - struct cgroup *parent; - - if (pos == root) - return NULL; + raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); + struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(root, cpu); + struct cgroup *head = NULL, *parent, *child; + unsigned long flags; /* - * We're gonna walk down to the first leaf and visit/remove it. We - * can pick whatever unvisited node as the starting point. + * The _irqsave() is needed because cgroup_rstat_lock is + * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring + * this lock with the _irq() suffix only disables interrupts on + * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables + * interrupts on both configurations. The _irqsave() ensures + * that interrupts are always disabled and later restored. */ - if (!pos) { - pos = root; - /* return NULL if this subtree is not on-list */ - if (!cgroup_rstat_cpu(pos, cpu)->updated_next) - return NULL; - } else { - pos = cgroup_parent(pos); - } + raw_spin_lock_irqsave(cpu_lock, flags); - /* walk down to the first leaf */ - while (true) { - rstatc = cgroup_rstat_cpu(pos, cpu); - if (rstatc->updated_children == pos) - break; - pos = rstatc->updated_children; - } + /* Return NULL if this subtree is not on-list */ + if (!rstatc->updated_next) + goto unlock_ret; /* - * Unlink @pos from the tree. As the updated_children list is + * Unlink @root from its parent. As the updated_children list is * singly linked, we have to walk it to find the removal point. - * However, due to the way we traverse, @pos will be the first - * child in most cases. The only exception is @root. */ - parent = cgroup_parent(pos); + parent = cgroup_parent(root); if (parent) { struct cgroup_rstat_cpu *prstatc; struct cgroup **nextp; prstatc = cgroup_rstat_cpu(parent, cpu); nextp = &prstatc->updated_children; - while (*nextp != pos) { + while (*nextp != root) { struct cgroup_rstat_cpu *nrstatc; nrstatc = cgroup_rstat_cpu(*nextp, cpu); @@ -142,7 +187,17 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, } rstatc->updated_next = NULL; - return pos; + + /* Push @root to the list first before pushing the children */ + head = root; + root->rstat_flush_next = NULL; + child = rstatc->updated_children; + rstatc->updated_children = root; + if (child != root) + head = cgroup_rstat_push_children(head, child, cpu); +unlock_ret: + raw_spin_unlock_irqrestore(cpu_lock, flags); + return head; } /* @@ -156,19 +211,16 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, * optimize away the callsite. Therefore, __weak is needed to ensure that the * call is still emitted, by telling the compiler that we don't know what the * function might eventually be. - * - * __diag_* below are needed to dismiss the missing prototype warning. */ -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "kfuncs which will be used in BPF programs"); + +__bpf_hook_start(); __weak noinline void bpf_rstat_flush(struct cgroup *cgrp, struct cgroup *parent, int cpu) { } -__diag_pop(); +__bpf_hook_end(); /* see cgroup_rstat_flush() */ static void cgroup_rstat_flush_locked(struct cgroup *cgrp) @@ -179,21 +231,9 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp) lockdep_assert_held(&cgroup_rstat_lock); for_each_possible_cpu(cpu) { - raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, - cpu); - struct cgroup *pos = NULL; - unsigned long flags; + struct cgroup *pos = cgroup_rstat_updated_list(cgrp, cpu); - /* - * The _irqsave() is needed because cgroup_rstat_lock is - * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring - * this lock with the _irq() suffix only disables interrupts on - * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables - * interrupts on both configurations. The _irqsave() ensures - * that interrupts are always disabled and later restored. - */ - raw_spin_lock_irqsave(cpu_lock, flags); - while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) { + for (; pos; pos = pos->rstat_flush_next) { struct cgroup_subsys_state *css; cgroup_base_stat_flush(pos, cpu); @@ -205,7 +245,6 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp) css->ss->css_rstat_flush(css, cpu); rcu_read_unlock(); } - raw_spin_unlock_irqrestore(cpu_lock, flags); /* play nice and yield if necessary */ if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) { diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config index e8db8d938661..4722b998a324 100644 --- a/kernel/configs/debug.config +++ b/kernel/configs/debug.config @@ -1,3 +1,5 @@ +# Help: Debugging for CI systems and finding regressions +# # The config is based on running daily CI for enterprise Linux distros to # seek regressions on linux-next builds on different bare-metal and virtual # platforms. It can be used for example, diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config new file mode 100644 index 000000000000..95a400f042b1 --- /dev/null +++ b/kernel/configs/hardening.config @@ -0,0 +1,98 @@ +# Help: Basic kernel hardening options +# +# These are considered the basic kernel hardening, self-protection, and +# attack surface reduction options. They are expected to have low (or +# no) performance impact on most workloads, and have a reasonable level +# of legacy API removals. + +# Make sure reporting of various hardening actions is possible. +CONFIG_BUG=y + +# Basic kernel memory permission enforcement. +CONFIG_STRICT_KERNEL_RWX=y +CONFIG_STRICT_MODULE_RWX=y +CONFIG_VMAP_STACK=y + +# Kernel image and memory ASLR. +CONFIG_RANDOMIZE_BASE=y +CONFIG_RANDOMIZE_MEMORY=y + +# Randomize allocator freelists, harden metadata. +CONFIG_SLAB_FREELIST_RANDOM=y +CONFIG_SLAB_FREELIST_HARDENED=y +CONFIG_SHUFFLE_PAGE_ALLOCATOR=y +CONFIG_RANDOM_KMALLOC_CACHES=y + +# Randomize kernel stack offset on syscall entry. +CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT=y + +# Basic stack frame overflow protection. +CONFIG_STACKPROTECTOR=y +CONFIG_STACKPROTECTOR_STRONG=y + +# Basic buffer length bounds checking. +CONFIG_HARDENED_USERCOPY=y +CONFIG_FORTIFY_SOURCE=y + +# Basic array index bounds checking. +CONFIG_UBSAN=y +CONFIG_UBSAN_TRAP=y +CONFIG_UBSAN_BOUNDS=y +# CONFIG_UBSAN_SHIFT is not set +# CONFIG_UBSAN_DIV_ZERO +# CONFIG_UBSAN_UNREACHABLE +# CONFIG_UBSAN_BOOL +# CONFIG_UBSAN_ENUM +# CONFIG_UBSAN_ALIGNMENT +CONFIG_UBSAN_SANITIZE_ALL=y + +# Linked list integrity checking. +CONFIG_LIST_HARDENED=y + +# Initialize all heap variables to zero on allocation. +CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y + +# Initialize all stack variables to zero on function entry. +CONFIG_INIT_STACK_ALL_ZERO=y + +# Wipe RAM at reboot via EFI. For more details, see: +# https://trustedcomputinggroup.org/resource/pc-client-work-group-platform-reset-attack-mitigation-specification/ +# https://bugzilla.redhat.com/show_bug.cgi?id=1532058 +CONFIG_RESET_ATTACK_MITIGATION=y + +# Disable DMA between EFI hand-off and the kernel's IOMMU setup. +CONFIG_EFI_DISABLE_PCI_DMA=y + +# Force IOMMU TLB invalidation so devices will never be able to access stale +# data content. +CONFIG_IOMMU_SUPPORT=y +CONFIG_IOMMU_DEFAULT_DMA_STRICT=y + +# Do not allow direct physical memory access to non-device memory. +CONFIG_STRICT_DEVMEM=y +CONFIG_IO_STRICT_DEVMEM=y + +# Provide userspace with seccomp BPF API for syscall attack surface reduction. +CONFIG_SECCOMP=y +CONFIG_SECCOMP_FILTER=y + +# Provides some protections against SYN flooding. +CONFIG_SYN_COOKIES=y + +# Attack surface reduction: do not autoload TTY line disciplines. +# CONFIG_LDISC_AUTOLOAD is not set + +# Dangerous; enabling this disables userspace brk ASLR. +# CONFIG_COMPAT_BRK is not set + +# Dangerous; exposes kernel text image layout. +# CONFIG_PROC_KCORE is not set + +# Dangerous; enabling this disables userspace VDSO ASLR. +# CONFIG_COMPAT_VDSO is not set + +# Attack surface reduction: Use the modern PTY interface (devpts) only. +# CONFIG_LEGACY_PTYS is not set + +# Attack surface reduction: Use only modesetting video drivers. +# CONFIG_DRM_LEGACY is not set diff --git a/kernel/configs/kvm_guest.config b/kernel/configs/kvm_guest.config index 208481d91090..d0877063d925 100644 --- a/kernel/configs/kvm_guest.config +++ b/kernel/configs/kvm_guest.config @@ -1,3 +1,4 @@ +# Help: Bootable as a KVM guest CONFIG_NET=y CONFIG_NET_CORE=y CONFIG_NETDEVICES=y diff --git a/kernel/configs/nopm.config b/kernel/configs/nopm.config index 81ff07863576..ebfdc3d8aa9a 100644 --- a/kernel/configs/nopm.config +++ b/kernel/configs/nopm.config @@ -1,3 +1,5 @@ +# Help: Disable Power Management + CONFIG_PM=n CONFIG_SUSPEND=n CONFIG_HIBERNATION=n diff --git a/kernel/configs/rust.config b/kernel/configs/rust.config index 38a7c5362c9c..2c6e001a7284 100644 --- a/kernel/configs/rust.config +++ b/kernel/configs/rust.config @@ -1 +1,2 @@ +# Help: Enable Rust CONFIG_RUST=y diff --git a/kernel/configs/x86_debug.config b/kernel/configs/x86_debug.config index 6fac5b405334..35f48671b8d5 100644 --- a/kernel/configs/x86_debug.config +++ b/kernel/configs/x86_debug.config @@ -1,3 +1,4 @@ +# Help: Debugging options for tip tree testing CONFIG_X86_DEBUG_FPU=y CONFIG_LOCK_STAT=y CONFIG_DEBUG_VM=y diff --git a/kernel/configs/xen.config b/kernel/configs/xen.config index 436f806aa1ed..6878b9a49be8 100644 --- a/kernel/configs/xen.config +++ b/kernel/configs/xen.config @@ -1,3 +1,5 @@ +# Help: Bootable as a Xen guest +# # global stuff - these enable us to allow some # of the not so generic stuff below for xen CONFIG_PARAVIRT=y diff --git a/kernel/cpu.c b/kernel/cpu.c index 6de7c6bb74ee..e6ec3ba4950b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -659,11 +659,19 @@ static inline bool cpu_smt_thread_allowed(unsigned int cpu) #endif } -static inline bool cpu_smt_allowed(unsigned int cpu) +static inline bool cpu_bootable(unsigned int cpu) { if (cpu_smt_control == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu)) return true; + /* All CPUs are bootable if controls are not configured */ + if (cpu_smt_control == CPU_SMT_NOT_IMPLEMENTED) + return true; + + /* All CPUs are bootable if CPU is not SMT capable */ + if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED) + return true; + if (topology_is_primary_thread(cpu)) return true; @@ -685,7 +693,7 @@ bool cpu_smt_possible(void) EXPORT_SYMBOL_GPL(cpu_smt_possible); #else -static inline bool cpu_smt_allowed(unsigned int cpu) { return true; } +static inline bool cpu_bootable(unsigned int cpu) { return true; } #endif static inline enum cpuhp_state @@ -788,10 +796,10 @@ static int bringup_wait_for_ap_online(unsigned int cpu) * SMT soft disabling on X86 requires to bring the CPU out of the * BIOS 'wait for SIPI' state in order to set the CR4.MCE bit. The * CPU marked itself as booted_once in notify_cpu_starting() so the - * cpu_smt_allowed() check will now return false if this is not the + * cpu_bootable() check will now return false if this is not the * primary sibling. */ - if (!cpu_smt_allowed(cpu)) + if (!cpu_bootable(cpu)) return -ECANCELED; return 0; } @@ -1372,7 +1380,14 @@ static int takedown_cpu(unsigned int cpu) cpuhp_bp_sync_dead(cpu); tick_cleanup_dead_cpu(cpu); + + /* + * Callbacks must be re-integrated right away to the RCU state machine. + * Otherwise an RCU callback could block a further teardown function + * waiting for its completion. + */ rcutree_migrate_callbacks(cpu); + return 0; } @@ -1388,10 +1403,10 @@ void cpuhp_report_idle_dead(void) struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); BUG_ON(st->state != CPUHP_AP_OFFLINE); - rcu_report_dead(smp_processor_id()); + rcutree_report_cpu_dead(); st->state = CPUHP_AP_IDLE_DEAD; /* - * We cannot call complete after rcu_report_dead() so we delegate it + * We cannot call complete after rcutree_report_cpu_dead() so we delegate it * to an online cpu. */ smp_call_function_single(cpumask_first(cpu_online_mask), @@ -1515,11 +1530,14 @@ static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target) /* * Ensure that the control task does not run on the to be offlined * CPU to prevent a deadlock against cfs_b->period_timer. + * Also keep at least one housekeeping cpu onlined to avoid generating + * an empty sched_domain span. */ - cpu = cpumask_any_but(cpu_online_mask, cpu); - if (cpu >= nr_cpu_ids) - return -EBUSY; - return work_on_cpu(cpu, __cpu_down_maps_locked, &work); + for_each_cpu_and(cpu, cpu_online_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)) { + if (cpu != work.cpu) + return work_on_cpu(cpu, __cpu_down_maps_locked, &work); + } + return -EBUSY; } static int cpu_down(unsigned int cpu, enum cpuhp_state target) @@ -1617,7 +1635,7 @@ void notify_cpu_starting(unsigned int cpu) struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); - rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ + rcutree_report_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ cpumask_set_cpu(cpu, &cpus_booted_once_mask); /* @@ -1725,9 +1743,6 @@ static int cpu_up(unsigned int cpu, enum cpuhp_state target) if (!cpu_possible(cpu)) { pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n", cpu); -#if defined(CONFIG_IA64) - pr_err("please check additional_cpus= boot parameter\n"); -#endif return -EINVAL; } @@ -1741,7 +1756,7 @@ static int cpu_up(unsigned int cpu, enum cpuhp_state target) err = -EBUSY; goto out; } - if (!cpu_smt_allowed(cpu)) { + if (!cpu_bootable(cpu)) { err = -EPERM; goto out; } @@ -2098,7 +2113,7 @@ static struct cpuhp_step cpuhp_hp_states[] = { [CPUHP_HRTIMERS_PREPARE] = { .name = "hrtimers:prepare", .startup.single = hrtimers_prepare_cpu, - .teardown.single = hrtimers_dead_cpu, + .teardown.single = NULL, }, [CPUHP_SMPCFD_PREPARE] = { .name = "smpcfd:prepare", @@ -2110,11 +2125,6 @@ static struct cpuhp_step cpuhp_hp_states[] = { .startup.single = relay_prepare_cpu, .teardown.single = NULL, }, - [CPUHP_SLAB_PREPARE] = { - .name = "slab:prepare", - .startup.single = slab_prepare_cpu, - .teardown.single = slab_dead_cpu, - }, [CPUHP_RCUTREE_PREP] = { .name = "RCU/tree:prepare", .startup.single = rcutree_prepare_cpu, @@ -2190,6 +2200,12 @@ static struct cpuhp_step cpuhp_hp_states[] = { .startup.single = NULL, .teardown.single = smpcfd_dying_cpu, }, + [CPUHP_AP_HRTIMERS_DYING] = { + .name = "hrtimers:dying", + .startup.single = NULL, + .teardown.single = hrtimers_cpu_dying, + }, + /* Entry state on starting. Interrupts enabled from here on. Transient * state for synchronsization */ [CPUHP_AP_ONLINE] = { diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 03a7932cde0a..75cd6a736d03 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -5,7 +5,6 @@ */ #include <linux/buildid.h> -#include <linux/crash_core.h> #include <linux/init.h> #include <linux/utsname.h> #include <linux/vmalloc.h> @@ -13,6 +12,8 @@ #include <linux/kexec.h> #include <linux/memory.h> #include <linux/cpuhotplug.h> +#include <linux/memblock.h> +#include <linux/kmemleak.h> #include <asm/page.h> #include <asm/sections.h> @@ -33,6 +34,22 @@ u32 *vmcoreinfo_note; /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ static unsigned char *vmcoreinfo_data_safecopy; +/* Location of the reserved area for the crash kernel */ +struct resource crashk_res = { + .name = "Crash kernel", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, + .desc = IORES_DESC_CRASH_KERNEL +}; +struct resource crashk_low_res = { + .name = "Crash kernel", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, + .desc = IORES_DESC_CRASH_KERNEL +}; + /* * parsing the "crashkernel" commandline * @@ -181,7 +198,7 @@ static __initdata char *suffix_tbl[] = { * It returns 0 on success and -EINVAL on failure. */ static int __init parse_crashkernel_suffix(char *cmdline, - unsigned long long *crash_size, + unsigned long long *crash_size, const char *suffix) { char *cur = cmdline; @@ -248,11 +265,11 @@ static int __init __parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base, - const char *name, const char *suffix) { - char *first_colon, *first_space; - char *ck_cmdline; + char *first_colon, *first_space; + char *ck_cmdline; + char *name = "crashkernel="; BUG_ON(!crash_size || !crash_base); *crash_size = 0; @@ -283,32 +300,53 @@ static int __init __parse_crashkernel(char *cmdline, /* * That function is the entry point for command line parsing and should be * called from the arch-specific code. + * + * If crashkernel=,high|low is supported on architecture, non-NULL values + * should be passed to parameters 'low_size' and 'high'. */ int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, - unsigned long long *crash_base) + unsigned long long *crash_base, + unsigned long long *low_size, + bool *high) { - return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel=", NULL); -} + int ret; -int __init parse_crashkernel_high(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel=", suffix_tbl[SUFFIX_HIGH]); -} + /* crashkernel=X[@offset] */ + ret = __parse_crashkernel(cmdline, system_ram, crash_size, + crash_base, NULL); +#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION + /* + * If non-NULL 'high' passed in and no normal crashkernel + * setting detected, try parsing crashkernel=,high|low. + */ + if (high && ret == -ENOENT) { + ret = __parse_crashkernel(cmdline, 0, crash_size, + crash_base, suffix_tbl[SUFFIX_HIGH]); + if (ret || !*crash_size) + return -EINVAL; -int __init parse_crashkernel_low(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel=", suffix_tbl[SUFFIX_LOW]); + /* + * crashkernel=Y,low can be specified or not, but invalid value + * is not allowed. + */ + ret = __parse_crashkernel(cmdline, 0, low_size, + crash_base, suffix_tbl[SUFFIX_LOW]); + if (ret == -ENOENT) { + *low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE; + ret = 0; + } else if (ret) { + return ret; + } + + *high = true; + } +#endif + if (!*crash_size) + ret = -EINVAL; + + return ret; } /* @@ -321,6 +359,119 @@ static int __init parse_crashkernel_dummy(char *arg) } early_param("crashkernel", parse_crashkernel_dummy); +#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION +static int __init reserve_crashkernel_low(unsigned long long low_size) +{ +#ifdef CONFIG_64BIT + unsigned long long low_base; + + low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX); + if (!low_base) { + pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size); + return -ENOMEM; + } + + pr_info("crashkernel low memory reserved: 0x%08llx - 0x%08llx (%lld MB)\n", + low_base, low_base + low_size, low_size >> 20); + + crashk_low_res.start = low_base; + crashk_low_res.end = low_base + low_size - 1; +#endif + return 0; +} + +void __init reserve_crashkernel_generic(char *cmdline, + unsigned long long crash_size, + unsigned long long crash_base, + unsigned long long crash_low_size, + bool high) +{ + unsigned long long search_end = CRASH_ADDR_LOW_MAX, search_base = 0; + bool fixed_base = false; + + /* User specifies base address explicitly. */ + if (crash_base) { + fixed_base = true; + search_base = crash_base; + search_end = crash_base + crash_size; + } else if (high) { + search_base = CRASH_ADDR_LOW_MAX; + search_end = CRASH_ADDR_HIGH_MAX; + } + +retry: + crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN, + search_base, search_end); + if (!crash_base) { + /* + * For crashkernel=size[KMG]@offset[KMG], print out failure + * message if can't reserve the specified region. + */ + if (fixed_base) { + pr_warn("crashkernel reservation failed - memory is in use.\n"); + return; + } + + /* + * For crashkernel=size[KMG], if the first attempt was for + * low memory, fall back to high memory, the minimum required + * low memory will be reserved later. + */ + if (!high && search_end == CRASH_ADDR_LOW_MAX) { + search_end = CRASH_ADDR_HIGH_MAX; + search_base = CRASH_ADDR_LOW_MAX; + crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE; + goto retry; + } + + /* + * For crashkernel=size[KMG],high, if the first attempt was + * for high memory, fall back to low memory. + */ + if (high && search_end == CRASH_ADDR_HIGH_MAX) { + search_end = CRASH_ADDR_LOW_MAX; + search_base = 0; + goto retry; + } + pr_warn("cannot allocate crashkernel (size:0x%llx)\n", + crash_size); + return; + } + + if ((crash_base >= CRASH_ADDR_LOW_MAX) && + crash_low_size && reserve_crashkernel_low(crash_low_size)) { + memblock_phys_free(crash_base, crash_size); + return; + } + + pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n", + crash_base, crash_base + crash_size, crash_size >> 20); + + /* + * The crashkernel memory will be removed from the kernel linear + * map. Inform kmemleak so that it won't try to access it. + */ + kmemleak_ignore_phys(crash_base); + if (crashk_low_res.end) + kmemleak_ignore_phys(crashk_low_res.start); + + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; +} + +static __init int insert_crashkernel_resources(void) +{ + if (crashk_res.start < crashk_res.end) + insert_resource(&iomem_resource, &crashk_res); + + if (crashk_low_res.start < crashk_low_res.end) + insert_resource(&iomem_resource, &crashk_low_res); + + return 0; +} +early_initcall(insert_crashkernel_resources); +#endif + int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map, void **addr, unsigned long *sz) { @@ -409,9 +560,11 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map, phdr->p_filesz = phdr->p_memsz = mend - mstart + 1; phdr->p_align = 0; ehdr->e_phnum++; - pr_debug("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n", - phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz, - ehdr->e_phnum, phdr->p_offset); +#ifdef CONFIG_KEXEC_FILE + kexec_dprintk("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n", + phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz, + ehdr->e_phnum, phdr->p_offset); +#endif phdr++; } @@ -423,9 +576,8 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map, int crash_exclude_mem_range(struct crash_mem *mem, unsigned long long mstart, unsigned long long mend) { - int i, j; + int i; unsigned long long start, end, p_start, p_end; - struct range temp_range = {0, 0}; for (i = 0; i < mem->nr_ranges; i++) { start = mem->ranges[i].start; @@ -433,72 +585,51 @@ int crash_exclude_mem_range(struct crash_mem *mem, p_start = mstart; p_end = mend; - if (mstart > end || mend < start) + if (p_start > end) continue; + /* + * Because the memory ranges in mem->ranges are stored in + * ascending order, when we detect `p_end < start`, we can + * immediately exit the for loop, as the subsequent memory + * ranges will definitely be outside the range we are looking + * for. + */ + if (p_end < start) + break; + /* Truncate any area outside of range */ - if (mstart < start) + if (p_start < start) p_start = start; - if (mend > end) + if (p_end > end) p_end = end; /* Found completely overlapping range */ if (p_start == start && p_end == end) { - mem->ranges[i].start = 0; - mem->ranges[i].end = 0; - if (i < mem->nr_ranges - 1) { - /* Shift rest of the ranges to left */ - for (j = i; j < mem->nr_ranges - 1; j++) { - mem->ranges[j].start = - mem->ranges[j+1].start; - mem->ranges[j].end = - mem->ranges[j+1].end; - } - - /* - * Continue to check if there are another overlapping ranges - * from the current position because of shifting the above - * mem ranges. - */ - i--; - mem->nr_ranges--; - continue; - } + memmove(&mem->ranges[i], &mem->ranges[i + 1], + (mem->nr_ranges - (i + 1)) * sizeof(mem->ranges[i])); + i--; mem->nr_ranges--; - return 0; - } - - if (p_start > start && p_end < end) { + } else if (p_start > start && p_end < end) { /* Split original range */ + if (mem->nr_ranges >= mem->max_nr_ranges) + return -ENOMEM; + + memmove(&mem->ranges[i + 2], &mem->ranges[i + 1], + (mem->nr_ranges - (i + 1)) * sizeof(mem->ranges[i])); + mem->ranges[i].end = p_start - 1; - temp_range.start = p_end + 1; - temp_range.end = end; + mem->ranges[i + 1].start = p_end + 1; + mem->ranges[i + 1].end = end; + + i++; + mem->nr_ranges++; } else if (p_start != start) mem->ranges[i].end = p_start - 1; else mem->ranges[i].start = p_end + 1; - break; } - /* If a split happened, add the split to array */ - if (!temp_range.end) - return 0; - - /* Split happened */ - if (i == mem->max_nr_ranges - 1) - return -ENOMEM; - - /* Location where new range should go */ - j = i + 1; - if (j < mem->nr_ranges) { - /* Move over all ranges one slot towards the end */ - for (i = mem->nr_ranges - 1; i >= j; i--) - mem->ranges[i + 1] = mem->ranges[i]; - } - - mem->ranges[j].start = temp_range.start; - mem->ranges[j].end = temp_range.end; - mem->nr_ranges++; return 0; } @@ -660,7 +791,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(list_head, prev); VMCOREINFO_OFFSET(vmap_area, va_start); VMCOREINFO_OFFSET(vmap_area, list); - VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER + 1); + VMCOREINFO_LENGTH(zone.free_area, NR_PAGE_ORDERS); log_buf_vmcoreinfo_setup(); VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); VMCOREINFO_NUMBER(NR_FREE_PAGES); @@ -740,6 +871,17 @@ subsys_initcall(crash_notes_memory_init); #define pr_fmt(fmt) "crash hp: " fmt /* + * Different than kexec/kdump loading/unloading/jumping/shrinking which + * usually rarely happen, there will be many crash hotplug events notified + * during one short period, e.g one memory board is hot added and memory + * regions are online. So mutex lock __crash_hotplug_lock is used to + * serialize the crash hotplug handling specifically. + */ +static DEFINE_MUTEX(__crash_hotplug_lock); +#define crash_hotplug_lock() mutex_lock(&__crash_hotplug_lock) +#define crash_hotplug_unlock() mutex_unlock(&__crash_hotplug_lock) + +/* * This routine utilized when the crash_hotplug sysfs node is read. * It reflects the kernel's ability/permission to update the crash * elfcorehdr directly. @@ -748,9 +890,11 @@ int crash_check_update_elfcorehdr(void) { int rc = 0; + crash_hotplug_lock(); /* Obtain lock while reading crash information */ if (!kexec_trylock()) { pr_info("kexec_trylock() failed, elfcorehdr may be inaccurate\n"); + crash_hotplug_unlock(); return 0; } if (kexec_crash_image) { @@ -761,6 +905,7 @@ int crash_check_update_elfcorehdr(void) } /* Release lock now that update complete */ kexec_unlock(); + crash_hotplug_unlock(); return rc; } @@ -783,9 +928,11 @@ static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu) { struct kimage *image; + crash_hotplug_lock(); /* Obtain lock while changing crash information */ if (!kexec_trylock()) { pr_info("kexec_trylock() failed, elfcorehdr may be inaccurate\n"); + crash_hotplug_unlock(); return; } @@ -852,6 +999,7 @@ static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu) out: /* Release lock now that update complete */ kexec_unlock(); + crash_hotplug_unlock(); } static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *v) diff --git a/kernel/cred.c b/kernel/cred.c index 98cb4eca23fb..c033a201c808 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -36,17 +36,13 @@ do { \ static struct kmem_cache *cred_jar; /* init to 2 - one for init_task, one to ensure it is never freed */ -static struct group_info init_groups = { .usage = ATOMIC_INIT(2) }; +static struct group_info init_groups = { .usage = REFCOUNT_INIT(2) }; /* * The initial credentials for the initial task */ struct cred init_cred = { .usage = ATOMIC_INIT(4), -#ifdef CONFIG_DEBUG_CREDENTIALS - .subscribers = ATOMIC_INIT(2), - .magic = CRED_MAGIC, -#endif .uid = GLOBAL_ROOT_UID, .gid = GLOBAL_ROOT_GID, .suid = GLOBAL_ROOT_UID, @@ -66,31 +62,6 @@ struct cred init_cred = { .ucounts = &init_ucounts, }; -static inline void set_cred_subscribers(struct cred *cred, int n) -{ -#ifdef CONFIG_DEBUG_CREDENTIALS - atomic_set(&cred->subscribers, n); -#endif -} - -static inline int read_cred_subscribers(const struct cred *cred) -{ -#ifdef CONFIG_DEBUG_CREDENTIALS - return atomic_read(&cred->subscribers); -#else - return 0; -#endif -} - -static inline void alter_cred_subscribers(const struct cred *_cred, int n) -{ -#ifdef CONFIG_DEBUG_CREDENTIALS - struct cred *cred = (struct cred *) _cred; - - atomic_add(n, &cred->subscribers); -#endif -} - /* * The RCU callback to actually dispose of a set of credentials */ @@ -100,20 +71,9 @@ static void put_cred_rcu(struct rcu_head *rcu) kdebug("put_cred_rcu(%p)", cred); -#ifdef CONFIG_DEBUG_CREDENTIALS - if (cred->magic != CRED_MAGIC_DEAD || - atomic_read(&cred->usage) != 0 || - read_cred_subscribers(cred) != 0) - panic("CRED: put_cred_rcu() sees %p with" - " mag %x, put %p, usage %d, subscr %d\n", - cred, cred->magic, cred->put_addr, - atomic_read(&cred->usage), - read_cred_subscribers(cred)); -#else - if (atomic_read(&cred->usage) != 0) - panic("CRED: put_cred_rcu() sees %p with usage %d\n", - cred, atomic_read(&cred->usage)); -#endif + if (atomic_long_read(&cred->usage) != 0) + panic("CRED: put_cred_rcu() sees %p with usage %ld\n", + cred, atomic_long_read(&cred->usage)); security_cred_free(cred); key_put(cred->session_keyring); @@ -137,16 +97,10 @@ static void put_cred_rcu(struct rcu_head *rcu) */ void __put_cred(struct cred *cred) { - kdebug("__put_cred(%p{%d,%d})", cred, - atomic_read(&cred->usage), - read_cred_subscribers(cred)); - - BUG_ON(atomic_read(&cred->usage) != 0); -#ifdef CONFIG_DEBUG_CREDENTIALS - BUG_ON(read_cred_subscribers(cred) != 0); - cred->magic = CRED_MAGIC_DEAD; - cred->put_addr = __builtin_return_address(0); -#endif + kdebug("__put_cred(%p{%ld})", cred, + atomic_long_read(&cred->usage)); + + BUG_ON(atomic_long_read(&cred->usage) != 0); BUG_ON(cred == current->cred); BUG_ON(cred == current->real_cred); @@ -162,23 +116,23 @@ EXPORT_SYMBOL(__put_cred); */ void exit_creds(struct task_struct *tsk) { - struct cred *cred; + struct cred *real_cred, *cred; - kdebug("exit_creds(%u,%p,%p,{%d,%d})", tsk->pid, tsk->real_cred, tsk->cred, - atomic_read(&tsk->cred->usage), - read_cred_subscribers(tsk->cred)); + kdebug("exit_creds(%u,%p,%p,{%ld})", tsk->pid, tsk->real_cred, tsk->cred, + atomic_long_read(&tsk->cred->usage)); - cred = (struct cred *) tsk->real_cred; + real_cred = (struct cred *) tsk->real_cred; tsk->real_cred = NULL; - validate_creds(cred); - alter_cred_subscribers(cred, -1); - put_cred(cred); cred = (struct cred *) tsk->cred; tsk->cred = NULL; - validate_creds(cred); - alter_cred_subscribers(cred, -1); - put_cred(cred); + + if (real_cred == cred) { + put_cred_many(cred, 2); + } else { + put_cred(real_cred); + put_cred(cred); + } #ifdef CONFIG_KEYS_REQUEST_CACHE key_put(tsk->cached_requested_key); @@ -224,10 +178,7 @@ struct cred *cred_alloc_blank(void) if (!new) return NULL; - atomic_set(&new->usage, 1); -#ifdef CONFIG_DEBUG_CREDENTIALS - new->magic = CRED_MAGIC; -#endif + atomic_long_set(&new->usage, 1); if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0) goto error; @@ -258,8 +209,6 @@ struct cred *prepare_creds(void) const struct cred *old; struct cred *new; - validate_process_creds(); - new = kmem_cache_alloc(cred_jar, GFP_KERNEL); if (!new) return NULL; @@ -270,8 +219,7 @@ struct cred *prepare_creds(void) memcpy(new, old, sizeof(struct cred)); new->non_rcu = 0; - atomic_set(&new->usage, 1); - set_cred_subscribers(new, 0); + atomic_long_set(&new->usage, 1); get_group_info(new->group_info); get_uid(new->user); get_user_ns(new->user_ns); @@ -294,7 +242,6 @@ struct cred *prepare_creds(void) if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) goto error; - validate_creds(new); return new; error: @@ -355,12 +302,9 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) #endif clone_flags & CLONE_THREAD ) { - p->real_cred = get_cred(p->cred); - get_cred(p->cred); - alter_cred_subscribers(p->cred, 2); - kdebug("share_creds(%p{%d,%d})", - p->cred, atomic_read(&p->cred->usage), - read_cred_subscribers(p->cred)); + p->real_cred = get_cred_many(p->cred, 2); + kdebug("share_creds(%p{%ld})", + p->cred, atomic_long_read(&p->cred->usage)); inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); return 0; } @@ -399,8 +343,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) p->cred = p->real_cred = get_cred(new); inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); - alter_cred_subscribers(new, 2); - validate_creds(new); return 0; error_put: @@ -452,17 +394,11 @@ int commit_creds(struct cred *new) struct task_struct *task = current; const struct cred *old = task->real_cred; - kdebug("commit_creds(%p{%d,%d})", new, - atomic_read(&new->usage), - read_cred_subscribers(new)); + kdebug("commit_creds(%p{%ld})", new, + atomic_long_read(&new->usage)); BUG_ON(task->cred != old); -#ifdef CONFIG_DEBUG_CREDENTIALS - BUG_ON(read_cred_subscribers(old) < 2); - validate_creds(old); - validate_creds(new); -#endif - BUG_ON(atomic_read(&new->usage) < 1); + BUG_ON(atomic_long_read(&new->usage) < 1); get_cred(new); /* we will require a ref for the subj creds too */ @@ -497,14 +433,12 @@ int commit_creds(struct cred *new) * RLIMIT_NPROC limits on user->processes have already been checked * in set_user(). */ - alter_cred_subscribers(new, 2); if (new->user != old->user || new->user_ns != old->user_ns) inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1); rcu_assign_pointer(task->real_cred, new); rcu_assign_pointer(task->cred, new); if (new->user != old->user || new->user_ns != old->user_ns) dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1); - alter_cred_subscribers(old, -2); /* send notifications */ if (!uid_eq(new->uid, old->uid) || @@ -520,8 +454,7 @@ int commit_creds(struct cred *new) proc_id_connector(task, PROC_EVENT_GID); /* release the old obj and subj refs both */ - put_cred(old); - put_cred(old); + put_cred_many(old, 2); return 0; } EXPORT_SYMBOL(commit_creds); @@ -535,14 +468,10 @@ EXPORT_SYMBOL(commit_creds); */ void abort_creds(struct cred *new) { - kdebug("abort_creds(%p{%d,%d})", new, - atomic_read(&new->usage), - read_cred_subscribers(new)); + kdebug("abort_creds(%p{%ld})", new, + atomic_long_read(&new->usage)); -#ifdef CONFIG_DEBUG_CREDENTIALS - BUG_ON(read_cred_subscribers(new) != 0); -#endif - BUG_ON(atomic_read(&new->usage) < 1); + BUG_ON(atomic_long_read(&new->usage) < 1); put_cred(new); } EXPORT_SYMBOL(abort_creds); @@ -558,12 +487,8 @@ const struct cred *override_creds(const struct cred *new) { const struct cred *old = current->cred; - kdebug("override_creds(%p{%d,%d})", new, - atomic_read(&new->usage), - read_cred_subscribers(new)); - - validate_creds(old); - validate_creds(new); + kdebug("override_creds(%p{%ld})", new, + atomic_long_read(&new->usage)); /* * NOTE! This uses 'get_new_cred()' rather than 'get_cred()'. @@ -572,18 +497,12 @@ const struct cred *override_creds(const struct cred *new) * we are only installing the cred into the thread-synchronous * '->cred' pointer, not the '->real_cred' pointer that is * visible to other threads under RCU. - * - * Also note that we did validate_creds() manually, not depending - * on the validation in 'get_cred()'. */ get_new_cred((struct cred *)new); - alter_cred_subscribers(new, 1); rcu_assign_pointer(current->cred, new); - alter_cred_subscribers(old, -1); - kdebug("override_creds() = %p{%d,%d}", old, - atomic_read(&old->usage), - read_cred_subscribers(old)); + kdebug("override_creds() = %p{%ld}", old, + atomic_long_read(&old->usage)); return old; } EXPORT_SYMBOL(override_creds); @@ -599,15 +518,10 @@ void revert_creds(const struct cred *old) { const struct cred *override = current->cred; - kdebug("revert_creds(%p{%d,%d})", old, - atomic_read(&old->usage), - read_cred_subscribers(old)); + kdebug("revert_creds(%p{%ld})", old, + atomic_long_read(&old->usage)); - validate_creds(old); - validate_creds(override); - alter_cred_subscribers(old, 1); rcu_assign_pointer(current->cred, old); - alter_cred_subscribers(override, -1); put_cred(override); } EXPORT_SYMBOL(revert_creds); @@ -727,12 +641,10 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) kdebug("prepare_kernel_cred() alloc %p", new); old = get_task_cred(daemon); - validate_creds(old); *new = *old; new->non_rcu = 0; - atomic_set(&new->usage, 1); - set_cred_subscribers(new, 0); + atomic_long_set(&new->usage, 1); get_uid(new->user); get_user_ns(new->user_ns); get_group_info(new->group_info); @@ -756,7 +668,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) goto error; put_cred(old); - validate_creds(new); return new; error: @@ -821,109 +732,3 @@ int set_create_files_as(struct cred *new, struct inode *inode) return security_kernel_create_files_as(new, inode); } EXPORT_SYMBOL(set_create_files_as); - -#ifdef CONFIG_DEBUG_CREDENTIALS - -bool creds_are_invalid(const struct cred *cred) -{ - if (cred->magic != CRED_MAGIC) - return true; - return false; -} -EXPORT_SYMBOL(creds_are_invalid); - -/* - * dump invalid credentials - */ -static void dump_invalid_creds(const struct cred *cred, const char *label, - const struct task_struct *tsk) -{ - pr_err("%s credentials: %p %s%s%s\n", - label, cred, - cred == &init_cred ? "[init]" : "", - cred == tsk->real_cred ? "[real]" : "", - cred == tsk->cred ? "[eff]" : ""); - pr_err("->magic=%x, put_addr=%p\n", - cred->magic, cred->put_addr); - pr_err("->usage=%d, subscr=%d\n", - atomic_read(&cred->usage), - read_cred_subscribers(cred)); - pr_err("->*uid = { %d,%d,%d,%d }\n", - from_kuid_munged(&init_user_ns, cred->uid), - from_kuid_munged(&init_user_ns, cred->euid), - from_kuid_munged(&init_user_ns, cred->suid), - from_kuid_munged(&init_user_ns, cred->fsuid)); - pr_err("->*gid = { %d,%d,%d,%d }\n", - from_kgid_munged(&init_user_ns, cred->gid), - from_kgid_munged(&init_user_ns, cred->egid), - from_kgid_munged(&init_user_ns, cred->sgid), - from_kgid_munged(&init_user_ns, cred->fsgid)); -#ifdef CONFIG_SECURITY - pr_err("->security is %p\n", cred->security); - if ((unsigned long) cred->security >= PAGE_SIZE && - (((unsigned long) cred->security & 0xffffff00) != - (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8))) - pr_err("->security {%x, %x}\n", - ((u32*)cred->security)[0], - ((u32*)cred->security)[1]); -#endif -} - -/* - * report use of invalid credentials - */ -void __noreturn __invalid_creds(const struct cred *cred, const char *file, unsigned line) -{ - pr_err("Invalid credentials\n"); - pr_err("At %s:%u\n", file, line); - dump_invalid_creds(cred, "Specified", current); - BUG(); -} -EXPORT_SYMBOL(__invalid_creds); - -/* - * check the credentials on a process - */ -void __validate_process_creds(struct task_struct *tsk, - const char *file, unsigned line) -{ - if (tsk->cred == tsk->real_cred) { - if (unlikely(read_cred_subscribers(tsk->cred) < 2 || - creds_are_invalid(tsk->cred))) - goto invalid_creds; - } else { - if (unlikely(read_cred_subscribers(tsk->real_cred) < 1 || - read_cred_subscribers(tsk->cred) < 1 || - creds_are_invalid(tsk->real_cred) || - creds_are_invalid(tsk->cred))) - goto invalid_creds; - } - return; - -invalid_creds: - pr_err("Invalid process credentials\n"); - pr_err("At %s:%u\n", file, line); - - dump_invalid_creds(tsk->real_cred, "Real", tsk); - if (tsk->cred != tsk->real_cred) - dump_invalid_creds(tsk->cred, "Effective", tsk); - else - pr_err("Effective creds == Real creds\n"); - BUG(); -} -EXPORT_SYMBOL(__validate_process_creds); - -/* - * check creds for do_exit() - */ -void validate_creds_for_do_exit(struct task_struct *tsk) -{ - kdebug("validate_creds_for_do_exit(%p,%p{%d,%d})", - tsk->real_cred, tsk->cred, - atomic_read(&tsk->cred->usage), - read_cred_subscribers(tsk->cred)); - - __validate_process_creds(tsk, __FILE__, __LINE__); -} - -#endif /* CONFIG_DEBUG_CREDENTIALS */ diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 621037a0aa87..ce1bb2301c06 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -1006,6 +1006,9 @@ void kgdb_panic(const char *msg) if (panic_timeout) return; + debug_locks_off(); + console_flush_on_panic(CONSOLE_FLUSH_PENDING); + if (dbg_kdb_mode) kdb_printf("PANIC: %s\n", msg); diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 813cb6cf72d6..9443bc63c5a2 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -590,6 +590,8 @@ static void kdb_msg_write(const char *msg, int msg_len) continue; if (c == dbg_io_ops->cons) continue; + if (!c->write) + continue; /* * Set oops_in_progress to encourage the console drivers to * disregard their internal spin locks: in the current calling diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 438b868cbfa9..d05066cb40b2 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -272,11 +272,10 @@ char *kdbgetenv(const char *match) * kdballocenv - This function is used to allocate bytes for * environment entries. * Parameters: - * match A character string representing a numeric value - * Outputs: - * *value the unsigned long representation of the env variable 'match' + * bytes The number of bytes to allocate in the static buffer. * Returns: - * Zero on success, a kdb diagnostic on failure. + * A pointer to the allocated space in the buffer on success. + * NULL if bytes > size available in the envbuffer. * Remarks: * We use a static environment buffer (envbuffer) to hold the values * of dynamically generated environment variables (see kdb_set). Buffer @@ -1349,8 +1348,6 @@ do_full_getstr: /* PROMPT can only be set if we have MEM_READ permission. */ snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"), raw_smp_processor_id()); - if (defcmd_in_progress) - strncat(kdb_prompt_str, "[defcmd]", CMD_BUFLEN); /* * Fetch command from keyboard diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 4c1e9a3c0ab6..d62f5957f36b 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -135,6 +135,8 @@ config DMA_COHERENT_POOL config DMA_GLOBAL_POOL select DMA_DECLARE_COHERENT + depends on !ARCH_HAS_DMA_SET_UNCACHED + depends on !DMA_DIRECT_REMAP bool config DMA_DIRECT_REMAP @@ -142,6 +144,15 @@ config DMA_DIRECT_REMAP select DMA_COHERENT_POOL select DMA_NONCOHERENT_MMAP +# +# Fallback to arch code for DMA allocations. This should eventually go away. +# +config ARCH_HAS_DMA_ALLOC + depends on !ARCH_HAS_DMA_SET_UNCACHED + depends on !DMA_DIRECT_REMAP + depends on !DMA_GLOBAL_POOL + bool + config DMA_CMA bool "DMA Contiguous Memory Allocator" depends on HAVE_DMA_CONTIGUOUS && CMA @@ -160,7 +171,7 @@ if DMA_CMA config DMA_NUMA_CMA bool "Enable separate DMA Contiguous Memory Area for NUMA Node" - default NUMA + depends on NUMA help Enable this option to get numa CMA areas so that NUMA devices can get local memory by DMA coherent APIs. diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index c21abc77c53e..ff5683a57f77 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -132,8 +132,10 @@ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, void dma_release_coherent_memory(struct device *dev) { - if (dev) + if (dev) { _dma_release_coherent_memory(dev->dma_mem); + dev->dma_mem = NULL; + } } static void *__dma_alloc_from_coherent(struct device *dev, diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 88c595e49e34..f005c66f378c 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -473,11 +473,6 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem) return -EBUSY; } - if (memblock_is_region_reserved(rmem->base, rmem->size)) { - pr_info("Reserved memory: overlap with other memblock reserved region\n"); - return -EBUSY; - } - if (!of_get_flat_dt_prop(node, "reusable", NULL) || of_get_flat_dt_prop(node, "no-map", NULL)) return -EINVAL; diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index f190651bcadd..a6e3792b15f8 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -62,7 +62,8 @@ enum map_err_types { * @pfn: page frame of the start address * @offset: offset of mapping relative to pfn * @map_err_type: track whether dma_mapping_error() was checked - * @stacktrace: support backtraces when a violation is detected + * @stack_len: number of backtrace entries in @stack_entries + * @stack_entries: stack of backtrace history */ struct dma_debug_entry { struct list_head list; @@ -139,7 +140,7 @@ static const char *const maperr2str[] = { static const char *type2name[] = { [dma_debug_single] = "single", - [dma_debug_sg] = "scather-gather", + [dma_debug_sg] = "scatter-gather", [dma_debug_coherent] = "coherent", [dma_debug_resource] = "resource", }; @@ -637,15 +638,19 @@ static struct dma_debug_entry *__dma_entry_alloc(void) return entry; } -static void __dma_entry_alloc_check_leak(void) +/* + * This should be called outside of free_entries_lock scope to avoid potential + * deadlocks with serial consoles that use DMA. + */ +static void __dma_entry_alloc_check_leak(u32 nr_entries) { - u32 tmp = nr_total_entries % nr_prealloc_entries; + u32 tmp = nr_entries % nr_prealloc_entries; /* Shout each time we tick over some multiple of the initial pool */ if (tmp < DMA_DEBUG_DYNAMIC_ENTRIES) { pr_info("dma_debug_entry pool grown to %u (%u00%%)\n", - nr_total_entries, - (nr_total_entries / nr_prealloc_entries)); + nr_entries, + (nr_entries / nr_prealloc_entries)); } } @@ -656,8 +661,10 @@ static void __dma_entry_alloc_check_leak(void) */ static struct dma_debug_entry *dma_entry_alloc(void) { + bool alloc_check_leak = false; struct dma_debug_entry *entry; unsigned long flags; + u32 nr_entries; spin_lock_irqsave(&free_entries_lock, flags); if (num_free_entries == 0) { @@ -667,13 +674,17 @@ static struct dma_debug_entry *dma_entry_alloc(void) pr_err("debugging out of memory - disabling\n"); return NULL; } - __dma_entry_alloc_check_leak(); + alloc_check_leak = true; + nr_entries = nr_total_entries; } entry = __dma_entry_alloc(); spin_unlock_irqrestore(&free_entries_lock, flags); + if (alloc_check_leak) + __dma_entry_alloc_check_leak(nr_entries); + #ifdef CONFIG_STACKTRACE entry->stack_len = stack_trace_save(entry->stack_entries, ARRAY_SIZE(entry->stack_entries), @@ -866,7 +877,7 @@ static int dma_debug_device_change(struct notifier_block *nb, unsigned long acti return 0; } -void dma_debug_add_bus(struct bus_type *bus) +void dma_debug_add_bus(const struct bus_type *bus) { struct notifier_block *nb; diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 9596ae1aa0da..98b2e192fd69 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -220,13 +220,7 @@ void *dma_direct_alloc(struct device *dev, size_t size, return dma_direct_alloc_no_mapping(dev, size, dma_handle, gfp); if (!dev_is_dma_coherent(dev)) { - /* - * Fallback to the arch handler if it exists. This should - * eventually go away. - */ - if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && - !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - !IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) && + if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_ALLOC) && !is_swiotlb_for_alloc(dev)) return arch_dma_alloc(dev, size, dma_handle, gfp, attrs); @@ -240,27 +234,24 @@ void *dma_direct_alloc(struct device *dev, size_t size, dma_handle); /* - * Otherwise remap if the architecture is asking for it. But - * given that remapping memory is a blocking operation we'll - * instead have to dip into the atomic pools. + * Otherwise we require the architecture to either be able to + * mark arbitrary parts of the kernel direct mapping uncached, + * or remapped it uncached. */ + set_uncached = IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED); remap = IS_ENABLED(CONFIG_DMA_DIRECT_REMAP); - if (remap) { - if (dma_direct_use_pool(dev, gfp)) - return dma_direct_alloc_from_pool(dev, size, - dma_handle, gfp); - } else { - if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED)) - return NULL; - set_uncached = true; + if (!set_uncached && !remap) { + pr_warn_once("coherent DMA allocations not supported on this platform.\n"); + return NULL; } } /* - * Decrypting memory may block, so allocate the memory from the atomic - * pools if we can't block. + * Remapping or decrypting memory may block, allocate the memory from + * the atomic pools instead if we aren't allowed block. */ - if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp)) + if ((remap || force_dma_unencrypted(dev)) && + dma_direct_use_pool(dev, gfp)) return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); /* we always manually zero the memory once we are done */ @@ -330,9 +321,7 @@ void dma_direct_free(struct device *dev, size_t size, return; } - if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && - !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - !IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) && + if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_ALLOC) && !dev_is_dma_coherent(dev) && !is_swiotlb_for_alloc(dev)) { arch_dma_free(dev, size, cpu_addr, dma_addr, attrs); @@ -598,6 +587,46 @@ int dma_direct_supported(struct device *dev, u64 mask) return mask >= phys_to_dma_unencrypted(dev, min_mask); } +/* + * To check whether all ram resource ranges are covered by dma range map + * Returns 0 when further check is needed + * Returns 1 if there is some RAM range can't be covered by dma_range_map + */ +static int check_ram_in_range_map(unsigned long start_pfn, + unsigned long nr_pages, void *data) +{ + unsigned long end_pfn = start_pfn + nr_pages; + const struct bus_dma_region *bdr = NULL; + const struct bus_dma_region *m; + struct device *dev = data; + + while (start_pfn < end_pfn) { + for (m = dev->dma_range_map; PFN_DOWN(m->size); m++) { + unsigned long cpu_start_pfn = PFN_DOWN(m->cpu_start); + + if (start_pfn >= cpu_start_pfn && + start_pfn - cpu_start_pfn < PFN_DOWN(m->size)) { + bdr = m; + break; + } + } + if (!bdr) + return 1; + + start_pfn = PFN_DOWN(bdr->cpu_start) + PFN_DOWN(bdr->size); + } + + return 0; +} + +bool dma_direct_all_ram_mapped(struct device *dev) +{ + if (!dev->dma_range_map) + return true; + return !walk_system_ram_range(0, PFN_DOWN(ULONG_MAX) + 1, dev, + check_ram_in_range_map); +} + size_t dma_direct_max_mapping_size(struct device *dev) { /* If SWIOTLB is active, use its maximum mapping size */ @@ -648,7 +677,6 @@ int dma_direct_set_offset(struct device *dev, phys_addr_t cpu_start, return -ENOMEM; map[0].cpu_start = cpu_start; map[0].dma_start = dma_start; - map[0].offset = offset; map[0].size = size; dev->dma_range_map = map; return 0; diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index 97ec892ea0b5..18d346118fe8 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -20,6 +20,7 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr); int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs); +bool dma_direct_all_ram_mapped(struct device *dev); size_t dma_direct_max_mapping_size(struct device *dev); #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index e323ca48f7f2..58db8fd70471 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -793,6 +793,28 @@ int dma_set_coherent_mask(struct device *dev, u64 mask) } EXPORT_SYMBOL(dma_set_coherent_mask); +/** + * dma_addressing_limited - return if the device is addressing limited + * @dev: device to check + * + * Return %true if the devices DMA mask is too small to address all memory in + * the system, else %false. Lack of addressing bits is the prime reason for + * bounce buffering, but might not be the only one. + */ +bool dma_addressing_limited(struct device *dev) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) < + dma_get_required_mask(dev)) + return true; + + if (unlikely(ops)) + return false; + return !dma_direct_all_ram_mapped(dev); +} +EXPORT_SYMBOL_GPL(dma_addressing_limited); + size_t dma_max_mapping_size(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 1acec2e22827..d10613eb0f63 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -84,8 +84,8 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, void *addr; int ret = -ENOMEM; - /* Cannot allocate larger than MAX_ORDER */ - order = min(get_order(pool_size), MAX_ORDER); + /* Cannot allocate larger than MAX_PAGE_ORDER */ + order = min(get_order(pool_size), MAX_PAGE_ORDER); do { pool_size = 1 << (PAGE_SHIFT + order); @@ -135,9 +135,9 @@ encrypt_mapping: remove_mapping: #ifdef CONFIG_DMA_DIRECT_REMAP dma_common_free_remap(addr, pool_size); -#endif -free_page: __maybe_unused +free_page: __free_pages(page, order); +#endif out: return ret; } @@ -190,7 +190,7 @@ static int __init dma_atomic_pool_init(void) /* * If coherent_pool was not used on the command line, default the pool - * sizes to 128KB per 1GB of memory, min 128KB, max MAX_ORDER. + * sizes to 128KB per 1GB of memory, min 128KB, max MAX_PAGE_ORDER. */ if (!atomic_pool_size) { unsigned long pages = totalram_pages() / (SZ_1G / SZ_128K); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 394494a6b1f3..b079a9a8e087 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -283,7 +283,8 @@ static void swiotlb_init_io_tlb_pool(struct io_tlb_pool *mem, phys_addr_t start, } for (i = 0; i < mem->nslabs; i++) { - mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i); + mem->slots[i].list = min(IO_TLB_SEGSIZE - io_tlb_offset(i), + mem->nslabs - i); mem->slots[i].orig_addr = INVALID_PHYS_ADDR; mem->slots[i].alloc_size = 0; } @@ -399,14 +400,13 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags, } mem->areas = memblock_alloc(array_size(sizeof(struct io_tlb_area), - default_nareas), SMP_CACHE_BYTES); + nareas), SMP_CACHE_BYTES); if (!mem->areas) { pr_warn("%s: Failed to allocate mem->areas.\n", __func__); return; } - swiotlb_init_io_tlb_pool(mem, __pa(tlb), nslabs, false, - default_nareas); + swiotlb_init_io_tlb_pool(mem, __pa(tlb), nslabs, false, nareas); add_mem_pool(&io_tlb_default_mem, mem); if (flags & SWIOTLB_VERBOSE) @@ -559,29 +559,40 @@ void __init swiotlb_exit(void) * alloc_dma_pages() - allocate pages to be used for DMA * @gfp: GFP flags for the allocation. * @bytes: Size of the buffer. + * @phys_limit: Maximum allowed physical address of the buffer. * * Allocate pages from the buddy allocator. If successful, make the allocated * pages decrypted that they can be used for DMA. * - * Return: Decrypted pages, or %NULL on failure. + * Return: Decrypted pages, %NULL on allocation failure, or ERR_PTR(-EAGAIN) + * if the allocated physical address was above @phys_limit. */ -static struct page *alloc_dma_pages(gfp_t gfp, size_t bytes) +static struct page *alloc_dma_pages(gfp_t gfp, size_t bytes, u64 phys_limit) { unsigned int order = get_order(bytes); struct page *page; + phys_addr_t paddr; void *vaddr; page = alloc_pages(gfp, order); if (!page) return NULL; - vaddr = page_address(page); + paddr = page_to_phys(page); + if (paddr + bytes - 1 > phys_limit) { + __free_pages(page, order); + return ERR_PTR(-EAGAIN); + } + + vaddr = phys_to_virt(paddr); if (set_memory_decrypted((unsigned long)vaddr, PFN_UP(bytes))) goto error; return page; error: - __free_pages(page, order); + /* Intentional leak if pages cannot be encrypted again. */ + if (!set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes))) + __free_pages(page, order); return NULL; } @@ -619,11 +630,7 @@ static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes, else if (phys_limit <= DMA_BIT_MASK(32)) gfp |= __GFP_DMA32; - while ((page = alloc_dma_pages(gfp, bytes)) && - page_to_phys(page) + bytes - 1 > phys_limit) { - /* allocated, but too high */ - __free_pages(page, get_order(bytes)); - + while (IS_ERR(page = alloc_dma_pages(gfp, bytes, phys_limit))) { if (IS_ENABLED(CONFIG_ZONE_DMA32) && phys_limit < DMA_BIT_MASK(64) && !(gfp & (__GFP_DMA32 | __GFP_DMA))) @@ -679,6 +686,11 @@ static struct io_tlb_pool *swiotlb_alloc_pool(struct device *dev, size_t pool_size; size_t tlb_size; + if (nslabs > SLABS_PER_PAGE << MAX_PAGE_ORDER) { + nslabs = SLABS_PER_PAGE << MAX_PAGE_ORDER; + nareas = limit_nareas(nareas, nslabs); + } + pool_size = sizeof(*pool) + array_size(sizeof(*pool->areas), nareas); pool = kzalloc(pool_size, gfp); if (!pool) @@ -729,9 +741,6 @@ static void swiotlb_dyn_alloc(struct work_struct *work) } add_mem_pool(mem, pool); - - /* Pairs with smp_rmb() in is_swiotlb_buffer(). */ - smp_wmb(); } /** @@ -948,7 +957,7 @@ static void dec_used(struct io_tlb_mem *mem, unsigned int nslots) #endif /* CONFIG_DEBUG_FS */ /** - * swiotlb_area_find_slots() - search for slots in one IO TLB memory area + * swiotlb_search_pool_area() - search one memory area in one pool * @dev: Device which maps the buffer. * @pool: Memory pool to be searched. * @area_index: Index of the IO TLB memory area to be searched. @@ -963,7 +972,7 @@ static void dec_used(struct io_tlb_mem *mem, unsigned int nslots) * * Return: Index of the first allocated slot, or -1 on error. */ -static int swiotlb_area_find_slots(struct device *dev, struct io_tlb_pool *pool, +static int swiotlb_search_pool_area(struct device *dev, struct io_tlb_pool *pool, int area_index, phys_addr_t orig_addr, size_t alloc_size, unsigned int alloc_align_mask) { @@ -1057,41 +1066,50 @@ found: return slot_index; } +#ifdef CONFIG_SWIOTLB_DYNAMIC + /** - * swiotlb_pool_find_slots() - search for slots in one memory pool + * swiotlb_search_area() - search one memory area in all pools * @dev: Device which maps the buffer. - * @pool: Memory pool to be searched. + * @start_cpu: Start CPU number. + * @cpu_offset: Offset from @start_cpu. * @orig_addr: Original (non-bounced) IO buffer address. * @alloc_size: Total requested size of the bounce buffer, * including initial alignment padding. * @alloc_align_mask: Required alignment of the allocated buffer. + * @retpool: Used memory pool, updated on return. * - * Search through one memory pool to find a sequence of slots that match the + * Search one memory area in all pools for a sequence of slots that match the * allocation constraints. * * Return: Index of the first allocated slot, or -1 on error. */ -static int swiotlb_pool_find_slots(struct device *dev, struct io_tlb_pool *pool, - phys_addr_t orig_addr, size_t alloc_size, - unsigned int alloc_align_mask) +static int swiotlb_search_area(struct device *dev, int start_cpu, + int cpu_offset, phys_addr_t orig_addr, size_t alloc_size, + unsigned int alloc_align_mask, struct io_tlb_pool **retpool) { - int start = raw_smp_processor_id() & (pool->nareas - 1); - int i = start, index; - - do { - index = swiotlb_area_find_slots(dev, pool, i, orig_addr, - alloc_size, alloc_align_mask); - if (index >= 0) - return index; - if (++i >= pool->nareas) - i = 0; - } while (i != start); + struct io_tlb_mem *mem = dev->dma_io_tlb_mem; + struct io_tlb_pool *pool; + int area_index; + int index = -1; - return -1; + rcu_read_lock(); + list_for_each_entry_rcu(pool, &mem->pools, node) { + if (cpu_offset >= pool->nareas) + continue; + area_index = (start_cpu + cpu_offset) & (pool->nareas - 1); + index = swiotlb_search_pool_area(dev, pool, area_index, + orig_addr, alloc_size, + alloc_align_mask); + if (index >= 0) { + *retpool = pool; + break; + } + } + rcu_read_unlock(); + return index; } -#ifdef CONFIG_SWIOTLB_DYNAMIC - /** * swiotlb_find_slots() - search for slots in the whole swiotlb * @dev: Device which maps the buffer. @@ -1115,18 +1133,20 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, unsigned long nslabs; unsigned long flags; u64 phys_limit; + int cpu, i; int index; - rcu_read_lock(); - list_for_each_entry_rcu(pool, &mem->pools, node) { - index = swiotlb_pool_find_slots(dev, pool, orig_addr, - alloc_size, alloc_align_mask); - if (index >= 0) { - rcu_read_unlock(); + if (alloc_size > IO_TLB_SEGSIZE * IO_TLB_SIZE) + return -1; + + cpu = raw_smp_processor_id(); + for (i = 0; i < default_nareas; ++i) { + index = swiotlb_search_area(dev, cpu, i, orig_addr, alloc_size, + alloc_align_mask, &pool); + if (index >= 0) goto found; - } } - rcu_read_unlock(); + if (!mem->can_grow) return -1; @@ -1139,8 +1159,8 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, if (!pool) return -1; - index = swiotlb_pool_find_slots(dev, pool, orig_addr, - alloc_size, alloc_align_mask); + index = swiotlb_search_pool_area(dev, pool, 0, orig_addr, + alloc_size, alloc_align_mask); if (index < 0) { swiotlb_dyn_free(&pool->rcu); return -1; @@ -1152,9 +1172,26 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, spin_unlock_irqrestore(&dev->dma_io_tlb_lock, flags); found: - dev->dma_uses_io_tlb = true; - /* Pairs with smp_rmb() in is_swiotlb_buffer() */ - smp_wmb(); + WRITE_ONCE(dev->dma_uses_io_tlb, true); + + /* + * The general barrier orders reads and writes against a presumed store + * of the SWIOTLB buffer address by a device driver (to a driver private + * data structure). It serves two purposes. + * + * First, the store to dev->dma_uses_io_tlb must be ordered before the + * presumed store. This guarantees that the returned buffer address + * cannot be passed to another CPU before updating dev->dma_uses_io_tlb. + * + * Second, the load from mem->pools must be ordered before the same + * presumed store. This guarantees that the returned buffer address + * cannot be observed by another CPU before an update of the RCU list + * that was made by swiotlb_dyn_alloc() on a third CPU (cf. multicopy + * atomicity). + * + * See also the comment in is_swiotlb_buffer(). + */ + smp_mb(); *retpool = pool; return index; @@ -1166,9 +1203,21 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, size_t alloc_size, unsigned int alloc_align_mask, struct io_tlb_pool **retpool) { - *retpool = &dev->dma_io_tlb_mem->defpool; - return swiotlb_pool_find_slots(dev, *retpool, - orig_addr, alloc_size, alloc_align_mask); + struct io_tlb_pool *pool; + int start, i; + int index; + + *retpool = pool = &dev->dma_io_tlb_mem->defpool; + i = start = raw_smp_processor_id() & (pool->nareas - 1); + do { + index = swiotlb_search_pool_area(dev, pool, i, orig_addr, + alloc_size, alloc_align_mask); + if (index >= 0) + return index; + if (++i >= pool->nareas) + i = 0; + } while (i != start); + return -1; } #endif /* CONFIG_SWIOTLB_DYNAMIC */ @@ -1283,11 +1332,13 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, pool->slots[index + i].orig_addr = slot_addr(orig_addr, i); tlb_addr = slot_addr(pool->start, index) + offset; /* - * When dir == DMA_FROM_DEVICE we could omit the copy from the orig - * to the tlb buffer, if we knew for sure the device will - * overwrite the entire current content. But we don't. Thus - * unconditional bounce may prevent leaking swiotlb content (i.e. - * kernel memory) to user-space. + * When the device is writing memory, i.e. dir == DMA_FROM_DEVICE, copy + * the original buffer to the TLB buffer before initiating DMA in order + * to preserve the original's data if the device does a partial write, + * i.e. if the device doesn't overwrite the entire buffer. Preserving + * the original data, even if it's garbage, is necessary to match + * hardware behavior. Use of swiotlb is supposed to be transparent, + * i.e. swiotlb must not corrupt memory by clobbering unwritten bytes. */ swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE); return tlb_addr; diff --git a/kernel/entry/common.c b/kernel/entry/common.c index d7ee4bc3f2ba..88cb3c88aaa5 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -15,26 +15,6 @@ #define CREATE_TRACE_POINTS #include <trace/events/syscalls.h> -/* See comment for enter_from_user_mode() in entry-common.h */ -static __always_inline void __enter_from_user_mode(struct pt_regs *regs) -{ - arch_enter_from_user_mode(regs); - lockdep_hardirqs_off(CALLER_ADDR0); - - CT_WARN_ON(__ct_state() != CONTEXT_USER); - user_exit_irqoff(); - - instrumentation_begin(); - kmsan_unpoison_entry_regs(regs); - trace_hardirqs_off_finish(); - instrumentation_end(); -} - -void noinstr enter_from_user_mode(struct pt_regs *regs) -{ - __enter_from_user_mode(regs); -} - static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) { if (unlikely(audit_context())) { @@ -45,7 +25,7 @@ static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) } } -static long syscall_trace_enter(struct pt_regs *regs, long syscall, +long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work) { long ret = 0; @@ -85,67 +65,24 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, return ret ? : syscall; } -static __always_inline long -__syscall_enter_from_user_work(struct pt_regs *regs, long syscall) -{ - unsigned long work = READ_ONCE(current_thread_info()->syscall_work); - - if (work & SYSCALL_WORK_ENTER) - syscall = syscall_trace_enter(regs, syscall, work); - - return syscall; -} - -long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall) -{ - return __syscall_enter_from_user_work(regs, syscall); -} - -noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) -{ - long ret; - - __enter_from_user_mode(regs); - - instrumentation_begin(); - local_irq_enable(); - ret = __syscall_enter_from_user_work(regs, syscall); - instrumentation_end(); - - return ret; -} - noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) { - __enter_from_user_mode(regs); + enter_from_user_mode(regs); instrumentation_begin(); local_irq_enable(); instrumentation_end(); } -/* See comment for exit_to_user_mode() in entry-common.h */ -static __always_inline void __exit_to_user_mode(void) -{ - instrumentation_begin(); - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(); - instrumentation_end(); - - user_enter_irqoff(); - arch_exit_to_user_mode(); - lockdep_hardirqs_on(CALLER_ADDR0); -} - -void noinstr exit_to_user_mode(void) -{ - __exit_to_user_mode(); -} - /* Workaround to allow gradual conversion of architecture code */ void __weak arch_do_signal_or_restart(struct pt_regs *regs) { } -static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, - unsigned long ti_work) +/** + * exit_to_user_mode_loop - do any pending work before leaving to user space + * @regs: Pointer to pt_regs on entry stack + * @ti_work: TIF work flags as read by the caller + */ +__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, + unsigned long ti_work) { /* * Before returning to user space ensure that all pending work @@ -190,27 +127,6 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, return ti_work; } -static void exit_to_user_mode_prepare(struct pt_regs *regs) -{ - unsigned long ti_work; - - lockdep_assert_irqs_disabled(); - - /* Flush pending rcuog wakeup before the last need_resched() check */ - tick_nohz_user_enter_prepare(); - - ti_work = read_thread_flags(); - if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) - ti_work = exit_to_user_mode_loop(regs, ti_work); - - arch_exit_to_user_mode_prepare(regs, ti_work); - - /* Ensure that kernel state is sane for a return to userspace */ - kmap_assert_nomap(); - lockdep_assert_irqs_disabled(); - lockdep_sys_exit(); -} - /* * If SYSCALL_EMU is set, then the only reason to report is when * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall @@ -295,12 +211,12 @@ __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) instrumentation_begin(); __syscall_exit_to_user_mode_work(regs); instrumentation_end(); - __exit_to_user_mode(); + exit_to_user_mode(); } noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) { - __enter_from_user_mode(regs); + enter_from_user_mode(regs); } noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) @@ -308,7 +224,7 @@ noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) instrumentation_begin(); exit_to_user_mode_prepare(regs); instrumentation_end(); - __exit_to_user_mode(); + exit_to_user_mode(); } noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) diff --git a/kernel/events/core.c b/kernel/events/core.c index 4c72a41f11af..f0f0f71213a1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -375,6 +375,7 @@ enum event_type_t { EVENT_TIME = 0x4, /* see ctx_resched() for details */ EVENT_CPU = 0x8, + EVENT_CGROUP = 0x10, EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, }; @@ -449,8 +450,8 @@ static void update_perf_cpu_limits(void) static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc); -int perf_proc_update_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +int perf_event_max_sample_rate_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int ret; int perf_cpu = sysctl_perf_cpu_time_max_percent; @@ -684,20 +685,26 @@ do { \ ___p; \ }) -static void perf_ctx_disable(struct perf_event_context *ctx) +static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup) { struct perf_event_pmu_context *pmu_ctx; - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { + if (cgroup && !pmu_ctx->nr_cgroups) + continue; perf_pmu_disable(pmu_ctx->pmu); + } } -static void perf_ctx_enable(struct perf_event_context *ctx) +static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup) { struct perf_event_pmu_context *pmu_ctx; - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { + if (cgroup && !pmu_ctx->nr_cgroups) + continue; perf_pmu_enable(pmu_ctx->pmu); + } } static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type); @@ -856,9 +863,9 @@ static void perf_cgroup_switch(struct task_struct *task) return; perf_ctx_lock(cpuctx, cpuctx->task_ctx); - perf_ctx_disable(&cpuctx->ctx); + perf_ctx_disable(&cpuctx->ctx, true); - ctx_sched_out(&cpuctx->ctx, EVENT_ALL); + ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP); /* * must not be done before ctxswout due * to update_cgrp_time_from_cpuctx() in @@ -870,9 +877,9 @@ static void perf_cgroup_switch(struct task_struct *task) * perf_cgroup_set_timestamp() in ctx_sched_in() * to not have to pass task around */ - ctx_sched_in(&cpuctx->ctx, EVENT_ALL); + ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP); - perf_ctx_enable(&cpuctx->ctx); + perf_ctx_enable(&cpuctx->ctx, true); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } @@ -965,6 +972,8 @@ perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ct if (!is_cgroup_event(event)) return; + event->pmu_ctx->nr_cgroups++; + /* * Because cgroup events are always per-cpu events, * @ctx == &cpuctx->ctx. @@ -985,6 +994,8 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c if (!is_cgroup_event(event)) return; + event->pmu_ctx->nr_cgroups--; + /* * Because cgroup events are always per-cpu events, * @ctx == &cpuctx->ctx. @@ -1803,31 +1814,34 @@ static inline void perf_event__state_init(struct perf_event *event) PERF_EVENT_STATE_INACTIVE; } -static void __perf_event_read_size(struct perf_event *event, int nr_siblings) +static int __perf_event_read_size(u64 read_format, int nr_siblings) { int entry = sizeof(u64); /* value */ int size = 0; int nr = 1; - if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) size += sizeof(u64); - if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) size += sizeof(u64); - if (event->attr.read_format & PERF_FORMAT_ID) + if (read_format & PERF_FORMAT_ID) entry += sizeof(u64); - if (event->attr.read_format & PERF_FORMAT_LOST) + if (read_format & PERF_FORMAT_LOST) entry += sizeof(u64); - if (event->attr.read_format & PERF_FORMAT_GROUP) { + if (read_format & PERF_FORMAT_GROUP) { nr += nr_siblings; size += sizeof(u64); } - size += entry * nr; - event->read_size = size; + /* + * Since perf_event_validate_size() limits this to 16k and inhibits + * adding more siblings, this will never overflow. + */ + return size + nr * entry; } static void __perf_event_header_size(struct perf_event *event, u64 sample_type) @@ -1877,8 +1891,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type) */ static void perf_event__header_size(struct perf_event *event) { - __perf_event_read_size(event, - event->group_leader->nr_siblings); + event->read_size = + __perf_event_read_size(event->attr.read_format, + event->group_leader->nr_siblings); __perf_event_header_size(event, event->attr.sample_type); } @@ -1909,23 +1924,44 @@ static void perf_event__id_header_size(struct perf_event *event) event->id_header_size = size; } +/* + * Check that adding an event to the group does not result in anybody + * overflowing the 64k event limit imposed by the output buffer. + * + * Specifically, check that the read_size for the event does not exceed 16k, + * read_size being the one term that grows with groups size. Since read_size + * depends on per-event read_format, also (re)check the existing events. + * + * This leaves 48k for the constant size fields and things like callchains, + * branch stacks and register sets. + */ static bool perf_event_validate_size(struct perf_event *event) { - /* - * The values computed here will be over-written when we actually - * attach the event. - */ - __perf_event_read_size(event, event->group_leader->nr_siblings + 1); - __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ); - perf_event__id_header_size(event); + struct perf_event *sibling, *group_leader = event->group_leader; + + if (__perf_event_read_size(event->attr.read_format, + group_leader->nr_siblings + 1) > 16*1024) + return false; + + if (__perf_event_read_size(group_leader->attr.read_format, + group_leader->nr_siblings + 1) > 16*1024) + return false; /* - * Sum the lot; should not exceed the 64k limit we have on records. - * Conservative limit to allow for callchains and other variable fields. + * When creating a new group leader, group_leader->ctx is initialized + * after the size has been validated, but we cannot safely use + * for_each_sibling_event() until group_leader->ctx is set. A new group + * leader cannot have any siblings yet, so we can safely skip checking + * the non-existent siblings. */ - if (event->read_size + event->header_size + - event->id_header_size + sizeof(struct perf_event_header) >= 16*1024) - return false; + if (event == group_leader) + return true; + + for_each_sibling_event(sibling, group_leader) { + if (__perf_event_read_size(sibling->attr.read_format, + group_leader->nr_siblings + 1) > 16*1024) + return false; + } return true; } @@ -1954,6 +1990,7 @@ static void perf_group_attach(struct perf_event *event) list_add_tail(&event->sibling_list, &group_leader->sibling_list); group_leader->nr_siblings++; + group_leader->group_generation++; perf_event__header_size(group_leader); @@ -2144,6 +2181,7 @@ static void perf_group_detach(struct perf_event *event) if (leader != event) { list_del_init(&event->sibling_list); event->group_leader->nr_siblings--; + event->group_leader->group_generation++; goto out; } @@ -2677,9 +2715,9 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, event_type &= EVENT_ALL; - perf_ctx_disable(&cpuctx->ctx); + perf_ctx_disable(&cpuctx->ctx, false); if (task_ctx) { - perf_ctx_disable(task_ctx); + perf_ctx_disable(task_ctx, false); task_ctx_sched_out(task_ctx, event_type); } @@ -2697,9 +2735,9 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, perf_event_sched_in(cpuctx, task_ctx); - perf_ctx_enable(&cpuctx->ctx); + perf_ctx_enable(&cpuctx->ctx, false); if (task_ctx) - perf_ctx_enable(task_ctx); + perf_ctx_enable(task_ctx, false); } void perf_pmu_resched(struct pmu *pmu) @@ -3244,6 +3282,9 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; + bool cgroup = event_type & EVENT_CGROUP; + + event_type &= ~EVENT_CGROUP; lockdep_assert_held(&ctx->lock); @@ -3290,8 +3331,11 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) is_active ^= ctx->is_active; /* changed bits */ - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { + if (cgroup && !pmu_ctx->nr_cgroups) + continue; __pmu_ctx_sched_out(pmu_ctx, is_active); + } } /* @@ -3482,7 +3526,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { - perf_ctx_disable(ctx); + perf_ctx_disable(ctx, false); /* PMIs are disabled; ctx->nr_pending is stable. */ if (local_read(&ctx->nr_pending) || @@ -3502,7 +3546,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) perf_ctx_sched_task_cb(ctx, false); perf_event_swap_task_ctx_data(ctx, next_ctx); - perf_ctx_enable(ctx); + perf_ctx_enable(ctx, false); /* * RCU_INIT_POINTER here is safe because we've not @@ -3526,13 +3570,13 @@ unlock: if (do_switch) { raw_spin_lock(&ctx->lock); - perf_ctx_disable(ctx); + perf_ctx_disable(ctx, false); inside_switch: perf_ctx_sched_task_cb(ctx, false); task_ctx_sched_out(ctx, EVENT_ALL); - perf_ctx_enable(ctx); + perf_ctx_enable(ctx, false); raw_spin_unlock(&ctx->lock); } } @@ -3818,47 +3862,32 @@ static int merge_sched_in(struct perf_event *event, void *data) return 0; } -static void ctx_pinned_sched_in(struct perf_event_context *ctx, struct pmu *pmu) +static void pmu_groups_sched_in(struct perf_event_context *ctx, + struct perf_event_groups *groups, + struct pmu *pmu) { - struct perf_event_pmu_context *pmu_ctx; int can_add_hw = 1; - - if (pmu) { - visit_groups_merge(ctx, &ctx->pinned_groups, - smp_processor_id(), pmu, - merge_sched_in, &can_add_hw); - } else { - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - can_add_hw = 1; - visit_groups_merge(ctx, &ctx->pinned_groups, - smp_processor_id(), pmu_ctx->pmu, - merge_sched_in, &can_add_hw); - } - } + visit_groups_merge(ctx, groups, smp_processor_id(), pmu, + merge_sched_in, &can_add_hw); } -static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct pmu *pmu) +static void ctx_groups_sched_in(struct perf_event_context *ctx, + struct perf_event_groups *groups, + bool cgroup) { struct perf_event_pmu_context *pmu_ctx; - int can_add_hw = 1; - if (pmu) { - visit_groups_merge(ctx, &ctx->flexible_groups, - smp_processor_id(), pmu, - merge_sched_in, &can_add_hw); - } else { - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - can_add_hw = 1; - visit_groups_merge(ctx, &ctx->flexible_groups, - smp_processor_id(), pmu_ctx->pmu, - merge_sched_in, &can_add_hw); - } + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { + if (cgroup && !pmu_ctx->nr_cgroups) + continue; + pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu); } } -static void __pmu_ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu) +static void __pmu_ctx_sched_in(struct perf_event_context *ctx, + struct pmu *pmu) { - ctx_flexible_sched_in(ctx, pmu); + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu); } static void @@ -3866,6 +3895,9 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); int is_active = ctx->is_active; + bool cgroup = event_type & EVENT_CGROUP; + + event_type &= ~EVENT_CGROUP; lockdep_assert_held(&ctx->lock); @@ -3898,11 +3930,11 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) * in order to give them the best chance of going on. */ if (is_active & EVENT_PINNED) - ctx_pinned_sched_in(ctx, NULL); + ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup); /* Then walk through the lower prio flexible groups */ if (is_active & EVENT_FLEXIBLE) - ctx_flexible_sched_in(ctx, NULL); + ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup); } static void perf_event_context_sched_in(struct task_struct *task) @@ -3917,11 +3949,11 @@ static void perf_event_context_sched_in(struct task_struct *task) if (cpuctx->task_ctx == ctx) { perf_ctx_lock(cpuctx, ctx); - perf_ctx_disable(ctx); + perf_ctx_disable(ctx, false); perf_ctx_sched_task_cb(ctx, true); - perf_ctx_enable(ctx); + perf_ctx_enable(ctx, false); perf_ctx_unlock(cpuctx, ctx); goto rcu_unlock; } @@ -3934,7 +3966,7 @@ static void perf_event_context_sched_in(struct task_struct *task) if (!ctx->nr_events) goto unlock; - perf_ctx_disable(ctx); + perf_ctx_disable(ctx, false); /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, @@ -3944,7 +3976,7 @@ static void perf_event_context_sched_in(struct task_struct *task) * events, no need to flip the cpuctx's events around. */ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) { - perf_ctx_disable(&cpuctx->ctx); + perf_ctx_disable(&cpuctx->ctx, false); ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE); } @@ -3953,9 +3985,9 @@ static void perf_event_context_sched_in(struct task_struct *task) perf_ctx_sched_task_cb(cpuctx->task_ctx, true); if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) - perf_ctx_enable(&cpuctx->ctx); + perf_ctx_enable(&cpuctx->ctx, false); - perf_ctx_enable(ctx); + perf_ctx_enable(ctx, false); unlock: perf_ctx_unlock(cpuctx, ctx); @@ -4425,6 +4457,9 @@ static int __perf_event_read_cpu(struct perf_event *event, int event_cpu) { u16 local_pkg, event_pkg; + if ((unsigned)event_cpu >= nr_cpu_ids) + return event_cpu; + if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { int local_cpu = smp_processor_id(); @@ -4527,6 +4562,8 @@ int perf_event_read_local(struct perf_event *event, u64 *value, u64 *enabled, u64 *running) { unsigned long flags; + int event_oncpu; + int event_cpu; int ret = 0; /* @@ -4551,15 +4588,22 @@ int perf_event_read_local(struct perf_event *event, u64 *value, goto out; } + /* + * Get the event CPU numbers, and adjust them to local if the event is + * a per-package event that can be read locally + */ + event_oncpu = __perf_event_read_cpu(event, event->oncpu); + event_cpu = __perf_event_read_cpu(event, event->cpu); + /* If this is a per-CPU event, it must be for this CPU */ if (!(event->attach_state & PERF_ATTACH_TASK) && - event->cpu != smp_processor_id()) { + event_cpu != smp_processor_id()) { ret = -EINVAL; goto out; } /* If this is a pinned event it must be running on this CPU */ - if (event->attr.pinned && event->oncpu != smp_processor_id()) { + if (event->attr.pinned && event_oncpu != smp_processor_id()) { ret = -EBUSY; goto out; } @@ -4569,7 +4613,7 @@ int perf_event_read_local(struct perf_event *event, u64 *value, * or local to this CPU. Furthermore it means its ACTIVE (otherwise * oncpu == -1). */ - if (event->oncpu == smp_processor_id()) + if (event_oncpu == smp_processor_id()) event->pmu->read(event); *value = local64_read(&event->count); @@ -4809,6 +4853,11 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, void *task_ctx_data = NULL; if (!ctx->task) { + /* + * perf_pmu_migrate_context() / __perf_pmu_install_event() + * relies on the fact that find_get_pmu_context() cannot fail + * for CPU contexts. + */ struct perf_cpu_pmu_context *cpc; cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu); @@ -5440,7 +5489,7 @@ static int __perf_read_group_add(struct perf_event *leader, u64 read_format, u64 *values) { struct perf_event_context *ctx = leader->ctx; - struct perf_event *sub; + struct perf_event *sub, *parent; unsigned long flags; int n = 1; /* skip @nr */ int ret; @@ -5450,6 +5499,33 @@ static int __perf_read_group_add(struct perf_event *leader, return ret; raw_spin_lock_irqsave(&ctx->lock, flags); + /* + * Verify the grouping between the parent and child (inherited) + * events is still in tact. + * + * Specifically: + * - leader->ctx->lock pins leader->sibling_list + * - parent->child_mutex pins parent->child_list + * - parent->ctx->mutex pins parent->sibling_list + * + * Because parent->ctx != leader->ctx (and child_list nests inside + * ctx->mutex), group destruction is not atomic between children, also + * see perf_event_release_kernel(). Additionally, parent can grow the + * group. + * + * Therefore it is possible to have parent and child groups in a + * different configuration and summing over such a beast makes no sense + * what so ever. + * + * Reject this. + */ + parent = leader->parent; + if (parent && + (parent->group_generation != leader->group_generation || + parent->nr_siblings != leader->nr_siblings)) { + ret = -ECHILD; + goto unlock; + } /* * Since we co-schedule groups, {enabled,running} times of siblings @@ -5483,8 +5559,9 @@ static int __perf_read_group_add(struct perf_event *leader, values[n++] = atomic64_read(&sub->lost_samples); } +unlock: raw_spin_unlock_irqrestore(&ctx->lock, flags); - return 0; + return ret; } static int perf_read_group(struct perf_event *event, @@ -5503,10 +5580,6 @@ static int perf_read_group(struct perf_event *event, values[0] = 1 + leader->nr_siblings; - /* - * By locking the child_mutex of the leader we effectively - * lock the child list of all siblings.. XXX explain how. - */ mutex_lock(&leader->child_mutex); ret = __perf_read_group_add(leader, read_format, values); @@ -7324,6 +7397,14 @@ void perf_output_sample(struct perf_output_handle *handle, if (branch_sample_hw_index(event)) perf_output_put(handle, data->br_stack->hw_idx); perf_output_copy(handle, data->br_stack->entries, size); + /* + * Add the extension space which is appended + * right after the struct perf_branch_stack. + */ + if (data->br_stack_cntr) { + size = data->br_stack->nr * sizeof(u64); + perf_output_copy(handle, data->br_stack_cntr, size); + } } else { /* * we always store at least the value of nr @@ -11352,9 +11433,30 @@ static DEVICE_ATTR_RW(perf_event_mux_interval_ms); static struct attribute *pmu_dev_attrs[] = { &dev_attr_type.attr, &dev_attr_perf_event_mux_interval_ms.attr, + &dev_attr_nr_addr_filters.attr, + NULL, +}; + +static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct pmu *pmu = dev_get_drvdata(dev); + + if (n == 2 && !pmu->nr_addr_filters) + return 0; + + return a->mode; +} + +static struct attribute_group pmu_dev_attr_group = { + .is_visible = pmu_dev_is_visible, + .attrs = pmu_dev_attrs, +}; + +static const struct attribute_group *pmu_dev_groups[] = { + &pmu_dev_attr_group, NULL, }; -ATTRIBUTE_GROUPS(pmu_dev); static int pmu_bus_running; static struct bus_type pmu_bus = { @@ -11391,18 +11493,11 @@ static int pmu_dev_alloc(struct pmu *pmu) if (ret) goto free_dev; - /* For PMUs with address filters, throw in an extra attribute: */ - if (pmu->nr_addr_filters) - ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters); - - if (ret) - goto del_dev; - - if (pmu->attr_update) + if (pmu->attr_update) { ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update); - - if (ret) - goto del_dev; + if (ret) + goto del_dev; + } out: return ret; @@ -12846,6 +12941,9 @@ static void __perf_pmu_install_event(struct pmu *pmu, int cpu, struct perf_event *event) { struct perf_event_pmu_context *epc; + struct perf_event_context *old_ctx = event->ctx; + + get_ctx(ctx); /* normally find_get_context() */ event->cpu = cpu; epc = find_get_pmu_context(pmu, ctx, event); @@ -12854,6 +12952,11 @@ static void __perf_pmu_install_event(struct pmu *pmu, if (event->state >= PERF_EVENT_STATE_OFF) event->state = PERF_EVENT_STATE_INACTIVE; perf_install_in_context(ctx, event, cpu); + + /* + * Now that event->ctx is updated and visible, put the old ctx. + */ + put_ctx(old_ctx); } static void __perf_pmu_install(struct perf_event_context *ctx, @@ -12892,6 +12995,10 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) struct perf_event_context *src_ctx, *dst_ctx; LIST_HEAD(events); + /* + * Since per-cpu context is persistent, no need to grab an extra + * reference. + */ src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx; dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx; @@ -13346,6 +13453,8 @@ static int inherit_group(struct perf_event *parent_event, !perf_get_aux_event(child_ctr, leader)) return -EINVAL; } + if (leader) + leader->group_generation = parent_event->group_generation; return 0; } diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index fb1e180b5f0a..60ed43d1c29e 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -610,8 +610,8 @@ static struct page *rb_alloc_aux_page(int node, int order) { struct page *page; - if (order > MAX_ORDER) - order = MAX_ORDER; + if (order > MAX_PAGE_ORDER) + order = MAX_PAGE_ORDER; do { page = alloc_pages_node(node, PERF_AUX_GFP, order); @@ -700,6 +700,12 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, watermark = 0; } + /* + * kcalloc_node() is unable to allocate buffer if the size is larger + * than: PAGE_SIZE << MAX_PAGE_ORDER; directly bail out in this case. + */ + if (get_order((unsigned long)nr_pages * sizeof(void *)) > MAX_PAGE_ORDER) + return -ENOMEM; rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL, node); if (!rb->aux_pages) @@ -815,7 +821,7 @@ struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) size = sizeof(struct perf_buffer); size += nr_pages * sizeof(void *); - if (order_base_2(size) > PAGE_SHIFT+MAX_ORDER) + if (order_base_2(size) > PAGE_SHIFT+MAX_PAGE_ORDER) goto fail; node = (cpu == -1) ? cpu : cpu_to_node(cpu); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 3048589e2e85..929e98c62965 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -181,7 +181,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, if (new_page) { folio_get(new_folio); - page_add_new_anon_rmap(new_page, vma, addr); + folio_add_new_anon_rmap(new_folio, vma, addr); folio_add_lru_vma(new_folio, vma); } else /* no new page, just dec_mm_counter for old_page */ @@ -198,7 +198,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, set_pte_at_notify(mm, addr, pvmw.pte, mk_pte(new_page, vma->vm_page_prot)); - page_remove_rmap(old_page, vma, false); + folio_remove_rmap_pte(old_folio, old_page, vma); if (!folio_mapped(old_folio)) folio_free_swap(old_folio); page_vma_mapped_walk_done(&pvmw); @@ -474,8 +474,8 @@ retry: gup_flags |= FOLL_SPLIT_PMD; /* Read the page with vaddr into memory */ old_page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma); - if (IS_ERR_OR_NULL(old_page)) - return old_page ? PTR_ERR(old_page) : 0; + if (IS_ERR(old_page)) + return PTR_ERR(old_page); ret = verify_opcode(old_page, vaddr, &opcode); if (ret <= 0) @@ -537,7 +537,7 @@ retry: } } - ret = __replace_page(vma, vaddr, old_page, new_page); + ret = __replace_page(vma, vaddr & PAGE_MASK, old_page, new_page); if (new_page) put_page(new_page); put_old: diff --git a/kernel/exit.c b/kernel/exit.c index edb50b4c9972..dfb963d2f862 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -69,11 +69,15 @@ #include <linux/rethook.h> #include <linux/sysfs.h> #include <linux/user_events.h> - #include <linux/uaccess.h> + +#include <uapi/linux/wait.h> + #include <asm/unistd.h> #include <asm/mmu_context.h> +#include "exit.h" + /* * The default value should be high enough to not crash a system that randomly * crashes its kernel from time to time, but low enough to at least not permit @@ -133,7 +137,6 @@ static void __unhash_process(struct task_struct *p, bool group_dead) list_del_init(&p->sibling); __this_cpu_dec(process_counts); } - list_del_rcu(&p->thread_group); list_del_rcu(&p->thread_node); } @@ -539,7 +542,6 @@ static void exit_mm(void) exit_mm_release(current, mm); if (!mm) return; - sync_mm_rss(mm); mmap_read_lock(mm); mmgrab_lazy_tlb(mm); BUG_ON(mm != current->active_mm); @@ -824,14 +826,9 @@ void __noreturn do_exit(long code) ptrace_event(PTRACE_EVENT_EXIT, code); user_events_exit(tsk); - validate_creds_for_do_exit(tsk); - io_uring_files_cancel(); exit_signals(tsk); /* sets PF_EXITING */ - /* sync mm's RSS info before statistics gathering */ - if (tsk->mm) - sync_mm_rss(tsk->mm); acct_update_integrals(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { @@ -912,7 +909,6 @@ void __noreturn do_exit(long code) if (tsk->task_frag.page) put_page(tsk->task_frag.page); - validate_creds_for_do_exit(tsk); exit_task_stack_account(tsk); check_stack_usage(); @@ -1037,26 +1033,6 @@ SYSCALL_DEFINE1(exit_group, int, error_code) return 0; } -struct waitid_info { - pid_t pid; - uid_t uid; - int status; - int cause; -}; - -struct wait_opts { - enum pid_type wo_type; - int wo_flags; - struct pid *wo_pid; - - struct waitid_info *wo_info; - int wo_stat; - struct rusage *wo_rusage; - - wait_queue_entry_t child_wait; - int notask_error; -}; - static int eligible_pid(struct wait_opts *wo, struct task_struct *p) { return wo->wo_type == PIDTYPE_MAX || @@ -1151,17 +1127,14 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * and nobody can change them. * * psig->stats_lock also protects us from our sub-threads - * which can reap other children at the same time. Until - * we change k_getrusage()-like users to rely on this lock - * we have to take ->siglock as well. + * which can reap other children at the same time. * * We use thread_group_cputime_adjusted() to get times for * the thread group, which consolidates times for all threads * in the group including the group leader. */ thread_group_cputime_adjusted(p, &tgutime, &tgstime); - spin_lock_irq(¤t->sighand->siglock); - write_seqlock(&psig->stats_lock); + write_seqlock_irq(&psig->stats_lock); psig->cutime += tgutime + sig->cutime; psig->cstime += tgstime + sig->cstime; psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; @@ -1184,8 +1157,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) psig->cmaxrss = maxrss; task_io_accounting_add(&psig->ioac, &p->ioac); task_io_accounting_add(&psig->ioac, &sig->ioac); - write_sequnlock(&psig->stats_lock); - spin_unlock_irq(¤t->sighand->siglock); + write_sequnlock_irq(&psig->stats_lock); } if (wo->wo_rusage) @@ -1520,6 +1492,17 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) return 0; } +bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p) +{ + if (!eligible_pid(wo, p)) + return false; + + if ((wo->wo_flags & __WNOTHREAD) && wo->child_wait.private != p->parent) + return false; + + return true; +} + static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { @@ -1527,13 +1510,10 @@ static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode, child_wait); struct task_struct *p = key; - if (!eligible_pid(wo, p)) - return 0; + if (pid_child_should_wake(wo, p)) + return default_wake_function(wait, mode, sync, key); - if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent) - return 0; - - return default_wake_function(wait, mode, sync, key); + return 0; } void __wake_up_parent(struct task_struct *p, struct task_struct *parent) @@ -1582,16 +1562,10 @@ static int do_wait_pid(struct wait_opts *wo) return 0; } -static long do_wait(struct wait_opts *wo) +long __do_wait(struct wait_opts *wo) { - int retval; - - trace_sched_process_wait(wo->wo_pid); + long retval; - init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); - wo->child_wait.private = current; - add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); -repeat: /* * If there is nothing that can match our criteria, just get out. * We will clear ->notask_error to zero if we see any child that @@ -1603,24 +1577,23 @@ repeat: (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type))) goto notask; - set_current_state(TASK_INTERRUPTIBLE); read_lock(&tasklist_lock); if (wo->wo_type == PIDTYPE_PID) { retval = do_wait_pid(wo); if (retval) - goto end; + return retval; } else { struct task_struct *tsk = current; do { retval = do_wait_thread(wo, tsk); if (retval) - goto end; + return retval; retval = ptrace_do_wait(wo, tsk); if (retval) - goto end; + return retval; if (wo->wo_flags & __WNOTHREAD) break; @@ -1630,27 +1603,44 @@ repeat: notask: retval = wo->notask_error; - if (!retval && !(wo->wo_flags & WNOHANG)) { - retval = -ERESTARTSYS; - if (!signal_pending(current)) { - schedule(); - goto repeat; - } - } -end: + if (!retval && !(wo->wo_flags & WNOHANG)) + return -ERESTARTSYS; + + return retval; +} + +static long do_wait(struct wait_opts *wo) +{ + int retval; + + trace_sched_process_wait(wo->wo_pid); + + init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); + wo->child_wait.private = current; + add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); + + do { + set_current_state(TASK_INTERRUPTIBLE); + retval = __do_wait(wo); + if (retval != -ERESTARTSYS) + break; + if (signal_pending(current)) + break; + schedule(); + } while (1); + __set_current_state(TASK_RUNNING); remove_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); return retval; } -static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, - int options, struct rusage *ru) +int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid, + struct waitid_info *infop, int options, + struct rusage *ru) { - struct wait_opts wo; + unsigned int f_flags = 0; struct pid *pid = NULL; enum pid_type type; - long ret; - unsigned int f_flags = 0; if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED| __WNOTHREAD|__WCLONE|__WALL)) @@ -1693,19 +1683,32 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, return -EINVAL; } - wo.wo_type = type; - wo.wo_pid = pid; - wo.wo_flags = options; - wo.wo_info = infop; - wo.wo_rusage = ru; + wo->wo_type = type; + wo->wo_pid = pid; + wo->wo_flags = options; + wo->wo_info = infop; + wo->wo_rusage = ru; if (f_flags & O_NONBLOCK) - wo.wo_flags |= WNOHANG; + wo->wo_flags |= WNOHANG; + + return 0; +} + +static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, + int options, struct rusage *ru) +{ + struct wait_opts wo; + long ret; + + ret = kernel_waitid_prepare(&wo, which, upid, infop, options, ru); + if (ret) + return ret; ret = do_wait(&wo); - if (!ret && !(options & WNOHANG) && (f_flags & O_NONBLOCK)) + if (!ret && !(options & WNOHANG) && (wo.wo_flags & WNOHANG)) ret = -EAGAIN; - put_pid(pid); + put_pid(wo.wo_pid); return ret; } diff --git a/kernel/exit.h b/kernel/exit.h new file mode 100644 index 000000000000..278faa26a653 --- /dev/null +++ b/kernel/exit.h @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0-only +#ifndef LINUX_WAITID_H +#define LINUX_WAITID_H + +struct waitid_info { + pid_t pid; + uid_t uid; + int status; + int cause; +}; + +struct wait_opts { + enum pid_type wo_type; + int wo_flags; + struct pid *wo_pid; + + struct waitid_info *wo_info; + int wo_stat; + struct rusage *wo_rusage; + + wait_queue_entry_t child_wait; + int notask_error; +}; + +bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p); +long __do_wait(struct wait_opts *wo); +int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid, + struct waitid_info *infop, int options, + struct rusage *ru); +#endif diff --git a/kernel/fork.c b/kernel/fork.c index 3b6d20dfb9a8..0d944e92a43f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -53,6 +53,7 @@ #include <linux/seccomp.h> #include <linux/swap.h> #include <linux/syscalls.h> +#include <linux/syscall_user_dispatch.h> #include <linux/jiffies.h> #include <linux/futex.h> #include <linux/compat.h> @@ -99,6 +100,7 @@ #include <linux/stackprotector.h> #include <linux/user_events.h> #include <linux/iommu.h> +#include <linux/rseq.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> @@ -165,7 +167,6 @@ void __weak arch_release_task_struct(struct task_struct *tsk) { } -#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR static struct kmem_cache *task_struct_cachep; static inline struct task_struct *alloc_task_struct_node(int node) @@ -177,9 +178,6 @@ static inline void free_task_struct(struct task_struct *tsk) { kmem_cache_free(task_struct_cachep, tsk); } -#endif - -#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR /* * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a @@ -412,24 +410,6 @@ void thread_stack_cache_init(void) } # endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */ -#else /* CONFIG_ARCH_THREAD_STACK_ALLOCATOR */ - -static int alloc_thread_stack_node(struct task_struct *tsk, int node) -{ - unsigned long *stack; - - stack = arch_alloc_thread_stack_node(tsk, node); - tsk->stack = stack; - return stack ? 0 : -ENOMEM; -} - -static void free_thread_stack(struct task_struct *tsk) -{ - arch_free_thread_stack(tsk); - tsk->stack = NULL; -} - -#endif /* !CONFIG_ARCH_THREAD_STACK_ALLOCATOR */ /* SLAB cache for signal_struct structures (tsk->signal) */ static struct kmem_cache *signal_cachep; @@ -650,7 +630,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, int retval; unsigned long charge = 0; LIST_HEAD(uf); - VMA_ITERATOR(old_vmi, oldmm, 0); VMA_ITERATOR(vmi, mm, 0); uprobe_start_dup_mmap(); @@ -678,16 +657,22 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, goto out; khugepaged_fork(mm, oldmm); - retval = vma_iter_bulk_alloc(&vmi, oldmm->map_count); - if (retval) + /* Use __mt_dup() to efficiently build an identical maple tree. */ + retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL); + if (unlikely(retval)) goto out; mt_clear_in_rcu(vmi.mas.tree); - for_each_vma(old_vmi, mpnt) { + for_each_vma(vmi, mpnt) { struct file *file; vma_start_write(mpnt); if (mpnt->vm_flags & VM_DONTCOPY) { + retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start, + mpnt->vm_end, GFP_KERNEL); + if (retval) + goto loop_out; + vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); continue; } @@ -733,7 +718,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, get_file(file); i_mmap_lock_write(mapping); - if (tmp->vm_flags & VM_SHARED) + if (vma_is_shared_maywrite(tmp)) mapping_allow_writable(mapping); flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ @@ -749,9 +734,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, if (is_vm_hugetlb_page(tmp)) hugetlb_dup_vma_private(tmp); - /* Link the vma into the MT */ - if (vma_iter_bulk_store(&vmi, tmp)) - goto fail_nomem_vmi_store; + /* + * Link the vma into the MT. After using __mt_dup(), memory + * allocation is not necessary here, so it cannot fail. + */ + vma_iter_bulk_store(&vmi, tmp); mm->map_count++; if (!(tmp->vm_flags & VM_WIPEONFORK)) @@ -760,15 +747,28 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); - if (retval) + if (retval) { + mpnt = vma_next(&vmi); goto loop_out; + } } /* a new mm has just been created */ retval = arch_dup_mmap(oldmm, mm); loop_out: vma_iter_free(&vmi); - if (!retval) + if (!retval) { mt_set_in_rcu(vmi.mas.tree); + } else if (mpnt) { + /* + * The entire maple tree has already been duplicated. If the + * mmap duplication fails, mark the failure point with + * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, + * stop releasing VMAs that have not been duplicated after this + * point. + */ + mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); + mas_store(&vmi.mas, XA_ZERO_ENTRY); + } out: mmap_write_unlock(mm); flush_tlb_mm(oldmm); @@ -778,8 +778,6 @@ fail_uprobe_end: uprobe_end_dup_mmap(); return retval; -fail_nomem_vmi_store: - unlink_anon_vmas(tmp); fail_nomem_anon_vma_fork: mpol_put(vma_policy(tmp)); fail_nomem_policy: @@ -1021,7 +1019,6 @@ static void set_max_threads(unsigned int max_threads_suggested) int arch_task_struct_size __read_mostly; #endif -#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR static void task_struct_whitelist(unsigned long *offset, unsigned long *size) { /* Fetch thread_struct whitelist for the architecture. */ @@ -1036,12 +1033,10 @@ static void task_struct_whitelist(unsigned long *offset, unsigned long *size) else *offset += offsetof(struct task_struct, thread); } -#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */ void __init fork_init(void) { int i; -#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR #ifndef ARCH_MIN_TASKALIGN #define ARCH_MIN_TASKALIGN 0 #endif @@ -1054,7 +1049,6 @@ void __init fork_init(void) arch_task_struct_size, align, SLAB_PANIC|SLAB_ACCOUNT, useroffset, usersize, NULL); -#endif /* do the arch specific task caches init */ arch_task_cache_init(); @@ -1179,7 +1173,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->use_memdelay = 0; #endif -#ifdef CONFIG_IOMMU_SVA +#ifdef CONFIG_ARCH_HAS_CPU_PASID tsk->pasid_activated = 0; #endif @@ -1288,7 +1282,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, hugetlb_count_init(mm); if (current->mm) { - mm->flags = current->mm->flags & MMF_INIT_MASK; + mm->flags = mmf_init_flags(current->mm->flags); mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK; } else { mm->flags = default_dump_filter; @@ -1393,6 +1387,8 @@ EXPORT_SYMBOL_GPL(mmput_async); /** * set_mm_exe_file - change a reference to the mm's executable file + * @mm: The mm to change. + * @new_exe_file: The new file to use. * * This changes mm's executable file (shown as symlink /proc/[pid]/exe). * @@ -1432,6 +1428,8 @@ int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) /** * replace_mm_exe_file - replace a reference to the mm's executable file + * @mm: The mm to change. + * @new_exe_file: The new file to use. * * This changes mm's executable file (shown as symlink /proc/[pid]/exe). * @@ -1483,6 +1481,7 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) /** * get_mm_exe_file - acquire a reference to the mm's executable file + * @mm: The mm of interest. * * Returns %NULL if mm has no associated executable file. * User must release file via fput(). @@ -1492,15 +1491,14 @@ struct file *get_mm_exe_file(struct mm_struct *mm) struct file *exe_file; rcu_read_lock(); - exe_file = rcu_dereference(mm->exe_file); - if (exe_file && !get_file_rcu(exe_file)) - exe_file = NULL; + exe_file = get_file_rcu(&mm->exe_file); rcu_read_unlock(); return exe_file; } /** * get_task_exe_file - acquire a reference to the task's executable file + * @task: The task. * * Returns %NULL if task's mm (if any) has no associated executable file or * this is a kernel thread with borrowed mm (see the comment above get_task_mm). @@ -1523,6 +1521,7 @@ struct file *get_task_exe_file(struct task_struct *task) /** * get_task_mm - acquire a reference to the task's mm + * @task: The task. * * Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning * this kernel workthread has transiently adopted a user mm with use_mm, @@ -1583,7 +1582,7 @@ static void complete_vfork_done(struct task_struct *tsk) static int wait_for_vfork_done(struct task_struct *child, struct completion *vfork) { - unsigned int state = TASK_UNINTERRUPTIBLE|TASK_KILLABLE|TASK_FREEZABLE; + unsigned int state = TASK_KILLABLE|TASK_FREEZABLE; int killed; cgroup_enter_frozen(); @@ -1749,6 +1748,7 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) if (clone_flags & CLONE_FS) { /* tsk->fs is already what we want */ spin_lock(&fs->lock); + /* "users" and "in_exec" locked for check_unsafe_exec() */ if (fs->in_exec) { spin_unlock(&fs->lock); return -EAGAIN; @@ -2102,11 +2102,11 @@ const struct file_operations pidfd_fops = { * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd * @pid: the struct pid for which to create a pidfd * @flags: flags of the new @pidfd - * @pidfd: the pidfd to return + * @ret: Where to return the file for the pidfd. * * Allocate a new file that stashes @pid and reserve a new pidfd number in the * caller's file descriptor table. The pidfd is reserved but not installed yet. - + * * The helper doesn't perform checks on @pid which makes it useful for pidfds * created via CLONE_PIDFD where @pid has no task attached when the pidfd and * pidfd file are prepared. @@ -2153,7 +2153,7 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd * @pid: the struct pid for which to create a pidfd * @flags: flags of the new @pidfd - * @pidfd: the pidfd to return + * @ret: Where to return the pidfd. * * Allocate a new file that stashes @pid and reserve a new pidfd number in the * caller's file descriptor table. The pidfd is reserved but not installed yet. @@ -2406,10 +2406,6 @@ __latent_entropy struct task_struct *copy_process( p->io_uring = NULL; #endif -#if defined(SPLIT_RSS_COUNTING) - memset(&p->rss_stat, 0, sizeof(p->rss_stat)); -#endif - p->default_timer_slack_ns = current->timer_slack_ns; #ifdef CONFIG_PSI @@ -2576,7 +2572,6 @@ __latent_entropy struct task_struct *copy_process( p->dirty_paused_when = 0; p->pdeath_signal = 0; - INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; clear_posix_cputimers_work(p); @@ -2704,8 +2699,6 @@ __latent_entropy struct task_struct *copy_process( atomic_inc(¤t->signal->live); refcount_inc(¤t->signal->sigcnt); task_join_group_stop(p); - list_add_tail_rcu(&p->thread_group, - &p->group_leader->thread_group); list_add_tail_rcu(&p->thread_node, &p->signal->thread_head); } @@ -2930,7 +2923,7 @@ pid_t kernel_clone(struct kernel_clone_args *args) get_task_struct(p); } - if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) { + if (IS_ENABLED(CONFIG_LRU_GEN_WALKS_MMU) && !(clone_flags & CLONE_VM)) { /* lock the task to synchronize with memcg migration */ task_lock(p); lru_gen_add_mm(p->mm); @@ -3144,7 +3137,7 @@ static inline bool clone3_stack_valid(struct kernel_clone_args *kargs) if (!access_ok((void __user *)kargs->stack, kargs->stack_size)) return false; -#if !defined(CONFIG_STACK_GROWSUP) && !defined(CONFIG_IA64) +#if !defined(CONFIG_STACK_GROWSUP) kargs->stack += kargs->stack_size; #endif } @@ -3181,7 +3174,7 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs) } /** - * clone3 - create a new process with specific properties + * sys_clone3 - create a new process with specific properties * @uargs: argument structure * @size: size of @uargs * diff --git a/kernel/freezer.c b/kernel/freezer.c index 4fad0e6fca64..f57aaf96b829 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -71,7 +71,11 @@ bool __refrigerator(bool check_kthr_stop) for (;;) { bool freeze; + raw_spin_lock_irq(¤t->pi_lock); set_current_state(TASK_FROZEN); + /* unstale saved_state so that __thaw_task() will wake us up */ + current->saved_state = TASK_RUNNING; + raw_spin_unlock_irq(¤t->pi_lock); spin_lock_irq(&freezer_lock); freeze = freezing(current) && !(check_kthr_stop && kthread_should_stop()); @@ -129,6 +133,7 @@ static int __set_task_frozen(struct task_struct *p, void *arg) WARN_ON_ONCE(debug_locks && p->lockdep_depth); #endif + p->saved_state = p->__state; WRITE_ONCE(p->__state, TASK_FROZEN); return TASK_FROZEN; } @@ -170,42 +175,35 @@ bool freeze_task(struct task_struct *p) } /* - * The special task states (TASK_STOPPED, TASK_TRACED) keep their canonical - * state in p->jobctl. If either of them got a wakeup that was missed because - * TASK_FROZEN, then their canonical state reflects that and the below will - * refuse to restore the special state and instead issue the wakeup. + * Restore the saved_state before the task entered freezer. For typical task + * in the __refrigerator(), saved_state == TASK_RUNNING so nothing happens + * here. For tasks which were TASK_NORMAL | TASK_FREEZABLE, their initial state + * is restored unless they got an expected wakeup (see ttwu_state_match()). + * Returns 1 if the task state was restored. */ -static int __set_task_special(struct task_struct *p, void *arg) +static int __restore_freezer_state(struct task_struct *p, void *arg) { - unsigned int state = 0; + unsigned int state = p->saved_state; - if (p->jobctl & JOBCTL_TRACED) - state = TASK_TRACED; - - else if (p->jobctl & JOBCTL_STOPPED) - state = TASK_STOPPED; - - if (state) + if (state != TASK_RUNNING) { WRITE_ONCE(p->__state, state); + p->saved_state = TASK_RUNNING; + return 1; + } - return state; + return 0; } void __thaw_task(struct task_struct *p) { - unsigned long flags, flags2; + unsigned long flags; spin_lock_irqsave(&freezer_lock, flags); if (WARN_ON_ONCE(freezing(p))) goto unlock; - if (lock_task_sighand(p, &flags2)) { - /* TASK_FROZEN -> TASK_{STOPPED,TRACED} */ - bool ret = task_call_func(p, __set_task_special, NULL); - unlock_task_sighand(p, &flags2); - if (ret) - goto unlock; - } + if (!frozen(p) || task_call_func(p, __restore_freezer_state, NULL)) + goto unlock; wake_up_state(p, TASK_FROZEN); unlock: diff --git a/kernel/futex/core.c b/kernel/futex/core.c index f10587d1d481..1e78ef24321e 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -34,6 +34,7 @@ #include <linux/compat.h> #include <linux/jhash.h> #include <linux/pagemap.h> +#include <linux/plist.h> #include <linux/memblock.h> #include <linux/fault-inject.h> #include <linux/slab.h> @@ -193,7 +194,7 @@ static u64 get_inode_sequence_number(struct inode *inode) /** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex - * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED + * @flags: FLAGS_* * @key: address where result is stored. * @rw: mapping needs to be read/write (values: FUTEX_READ, * FUTEX_WRITE) @@ -217,14 +218,18 @@ static u64 get_inode_sequence_number(struct inode *inode) * * lock_page() might sleep, the caller should not hold a spinlock. */ -int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, +int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, enum futex_access rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; - struct page *page, *tail; + struct page *page; + struct folio *folio; struct address_space *mapping; int err, ro = 0; + bool fshared; + + fshared = flags & FLAGS_SHARED; /* * The futex address must be "naturally" aligned. @@ -248,7 +253,17 @@ int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, * but access_ok() should be faster than find_vma() */ if (!fshared) { - key->private.mm = mm; + /* + * On no-MMU, shared futexes are treated as private, therefore + * we must not include the current process in the key. Since + * there is only one address space, the address is a unique key + * on its own. + */ + if (IS_ENABLED(CONFIG_MMU)) + key->private.mm = mm; + else + key->private.mm = NULL; + key->private.address = address; return 0; } @@ -273,54 +288,52 @@ again: err = 0; /* - * The treatment of mapping from this point on is critical. The page - * lock protects many things but in this context the page lock + * The treatment of mapping from this point on is critical. The folio + * lock protects many things but in this context the folio lock * stabilizes mapping, prevents inode freeing in the shared * file-backed region case and guards against movement to swap cache. * - * Strictly speaking the page lock is not needed in all cases being - * considered here and page lock forces unnecessarily serialization + * Strictly speaking the folio lock is not needed in all cases being + * considered here and folio lock forces unnecessarily serialization. * From this point on, mapping will be re-verified if necessary and - * page lock will be acquired only if it is unavoidable + * folio lock will be acquired only if it is unavoidable * - * Mapping checks require the head page for any compound page so the - * head page and mapping is looked up now. For anonymous pages, it - * does not matter if the page splits in the future as the key is - * based on the address. For filesystem-backed pages, the tail is - * required as the index of the page determines the key. For - * base pages, there is no tail page and tail == page. + * Mapping checks require the folio so it is looked up now. For + * anonymous pages, it does not matter if the folio is split + * in the future as the key is based on the address. For + * filesystem-backed pages, the precise page is required as the + * index of the page determines the key. */ - tail = page; - page = compound_head(page); - mapping = READ_ONCE(page->mapping); + folio = page_folio(page); + mapping = READ_ONCE(folio->mapping); /* - * If page->mapping is NULL, then it cannot be a PageAnon + * If folio->mapping is NULL, then it cannot be an anonymous * page; but it might be the ZERO_PAGE or in the gate area or * in a special mapping (all cases which we are happy to fail); * or it may have been a good file page when get_user_pages_fast * found it, but truncated or holepunched or subjected to - * invalidate_complete_page2 before we got the page lock (also + * invalidate_complete_page2 before we got the folio lock (also * cases which we are happy to fail). And we hold a reference, * so refcount care in invalidate_inode_page's remove_mapping * prevents drop_caches from setting mapping to NULL beneath us. * * The case we do have to guard against is when memory pressure made * shmem_writepage move it from filecache to swapcache beneath us: - * an unlikely race, but we do need to retry for page->mapping. + * an unlikely race, but we do need to retry for folio->mapping. */ if (unlikely(!mapping)) { int shmem_swizzled; /* - * Page lock is required to identify which special case above - * applies. If this is really a shmem page then the page lock + * Folio lock is required to identify which special case above + * applies. If this is really a shmem page then the folio lock * will prevent unexpected transitions. */ - lock_page(page); - shmem_swizzled = PageSwapCache(page) || page->mapping; - unlock_page(page); - put_page(page); + folio_lock(folio); + shmem_swizzled = folio_test_swapcache(folio) || folio->mapping; + folio_unlock(folio); + folio_put(folio); if (shmem_swizzled) goto again; @@ -331,14 +344,14 @@ again: /* * Private mappings are handled in a simple way. * - * If the futex key is stored on an anonymous page, then the associated + * If the futex key is stored in anonymous memory, then the associated * object is the mm which is implicitly pinned by the calling process. * * NOTE: When userspace waits on a MAP_SHARED mapping, even if * it's a read-only handle, it's expected that futexes attach to * the object not the particular process. */ - if (PageAnon(page)) { + if (folio_test_anon(folio)) { /* * A RO anonymous page will never change and thus doesn't make * sense for futex operations. @@ -357,10 +370,10 @@ again: /* * The associated futex object in this case is the inode and - * the page->mapping must be traversed. Ordinarily this should - * be stabilised under page lock but it's not strictly + * the folio->mapping must be traversed. Ordinarily this should + * be stabilised under folio lock but it's not strictly * necessary in this case as we just want to pin the inode, not - * update the radix tree or anything like that. + * update i_pages or anything like that. * * The RCU read lock is taken as the inode is finally freed * under RCU. If the mapping still matches expectations then the @@ -368,9 +381,9 @@ again: */ rcu_read_lock(); - if (READ_ONCE(page->mapping) != mapping) { + if (READ_ONCE(folio->mapping) != mapping) { rcu_read_unlock(); - put_page(page); + folio_put(folio); goto again; } @@ -378,19 +391,19 @@ again: inode = READ_ONCE(mapping->host); if (!inode) { rcu_read_unlock(); - put_page(page); + folio_put(folio); goto again; } key->both.offset |= FUT_OFF_INODE; /* inode-based key */ key->shared.i_seq = get_inode_sequence_number(inode); - key->shared.pgoff = page_to_pgoff(tail); + key->shared.pgoff = folio->index + folio_page_idx(folio, page); rcu_read_unlock(); } out: - put_page(page); + folio_put(folio); return err; } @@ -614,12 +627,21 @@ retry: } /* - * PI futexes can not be requeued and must remove themselves from the - * hash bucket. The hash bucket lock (i.e. lock_ptr) is held. + * PI futexes can not be requeued and must remove themselves from the hash + * bucket. The hash bucket lock (i.e. lock_ptr) is held. */ void futex_unqueue_pi(struct futex_q *q) { - __futex_unqueue(q); + /* + * If the lock was not acquired (due to timeout or signal) then the + * rt_waiter is removed before futex_q is. If this is observed by + * an unlocker after dropping the rtmutex wait lock and before + * acquiring the hash bucket lock, then the unlocker dequeues the + * futex_q from the hash bucket list to guarantee consistent state + * vs. userspace. Therefore the dequeue here must be conditional. + */ + if (!plist_node_empty(&q->list)) + __futex_unqueue(q); BUG_ON(!q->pi_state); put_pi_state(q->pi_state); @@ -688,7 +710,8 @@ retry: owner = uval & FUTEX_TID_MASK; if (pending_op && !pi && !owner) { - futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); + futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1, + FUTEX_BITSET_MATCH_ANY); return 0; } @@ -740,8 +763,10 @@ retry: * Wake robust non-PI futexes here. The wakeup of * PI futexes happens in exit_pi_state(): */ - if (!pi && (uval & FUTEX_WAITERS)) - futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); + if (!pi && (uval & FUTEX_WAITERS)) { + futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1, + FUTEX_BITSET_MATCH_ANY); + } return 0; } diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index b5379c0e6d6d..8b195d06f4e8 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -5,6 +5,7 @@ #include <linux/futex.h> #include <linux/rtmutex.h> #include <linux/sched/wake_q.h> +#include <linux/compat.h> #ifdef CONFIG_PREEMPT_RT #include <linux/rcuwait.h> @@ -16,17 +17,86 @@ * Futex flags used to encode options to functions and preserve them across * restarts. */ +#define FLAGS_SIZE_8 0x0000 +#define FLAGS_SIZE_16 0x0001 +#define FLAGS_SIZE_32 0x0002 +#define FLAGS_SIZE_64 0x0003 + +#define FLAGS_SIZE_MASK 0x0003 + #ifdef CONFIG_MMU -# define FLAGS_SHARED 0x01 +# define FLAGS_SHARED 0x0010 #else /* * NOMMU does not have per process address space. Let the compiler optimize * code away. */ -# define FLAGS_SHARED 0x00 +# define FLAGS_SHARED 0x0000 #endif -#define FLAGS_CLOCKRT 0x02 -#define FLAGS_HAS_TIMEOUT 0x04 +#define FLAGS_CLOCKRT 0x0020 +#define FLAGS_HAS_TIMEOUT 0x0040 +#define FLAGS_NUMA 0x0080 +#define FLAGS_STRICT 0x0100 + +/* FUTEX_ to FLAGS_ */ +static inline unsigned int futex_to_flags(unsigned int op) +{ + unsigned int flags = FLAGS_SIZE_32; + + if (!(op & FUTEX_PRIVATE_FLAG)) + flags |= FLAGS_SHARED; + + if (op & FUTEX_CLOCK_REALTIME) + flags |= FLAGS_CLOCKRT; + + return flags; +} + +#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_PRIVATE) + +/* FUTEX2_ to FLAGS_ */ +static inline unsigned int futex2_to_flags(unsigned int flags2) +{ + unsigned int flags = flags2 & FUTEX2_SIZE_MASK; + + if (!(flags2 & FUTEX2_PRIVATE)) + flags |= FLAGS_SHARED; + + if (flags2 & FUTEX2_NUMA) + flags |= FLAGS_NUMA; + + return flags; +} + +static inline unsigned int futex_size(unsigned int flags) +{ + return 1 << (flags & FLAGS_SIZE_MASK); +} + +static inline bool futex_flags_valid(unsigned int flags) +{ + /* Only 64bit futexes for 64bit code */ + if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()) { + if ((flags & FLAGS_SIZE_MASK) == FLAGS_SIZE_64) + return false; + } + + /* Only 32bit futexes are implemented -- for now */ + if ((flags & FLAGS_SIZE_MASK) != FLAGS_SIZE_32) + return false; + + return true; +} + +static inline bool futex_validate_input(unsigned int flags, u64 val) +{ + int bits = 8 * futex_size(flags); + + if (bits < 64 && (val >> bits)) + return false; + + return true; +} #ifdef CONFIG_FAIL_FUTEX extern bool should_fail_futex(bool fshared); @@ -69,11 +139,16 @@ struct futex_pi_state { union futex_key key; } __randomize_layout; +struct futex_q; +typedef void (futex_wake_fn)(struct wake_q_head *wake_q, struct futex_q *q); + /** * struct futex_q - The hashed futex queue entry, one per waiting task * @list: priority-sorted list of tasks waiting on this futex * @task: the task waiting on the futex * @lock_ptr: the hash bucket lock + * @wake: the wake handler for this queue + * @wake_data: data associated with the wake handler * @key: the key the futex is hashed on * @pi_state: optional priority inheritance state * @rt_waiter: rt_waiter storage for use with requeue_pi @@ -98,6 +173,8 @@ struct futex_q { struct task_struct *task; spinlock_t *lock_ptr; + futex_wake_fn *wake; + void *wake_data; union futex_key key; struct futex_pi_state *pi_state; struct rt_mutex_waiter *rt_waiter; @@ -116,7 +193,7 @@ enum futex_access { FUTEX_WRITE }; -extern int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, +extern int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, enum futex_access rw); extern struct hrtimer_sleeper * @@ -144,6 +221,7 @@ extern int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, struct futex_q *q, struct futex_hash_bucket **hb); extern void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, struct hrtimer_sleeper *timeout); +extern bool __futex_wake_mark(struct futex_q *q); extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q); extern int fault_in_user_writeable(u32 __user *uaddr); @@ -260,10 +338,14 @@ extern int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset, u32 __user *uaddr2); -extern int futex_requeue(u32 __user *uaddr1, unsigned int flags, - u32 __user *uaddr2, int nr_wake, int nr_requeue, +extern int futex_requeue(u32 __user *uaddr1, unsigned int flags1, + u32 __user *uaddr2, unsigned int flags2, + int nr_wake, int nr_requeue, u32 *cmpval, int requeue_pi); +extern int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, + struct hrtimer_sleeper *to, u32 bitset); + extern int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset); @@ -279,6 +361,16 @@ struct futex_vector { struct futex_q q; }; +extern int futex_parse_waitv(struct futex_vector *futexv, + struct futex_waitv __user *uwaitv, + unsigned int nr_futexes, futex_wake_fn *wake, + void *wake_data); + +extern int futex_wait_multiple_setup(struct futex_vector *vs, int count, + int *woken); + +extern int futex_unqueue_multiple(struct futex_vector *v, int count); + extern int futex_wait_multiple(struct futex_vector *vs, unsigned int count, struct hrtimer_sleeper *to); diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c index ce2889f12375..5722467f2737 100644 --- a/kernel/futex/pi.c +++ b/kernel/futex/pi.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/slab.h> +#include <linux/sched/rt.h> #include <linux/sched/task.h> #include "futex.h" @@ -610,29 +611,16 @@ int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, /* * Caller must hold a reference on @pi_state. */ -static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) +static int wake_futex_pi(u32 __user *uaddr, u32 uval, + struct futex_pi_state *pi_state, + struct rt_mutex_waiter *top_waiter) { - struct rt_mutex_waiter *top_waiter; struct task_struct *new_owner; bool postunlock = false; DEFINE_RT_WAKE_Q(wqh); u32 curval, newval; int ret = 0; - top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex); - if (WARN_ON_ONCE(!top_waiter)) { - /* - * As per the comment in futex_unlock_pi() this should not happen. - * - * When this happens, give up our locks and try again, giving - * the futex_lock_pi() instance time to complete, either by - * waiting on the rtmutex or removing itself from the futex - * queue. - */ - ret = -EAGAIN; - goto out_unlock; - } - new_owner = top_waiter->task; /* @@ -945,7 +933,7 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl to = futex_setup_timer(time, &timeout, flags, 0); retry: - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE); + ret = get_futex_key(uaddr, flags, &q.key, FUTEX_WRITE); if (unlikely(ret != 0)) goto out; @@ -1002,6 +990,12 @@ retry_private: goto no_block; } + /* + * Must be done before we enqueue the waiter, here is unfortunately + * under the hb lock, but that *should* work because it does nothing. + */ + rt_mutex_pre_schedule(); + rt_mutex_init_waiter(&rt_waiter); /* @@ -1039,19 +1033,37 @@ retry_private: ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); cleanup: - spin_lock(q.lock_ptr); /* * If we failed to acquire the lock (deadlock/signal/timeout), we must - * first acquire the hb->lock before removing the lock from the - * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait - * lists consistent. + * must unwind the above, however we canont lock hb->lock because + * rt_mutex already has a waiter enqueued and hb->lock can itself try + * and enqueue an rt_waiter through rtlock. + * + * Doing the cleanup without holding hb->lock can cause inconsistent + * state between hb and pi_state, but only in the direction of not + * seeing a waiter that is leaving. + * + * See futex_unlock_pi(), it deals with this inconsistency. + * + * There be dragons here, since we must deal with the inconsistency on + * the way out (here), it is impossible to detect/warn about the race + * the other way around (missing an incoming waiter). * - * In particular; it is important that futex_unlock_pi() can not - * observe this inconsistency. + * What could possibly go wrong... */ if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) ret = 0; + /* + * Now that the rt_waiter has been dequeued, it is safe to use + * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up + * the + */ + spin_lock(q.lock_ptr); + /* + * Waiter is unqueued. + */ + rt_mutex_post_schedule(); no_block: /* * Fixup the pi_state owner and possibly acquire the lock if we @@ -1117,12 +1129,13 @@ retry: if ((uval & FUTEX_TID_MASK) != vpid) return -EPERM; - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE); + ret = get_futex_key(uaddr, flags, &key, FUTEX_WRITE); if (ret) return ret; hb = futex_hash(&key); spin_lock(&hb->lock); +retry_hb: /* * Check waiters first. We do not trust user space values at @@ -1132,6 +1145,7 @@ retry: top_waiter = futex_top_waiter(hb, &key); if (top_waiter) { struct futex_pi_state *pi_state = top_waiter->pi_state; + struct rt_mutex_waiter *rt_waiter; ret = -EINVAL; if (!pi_state) @@ -1144,22 +1158,44 @@ retry: if (pi_state->owner != current) goto out_unlock; - get_pi_state(pi_state); /* * By taking wait_lock while still holding hb->lock, we ensure - * there is no point where we hold neither; and therefore - * wake_futex_p() must observe a state consistent with what we - * observed. + * there is no point where we hold neither; and thereby + * wake_futex_pi() must observe any new waiters. + * + * Since the cleanup: case in futex_lock_pi() removes the + * rt_waiter without holding hb->lock, it is possible for + * wake_futex_pi() to not find a waiter while the above does, + * in this case the waiter is on the way out and it can be + * ignored. * * In particular; this forces __rt_mutex_start_proxy() to * complete such that we're guaranteed to observe the - * rt_waiter. Also see the WARN in wake_futex_pi(). + * rt_waiter. */ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + + /* + * Futex vs rt_mutex waiter state -- if there are no rt_mutex + * waiters even though futex thinks there are, then the waiter + * is leaving. The entry needs to be removed from the list so a + * new futex_lock_pi() is not using this stale PI-state while + * the futex is available in user space again. + * There can be more than one task on its way out so it needs + * to retry. + */ + rt_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex); + if (!rt_waiter) { + __futex_unqueue(top_waiter); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + goto retry_hb; + } + + get_pi_state(pi_state); spin_unlock(&hb->lock); /* drops pi_state->pi_mutex.wait_lock */ - ret = wake_futex_pi(uaddr, uval, pi_state); + ret = wake_futex_pi(uaddr, uval, pi_state, rt_waiter); put_pi_state(pi_state); diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c index cba8b1a6a4cc..b47bb764b352 100644 --- a/kernel/futex/requeue.c +++ b/kernel/futex/requeue.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/plist.h> #include <linux/sched/signal.h> #include "futex.h" @@ -58,6 +59,7 @@ enum { const struct futex_q futex_q_init = { /* list gets initialized in futex_queue()*/ + .wake = futex_wake_mark, .key = FUTEX_KEY_INIT, .bitset = FUTEX_BITSET_MATCH_ANY, .requeue_state = ATOMIC_INIT(Q_REQUEUE_PI_NONE), @@ -269,7 +271,7 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, union futex_key *key2, struct futex_pi_state **ps, struct task_struct **exiting, int set_waiters) { - struct futex_q *top_waiter = NULL; + struct futex_q *top_waiter; u32 curval; int ret; @@ -346,8 +348,9 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, /** * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 * @uaddr1: source futex user address - * @flags: futex flags (FLAGS_SHARED, etc.) + * @flags1: futex flags (FLAGS_SHARED, etc.) * @uaddr2: target futex user address + * @flags2: futex flags (FLAGS_SHARED, etc.) * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) * @nr_requeue: number of waiters to requeue (0-INT_MAX) * @cmpval: @uaddr1 expected value (or %NULL) @@ -361,7 +364,8 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, * - >=0 - on success, the number of tasks requeued or woken; * - <0 - on error */ -int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, +int futex_requeue(u32 __user *uaddr1, unsigned int flags1, + u32 __user *uaddr2, unsigned int flags2, int nr_wake, int nr_requeue, u32 *cmpval, int requeue_pi) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; @@ -424,10 +428,10 @@ int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, } retry: - ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); + ret = get_futex_key(uaddr1, flags1, &key1, FUTEX_READ); if (unlikely(ret != 0)) return ret; - ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, + ret = get_futex_key(uaddr2, flags2, &key2, requeue_pi ? FUTEX_WRITE : FUTEX_READ); if (unlikely(ret != 0)) return ret; @@ -459,7 +463,7 @@ retry_private: if (ret) return ret; - if (!(flags & FLAGS_SHARED)) + if (!(flags1 & FLAGS_SHARED)) goto retry_private; goto retry; @@ -591,7 +595,7 @@ retry_private: /* Plain futexes just wake or requeue and are done */ if (!requeue_pi) { if (++task_count <= nr_wake) - futex_wake_mark(&wake_q, this); + this->wake(&wake_q, this); else requeue_futex(this, hb1, hb2, &key2); continue; @@ -789,7 +793,7 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, */ rt_mutex_init_waiter(&rt_waiter); - ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); + ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE); if (unlikely(ret != 0)) goto out; @@ -850,11 +854,13 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, pi_mutex = &q.pi_state->pi_mutex; ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); - /* Current is not longer pi_blocked_on */ - spin_lock(q.lock_ptr); + /* + * See futex_unlock_pi()'s cleanup: comment. + */ if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) ret = 0; + spin_lock(q.lock_ptr); debug_rt_mutex_free_waiter(&rt_waiter); /* * Fixup the pi_state owner and possibly acquire the lock if we diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c index a8074079b09e..4b6da9116aa6 100644 --- a/kernel/futex/syscalls.c +++ b/kernel/futex/syscalls.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -#include <linux/compat.h> #include <linux/syscalls.h> #include <linux/time_namespace.h> @@ -85,15 +84,12 @@ err_unlock: long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, u32 __user *uaddr2, u32 val2, u32 val3) { + unsigned int flags = futex_to_flags(op); int cmd = op & FUTEX_CMD_MASK; - unsigned int flags = 0; - if (!(op & FUTEX_PRIVATE_FLAG)) - flags |= FLAGS_SHARED; - - if (op & FUTEX_CLOCK_REALTIME) { - flags |= FLAGS_CLOCKRT; - if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI && + if (flags & FLAGS_CLOCKRT) { + if (cmd != FUTEX_WAIT_BITSET && + cmd != FUTEX_WAIT_REQUEUE_PI && cmd != FUTEX_LOCK_PI2) return -ENOSYS; } @@ -110,9 +106,9 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, case FUTEX_WAKE_BITSET: return futex_wake(uaddr, flags, val, val3); case FUTEX_REQUEUE: - return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); + return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, NULL, 0); case FUTEX_CMP_REQUEUE: - return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); + return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 0); case FUTEX_WAKE_OP: return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); case FUTEX_LOCK_PI: @@ -129,7 +125,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, uaddr2); case FUTEX_CMP_REQUEUE_PI: - return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); + return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 1); } return -ENOSYS; } @@ -183,43 +179,91 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); } -/* Mask of available flags for each futex in futex_waitv list */ -#define FUTEXV_WAITER_MASK (FUTEX_32 | FUTEX_PRIVATE_FLAG) - /** * futex_parse_waitv - Parse a waitv array from userspace * @futexv: Kernel side list of waiters to be filled * @uwaitv: Userspace list to be parsed * @nr_futexes: Length of futexv + * @wake: Wake to call when futex is woken + * @wake_data: Data for the wake handler * * Return: Error code on failure, 0 on success */ -static int futex_parse_waitv(struct futex_vector *futexv, - struct futex_waitv __user *uwaitv, - unsigned int nr_futexes) +int futex_parse_waitv(struct futex_vector *futexv, + struct futex_waitv __user *uwaitv, + unsigned int nr_futexes, futex_wake_fn *wake, + void *wake_data) { struct futex_waitv aux; unsigned int i; for (i = 0; i < nr_futexes; i++) { + unsigned int flags; + if (copy_from_user(&aux, &uwaitv[i], sizeof(aux))) return -EFAULT; - if ((aux.flags & ~FUTEXV_WAITER_MASK) || aux.__reserved) + if ((aux.flags & ~FUTEX2_VALID_MASK) || aux.__reserved) + return -EINVAL; + + flags = futex2_to_flags(aux.flags); + if (!futex_flags_valid(flags)) return -EINVAL; - if (!(aux.flags & FUTEX_32)) + if (!futex_validate_input(flags, aux.val)) return -EINVAL; - futexv[i].w.flags = aux.flags; + futexv[i].w.flags = flags; futexv[i].w.val = aux.val; futexv[i].w.uaddr = aux.uaddr; futexv[i].q = futex_q_init; + futexv[i].q.wake = wake; + futexv[i].q.wake_data = wake_data; } return 0; } +static int futex2_setup_timeout(struct __kernel_timespec __user *timeout, + clockid_t clockid, struct hrtimer_sleeper *to) +{ + int flag_clkid = 0, flag_init = 0; + struct timespec64 ts; + ktime_t time; + int ret; + + if (!timeout) + return 0; + + if (clockid == CLOCK_REALTIME) { + flag_clkid = FLAGS_CLOCKRT; + flag_init = FUTEX_CLOCK_REALTIME; + } + + if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC) + return -EINVAL; + + if (get_timespec64(&ts, timeout)) + return -EFAULT; + + /* + * Since there's no opcode for futex_waitv, use + * FUTEX_WAIT_BITSET that uses absolute timeout as well + */ + ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time); + if (ret) + return ret; + + futex_setup_timer(&time, to, flag_clkid, 0); + return 0; +} + +static inline void futex2_destroy_timeout(struct hrtimer_sleeper *to) +{ + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); +} + /** * sys_futex_waitv - Wait on a list of futexes * @waiters: List of futexes to wait on @@ -249,8 +293,6 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters, { struct hrtimer_sleeper to; struct futex_vector *futexv; - struct timespec64 ts; - ktime_t time; int ret; /* This syscall supports no flags for now */ @@ -260,30 +302,8 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters, if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters) return -EINVAL; - if (timeout) { - int flag_clkid = 0, flag_init = 0; - - if (clockid == CLOCK_REALTIME) { - flag_clkid = FLAGS_CLOCKRT; - flag_init = FUTEX_CLOCK_REALTIME; - } - - if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC) - return -EINVAL; - - if (get_timespec64(&ts, timeout)) - return -EFAULT; - - /* - * Since there's no opcode for futex_waitv, use - * FUTEX_WAIT_BITSET that uses absolute timeout as well - */ - ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time); - if (ret) - return ret; - - futex_setup_timer(&time, &to, flag_clkid, 0); - } + if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to))) + return ret; futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL); if (!futexv) { @@ -291,20 +311,133 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters, goto destroy_timer; } - ret = futex_parse_waitv(futexv, waiters, nr_futexes); + ret = futex_parse_waitv(futexv, waiters, nr_futexes, futex_wake_mark, + NULL); if (!ret) ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL); kfree(futexv); destroy_timer: - if (timeout) { - hrtimer_cancel(&to.timer); - destroy_hrtimer_on_stack(&to.timer); - } + if (timeout) + futex2_destroy_timeout(&to); return ret; } +/* + * sys_futex_wake - Wake a number of futexes + * @uaddr: Address of the futex(es) to wake + * @mask: bitmask + * @nr: Number of the futexes to wake + * @flags: FUTEX2 flags + * + * Identical to the traditional FUTEX_WAKE_BITSET op, except it is part of the + * futex2 family of calls. + */ + +SYSCALL_DEFINE4(futex_wake, + void __user *, uaddr, + unsigned long, mask, + int, nr, + unsigned int, flags) +{ + if (flags & ~FUTEX2_VALID_MASK) + return -EINVAL; + + flags = futex2_to_flags(flags); + if (!futex_flags_valid(flags)) + return -EINVAL; + + if (!futex_validate_input(flags, mask)) + return -EINVAL; + + return futex_wake(uaddr, FLAGS_STRICT | flags, nr, mask); +} + +/* + * sys_futex_wait - Wait on a futex + * @uaddr: Address of the futex to wait on + * @val: Value of @uaddr + * @mask: bitmask + * @flags: FUTEX2 flags + * @timeout: Optional absolute timeout + * @clockid: Clock to be used for the timeout, realtime or monotonic + * + * Identical to the traditional FUTEX_WAIT_BITSET op, except it is part of the + * futex2 familiy of calls. + */ + +SYSCALL_DEFINE6(futex_wait, + void __user *, uaddr, + unsigned long, val, + unsigned long, mask, + unsigned int, flags, + struct __kernel_timespec __user *, timeout, + clockid_t, clockid) +{ + struct hrtimer_sleeper to; + int ret; + + if (flags & ~FUTEX2_VALID_MASK) + return -EINVAL; + + flags = futex2_to_flags(flags); + if (!futex_flags_valid(flags)) + return -EINVAL; + + if (!futex_validate_input(flags, val) || + !futex_validate_input(flags, mask)) + return -EINVAL; + + if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to))) + return ret; + + ret = __futex_wait(uaddr, flags, val, timeout ? &to : NULL, mask); + + if (timeout) + futex2_destroy_timeout(&to); + + return ret; +} + +/* + * sys_futex_requeue - Requeue a waiter from one futex to another + * @waiters: array describing the source and destination futex + * @flags: unused + * @nr_wake: number of futexes to wake + * @nr_requeue: number of futexes to requeue + * + * Identical to the traditional FUTEX_CMP_REQUEUE op, except it is part of the + * futex2 family of calls. + */ + +SYSCALL_DEFINE4(futex_requeue, + struct futex_waitv __user *, waiters, + unsigned int, flags, + int, nr_wake, + int, nr_requeue) +{ + struct futex_vector futexes[2]; + u32 cmpval; + int ret; + + if (flags) + return -EINVAL; + + if (!waiters) + return -EINVAL; + + ret = futex_parse_waitv(futexes, waiters, 2, futex_wake_mark, NULL); + if (ret) + return ret; + + cmpval = futexes[0].w.val; + + return futex_requeue(u64_to_user_ptr(futexes[0].w.uaddr), futexes[0].w.flags, + u64_to_user_ptr(futexes[1].w.uaddr), futexes[1].w.flags, + nr_wake, nr_requeue, &cmpval, 0); +} + #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(set_robust_list, struct compat_robust_list_head __user *, head, diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c index ba01b9408203..3a10375d9521 100644 --- a/kernel/futex/waitwake.c +++ b/kernel/futex/waitwake.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/plist.h> #include <linux/sched/task.h> #include <linux/sched/signal.h> #include <linux/freezer.h> @@ -106,20 +107,11 @@ * double_lock_hb() and double_unlock_hb(), respectively. */ -/* - * The hash bucket lock must be held when this is called. - * Afterwards, the futex_q must not be accessed. Callers - * must ensure to later call wake_up_q() for the actual - * wakeups to occur. - */ -void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q) +bool __futex_wake_mark(struct futex_q *q) { - struct task_struct *p = q->task; - if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) - return; + return false; - get_task_struct(p); __futex_unqueue(q); /* * The waiting task can free the futex_q as soon as q->lock_ptr = NULL @@ -130,6 +122,26 @@ void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q) */ smp_store_release(&q->lock_ptr, NULL); + return true; +} + +/* + * The hash bucket lock must be held when this is called. + * Afterwards, the futex_q must not be accessed. Callers + * must ensure to later call wake_up_q() for the actual + * wakeups to occur. + */ +void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q) +{ + struct task_struct *p = q->task; + + get_task_struct(p); + + if (!__futex_wake_mark(q)) { + put_task_struct(p); + return; + } + /* * Queue the task for later wakeup for after we've released * the hb->lock. @@ -145,16 +157,19 @@ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) struct futex_hash_bucket *hb; struct futex_q *this, *next; union futex_key key = FUTEX_KEY_INIT; - int ret; DEFINE_WAKE_Q(wake_q); + int ret; if (!bitset) return -EINVAL; - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ); + ret = get_futex_key(uaddr, flags, &key, FUTEX_READ); if (unlikely(ret != 0)) return ret; + if ((flags & FLAGS_STRICT) && !nr_wake) + return 0; + hb = futex_hash(&key); /* Make sure we really have tasks to wakeup */ @@ -174,7 +189,7 @@ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) if (!(this->bitset & bitset)) continue; - futex_wake_mark(&wake_q, this); + this->wake(&wake_q, this); if (++ret >= nr_wake) break; } @@ -245,10 +260,10 @@ int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, DEFINE_WAKE_Q(wake_q); retry: - ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); + ret = get_futex_key(uaddr1, flags, &key1, FUTEX_READ); if (unlikely(ret != 0)) return ret; - ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); + ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE); if (unlikely(ret != 0)) return ret; @@ -289,7 +304,7 @@ retry_private: ret = -EINVAL; goto out_unlock; } - futex_wake_mark(&wake_q, this); + this->wake(&wake_q, this); if (++ret >= nr_wake) break; } @@ -303,7 +318,7 @@ retry_private: ret = -EINVAL; goto out_unlock; } - futex_wake_mark(&wake_q, this); + this->wake(&wake_q, this); if (++op_ret >= nr_wake2) break; } @@ -358,7 +373,7 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, } /** - * unqueue_multiple - Remove various futexes from their hash bucket + * futex_unqueue_multiple - Remove various futexes from their hash bucket * @v: The list of futexes to unqueue * @count: Number of futexes in the list * @@ -368,7 +383,7 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, * - >=0 - Index of the last futex that was awoken; * - -1 - No futex was awoken */ -static int unqueue_multiple(struct futex_vector *v, int count) +int futex_unqueue_multiple(struct futex_vector *v, int count) { int ret = -1, i; @@ -396,7 +411,7 @@ static int unqueue_multiple(struct futex_vector *v, int count) * - 0 - Success * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL */ -static int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken) +int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken) { struct futex_hash_bucket *hb; bool retry = false; @@ -419,11 +434,11 @@ static int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *wo */ retry: for (i = 0; i < count; i++) { - if ((vs[i].w.flags & FUTEX_PRIVATE_FLAG) && retry) + if (!(vs[i].w.flags & FLAGS_SHARED) && retry) continue; ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr), - !(vs[i].w.flags & FUTEX_PRIVATE_FLAG), + vs[i].w.flags, &vs[i].q.key, FUTEX_READ); if (unlikely(ret)) @@ -435,7 +450,7 @@ retry: for (i = 0; i < count; i++) { u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr; struct futex_q *q = &vs[i].q; - u32 val = (u32)vs[i].w.val; + u32 val = vs[i].w.val; hb = futex_q_lock(q); ret = futex_get_value_locked(&uval, uaddr); @@ -458,7 +473,7 @@ retry: * was woken, we don't return error and return this index to * userspace */ - *woken = unqueue_multiple(vs, i); + *woken = futex_unqueue_multiple(vs, i); if (*woken >= 0) return 1; @@ -543,7 +558,7 @@ int futex_wait_multiple(struct futex_vector *vs, unsigned int count, __set_current_state(TASK_RUNNING); - ret = unqueue_multiple(vs, count); + ret = futex_unqueue_multiple(vs, count); if (ret >= 0) return ret; @@ -599,7 +614,7 @@ int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, * while the syscall executes. */ retry: - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ); + ret = get_futex_key(uaddr, flags, &q->key, FUTEX_READ); if (unlikely(ret != 0)) return ret; @@ -629,20 +644,18 @@ retry_private: return ret; } -int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset) +int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, + struct hrtimer_sleeper *to, u32 bitset) { - struct hrtimer_sleeper timeout, *to; - struct restart_block *restart; - struct futex_hash_bucket *hb; struct futex_q q = futex_q_init; + struct futex_hash_bucket *hb; int ret; if (!bitset) return -EINVAL; + q.bitset = bitset; - to = futex_setup_timer(abs_time, &timeout, flags, - current->timer_slack_ns); retry: /* * Prepare to wait on uaddr. On success, it holds hb->lock and q @@ -650,18 +663,17 @@ retry: */ ret = futex_wait_setup(uaddr, val, flags, &q, &hb); if (ret) - goto out; + return ret; /* futex_queue and wait for wakeup, timeout, or a signal. */ futex_wait_queue(hb, &q, to); /* If we were woken (and unqueued), we succeeded, whatever. */ - ret = 0; if (!futex_unqueue(&q)) - goto out; - ret = -ETIMEDOUT; + return 0; + if (to && !to->task) - goto out; + return -ETIMEDOUT; /* * We expect signal_pending(current), but we might be the @@ -670,24 +682,38 @@ retry: if (!signal_pending(current)) goto retry; - ret = -ERESTARTSYS; - if (!abs_time) - goto out; + return -ERESTARTSYS; +} - restart = ¤t->restart_block; - restart->futex.uaddr = uaddr; - restart->futex.val = val; - restart->futex.time = *abs_time; - restart->futex.bitset = bitset; - restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; +int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset) +{ + struct hrtimer_sleeper timeout, *to; + struct restart_block *restart; + int ret; - ret = set_restart_fn(restart, futex_wait_restart); + to = futex_setup_timer(abs_time, &timeout, flags, + current->timer_slack_ns); -out: - if (to) { - hrtimer_cancel(&to->timer); - destroy_hrtimer_on_stack(&to->timer); + ret = __futex_wait(uaddr, flags, val, to, bitset); + + /* No timeout, nothing to clean up. */ + if (!to) + return ret; + + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); + + if (ret == -ERESTARTSYS) { + restart = ¤t->restart_block; + restart->futex.uaddr = uaddr; + restart->futex.val = val; + restart->futex.time = *abs_time; + restart->futex.bitset = bitset; + restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; + + return set_restart_fn(restart, futex_wait_restart); } + return ret; } diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index 5c3086cad8f9..01520689b57c 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -99,7 +99,7 @@ struct gcov_iterator { struct gcov_info *info; size_t size; loff_t pos; - char buffer[]; + char buffer[] __counted_by(size); }; /** diff --git a/kernel/groups.c b/kernel/groups.c index 9aaed2a31073..9b43da22647d 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -19,7 +19,7 @@ struct group_info *groups_alloc(int gidsetsize) if (!gi) return NULL; - atomic_set(&gi->usage, 1); + refcount_set(&gi->usage, 1); gi->ngroups = gidsetsize; return gi; } diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index 5971a66be034..aae0402507ed 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -121,7 +121,6 @@ static const struct irq_bit_descr irqdata_states[] = { BIT_MASK_DESCR(IRQD_AFFINITY_ON_ACTIVATE), BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN), BIT_MASK_DESCR(IRQD_CAN_RESERVE), - BIT_MASK_DESCR(IRQD_MSI_NOMASK_QUIRK), BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU), diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index c653cd31548d..d39a40bc542b 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -219,11 +219,15 @@ void irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, int num_ct, unsigned int irq_base, void __iomem *reg_base, irq_flow_handler_t handler) { + struct irq_chip_type *ct = gc->chip_types; + int i; + raw_spin_lock_init(&gc->lock); gc->num_ct = num_ct; gc->irq_base = irq_base; gc->reg_base = reg_base; - gc->chip_types->chip.name = name; + for (i = 0; i < num_ct; i++) + ct[i].chip.name = name; gc->chip_types->handler = handler; } @@ -544,21 +548,34 @@ EXPORT_SYMBOL_GPL(irq_setup_alt_chip); void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk, unsigned int clr, unsigned int set) { - unsigned int i = gc->irq_base; + unsigned int i, virq; raw_spin_lock(&gc_lock); list_del(&gc->list); raw_spin_unlock(&gc_lock); - for (; msk; msk >>= 1, i++) { + for (i = 0; msk; msk >>= 1, i++) { if (!(msk & 0x01)) continue; + /* + * Interrupt domain based chips store the base hardware + * interrupt number in gc::irq_base. Otherwise gc::irq_base + * contains the base Linux interrupt number. + */ + if (gc->domain) { + virq = irq_find_mapping(gc->domain, gc->irq_base + i); + if (!virq) + continue; + } else { + virq = gc->irq_base + i; + } + /* Remove handler first. That will mask the irq line */ - irq_set_handler(i, NULL); - irq_set_chip(i, &no_irq_chip); - irq_set_chip_data(i, NULL); - irq_modify_status(i, clr, set); + irq_set_handler(virq, NULL); + irq_set_chip(virq, &no_irq_chip); + irq_set_chip_data(virq, NULL); + irq_modify_status(virq, clr, set); } } EXPORT_SYMBOL_GPL(irq_remove_generic_chip); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 27ca1c866f29..371eb1711d34 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -600,7 +600,7 @@ int __init early_irq_init(void) mutex_init(&desc[i].request_mutex); init_waitqueue_head(&desc[i].wait_for_threads); desc_set_defaults(i, &desc[i], node, NULL, NULL); - irq_resend_init(desc); + irq_resend_init(&desc[i]); } return arch_early_irq_init(); } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d309ba84e08a..1782f90cd8c6 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1852,15 +1852,13 @@ out_thread: struct task_struct *t = new->thread; new->thread = NULL; - kthread_stop(t); - put_task_struct(t); + kthread_stop_put(t); } if (new->secondary && new->secondary->thread) { struct task_struct *t = new->secondary->thread; new->secondary->thread = NULL; - kthread_stop(t); - put_task_struct(t); + kthread_stop_put(t); } out_mput: module_put(desc->owner); @@ -1971,12 +1969,9 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) * the same bit to a newly requested action. */ if (action->thread) { - kthread_stop(action->thread); - put_task_struct(action->thread); - if (action->secondary && action->secondary->thread) { - kthread_stop(action->secondary->thread); - put_task_struct(action->secondary->thread); - } + kthread_stop_put(action->thread); + if (action->secondary && action->secondary->thread) + kthread_stop_put(action->secondary->thread); } /* Last action releases resources */ diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index 1698e77645ac..75d0ae490e29 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -466,16 +466,16 @@ unsigned int irq_matrix_reserved(struct irq_matrix *m) } /** - * irq_matrix_allocated - Get the number of allocated irqs on the local cpu + * irq_matrix_allocated - Get the number of allocated non-managed irqs on the local CPU * @m: Pointer to the matrix to search * - * This returns number of allocated irqs + * This returns number of allocated non-managed interrupts. */ unsigned int irq_matrix_allocated(struct irq_matrix *m) { struct cpumap *cm = this_cpu_ptr(m->maps); - return cm->allocated; + return cm->allocated - cm->managed_allocated; } #ifdef CONFIG_GENERIC_IRQ_DEBUGFS diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index b4c31a5c1147..79b4a58ba9c3 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -1204,7 +1204,6 @@ static int msi_handle_pci_fail(struct irq_domain *domain, struct msi_desc *desc, #define VIRQ_CAN_RESERVE 0x01 #define VIRQ_ACTIVATE 0x02 -#define VIRQ_NOMASK_QUIRK 0x04 static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflags) { @@ -1213,8 +1212,6 @@ static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflag if (!(vflags & VIRQ_CAN_RESERVE)) { irqd_clr_can_reserve(irqd); - if (vflags & VIRQ_NOMASK_QUIRK) - irqd_set_msi_nomask_quirk(irqd); /* * If the interrupt is managed but no CPU is available to @@ -1275,15 +1272,8 @@ static int __msi_domain_alloc_irqs(struct device *dev, struct irq_domain *domain * Interrupt can use a reserved vector and will not occupy * a real device vector until the interrupt is requested. */ - if (msi_check_reservation_mode(domain, info, dev)) { + if (msi_check_reservation_mode(domain, info, dev)) vflags |= VIRQ_CAN_RESERVE; - /* - * MSI affinity setting requires a special quirk (X86) when - * reservation mode is active. - */ - if (info->flags & MSI_FLAG_NOMASK_QUIRK) - vflags |= VIRQ_NOMASK_QUIRK; - } xa_for_each_range(xa, idx, desc, ctrl->first, ctrl->last) { if (!msi_desc_match(desc, MSI_DESC_NOTASSOCIATED)) diff --git a/kernel/kcmp.c b/kernel/kcmp.c index 5353edfad8e1..b0639f21041f 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -64,8 +64,10 @@ get_file_raw_ptr(struct task_struct *task, unsigned int idx) struct file *file; rcu_read_lock(); - file = task_lookup_fd_rcu(task, idx); + file = task_lookup_fdget_rcu(task, idx); rcu_read_unlock(); + if (file) + fput(file); return file; } diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index 0ddbdab5903d..015586217875 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -699,12 +699,9 @@ static void test_barrier_nothreads(struct kunit *test) KCSAN_EXPECT_RW_BARRIER(spin_unlock(&test_spinlock), true); KCSAN_EXPECT_RW_BARRIER(mutex_lock(&test_mutex), false); KCSAN_EXPECT_RW_BARRIER(mutex_unlock(&test_mutex), true); - -#ifdef clear_bit_unlock_is_negative_byte - KCSAN_EXPECT_READ_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true); - KCSAN_EXPECT_WRITE_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true); - KCSAN_EXPECT_RW_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true); -#endif + KCSAN_EXPECT_READ_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true); + KCSAN_EXPECT_WRITE_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true); + KCSAN_EXPECT_RW_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true); kcsan_nestable_atomic_end(); } diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c index 8679322450f2..84a1200271af 100644 --- a/kernel/kcsan/selftest.c +++ b/kernel/kcsan/selftest.c @@ -227,12 +227,9 @@ static bool __init test_barrier(void) KCSAN_CHECK_RW_BARRIER(arch_spin_unlock(&arch_spinlock)); spin_lock(&test_spinlock); KCSAN_CHECK_RW_BARRIER(spin_unlock(&test_spinlock)); - -#ifdef clear_bit_unlock_is_negative_byte - KCSAN_CHECK_RW_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var)); - KCSAN_CHECK_READ_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var)); - KCSAN_CHECK_WRITE_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var)); -#endif + KCSAN_CHECK_RW_BARRIER(xor_unlock_is_negative_byte(1, &test_var)); + KCSAN_CHECK_READ_BARRIER(xor_unlock_is_negative_byte(1, &test_var)); + KCSAN_CHECK_WRITE_BARRIER(xor_unlock_is_negative_byte(1, &test_var)); kcsan_nestable_atomic_end(); return ret; diff --git a/kernel/kexec.c b/kernel/kexec.c index 107f355eac10..8f35a5a42af8 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -247,7 +247,7 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) return -EINVAL; - ksegments = memdup_user(segments, nr_segments * sizeof(ksegments[0])); + ksegments = memdup_array_user(segments, nr_segments, sizeof(ksegments[0])); if (IS_ERR(ksegments)) return PTR_ERR(ksegments); diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 9dc728982d79..d08fc7b5db97 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -52,22 +52,7 @@ atomic_t __kexec_lock = ATOMIC_INIT(0); /* Flag to indicate we are going to kexec a new kernel */ bool kexec_in_progress = false; - -/* Location of the reserved area for the crash kernel */ -struct resource crashk_res = { - .name = "Crash kernel", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, - .desc = IORES_DESC_CRASH_KERNEL -}; -struct resource crashk_low_res = { - .name = "Crash kernel", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, - .desc = IORES_DESC_CRASH_KERNEL -}; +bool kexec_file_dbg_print; int kexec_should_crash(struct task_struct *p) { @@ -293,8 +278,8 @@ int kimage_is_destination_range(struct kimage *image, unsigned long mstart, mend; mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz; - if ((end > mstart) && (start < mend)) + mend = mstart + image->segment[i].memsz - 1; + if ((end >= mstart) && (start <= mend)) return 1; } @@ -387,7 +372,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image, pfn = page_to_boot_pfn(pages); epfn = pfn + count; addr = pfn << PAGE_SHIFT; - eaddr = epfn << PAGE_SHIFT; + eaddr = (epfn << PAGE_SHIFT) - 1; if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || kimage_is_destination_range(image, addr, eaddr)) { list_add(&pages->lru, &extra_pages); @@ -447,7 +432,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image, pages = NULL; size = (1 << order) << PAGE_SHIFT; - hole_start = (image->control_page + (size - 1)) & ~(size - 1); + hole_start = ALIGN(image->control_page, size); hole_end = hole_start + size - 1; while (hole_end <= crashk_res.end) { unsigned long i; @@ -464,7 +449,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image, mend = mstart + image->segment[i].memsz - 1; if ((hole_end >= mstart) && (hole_start <= mend)) { /* Advance the hole to the end of the segment */ - hole_start = (mend + (size - 1)) & ~(size - 1); + hole_start = ALIGN(mend, size); hole_end = hole_start + size - 1; break; } @@ -472,7 +457,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image, /* If I don't overlap any segments I have found my hole! */ if (i == image->nr_segments) { pages = pfn_to_page(hole_start >> PAGE_SHIFT); - image->control_page = hole_end; + image->control_page = hole_end + 1; break; } } @@ -733,7 +718,7 @@ static struct page *kimage_alloc_page(struct kimage *image, /* If the page is not a destination page use it */ if (!kimage_is_destination_range(image, addr, - addr + PAGE_SIZE)) + addr + PAGE_SIZE - 1)) break; /* @@ -1080,9 +1065,10 @@ __bpf_kfunc void crash_kexec(struct pt_regs *regs) * panic(). Otherwise parallel calls of panic() and crash_kexec() * may stop each other. To exclude them, we use panic_cpu here too. */ + old_cpu = PANIC_CPU_INVALID; this_cpu = raw_smp_processor_id(); - old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu); - if (old_cpu == PANIC_CPU_INVALID) { + + if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) { /* This is the 1st CPU which comes here, so go ahead. */ __crash_kexec(regs); @@ -1271,6 +1257,7 @@ int kernel_kexec(void) kexec_in_progress = true; kernel_restart_prepare("kexec reboot"); migrate_to_reboot_cpu(); + syscore_shutdown(); /* * migrate_to_reboot_cpu() disables CPU hotplug assuming that diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index f9a419cd22d4..bef2f6f2571b 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -123,6 +123,8 @@ void kimage_file_post_load_cleanup(struct kimage *image) */ kfree(image->image_loader_data); image->image_loader_data = NULL; + + kexec_file_dbg_print = false; } #ifdef CONFIG_KEXEC_SIG @@ -202,6 +204,8 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, if (ret < 0) return ret; image->kernel_buf_len = ret; + kexec_dprintk("kernel: %p kernel_size: %#lx\n", + image->kernel_buf, image->kernel_buf_len); /* Call arch image probe handlers */ ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, @@ -278,6 +282,7 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, if (!image) return -ENOMEM; + kexec_file_dbg_print = !!(flags & KEXEC_FILE_DEBUG); image->file_mode = 1; if (kexec_on_panic) { @@ -384,13 +389,14 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, if (ret) goto out; + kexec_dprintk("nr_segments = %lu\n", image->nr_segments); for (i = 0; i < image->nr_segments; i++) { struct kexec_segment *ksegment; ksegment = &image->segment[i]; - pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n", - i, ksegment->buf, ksegment->bufsz, ksegment->mem, - ksegment->memsz); + kexec_dprintk("segment[%d]: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n", + i, ksegment->buf, ksegment->bufsz, ksegment->mem, + ksegment->memsz); ret = kimage_load_segment(image, &image->segment[i]); if (ret) @@ -403,6 +409,8 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, if (ret) goto out; + kexec_dprintk("kexec_file_load: type:%u, start:0x%lx head:0x%lx flags:0x%lx\n", + image->type, image->start, image->head, flags); /* * Free up any temporary buffers allocated which are not needed * after image has been loaded @@ -426,11 +434,11 @@ static int locate_mem_hole_top_down(unsigned long start, unsigned long end, unsigned long temp_start, temp_end; temp_end = min(end, kbuf->buf_max); - temp_start = temp_end - kbuf->memsz; + temp_start = temp_end - kbuf->memsz + 1; do { /* align down start */ - temp_start = temp_start & (~(kbuf->buf_align - 1)); + temp_start = ALIGN_DOWN(temp_start, kbuf->buf_align); if (temp_start < start || temp_start < kbuf->buf_min) return 0; @@ -592,6 +600,8 @@ static int kexec_walk_resources(struct kexec_buf *kbuf, IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY, crashk_res.start, crashk_res.end, kbuf, func); + else if (kbuf->top_down) + return walk_system_ram_res_rev(0, ULONG_MAX, kbuf, func); else return walk_system_ram_res(0, ULONG_MAX, kbuf, func); } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 0c6185aefaef..9d9095e81792 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1877,13 +1877,27 @@ static struct notifier_block kprobe_exceptions_nb = { #ifdef CONFIG_KRETPROBES #if !defined(CONFIG_KRETPROBE_ON_RETHOOK) + +/* callbacks for objpool of kretprobe instances */ +static int kretprobe_init_inst(void *nod, void *context) +{ + struct kretprobe_instance *ri = nod; + + ri->rph = context; + return 0; +} +static int kretprobe_fini_pool(struct objpool_head *head, void *context) +{ + kfree(context); + return 0; +} + static void free_rp_inst_rcu(struct rcu_head *head) { struct kretprobe_instance *ri = container_of(head, struct kretprobe_instance, rcu); + struct kretprobe_holder *rph = ri->rph; - if (refcount_dec_and_test(&ri->rph->ref)) - kfree(ri->rph); - kfree(ri); + objpool_drop(ri, &rph->pool); } NOKPROBE_SYMBOL(free_rp_inst_rcu); @@ -1892,7 +1906,7 @@ static void recycle_rp_inst(struct kretprobe_instance *ri) struct kretprobe *rp = get_kretprobe(ri); if (likely(rp)) - freelist_add(&ri->freelist, &rp->freelist); + objpool_push(ri, &rp->rph->pool); else call_rcu(&ri->rcu, free_rp_inst_rcu); } @@ -1929,23 +1943,12 @@ NOKPROBE_SYMBOL(kprobe_flush_task); static inline void free_rp_inst(struct kretprobe *rp) { - struct kretprobe_instance *ri; - struct freelist_node *node; - int count = 0; - - node = rp->freelist.head; - while (node) { - ri = container_of(node, struct kretprobe_instance, freelist); - node = node->next; - - kfree(ri); - count++; - } + struct kretprobe_holder *rph = rp->rph; - if (refcount_sub_and_test(count, &rp->rph->ref)) { - kfree(rp->rph); - rp->rph = NULL; - } + if (!rph) + return; + rp->rph = NULL; + objpool_fini(&rph->pool); } /* This assumes the 'tsk' is the current task or the is not running. */ @@ -1990,7 +1993,7 @@ NOKPROBE_SYMBOL(__kretprobe_find_ret_addr); unsigned long kretprobe_find_ret_addr(struct task_struct *tsk, void *fp, struct llist_node **cur) { - struct kretprobe_instance *ri = NULL; + struct kretprobe_instance *ri; kprobe_opcode_t *ret; if (WARN_ON_ONCE(!cur)) @@ -2087,19 +2090,17 @@ NOKPROBE_SYMBOL(__kretprobe_trampoline_handler) static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) { struct kretprobe *rp = container_of(p, struct kretprobe, kp); + struct kretprobe_holder *rph = rp->rph; struct kretprobe_instance *ri; - struct freelist_node *fn; - fn = freelist_try_get(&rp->freelist); - if (!fn) { + ri = objpool_pop(&rph->pool); + if (!ri) { rp->nmissed++; return 0; } - ri = container_of(fn, struct kretprobe_instance, freelist); - if (rp->entry_handler && rp->entry_handler(ri, regs)) { - freelist_add(&ri->freelist, &rp->freelist); + objpool_push(ri, &rph->pool); return 0; } @@ -2193,7 +2194,6 @@ int kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long o int register_kretprobe(struct kretprobe *rp) { int ret; - struct kretprobe_instance *inst; int i; void *addr; @@ -2227,19 +2227,12 @@ int register_kretprobe(struct kretprobe *rp) rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus()); #ifdef CONFIG_KRETPROBE_ON_RETHOOK - rp->rh = rethook_alloc((void *)rp, kretprobe_rethook_handler); - if (!rp->rh) - return -ENOMEM; + rp->rh = rethook_alloc((void *)rp, kretprobe_rethook_handler, + sizeof(struct kretprobe_instance) + + rp->data_size, rp->maxactive); + if (IS_ERR(rp->rh)) + return PTR_ERR(rp->rh); - for (i = 0; i < rp->maxactive; i++) { - inst = kzalloc(struct_size(inst, data, rp->data_size), GFP_KERNEL); - if (inst == NULL) { - rethook_free(rp->rh); - rp->rh = NULL; - return -ENOMEM; - } - rethook_add_node(rp->rh, &inst->node); - } rp->nmissed = 0; /* Establish function entry probe point */ ret = register_kprobe(&rp->kp); @@ -2248,24 +2241,18 @@ int register_kretprobe(struct kretprobe *rp) rp->rh = NULL; } #else /* !CONFIG_KRETPROBE_ON_RETHOOK */ - rp->freelist.head = NULL; rp->rph = kzalloc(sizeof(struct kretprobe_holder), GFP_KERNEL); if (!rp->rph) return -ENOMEM; - rp->rph->rp = rp; - for (i = 0; i < rp->maxactive; i++) { - inst = kzalloc(struct_size(inst, data, rp->data_size), GFP_KERNEL); - if (inst == NULL) { - refcount_set(&rp->rph->ref, i); - free_rp_inst(rp); - return -ENOMEM; - } - inst->rph = rp->rph; - freelist_add(&inst->freelist, &rp->freelist); + if (objpool_init(&rp->rph->pool, rp->maxactive, rp->data_size + + sizeof(struct kretprobe_instance), GFP_KERNEL, + rp->rph, kretprobe_init_inst, kretprobe_fini_pool)) { + kfree(rp->rph); + rp->rph = NULL; + return -ENOMEM; } - refcount_set(&rp->rph->ref, i); - + rcu_assign_pointer(rp->rph->rp, rp); rp->nmissed = 0; /* Establish function entry probe point */ ret = register_kprobe(&rp->kp); @@ -2313,7 +2300,7 @@ void unregister_kretprobes(struct kretprobe **rps, int num) #ifdef CONFIG_KRETPROBE_ON_RETHOOK rethook_free(rps[i]->rh); #else - rps[i]->rph->rp = NULL; + rcu_assign_pointer(rps[i]->rph->rp, NULL); #endif } mutex_unlock(&kprobe_mutex); @@ -2815,7 +2802,7 @@ static int show_kprobe_addr(struct seq_file *pi, void *v) { struct hlist_head *head; struct kprobe *p, *kp; - const char *sym = NULL; + const char *sym; unsigned int i = *(loff_t *) v; unsigned long offset = 0; char *modname, namebuf[KSYM_NAME_LEN]; diff --git a/kernel/kthread.c b/kernel/kthread.c index 1eea53050bab..c5e40830c1f2 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -715,6 +715,24 @@ int kthread_stop(struct task_struct *k) } EXPORT_SYMBOL(kthread_stop); +/** + * kthread_stop_put - stop a thread and put its task struct + * @k: thread created by kthread_create(). + * + * Stops a thread created by kthread_create() and put its task_struct. + * Only use when holding an extra task struct reference obtained by + * calling get_task_struct(). + */ +int kthread_stop_put(struct task_struct *k) +{ + int ret; + + ret = kthread_stop(k); + put_task_struct(k); + return ret; +} +EXPORT_SYMBOL(kthread_stop_put); + int kthreadd(void *unused) { struct task_struct *tsk = current; @@ -1469,7 +1487,6 @@ void kthread_unuse_mm(struct mm_struct *mm) * clearing tsk->mm. */ smp_mb__after_spinlock(); - sync_mm_rss(mm); local_irq_disable(); tsk->mm = NULL; membarrier_update_current_mm(NULL); diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 61328328c474..ecbc9b6aba3a 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -243,7 +243,7 @@ static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab, * symbols are exported and normal relas can be used instead. */ if (!sec_vmlinux && sym_vmlinux) { - pr_err("invalid access to vmlinux symbol '%s' from module-specific livepatch relocation section", + pr_err("invalid access to vmlinux symbol '%s' from module-specific livepatch relocation section\n", sym_name); return -EINVAL; } diff --git a/kernel/locking/lock_events.c b/kernel/locking/lock_events.c index fa2c2f951c6b..e68d82099558 100644 --- a/kernel/locking/lock_events.c +++ b/kernel/locking/lock_events.c @@ -146,7 +146,7 @@ static int __init init_lockevent_counts(void) struct dentry *d_counts = debugfs_create_dir(LOCK_EVENTS_DIR, NULL); int i; - if (!d_counts) + if (IS_ERR(d_counts)) goto out; /* @@ -159,14 +159,14 @@ static int __init init_lockevent_counts(void) for (i = 0; i < lockevent_num; i++) { if (skip_lockevent(lockevent_names[i])) continue; - if (!debugfs_create_file(lockevent_names[i], 0400, d_counts, - (void *)(long)i, &fops_lockevent)) + if (IS_ERR(debugfs_create_file(lockevent_names[i], 0400, d_counts, + (void *)(long)i, &fops_lockevent))) goto fail_undo; } - if (!debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200, + if (IS_ERR(debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200, d_counts, (void *)(long)LOCKEVENT_reset_cnts, - &fops_lockevent)) + &fops_lockevent))) goto fail_undo; return 0; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index e85b5ad3e206..151bd3de5936 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3497,7 +3497,8 @@ static int alloc_chain_hlocks(int req) size = chain_block_size(curr); if (likely(size >= req)) { del_chain_block(0, size, chain_block_next(curr)); - add_chain_block(curr + req, size - req); + if (size > req) + add_chain_block(curr + req, size - req); return curr; } } diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 15fdc7fa5c68..e2bfb1db589d 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -440,7 +440,7 @@ static void snprint_time(char *buf, size_t bufsiz, s64 nr) static void seq_time(struct seq_file *m, s64 time) { - char num[15]; + char num[22]; snprint_time(num, sizeof(num), time); seq_printf(m, " %14s", num); diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 270c7f80ce84..415d81e6ce70 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -33,21 +33,23 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); -torture_param(int, nwriters_stress, -1, "Number of write-locking stress-test threads"); -torture_param(int, nreaders_stress, -1, "Number of read-locking stress-test threads"); +torture_param(int, acq_writer_lim, 0, "Write_acquisition time limit (jiffies)."); +torture_param(int, call_rcu_chains, 0, "Self-propagate call_rcu() chains during test (0=disable)."); torture_param(int, long_hold, 100, "Do occasional long hold of lock (ms), 0=disable"); +torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)"); +torture_param(int, nreaders_stress, -1, "Number of read-locking stress-test threads"); +torture_param(int, nwriters_stress, -1, "Number of write-locking stress-test threads"); torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (s), 0=disable"); +torture_param(int, rt_boost, 2, + "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types."); +torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens."); torture_param(int, shuffle_interval, 3, "Number of jiffies between shuffles, 0=disable"); torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable."); torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable"); -torture_param(int, rt_boost, 2, - "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types."); -torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens."); -torture_param(int, writer_fifo, 0, "Run writers at sched_set_fifo() priority"); torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); -torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)"); +torture_param(int, writer_fifo, 0, "Run writers at sched_set_fifo() priority"); /* Going much higher trips "BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!" errors */ #define MAX_NESTED_LOCKS 8 @@ -56,6 +58,55 @@ module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)"); +static cpumask_var_t bind_readers; // Bind the readers to the specified set of CPUs. +static cpumask_var_t bind_writers; // Bind the writers to the specified set of CPUs. + +// Parse a cpumask kernel parameter. If there are more users later on, +// this might need to got to a more central location. +static int param_set_cpumask(const char *val, const struct kernel_param *kp) +{ + cpumask_var_t *cm_bind = kp->arg; + int ret; + char *s; + + if (!alloc_cpumask_var(cm_bind, GFP_KERNEL)) { + s = "Out of memory"; + ret = -ENOMEM; + goto out_err; + } + ret = cpulist_parse(val, *cm_bind); + if (!ret) + return ret; + s = "Bad CPU range"; +out_err: + pr_warn("%s: %s, all CPUs set\n", kp->name, s); + cpumask_setall(*cm_bind); + return ret; +} + +// Output a cpumask kernel parameter. +static int param_get_cpumask(char *buffer, const struct kernel_param *kp) +{ + cpumask_var_t *cm_bind = kp->arg; + + return sprintf(buffer, "%*pbl", cpumask_pr_args(*cm_bind)); +} + +static bool cpumask_nonempty(cpumask_var_t mask) +{ + return cpumask_available(mask) && !cpumask_empty(mask); +} + +static const struct kernel_param_ops lt_bind_ops = { + .set = param_set_cpumask, + .get = param_get_cpumask, +}; + +module_param_cb(bind_readers, <_bind_ops, &bind_readers, 0644); +module_param_cb(bind_writers, <_bind_ops, &bind_writers, 0644); + +long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask); + static struct task_struct *stats_task; static struct task_struct **writer_tasks; static struct task_struct **reader_tasks; @@ -69,6 +120,12 @@ struct lock_stress_stats { long n_lock_acquired; }; +struct call_rcu_chain { + struct rcu_head crc_rh; + bool crc_stop; +}; +struct call_rcu_chain *call_rcu_chain_list; + /* Forward reference. */ static void lock_torture_cleanup(void); @@ -116,12 +173,9 @@ static int torture_lock_busted_write_lock(int tid __maybe_unused) static void torture_lock_busted_write_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; - /* We want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms); + if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) + mdelay(long_hold); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ } @@ -194,15 +248,14 @@ __acquires(torture_spinlock) static void torture_spin_lock_write_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 2; - const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; unsigned long j; /* We want a short delay mostly to emulate likely code, and * we want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * longdelay_ms))) { + if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) { j = jiffies; - mdelay(longdelay_ms); + mdelay(long_hold); pr_alert("%s: delay = %lu jiffies.\n", __func__, jiffies - j); } if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 200 * shortdelay_us))) @@ -320,14 +373,12 @@ __acquires(torture_rwlock) static void torture_rwlock_write_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 2; - const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; /* We want a short delay mostly to emulate likely code, and * we want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms); + if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) + mdelay(long_hold); else udelay(shortdelay_us); } @@ -348,14 +399,12 @@ __acquires(torture_rwlock) static void torture_rwlock_read_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 10; - const unsigned long longdelay_ms = 100; /* We want a short delay mostly to emulate likely code, and * we want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealreaders_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms); + if (long_hold && !(torture_random(trsp) % (cxt.nrealreaders_stress * 2000 * long_hold))) + mdelay(long_hold); else udelay(shortdelay_us); } @@ -453,12 +502,9 @@ __acquires(torture_mutex) static void torture_mutex_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; - /* We want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms * 5); + if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) + mdelay(long_hold * 5); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ } @@ -626,15 +672,13 @@ __acquires(torture_rtmutex) static void torture_rtmutex_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 2; - const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; /* * We want a short delay mostly to emulate likely code, and * we want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms); + if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) + mdelay(long_hold); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 200 * shortdelay_us))) udelay(shortdelay_us); @@ -691,12 +735,9 @@ __acquires(torture_rwsem) static void torture_rwsem_write_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; - /* We want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms * 10); + if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) + mdelay(long_hold * 10); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ } @@ -716,14 +757,11 @@ __acquires(torture_rwsem) static void torture_rwsem_read_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_ms = 100; - /* We want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealreaders_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms * 2); + if (long_hold && !(torture_random(trsp) % (cxt.nrealreaders_stress * 2000 * long_hold))) + mdelay(long_hold * 2); else - mdelay(longdelay_ms / 2); + mdelay(long_hold / 2); if (!(torture_random(trsp) % (cxt.nrealreaders_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ } @@ -803,11 +841,13 @@ static struct lock_torture_ops percpu_rwsem_lock_ops = { */ static int lock_torture_writer(void *arg) { + unsigned long j; + unsigned long j1; + u32 lockset_mask; struct lock_stress_stats *lwsp = arg; - int tid = lwsp - cxt.lwsa; DEFINE_TORTURE_RANDOM(rand); - u32 lockset_mask; bool skip_main_lock; + int tid = lwsp - cxt.lwsa; VERBOSE_TOROUT_STRING("lock_torture_writer task started"); if (!rt_task(current)) @@ -834,17 +874,24 @@ static int lock_torture_writer(void *arg) cxt.cur_ops->nested_lock(tid, lockset_mask); if (!skip_main_lock) { + if (acq_writer_lim > 0) + j = jiffies; cxt.cur_ops->writelock(tid); if (WARN_ON_ONCE(lock_is_write_held)) lwsp->n_lock_fail++; lock_is_write_held = true; if (WARN_ON_ONCE(atomic_read(&lock_is_read_held))) lwsp->n_lock_fail++; /* rare, but... */ - + if (acq_writer_lim > 0) { + j1 = jiffies; + WARN_ONCE(time_after(j1, j + acq_writer_lim), + "%s: Lock acquisition took %lu jiffies.\n", + __func__, j1 - j); + } lwsp->n_lock_acquired++; - } - if (!skip_main_lock) { + cxt.cur_ops->write_delay(&rand); + lock_is_write_held = false; WRITE_ONCE(last_lock_release, jiffies); cxt.cur_ops->writeunlock(tid); @@ -986,16 +1033,69 @@ static int lock_torture_stats(void *arg) return 0; } + static inline void lock_torture_print_module_parms(struct lock_torture_ops *cur_ops, const char *tag) { + static cpumask_t cpumask_all; + cpumask_t *rcmp = cpumask_nonempty(bind_readers) ? bind_readers : &cpumask_all; + cpumask_t *wcmp = cpumask_nonempty(bind_writers) ? bind_writers : &cpumask_all; + + cpumask_setall(&cpumask_all); pr_alert("%s" TORTURE_FLAG - "--- %s%s: nwriters_stress=%d nreaders_stress=%d nested_locks=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n", + "--- %s%s: acq_writer_lim=%d bind_readers=%*pbl bind_writers=%*pbl call_rcu_chains=%d long_hold=%d nested_locks=%d nreaders_stress=%d nwriters_stress=%d onoff_holdoff=%d onoff_interval=%d rt_boost=%d rt_boost_factor=%d shuffle_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d verbose=%d writer_fifo=%d\n", torture_type, tag, cxt.debug_lock ? " [debug]": "", - cxt.nrealwriters_stress, cxt.nrealreaders_stress, - nested_locks, stat_interval, verbose, shuffle_interval, - stutter, shutdown_secs, onoff_interval, onoff_holdoff); + acq_writer_lim, cpumask_pr_args(rcmp), cpumask_pr_args(wcmp), + call_rcu_chains, long_hold, nested_locks, cxt.nrealreaders_stress, + cxt.nrealwriters_stress, onoff_holdoff, onoff_interval, rt_boost, + rt_boost_factor, shuffle_interval, shutdown_secs, stat_interval, stutter, + verbose, writer_fifo); +} + +// If requested, maintain call_rcu() chains to keep a grace period always +// in flight. These increase the probability of getting an RCU CPU stall +// warning and associated diagnostics when a locking primitive stalls. + +static void call_rcu_chain_cb(struct rcu_head *rhp) +{ + struct call_rcu_chain *crcp = container_of(rhp, struct call_rcu_chain, crc_rh); + + if (!smp_load_acquire(&crcp->crc_stop)) { + (void)start_poll_synchronize_rcu(); // Start one grace period... + call_rcu(&crcp->crc_rh, call_rcu_chain_cb); // ... and later start another. + } +} + +// Start the requested number of call_rcu() chains. +static int call_rcu_chain_init(void) +{ + int i; + + if (call_rcu_chains <= 0) + return 0; + call_rcu_chain_list = kcalloc(call_rcu_chains, sizeof(*call_rcu_chain_list), GFP_KERNEL); + if (!call_rcu_chain_list) + return -ENOMEM; + for (i = 0; i < call_rcu_chains; i++) { + call_rcu_chain_list[i].crc_stop = false; + call_rcu(&call_rcu_chain_list[i].crc_rh, call_rcu_chain_cb); + } + return 0; +} + +// Stop all of the call_rcu() chains. +static void call_rcu_chain_cleanup(void) +{ + int i; + + if (!call_rcu_chain_list) + return; + for (i = 0; i < call_rcu_chains; i++) + smp_store_release(&call_rcu_chain_list[i].crc_stop, true); + rcu_barrier(); + kfree(call_rcu_chain_list); + call_rcu_chain_list = NULL; } static void lock_torture_cleanup(void) @@ -1048,6 +1148,8 @@ static void lock_torture_cleanup(void) kfree(cxt.lrsa); cxt.lrsa = NULL; + call_rcu_chain_cleanup(); + end: if (cxt.init_called) { if (cxt.cur_ops->exit) @@ -1177,6 +1279,10 @@ static int __init lock_torture_init(void) } } + firsterr = call_rcu_chain_init(); + if (torture_init_error(firsterr)) + goto unwind; + lock_torture_print_module_parms(cxt.cur_ops, "Start of test"); /* Prepare torture context. */ @@ -1250,6 +1356,8 @@ static int __init lock_torture_init(void) writer_fifo ? sched_set_fifo : NULL); if (torture_init_error(firsterr)) goto unwind; + if (cpumask_nonempty(bind_writers)) + torture_sched_setaffinity(writer_tasks[i]->pid, bind_writers); create_reader: if (cxt.cur_ops->readlock == NULL || (j >= cxt.nrealreaders_stress)) @@ -1259,6 +1367,8 @@ static int __init lock_torture_init(void) reader_tasks[j]); if (torture_init_error(firsterr)) goto unwind; + if (cpumask_nonempty(bind_readers)) + torture_sched_setaffinity(reader_tasks[j]->pid, bind_readers); } if (stat_interval > 0) { firsterr = torture_create_kthread(lock_torture_stats, NULL, diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index d973fe6041bf..cbae8c0b89ab 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -532,6 +532,11 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne * This function must not be used in interrupt context. Unlocking * of a not locked mutex is not allowed. * + * The caller must ensure that the mutex stays alive until this function has + * returned - mutex_unlock() can NOT directly be used to release an object such + * that another concurrent task can free it. + * Mutexes are different from spinlocks & refcounts in this aspect. + * * This function is similar to (but not equivalent to) up(). */ void __sched mutex_unlock(struct mutex *lock) @@ -1126,6 +1131,9 @@ EXPORT_SYMBOL(ww_mutex_lock_interruptible); #endif /* !CONFIG_DEBUG_LOCK_ALLOC */ #endif /* !CONFIG_PREEMPT_RT */ +EXPORT_TRACEPOINT_SYMBOL_GPL(contention_begin); +EXPORT_TRACEPOINT_SYMBOL_GPL(contention_end); + /** * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 * @cnt: the atomic which we are to dec diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index d5610ad52b92..75a6f6133866 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -11,6 +11,13 @@ * called from interrupt context and we have preemption disabled while * spinning. */ + +struct optimistic_spin_node { + struct optimistic_spin_node *next, *prev; + int locked; /* 1 if lock acquired */ + int cpu; /* encoded CPU # + 1 value */ +}; + static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node); /* @@ -37,32 +44,28 @@ static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val) /* * Get a stable @node->next pointer, either for unlock() or unqueue() purposes. * Can return NULL in case we were the last queued and we updated @lock instead. + * + * If osq_lock() is being cancelled there must be a previous node + * and 'old_cpu' is its CPU #. + * For osq_unlock() there is never a previous node and old_cpu is + * set to OSQ_UNLOCKED_VAL. */ static inline struct optimistic_spin_node * osq_wait_next(struct optimistic_spin_queue *lock, struct optimistic_spin_node *node, - struct optimistic_spin_node *prev) + int old_cpu) { - struct optimistic_spin_node *next = NULL; int curr = encode_cpu(smp_processor_id()); - int old; - - /* - * If there is a prev node in queue, then the 'old' value will be - * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if - * we're currently last in queue, then the queue will then become empty. - */ - old = prev ? prev->cpu : OSQ_UNLOCKED_VAL; for (;;) { if (atomic_read(&lock->tail) == curr && - atomic_cmpxchg_acquire(&lock->tail, curr, old) == curr) { + atomic_cmpxchg_acquire(&lock->tail, curr, old_cpu) == curr) { /* * We were the last queued, we moved @lock back. @prev * will now observe @lock and will complete its * unlock()/unqueue(). */ - break; + return NULL; } /* @@ -76,15 +79,15 @@ osq_wait_next(struct optimistic_spin_queue *lock, * wait for a new @node->next from its Step-C. */ if (node->next) { + struct optimistic_spin_node *next; + next = xchg(&node->next, NULL); if (next) - break; + return next; } cpu_relax(); } - - return next; } bool osq_lock(struct optimistic_spin_queue *lock) @@ -186,7 +189,7 @@ bool osq_lock(struct optimistic_spin_queue *lock) * back to @prev. */ - next = osq_wait_next(lock, node, prev); + next = osq_wait_next(lock, node, prev->cpu); if (!next) return false; @@ -226,7 +229,7 @@ void osq_unlock(struct optimistic_spin_queue *lock) return; } - next = osq_wait_next(lock, node, NULL); + next = osq_wait_next(lock, node, OSQ_UNLOCKED_VAL); if (next) WRITE_ONCE(next->locked, 1); } diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 21db0df0eb00..4a10e8c16fd2 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -218,6 +218,11 @@ static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock, return try_cmpxchg_acquire(&lock->owner, &old, new); } +static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock) +{ + return rt_mutex_cmpxchg_acquire(lock, NULL, current); +} + static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock, struct task_struct *old, struct task_struct *new) @@ -297,6 +302,20 @@ static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock, } +static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock); + +static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock) +{ + /* + * With debug enabled rt_mutex_cmpxchg trylock() will always fail. + * + * Avoid unconditionally taking the slow path by using + * rt_mutex_slow_trylock() which is covered by the debug code and can + * acquire a non-contended rtmutex. + */ + return rt_mutex_slowtrylock(lock); +} + static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock, struct task_struct *old, struct task_struct *new) @@ -1613,7 +1632,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, raw_spin_unlock_irq(&lock->wait_lock); if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) - schedule(); + rt_mutex_schedule(); raw_spin_lock_irq(&lock->wait_lock); set_current_state(state); @@ -1642,7 +1661,7 @@ static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock, WARN(1, "rtmutex deadlock detected\n"); while (1) { set_current_state(TASK_INTERRUPTIBLE); - schedule(); + rt_mutex_schedule(); } } @@ -1738,6 +1757,15 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock, int ret; /* + * Do all pre-schedule work here, before we queue a waiter and invoke + * PI -- any such work that trips on rtlock (PREEMPT_RT spinlock) would + * otherwise recurse back into task_blocks_on_rt_mutex() through + * rtlock_slowlock() and will then enqueue a second waiter for this + * same task and things get really confusing real fast. + */ + rt_mutex_pre_schedule(); + + /* * Technically we could use raw_spin_[un]lock_irq() here, but this can * be called in early boot if the cmpxchg() fast path is disabled * (debug, no architecture support). In this case we will acquire the @@ -1748,6 +1776,7 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock, raw_spin_lock_irqsave(&lock->wait_lock, flags); ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state); raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + rt_mutex_post_schedule(); return ret; } @@ -1755,7 +1784,9 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock, static __always_inline int __rt_mutex_lock(struct rt_mutex_base *lock, unsigned int state) { - if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) + lockdep_assert(!current->pi_blocked_on); + + if (likely(rt_mutex_try_acquire(lock))) return 0; return rt_mutex_slowlock(lock, NULL, state); diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c index 25ec0239477c..34a59569db6b 100644 --- a/kernel/locking/rwbase_rt.c +++ b/kernel/locking/rwbase_rt.c @@ -71,6 +71,7 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb, struct rt_mutex_base *rtm = &rwb->rtmutex; int ret; + rwbase_pre_schedule(); raw_spin_lock_irq(&rtm->wait_lock); /* @@ -125,12 +126,15 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb, rwbase_rtmutex_unlock(rtm); trace_contention_end(rwb, ret); + rwbase_post_schedule(); return ret; } static __always_inline int rwbase_read_lock(struct rwbase_rt *rwb, unsigned int state) { + lockdep_assert(!current->pi_blocked_on); + if (rwbase_read_trylock(rwb)) return 0; @@ -237,6 +241,8 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb, /* Force readers into slow path */ atomic_sub(READER_BIAS, &rwb->readers); + rwbase_pre_schedule(); + raw_spin_lock_irqsave(&rtm->wait_lock, flags); if (__rwbase_write_trylock(rwb)) goto out_unlock; @@ -248,6 +254,7 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb, if (rwbase_signal_pending_state(state, current)) { rwbase_restore_current_state(); __rwbase_write_unlock(rwb, 0, flags); + rwbase_post_schedule(); trace_contention_end(rwb, -EINTR); return -EINTR; } @@ -266,6 +273,7 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb, out_unlock: raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); + rwbase_post_schedule(); return 0; } diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 9eabd585ce7a..2340b6d90ec6 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1427,8 +1427,14 @@ static inline void __downgrade_write(struct rw_semaphore *sem) #define rwbase_signal_pending_state(state, current) \ signal_pending_state(state, current) +#define rwbase_pre_schedule() \ + rt_mutex_pre_schedule() + #define rwbase_schedule() \ - schedule() + rt_mutex_schedule() + +#define rwbase_post_schedule() \ + rt_mutex_post_schedule() #include "rwbase_rt.c" diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c index 14235671a1a7..87b03d2e41db 100644 --- a/kernel/locking/spinlock_debug.c +++ b/kernel/locking/spinlock_debug.c @@ -12,6 +12,7 @@ #include <linux/debug_locks.h> #include <linux/delay.h> #include <linux/export.h> +#include <linux/pid.h> void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, struct lock_class_key *key, short inner) diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c index 48a19ed8486d..38e292454fcc 100644 --- a/kernel/locking/spinlock_rt.c +++ b/kernel/locking/spinlock_rt.c @@ -37,6 +37,8 @@ static __always_inline void rtlock_lock(struct rt_mutex_base *rtm) { + lockdep_assert(!current->pi_blocked_on); + if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current))) rtlock_slowlock(rtm); } @@ -184,9 +186,13 @@ static __always_inline int rwbase_rtmutex_trylock(struct rt_mutex_base *rtm) #define rwbase_signal_pending_state(state, current) (0) +#define rwbase_pre_schedule() + #define rwbase_schedule() \ schedule_rtlock() +#define rwbase_post_schedule() + #include "rwbase_rt.c" /* * The common functions which get wrapped into the rwlock API. diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 93cca6e69860..78719e1ef1b1 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -9,7 +9,7 @@ #include <linux/delay.h> #include <linux/kthread.h> #include <linux/module.h> -#include <linux/random.h> +#include <linux/prandom.h> #include <linux/slab.h> #include <linux/ww_mutex.h> @@ -386,6 +386,19 @@ struct stress { int nlocks; }; +struct rnd_state rng; +DEFINE_SPINLOCK(rng_lock); + +static inline u32 prandom_u32_below(u32 ceil) +{ + u32 ret; + + spin_lock(&rng_lock); + ret = prandom_u32_state(&rng) % ceil; + spin_unlock(&rng_lock); + return ret; +} + static int *get_random_order(int count) { int *order; @@ -399,7 +412,7 @@ static int *get_random_order(int count) order[n] = n; for (n = count - 1; n > 1; n--) { - r = get_random_u32_below(n + 1); + r = prandom_u32_below(n + 1); if (r != n) { tmp = order[n]; order[n] = order[r]; @@ -452,21 +465,21 @@ retry: ww_mutex_unlock(&locks[order[n]]); if (err == -EDEADLK) { - ww_mutex_lock_slow(&locks[order[contended]], &ctx); - goto retry; + if (!time_after(jiffies, stress->timeout)) { + ww_mutex_lock_slow(&locks[order[contended]], &ctx); + goto retry; + } } + ww_acquire_fini(&ctx); if (err) { pr_err_once("stress (%s) failed with %d\n", __func__, err); break; } - - ww_acquire_fini(&ctx); } while (!time_after(jiffies, stress->timeout)); kfree(order); - kfree(stress); } struct reorder_lock { @@ -531,7 +544,6 @@ out: list_for_each_entry_safe(ll, ln, &locks, link) kfree(ll); kfree(order); - kfree(stress); } static void stress_one_work(struct work_struct *work) @@ -552,8 +564,6 @@ static void stress_one_work(struct work_struct *work) break; } } while (!time_after(jiffies, stress->timeout)); - - kfree(stress); } #define STRESS_INORDER BIT(0) @@ -564,15 +574,24 @@ static void stress_one_work(struct work_struct *work) static int stress(int nlocks, int nthreads, unsigned int flags) { struct ww_mutex *locks; - int n; + struct stress *stress_array; + int n, count; locks = kmalloc_array(nlocks, sizeof(*locks), GFP_KERNEL); if (!locks) return -ENOMEM; + stress_array = kmalloc_array(nthreads, sizeof(*stress_array), + GFP_KERNEL); + if (!stress_array) { + kfree(locks); + return -ENOMEM; + } + for (n = 0; n < nlocks; n++) ww_mutex_init(&locks[n], &ww_class); + count = 0; for (n = 0; nthreads; n++) { struct stress *stress; void (*fn)(struct work_struct *work); @@ -596,9 +615,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags) if (!fn) continue; - stress = kmalloc(sizeof(*stress), GFP_KERNEL); - if (!stress) - break; + stress = &stress_array[count++]; INIT_WORK(&stress->work, fn); stress->locks = locks; @@ -613,6 +630,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags) for (n = 0; n < nlocks; n++) ww_mutex_destroy(&locks[n]); + kfree(stress_array); kfree(locks); return 0; @@ -625,6 +643,8 @@ static int __init test_ww_mutex_init(void) printk(KERN_INFO "Beginning ww mutex selftests\n"); + prandom_seed_state(&rng, get_random_u64()); + wq = alloc_workqueue("test-ww_mutex", WQ_UNBOUND, 0); if (!wq) return -ENOMEM; diff --git a/kernel/locking/ww_rt_mutex.c b/kernel/locking/ww_rt_mutex.c index d1473c624105..c7196de838ed 100644 --- a/kernel/locking/ww_rt_mutex.c +++ b/kernel/locking/ww_rt_mutex.c @@ -62,7 +62,7 @@ __ww_rt_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx, } mutex_acquire_nest(&rtm->dep_map, 0, 0, nest_lock, ip); - if (likely(rt_mutex_cmpxchg_acquire(&rtm->rtmutex, NULL, current))) { + if (likely(rt_mutex_try_acquire(&rtm->rtmutex))) { if (ww_ctx) ww_mutex_set_context_fastpath(lock, ww_ctx); return 0; diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index 33a2e991f608..0ea1b2970a23 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -236,14 +236,6 @@ choice possible to load a signed module containing the algorithm to check the signature on that module. -config MODULE_SIG_SHA1 - bool "Sign modules with SHA-1" - select CRYPTO_SHA1 - -config MODULE_SIG_SHA224 - bool "Sign modules with SHA-224" - select CRYPTO_SHA256 - config MODULE_SIG_SHA256 bool "Sign modules with SHA-256" select CRYPTO_SHA256 @@ -256,16 +248,29 @@ config MODULE_SIG_SHA512 bool "Sign modules with SHA-512" select CRYPTO_SHA512 +config MODULE_SIG_SHA3_256 + bool "Sign modules with SHA3-256" + select CRYPTO_SHA3 + +config MODULE_SIG_SHA3_384 + bool "Sign modules with SHA3-384" + select CRYPTO_SHA3 + +config MODULE_SIG_SHA3_512 + bool "Sign modules with SHA3-512" + select CRYPTO_SHA3 + endchoice config MODULE_SIG_HASH string depends on MODULE_SIG || IMA_APPRAISE_MODSIG - default "sha1" if MODULE_SIG_SHA1 - default "sha224" if MODULE_SIG_SHA224 default "sha256" if MODULE_SIG_SHA256 default "sha384" if MODULE_SIG_SHA384 default "sha512" if MODULE_SIG_SHA512 + default "sha3-256" if MODULE_SIG_SHA3_256 + default "sha3-384" if MODULE_SIG_SHA3_384 + default "sha3-512" if MODULE_SIG_SHA3_512 choice prompt "Module compression mode" diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c index 87440f714c0c..474e68f0f063 100644 --- a/kernel/module/decompress.c +++ b/kernel/module/decompress.c @@ -100,7 +100,7 @@ static ssize_t module_gzip_decompress(struct load_info *info, s.next_in = buf + gzip_hdr_len; s.avail_in = size - gzip_hdr_len; - s.workspace = kmalloc(zlib_inflate_workspacesize(), GFP_KERNEL); + s.workspace = kvmalloc(zlib_inflate_workspacesize(), GFP_KERNEL); if (!s.workspace) return -ENOMEM; @@ -138,7 +138,7 @@ static ssize_t module_gzip_decompress(struct load_info *info, out_inflate_end: zlib_inflateEnd(&s); out: - kfree(s.workspace); + kvfree(s.workspace); return retval; } #elif defined(CONFIG_MODULE_COMPRESS_XZ) @@ -241,7 +241,7 @@ static ssize_t module_zstd_decompress(struct load_info *info, } wksp_size = zstd_dstream_workspace_bound(header.windowSize); - wksp = vmalloc(wksp_size); + wksp = kvmalloc(wksp_size, GFP_KERNEL); if (!wksp) { retval = -ENOMEM; goto out; @@ -284,7 +284,7 @@ static ssize_t module_zstd_decompress(struct load_info *info, retval = new_size; out: - vfree(wksp); + kvfree(wksp); return retval; } #else diff --git a/kernel/module/dups.c b/kernel/module/dups.c index f3d7ea1e96d8..9a92f2f8c9d3 100644 --- a/kernel/module/dups.c +++ b/kernel/module/dups.c @@ -207,7 +207,7 @@ bool kmod_dup_request_exists_wait(char *module_name, bool wait, int *dup_ret) * optimization enabled ... */ ret = wait_for_completion_state(&kmod_req->first_req_done, - TASK_UNINTERRUPTIBLE | TASK_KILLABLE); + TASK_KILLABLE); if (ret) { *dup_ret = ret; return true; diff --git a/kernel/module/main.c b/kernel/module/main.c index 98fedfdb8db5..36681911c05a 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -2199,6 +2199,9 @@ static int find_module_sections(struct module *mod, struct load_info *info) mod->kunit_suites = section_objs(info, ".kunit_test_suites", sizeof(*mod->kunit_suites), &mod->num_kunit_suites); + mod->kunit_init_suites = section_objs(info, ".kunit_init_test_suites", + sizeof(*mod->kunit_init_suites), + &mod->num_kunit_init_suites); #endif mod->extable = section_objs(info, "__ex_table", diff --git a/kernel/module/stats.c b/kernel/module/stats.c index 6ab2c94d6bc3..3ba0e98b3c91 100644 --- a/kernel/module/stats.c +++ b/kernel/module/stats.c @@ -126,7 +126,7 @@ static LIST_HEAD(dup_failed_modules); * These typically should not happen unless your system is under memory * pressure. * * invalid_becoming_bytes: total number of bytes allocated and freed used - * used to read the kernel module userspace wants us to read before we + * to read the kernel module userspace wants us to read before we * promote it to be processed to be added to our @modules linked list. These * failures can happen if we had a check in between a successful kernel_read_file_from_fd() * call and right before we allocate the our private memory for the module diff --git a/kernel/module/sysfs.c b/kernel/module/sysfs.c index c921bf044050..d964167c6658 100644 --- a/kernel/module/sysfs.c +++ b/kernel/module/sysfs.c @@ -143,7 +143,7 @@ static void remove_sect_attrs(struct module *mod) struct module_notes_attrs { struct kobject *dir; unsigned int notes; - struct bin_attribute attrs[]; + struct bin_attribute attrs[] __counted_by(notes); }; static ssize_t module_notes_read(struct file *filp, struct kobject *kobj, diff --git a/kernel/numa.c b/kernel/numa.c new file mode 100644 index 000000000000..67ca6b8585c0 --- /dev/null +++ b/kernel/numa.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/printk.h> +#include <linux/numa.h> + +/* Stub functions: */ + +#ifndef memory_add_physaddr_to_nid +int memory_add_physaddr_to_nid(u64 start) +{ + pr_info_once("Unknown online node for memory at 0x%llx, assuming node 0\n", + start); + return 0; +} +EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); +#endif + +#ifndef phys_to_target_node +int phys_to_target_node(u64 start) +{ + pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n", + start); + return 0; +} +EXPORT_SYMBOL_GPL(phys_to_target_node); +#endif diff --git a/kernel/padata.c b/kernel/padata.c index 222d60195de6..179fb1518070 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -202,7 +202,7 @@ int padata_do_parallel(struct padata_shell *ps, *cb_cpu = cpu; } - err = -EBUSY; + err = -EBUSY; if ((pinst->flags & PADATA_RESET)) goto out; @@ -1102,12 +1102,16 @@ EXPORT_SYMBOL(padata_alloc_shell); */ void padata_free_shell(struct padata_shell *ps) { + struct parallel_data *pd; + if (!ps) return; mutex_lock(&ps->pinst->lock); list_del(&ps->list); - padata_free_pd(rcu_dereference_protected(ps->pd, 1)); + pd = rcu_dereference_protected(ps->pd, 1); + if (refcount_dec_and_test(&pd->refcnt)) + padata_free_pd(pd); mutex_unlock(&ps->pinst->lock); kfree(ps); diff --git a/kernel/panic.c b/kernel/panic.c index 07239d4ad81e..2807639aab51 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -192,14 +192,15 @@ atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); */ void nmi_panic(struct pt_regs *regs, const char *msg) { - int old_cpu, cpu; + int old_cpu, this_cpu; - cpu = raw_smp_processor_id(); - old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, cpu); + old_cpu = PANIC_CPU_INVALID; + this_cpu = raw_smp_processor_id(); - if (old_cpu == PANIC_CPU_INVALID) + /* atomic_try_cmpxchg updates old_cpu on failure */ + if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) panic("%s", msg); - else if (old_cpu != cpu) + else if (old_cpu != this_cpu) nmi_panic_self_stop(regs); } EXPORT_SYMBOL(nmi_panic); @@ -311,15 +312,18 @@ void panic(const char *fmt, ...) * stop themself or will wait until they are stopped by the 1st CPU * with smp_send_stop(). * - * `old_cpu == PANIC_CPU_INVALID' means this is the 1st CPU which - * comes here, so go ahead. + * cmpxchg success means this is the 1st CPU which comes here, + * so go ahead. * `old_cpu == this_cpu' means we came from nmi_panic() which sets * panic_cpu to this CPU. In this case, this is also the 1st CPU. */ + old_cpu = PANIC_CPU_INVALID; this_cpu = raw_smp_processor_id(); - old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu); - if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu) + /* atomic_try_cmpxchg updates old_cpu on failure */ + if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) { + /* go ahead */ + } else if (old_cpu != this_cpu) panic_smp_self_stop(); console_verbose(); @@ -697,6 +701,7 @@ void warn_slowpath_fmt(const char *file, int line, unsigned taint, if (!fmt) { __warn(file, line, __builtin_return_address(0), taint, NULL, NULL); + warn_rcu_exit(rcu); return; } diff --git a/kernel/params.c b/kernel/params.c index 2d4a0564697e..2e447f8ae183 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -1,19 +1,20 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* Helpers for initial module or kernel cmdline parsing - Copyright (C) 2001 Rusty Russell. - -*/ +/* + * Helpers for initial module or kernel cmdline parsing + * Copyright (C) 2001 Rusty Russell. + */ +#include <linux/ctype.h> +#include <linux/device.h> +#include <linux/err.h> +#include <linux/errno.h> #include <linux/kernel.h> #include <linux/kstrtox.h> -#include <linux/string.h> -#include <linux/errno.h> #include <linux/module.h> #include <linux/moduleparam.h> -#include <linux/device.h> -#include <linux/err.h> -#include <linux/slab.h> -#include <linux/ctype.h> +#include <linux/overflow.h> #include <linux/security.h> +#include <linux/slab.h> +#include <linux/string.h> #ifdef CONFIG_SYSFS /* Protects all built-in parameters, modules use their own param_lock */ @@ -48,7 +49,7 @@ static void *kmalloc_parameter(unsigned int size) { struct kmalloced_param *p; - p = kmalloc(sizeof(*p) + size, GFP_KERNEL); + p = kmalloc(size_add(sizeof(*p), size), GFP_KERNEL); if (!p) return NULL; @@ -120,9 +121,7 @@ static int parse_one(char *param, unsigned num_params, s16 min_level, s16 max_level, - void *arg, - int (*handle_unknown)(char *param, char *val, - const char *doing, void *arg)) + void *arg, parse_unknown_fn handle_unknown) { unsigned int i; int err; @@ -165,9 +164,7 @@ char *parse_args(const char *doing, unsigned num, s16 min_level, s16 max_level, - void *arg, - int (*unknown)(char *param, char *val, - const char *doing, void *arg)) + void *arg, parse_unknown_fn unknown) { char *param, *val, *err = NULL; @@ -264,17 +261,22 @@ EXPORT_SYMBOL_GPL(param_set_uint_minmax); int param_set_charp(const char *val, const struct kernel_param *kp) { - if (strlen(val) > 1024) { + size_t len, maxlen = 1024; + + len = strnlen(val, maxlen + 1); + if (len == maxlen + 1) { pr_err("%s: string parameter too long\n", kp->name); return -ENOSPC; } maybe_kfree_parameter(*(char **)kp->arg); - /* This is a hack. We can't kmalloc in early boot, and we - * don't need to; this mangled commandline is preserved. */ + /* + * This is a hack. We can't kmalloc() in early boot, and we + * don't need to; this mangled commandline is preserved. + */ if (slab_is_available()) { - *(char **)kp->arg = kmalloc_parameter(strlen(val)+1); + *(char **)kp->arg = kmalloc_parameter(len + 1); if (!*(char **)kp->arg) return -ENOMEM; strcpy(*(char **)kp->arg, val); @@ -512,7 +514,7 @@ int param_set_copystring(const char *val, const struct kernel_param *kp) { const struct kparam_string *kps = kp->str; - if (strlen(val)+1 > kps->maxlen) { + if (strnlen(val, kps->maxlen) == kps->maxlen) { pr_err("%s: string doesn't fit in %u chars.\n", kp->name, kps->maxlen-1); return -ENOSPC; @@ -743,8 +745,10 @@ void module_param_sysfs_remove(struct module *mod) { if (mod->mkobj.mp) { sysfs_remove_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp); - /* We are positive that no one is using any param - * attrs at this point. Deallocate immediately. */ + /* + * We are positive that no one is using any param + * attrs at this point. Deallocate immediately. + */ free_module_param_attrs(&mod->mkobj); } } diff --git a/kernel/pid.c b/kernel/pid.c index fee14a4486a3..b52b10865454 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -609,7 +609,7 @@ int pidfd_create(struct pid *pid, unsigned int flags) } /** - * pidfd_open() - Open new pid file descriptor. + * sys_pidfd_open() - Open new pid file descriptor. * * @pid: pid for which to retrieve a pidfd * @flags: flags to pass @@ -700,7 +700,7 @@ static int pidfd_getfd(struct pid *pid, int fd) if (IS_ERR(file)) return PTR_ERR(file); - ret = receive_fd(file, O_CLOEXEC); + ret = receive_fd(file, NULL, O_CLOEXEC); fput(file); return ret; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 619972c78774..7ade20e95232 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -23,6 +23,7 @@ #include <linux/sched/task.h> #include <linux/sched/signal.h> #include <linux/idr.h> +#include <uapi/linux/wait.h> #include "pid_sysctl.h" static DEFINE_MUTEX(pid_caches_mutex); @@ -286,12 +287,6 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns)) return -EPERM; - /* - * Writing directly to ns' last_pid field is OK, since this field - * is volatile in a living namespace anyway and a code writing to - * it should synchronize its usage with external means. - */ - next = idr_get_cursor(&pid_ns->idr) - 1; tmp.data = &next; diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 2b4a946a6ff5..4b0b7cf2e019 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -642,9 +642,9 @@ int hibernation_platform_enter(void) */ static void power_down(void) { -#ifdef CONFIG_SUSPEND int error; +#ifdef CONFIG_SUSPEND if (hibernation_mode == HIBERNATION_SUSPEND) { error = suspend_devices_and_enter(mem_sleep_current); if (error) { @@ -667,7 +667,13 @@ static void power_down(void) kernel_restart(NULL); break; case HIBERNATION_PLATFORM: - hibernation_platform_enter(); + error = hibernation_platform_enter(); + if (error == -EAGAIN || error == -EBUSY) { + swsusp_unmark(); + events_check_enabled = false; + pr_info("Wakeup event detected during hibernation, rolling back.\n"); + return; + } fallthrough; case HIBERNATION_SHUTDOWN: if (kernel_can_power_off()) @@ -684,7 +690,7 @@ static void power_down(void) cpu_relax(); } -static int load_image_and_restore(bool snapshot_test) +static int load_image_and_restore(void) { int error; unsigned int flags; @@ -694,12 +700,12 @@ static int load_image_and_restore(bool snapshot_test) lock_device_hotplug(); error = create_basic_memory_bitmaps(); if (error) { - swsusp_close(snapshot_test); + swsusp_close(); goto Unlock; } error = swsusp_read(&flags); - swsusp_close(snapshot_test); + swsusp_close(); if (!error) error = hibernation_restore(flags & SF_PLATFORM_MODE); @@ -786,9 +792,9 @@ int hibernate(void) unlock_device_hotplug(); if (snapshot_test) { pm_pr_dbg("Checking hibernation image\n"); - error = swsusp_check(snapshot_test); + error = swsusp_check(false); if (!error) - error = load_image_and_restore(snapshot_test); + error = load_image_and_restore(); } thaw_processes(); @@ -945,14 +951,14 @@ static int software_resume(void) pm_pr_dbg("Looking for hibernation image.\n"); mutex_lock(&system_transition_mutex); - error = swsusp_check(false); + error = swsusp_check(true); if (error) goto Unlock; /* The snapshot device should not be opened while we're running */ if (!hibernate_acquire()) { error = -EBUSY; - swsusp_close(false); + swsusp_close(); goto Unlock; } @@ -973,7 +979,7 @@ static int software_resume(void) goto Close_Finish; } - error = load_image_and_restore(false); + error = load_image_and_restore(); thaw_processes(); Finish: pm_notifier_call_chain(PM_POST_RESTORE); @@ -987,7 +993,7 @@ static int software_resume(void) pm_pr_dbg("Hibernation image not present or could not be loaded.\n"); return error; Close_Finish: - swsusp_close(false); + swsusp_close(); goto Finish; } diff --git a/kernel/power/main.c b/kernel/power/main.c index f6425ae3e8b0..b1ae9b677d03 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -60,22 +60,6 @@ EXPORT_SYMBOL_GPL(lock_system_sleep); void unlock_system_sleep(unsigned int flags) { - /* - * Don't use freezer_count() because we don't want the call to - * try_to_freeze() here. - * - * Reason: - * Fundamentally, we just don't need it, because freezing condition - * doesn't come into effect until we release the - * system_transition_mutex lock, since the freezer always works with - * system_transition_mutex held. - * - * More importantly, in the case of hibernation, - * unlock_system_sleep() gets called in snapshot_read() and - * snapshot_write() when the freezing condition is still in effect. - * Which means, if we use try_to_freeze() here, it would make them - * enter the refrigerator, thus causing hibernation to lockup. - */ if (!(flags & PF_NOFREEZE)) current->flags &= ~PF_NOFREEZE; mutex_unlock(&system_transition_mutex); diff --git a/kernel/power/power.h b/kernel/power/power.h index 46eb14dc50c3..8499a39c62f4 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -168,13 +168,15 @@ extern int swsusp_swap_in_use(void); #define SF_HW_SIG 8 /* kernel/power/hibernate.c */ -int swsusp_check(bool snapshot_test); +int swsusp_check(bool exclusive); extern void swsusp_free(void); extern int swsusp_read(unsigned int *flags_p); extern int swsusp_write(unsigned int flags); -void swsusp_close(bool snapshot_test); +void swsusp_close(void); #ifdef CONFIG_SUSPEND extern int swsusp_unmark(void); +#else +static inline int swsusp_unmark(void) { return 0; } #endif struct __kernel_old_timeval; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 87e9f7e2bdc0..5c96ff067c64 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1119,7 +1119,7 @@ static void mark_nosave_pages(struct memory_bitmap *bm) int create_basic_memory_bitmaps(void) { struct memory_bitmap *bm1, *bm2; - int error = 0; + int error; if (forbidden_pages_map && free_pages_map) return 0; @@ -1487,11 +1487,11 @@ static bool copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) s_page = pfn_to_page(src_pfn); d_page = pfn_to_page(dst_pfn); if (PageHighMem(s_page)) { - src = kmap_atomic(s_page); - dst = kmap_atomic(d_page); + src = kmap_local_page(s_page); + dst = kmap_local_page(d_page); zeros_only = do_copy_page(dst, src); - kunmap_atomic(dst); - kunmap_atomic(src); + kunmap_local(dst); + kunmap_local(src); } else { if (PageHighMem(d_page)) { /* @@ -1499,9 +1499,9 @@ static bool copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) * data modified by kmap_atomic() */ zeros_only = safe_copy_page(buffer, s_page); - dst = kmap_atomic(d_page); + dst = kmap_local_page(d_page); copy_page(dst, buffer); - kunmap_atomic(dst); + kunmap_local(dst); } else { zeros_only = safe_copy_page(page_address(d_page), s_page); } @@ -2545,8 +2545,9 @@ static void *get_highmem_page_buffer(struct page *page, pbe->copy_page = tmp; } else { /* Copy of the page will be stored in normal memory */ - kaddr = safe_pages_list; - safe_pages_list = safe_pages_list->next; + kaddr = __get_safe_page(ca->gfp_mask); + if (!kaddr) + return ERR_PTR(-ENOMEM); pbe->copy_page = virt_to_page(kaddr); } pbe->next = highmem_pblist; @@ -2647,7 +2648,7 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm, memory_bm_free(bm, PG_UNSAFE_KEEP); /* Make a copy of zero_bm so it can be created in safe pages */ - error = memory_bm_create(&tmp, GFP_ATOMIC, PG_ANY); + error = memory_bm_create(&tmp, GFP_ATOMIC, PG_SAFE); if (error) goto Free; @@ -2660,7 +2661,7 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm, goto Free; duplicate_memory_bitmap(zero_bm, &tmp); - memory_bm_free(&tmp, PG_UNSAFE_KEEP); + memory_bm_free(&tmp, PG_UNSAFE_CLEAR); /* At this point zero_bm is in safe pages and it can be used for restoring. */ if (nr_highmem > 0) { @@ -2750,8 +2751,9 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) return ERR_PTR(-ENOMEM); } pbe->orig_address = page_address(page); - pbe->address = safe_pages_list; - safe_pages_list = safe_pages_list->next; + pbe->address = __get_safe_page(ca->gfp_mask); + if (!pbe->address) + return ERR_PTR(-ENOMEM); pbe->next = restore_pblist; restore_pblist = pbe; return pbe->address; @@ -2776,15 +2778,13 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) int snapshot_write_next(struct snapshot_handle *handle) { static struct chain_allocator ca; - int error = 0; + int error; next: /* Check if we have already loaded the entire image */ if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages + nr_zero_pages) return 0; - handle->sync_read = 1; - if (!handle->cur) { if (!buffer) /* This makes the buffer be freed by swsusp_free() */ @@ -2827,7 +2827,6 @@ next: memory_bm_position_reset(&zero_bm); restore_pblist = NULL; handle->buffer = get_buffer(&orig_bm, &ca); - handle->sync_read = 0; if (IS_ERR(handle->buffer)) return PTR_ERR(handle->buffer); } @@ -2837,9 +2836,8 @@ next: handle->buffer = get_buffer(&orig_bm, &ca); if (IS_ERR(handle->buffer)) return PTR_ERR(handle->buffer); - if (handle->buffer != buffer) - handle->sync_read = 0; } + handle->sync_read = (handle->buffer == buffer); handle->cur++; /* Zero pages were not included in the image, memset it and move on. */ diff --git a/kernel/power/swap.c b/kernel/power/swap.c index f6ebcd00c410..6053ddddaf65 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -222,7 +222,7 @@ int swsusp_swap_in_use(void) */ static unsigned short root_swap = 0xffff; -static struct block_device *hib_resume_bdev; +static struct bdev_handle *hib_resume_bdev_handle; struct hib_bio_batch { atomic_t count; @@ -276,7 +276,8 @@ static int hib_submit_io(blk_opf_t opf, pgoff_t page_off, void *addr, struct bio *bio; int error = 0; - bio = bio_alloc(hib_resume_bdev, 1, opf, GFP_NOIO | __GFP_HIGH); + bio = bio_alloc(hib_resume_bdev_handle->bdev, 1, opf, + GFP_NOIO | __GFP_HIGH); bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { @@ -356,14 +357,14 @@ static int swsusp_swap_check(void) return res; root_swap = res; - hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, + hib_resume_bdev_handle = bdev_open_by_dev(swsusp_resume_device, BLK_OPEN_WRITE, NULL, NULL); - if (IS_ERR(hib_resume_bdev)) - return PTR_ERR(hib_resume_bdev); + if (IS_ERR(hib_resume_bdev_handle)) + return PTR_ERR(hib_resume_bdev_handle); - res = set_blocksize(hib_resume_bdev, PAGE_SIZE); + res = set_blocksize(hib_resume_bdev_handle->bdev, PAGE_SIZE); if (res < 0) - blkdev_put(hib_resume_bdev, NULL); + bdev_release(hib_resume_bdev_handle); return res; } @@ -443,14 +444,14 @@ static int get_swap_writer(struct swap_map_handle *handle) err_rel: release_swap_writer(handle); err_close: - swsusp_close(false); + swsusp_close(); return ret; } static int swap_write_page(struct swap_map_handle *handle, void *buf, struct hib_bio_batch *hb) { - int error = 0; + int error; sector_t offset; if (!handle->cur) @@ -508,7 +509,7 @@ static int swap_writer_finish(struct swap_map_handle *handle, if (error) free_all_swap_pages(root_swap); release_swap_writer(handle); - swsusp_close(false); + swsusp_close(); return error; } @@ -605,11 +606,11 @@ static int crc32_threadfn(void *data) unsigned i; while (1) { - wait_event(d->go, atomic_read(&d->ready) || + wait_event(d->go, atomic_read_acquire(&d->ready) || kthread_should_stop()); if (kthread_should_stop()) { d->thr = NULL; - atomic_set(&d->stop, 1); + atomic_set_release(&d->stop, 1); wake_up(&d->done); break; } @@ -618,7 +619,7 @@ static int crc32_threadfn(void *data) for (i = 0; i < d->run_threads; i++) *d->crc32 = crc32_le(*d->crc32, d->unc[i], *d->unc_len[i]); - atomic_set(&d->stop, 1); + atomic_set_release(&d->stop, 1); wake_up(&d->done); } return 0; @@ -648,12 +649,12 @@ static int lzo_compress_threadfn(void *data) struct cmp_data *d = data; while (1) { - wait_event(d->go, atomic_read(&d->ready) || + wait_event(d->go, atomic_read_acquire(&d->ready) || kthread_should_stop()); if (kthread_should_stop()) { d->thr = NULL; d->ret = -1; - atomic_set(&d->stop, 1); + atomic_set_release(&d->stop, 1); wake_up(&d->done); break; } @@ -662,7 +663,7 @@ static int lzo_compress_threadfn(void *data) d->ret = lzo1x_1_compress(d->unc, d->unc_len, d->cmp + LZO_HEADER, &d->cmp_len, d->wrk); - atomic_set(&d->stop, 1); + atomic_set_release(&d->stop, 1); wake_up(&d->done); } return 0; @@ -797,7 +798,7 @@ static int save_image_lzo(struct swap_map_handle *handle, data[thr].unc_len = off; - atomic_set(&data[thr].ready, 1); + atomic_set_release(&data[thr].ready, 1); wake_up(&data[thr].go); } @@ -805,12 +806,12 @@ static int save_image_lzo(struct swap_map_handle *handle, break; crc->run_threads = thr; - atomic_set(&crc->ready, 1); + atomic_set_release(&crc->ready, 1); wake_up(&crc->go); for (run_threads = thr, thr = 0; thr < run_threads; thr++) { wait_event(data[thr].done, - atomic_read(&data[thr].stop)); + atomic_read_acquire(&data[thr].stop)); atomic_set(&data[thr].stop, 0); ret = data[thr].ret; @@ -849,7 +850,7 @@ static int save_image_lzo(struct swap_map_handle *handle, } } - wait_event(crc->done, atomic_read(&crc->stop)); + wait_event(crc->done, atomic_read_acquire(&crc->stop)); atomic_set(&crc->stop, 0); } @@ -1131,12 +1132,12 @@ static int lzo_decompress_threadfn(void *data) struct dec_data *d = data; while (1) { - wait_event(d->go, atomic_read(&d->ready) || + wait_event(d->go, atomic_read_acquire(&d->ready) || kthread_should_stop()); if (kthread_should_stop()) { d->thr = NULL; d->ret = -1; - atomic_set(&d->stop, 1); + atomic_set_release(&d->stop, 1); wake_up(&d->done); break; } @@ -1149,7 +1150,7 @@ static int lzo_decompress_threadfn(void *data) flush_icache_range((unsigned long)d->unc, (unsigned long)d->unc + d->unc_len); - atomic_set(&d->stop, 1); + atomic_set_release(&d->stop, 1); wake_up(&d->done); } return 0; @@ -1334,7 +1335,7 @@ static int load_image_lzo(struct swap_map_handle *handle, } if (crc->run_threads) { - wait_event(crc->done, atomic_read(&crc->stop)); + wait_event(crc->done, atomic_read_acquire(&crc->stop)); atomic_set(&crc->stop, 0); crc->run_threads = 0; } @@ -1370,7 +1371,7 @@ static int load_image_lzo(struct swap_map_handle *handle, pg = 0; } - atomic_set(&data[thr].ready, 1); + atomic_set_release(&data[thr].ready, 1); wake_up(&data[thr].go); } @@ -1389,7 +1390,7 @@ static int load_image_lzo(struct swap_map_handle *handle, for (run_threads = thr, thr = 0; thr < run_threads; thr++) { wait_event(data[thr].done, - atomic_read(&data[thr].stop)); + atomic_read_acquire(&data[thr].stop)); atomic_set(&data[thr].stop, 0); ret = data[thr].ret; @@ -1420,7 +1421,7 @@ static int load_image_lzo(struct swap_map_handle *handle, ret = snapshot_write_next(snapshot); if (ret <= 0) { crc->run_threads = thr + 1; - atomic_set(&crc->ready, 1); + atomic_set_release(&crc->ready, 1); wake_up(&crc->go); goto out_finish; } @@ -1428,13 +1429,13 @@ static int load_image_lzo(struct swap_map_handle *handle, } crc->run_threads = thr; - atomic_set(&crc->ready, 1); + atomic_set_release(&crc->ready, 1); wake_up(&crc->go); } out_finish: if (crc->run_threads) { - wait_event(crc->done, atomic_read(&crc->stop)); + wait_event(crc->done, atomic_read_acquire(&crc->stop)); atomic_set(&crc->stop, 0); } stop = ktime_get(); @@ -1513,18 +1514,19 @@ end: static void *swsusp_holder; /** - * swsusp_check - Check for swsusp signature in the resume device + * swsusp_check - Open the resume device and check for the swsusp signature. + * @exclusive: Open the resume device exclusively. */ -int swsusp_check(bool snapshot_test) +int swsusp_check(bool exclusive) { - void *holder = snapshot_test ? &swsusp_holder : NULL; + void *holder = exclusive ? &swsusp_holder : NULL; int error; - hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, BLK_OPEN_READ, - holder, NULL); - if (!IS_ERR(hib_resume_bdev)) { - set_blocksize(hib_resume_bdev, PAGE_SIZE); + hib_resume_bdev_handle = bdev_open_by_dev(swsusp_resume_device, + BLK_OPEN_READ, holder, NULL); + if (!IS_ERR(hib_resume_bdev_handle)) { + set_blocksize(hib_resume_bdev_handle->bdev, PAGE_SIZE); clear_page(swsusp_header); error = hib_submit_io(REQ_OP_READ, swsusp_resume_block, swsusp_header, NULL); @@ -1549,11 +1551,11 @@ int swsusp_check(bool snapshot_test) put: if (error) - blkdev_put(hib_resume_bdev, holder); + bdev_release(hib_resume_bdev_handle); else pr_debug("Image signature found, resuming\n"); } else { - error = PTR_ERR(hib_resume_bdev); + error = PTR_ERR(hib_resume_bdev_handle); } if (error) @@ -1563,17 +1565,17 @@ put: } /** - * swsusp_close - close swap device. + * swsusp_close - close resume device. */ -void swsusp_close(bool snapshot_test) +void swsusp_close(void) { - if (IS_ERR(hib_resume_bdev)) { + if (IS_ERR(hib_resume_bdev_handle)) { pr_debug("Image device not initialised\n"); return; } - blkdev_put(hib_resume_bdev, snapshot_test ? &swsusp_holder : NULL); + bdev_release(hib_resume_bdev_handle); } /** diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index f5b388e810b9..39a2b61c7232 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y = printk.o -obj-$(CONFIG_PRINTK) += printk_safe.o +obj-$(CONFIG_PRINTK) += printk_safe.o nbcon.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o obj-$(CONFIG_PRINTK_INDEX) += index.o diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 2a17704136f1..6c2afee5ef62 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -3,6 +3,8 @@ * internal.h - printk internal definitions */ #include <linux/percpu.h> +#include <linux/console.h> +#include "printk_ringbuffer.h" #if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL) void __init printk_sysctl_init(void); @@ -12,6 +14,12 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, #define printk_sysctl_init() do { } while (0) #endif +#define con_printk(lvl, con, fmt, ...) \ + printk(lvl pr_fmt("%s%sconsole [%s%d] " fmt), \ + (con->flags & CON_NBCON) ? "" : "legacy ", \ + (con->flags & CON_BOOT) ? "boot" : "", \ + con->name, con->index, ##__VA_ARGS__) + #ifdef CONFIG_PRINTK #ifdef CONFIG_PRINTK_CALLER @@ -35,6 +43,8 @@ enum printk_info_flags { LOG_CONT = 8, /* text is a fragment of a continuation line */ }; +extern struct printk_ringbuffer *prb; + __printf(4, 0) int vprintk_store(int facility, int level, const struct dev_printk_info *dev_info, @@ -61,6 +71,13 @@ void defer_console_output(void); u16 printk_parse_prefix(const char *text, int *level, enum printk_info_flags *flags); + +u64 nbcon_seq_read(struct console *con); +void nbcon_seq_force(struct console *con, u64 seq); +bool nbcon_alloc(struct console *con); +void nbcon_init(struct console *con); +void nbcon_free(struct console *con); + #else #define PRINTK_PREFIX_MAX 0 @@ -76,8 +93,16 @@ u16 printk_parse_prefix(const char *text, int *level, #define printk_safe_exit_irqrestore(flags) local_irq_restore(flags) static inline bool printk_percpu_data_ready(void) { return false; } +static inline u64 nbcon_seq_read(struct console *con) { return 0; } +static inline void nbcon_seq_force(struct console *con, u64 seq) { } +static inline bool nbcon_alloc(struct console *con) { return false; } +static inline void nbcon_init(struct console *con) { } +static inline void nbcon_free(struct console *con) { } + #endif /* CONFIG_PRINTK */ +extern struct printk_buffers printk_shared_pbufs; + /** * struct printk_buffers - Buffers to read/format/output printk messages. * @outbuf: After formatting, contains text to output. @@ -103,3 +128,11 @@ struct printk_message { u64 seq; unsigned long dropped; }; + +bool other_cpu_in_panic(void); +bool printk_get_next_message(struct printk_message *pmsg, u64 seq, + bool is_extended, bool may_supress); + +#ifdef CONFIG_PRINTK +void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped); +#endif diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c new file mode 100644 index 000000000000..b96077152f49 --- /dev/null +++ b/kernel/printk/nbcon.c @@ -0,0 +1,1029 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright (C) 2022 Linutronix GmbH, John Ogness +// Copyright (C) 2022 Intel, Thomas Gleixner + +#include <linux/kernel.h> +#include <linux/console.h> +#include <linux/delay.h> +#include <linux/slab.h> +#include "internal.h" +/* + * Printk console printing implementation for consoles which does not depend + * on the legacy style console_lock mechanism. + * + * The state of the console is maintained in the "nbcon_state" atomic + * variable. + * + * The console is locked when: + * + * - The 'prio' field contains the priority of the context that owns the + * console. Only higher priority contexts are allowed to take over the + * lock. A value of 0 (NBCON_PRIO_NONE) means the console is not locked. + * + * - The 'cpu' field denotes on which CPU the console is locked. It is used + * to prevent busy waiting on the same CPU. Also it informs the lock owner + * that it has lost the lock in a more complex scenario when the lock was + * taken over by a higher priority context, released, and taken on another + * CPU with the same priority as the interrupted owner. + * + * The acquire mechanism uses a few more fields: + * + * - The 'req_prio' field is used by the handover approach to make the + * current owner aware that there is a context with a higher priority + * waiting for the friendly handover. + * + * - The 'unsafe' field allows to take over the console in a safe way in the + * middle of emitting a message. The field is set only when accessing some + * shared resources or when the console device is manipulated. It can be + * cleared, for example, after emitting one character when the console + * device is in a consistent state. + * + * - The 'unsafe_takeover' field is set when a hostile takeover took the + * console in an unsafe state. The console will stay in the unsafe state + * until re-initialized. + * + * The acquire mechanism uses three approaches: + * + * 1) Direct acquire when the console is not owned or is owned by a lower + * priority context and is in a safe state. + * + * 2) Friendly handover mechanism uses a request/grant handshake. It is used + * when the current owner has lower priority and the console is in an + * unsafe state. + * + * The requesting context: + * + * a) Sets its priority into the 'req_prio' field. + * + * b) Waits (with a timeout) for the owning context to unlock the + * console. + * + * c) Takes the lock and clears the 'req_prio' field. + * + * The owning context: + * + * a) Observes the 'req_prio' field set on exit from the unsafe + * console state. + * + * b) Gives up console ownership by clearing the 'prio' field. + * + * 3) Unsafe hostile takeover allows to take over the lock even when the + * console is an unsafe state. It is used only in panic() by the final + * attempt to flush consoles in a try and hope mode. + * + * Note that separate record buffers are used in panic(). As a result, + * the messages can be read and formatted without any risk even after + * using the hostile takeover in unsafe state. + * + * The release function simply clears the 'prio' field. + * + * All operations on @console::nbcon_state are atomic cmpxchg based to + * handle concurrency. + * + * The acquire/release functions implement only minimal policies: + * + * - Preference for higher priority contexts. + * - Protection of the panic CPU. + * + * All other policy decisions must be made at the call sites: + * + * - What is marked as an unsafe section. + * - Whether to spin-wait if there is already an owner and the console is + * in an unsafe state. + * - Whether to attempt an unsafe hostile takeover. + * + * The design allows to implement the well known: + * + * acquire() + * output_one_printk_record() + * release() + * + * The output of one printk record might be interrupted with a higher priority + * context. The new owner is supposed to reprint the entire interrupted record + * from scratch. + */ + +/** + * nbcon_state_set - Helper function to set the console state + * @con: Console to update + * @new: The new state to write + * + * Only to be used when the console is not yet or no longer visible in the + * system. Otherwise use nbcon_state_try_cmpxchg(). + */ +static inline void nbcon_state_set(struct console *con, struct nbcon_state *new) +{ + atomic_set(&ACCESS_PRIVATE(con, nbcon_state), new->atom); +} + +/** + * nbcon_state_read - Helper function to read the console state + * @con: Console to read + * @state: The state to store the result + */ +static inline void nbcon_state_read(struct console *con, struct nbcon_state *state) +{ + state->atom = atomic_read(&ACCESS_PRIVATE(con, nbcon_state)); +} + +/** + * nbcon_state_try_cmpxchg() - Helper function for atomic_try_cmpxchg() on console state + * @con: Console to update + * @cur: Old/expected state + * @new: New state + * + * Return: True on success. False on fail and @cur is updated. + */ +static inline bool nbcon_state_try_cmpxchg(struct console *con, struct nbcon_state *cur, + struct nbcon_state *new) +{ + return atomic_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_state), &cur->atom, new->atom); +} + +#ifdef CONFIG_64BIT + +#define __seq_to_nbcon_seq(seq) (seq) +#define __nbcon_seq_to_seq(seq) (seq) + +#else /* CONFIG_64BIT */ + +#define __seq_to_nbcon_seq(seq) ((u32)seq) + +static inline u64 __nbcon_seq_to_seq(u32 nbcon_seq) +{ + u64 seq; + u64 rb_next_seq; + + /* + * The provided sequence is only the lower 32 bits of the ringbuffer + * sequence. It needs to be expanded to 64bit. Get the next sequence + * number from the ringbuffer and fold it. + * + * Having a 32bit representation in the console is sufficient. + * If a console ever gets more than 2^31 records behind + * the ringbuffer then this is the least of the problems. + * + * Also the access to the ring buffer is always safe. + */ + rb_next_seq = prb_next_seq(prb); + seq = rb_next_seq - ((u32)rb_next_seq - nbcon_seq); + + return seq; +} + +#endif /* CONFIG_64BIT */ + +/** + * nbcon_seq_read - Read the current console sequence + * @con: Console to read the sequence of + * + * Return: Sequence number of the next record to print on @con. + */ +u64 nbcon_seq_read(struct console *con) +{ + unsigned long nbcon_seq = atomic_long_read(&ACCESS_PRIVATE(con, nbcon_seq)); + + return __nbcon_seq_to_seq(nbcon_seq); +} + +/** + * nbcon_seq_force - Force console sequence to a specific value + * @con: Console to work on + * @seq: Sequence number value to set + * + * Only to be used during init (before registration) or in extreme situations + * (such as panic with CONSOLE_REPLAY_ALL). + */ +void nbcon_seq_force(struct console *con, u64 seq) +{ + /* + * If the specified record no longer exists, the oldest available record + * is chosen. This is especially important on 32bit systems because only + * the lower 32 bits of the sequence number are stored. The upper 32 bits + * are derived from the sequence numbers available in the ringbuffer. + */ + u64 valid_seq = max_t(u64, seq, prb_first_valid_seq(prb)); + + atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), __seq_to_nbcon_seq(valid_seq)); + + /* Clear con->seq since nbcon consoles use con->nbcon_seq instead. */ + con->seq = 0; +} + +/** + * nbcon_seq_try_update - Try to update the console sequence number + * @ctxt: Pointer to an acquire context that contains + * all information about the acquire mode + * @new_seq: The new sequence number to set + * + * @ctxt->seq is updated to the new value of @con::nbcon_seq (expanded to + * the 64bit value). This could be a different value than @new_seq if + * nbcon_seq_force() was used or the current context no longer owns the + * console. In the later case, it will stop printing anyway. + */ +static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq) +{ + unsigned long nbcon_seq = __seq_to_nbcon_seq(ctxt->seq); + struct console *con = ctxt->console; + + if (atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_seq), &nbcon_seq, + __seq_to_nbcon_seq(new_seq))) { + ctxt->seq = new_seq; + } else { + ctxt->seq = nbcon_seq_read(con); + } +} + +/** + * nbcon_context_try_acquire_direct - Try to acquire directly + * @ctxt: The context of the caller + * @cur: The current console state + * + * Acquire the console when it is released. Also acquire the console when + * the current owner has a lower priority and the console is in a safe state. + * + * Return: 0 on success. Otherwise, an error code on failure. Also @cur + * is updated to the latest state when failed to modify it. + * + * Errors: + * + * -EPERM: A panic is in progress and this is not the panic CPU. + * Or the current owner or waiter has the same or higher + * priority. No acquire method can be successful in + * this case. + * + * -EBUSY: The current owner has a lower priority but the console + * in an unsafe state. The caller should try using + * the handover acquire method. + */ +static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt, + struct nbcon_state *cur) +{ + unsigned int cpu = smp_processor_id(); + struct console *con = ctxt->console; + struct nbcon_state new; + + do { + if (other_cpu_in_panic()) + return -EPERM; + + if (ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio) + return -EPERM; + + if (cur->unsafe) + return -EBUSY; + + /* + * The console should never be safe for a direct acquire + * if an unsafe hostile takeover has ever happened. + */ + WARN_ON_ONCE(cur->unsafe_takeover); + + new.atom = cur->atom; + new.prio = ctxt->prio; + new.req_prio = NBCON_PRIO_NONE; + new.unsafe = cur->unsafe_takeover; + new.cpu = cpu; + + } while (!nbcon_state_try_cmpxchg(con, cur, &new)); + + return 0; +} + +static bool nbcon_waiter_matches(struct nbcon_state *cur, int expected_prio) +{ + /* + * The request context is well defined by the @req_prio because: + * + * - Only a context with a higher priority can take over the request. + * - There are only three priorities. + * - Only one CPU is allowed to request PANIC priority. + * - Lower priorities are ignored during panic() until reboot. + * + * As a result, the following scenario is *not* possible: + * + * 1. Another context with a higher priority directly takes ownership. + * 2. The higher priority context releases the ownership. + * 3. A lower priority context takes the ownership. + * 4. Another context with the same priority as this context + * creates a request and starts waiting. + */ + + return (cur->req_prio == expected_prio); +} + +/** + * nbcon_context_try_acquire_requested - Try to acquire after having + * requested a handover + * @ctxt: The context of the caller + * @cur: The current console state + * + * This is a helper function for nbcon_context_try_acquire_handover(). + * It is called when the console is in an unsafe state. The current + * owner will release the console on exit from the unsafe region. + * + * Return: 0 on success and @cur is updated to the new console state. + * Otherwise an error code on failure. + * + * Errors: + * + * -EPERM: A panic is in progress and this is not the panic CPU + * or this context is no longer the waiter. + * + * -EBUSY: The console is still locked. The caller should + * continue waiting. + * + * Note: The caller must still remove the request when an error has occurred + * except when this context is no longer the waiter. + */ +static int nbcon_context_try_acquire_requested(struct nbcon_context *ctxt, + struct nbcon_state *cur) +{ + unsigned int cpu = smp_processor_id(); + struct console *con = ctxt->console; + struct nbcon_state new; + + /* Note that the caller must still remove the request! */ + if (other_cpu_in_panic()) + return -EPERM; + + /* + * Note that the waiter will also change if there was an unsafe + * hostile takeover. + */ + if (!nbcon_waiter_matches(cur, ctxt->prio)) + return -EPERM; + + /* If still locked, caller should continue waiting. */ + if (cur->prio != NBCON_PRIO_NONE) + return -EBUSY; + + /* + * The previous owner should have never released ownership + * in an unsafe region. + */ + WARN_ON_ONCE(cur->unsafe); + + new.atom = cur->atom; + new.prio = ctxt->prio; + new.req_prio = NBCON_PRIO_NONE; + new.unsafe = cur->unsafe_takeover; + new.cpu = cpu; + + if (!nbcon_state_try_cmpxchg(con, cur, &new)) { + /* + * The acquire could fail only when it has been taken + * over by a higher priority context. + */ + WARN_ON_ONCE(nbcon_waiter_matches(cur, ctxt->prio)); + return -EPERM; + } + + /* Handover success. This context now owns the console. */ + return 0; +} + +/** + * nbcon_context_try_acquire_handover - Try to acquire via handover + * @ctxt: The context of the caller + * @cur: The current console state + * + * The function must be called only when the context has higher priority + * than the current owner and the console is in an unsafe state. + * It is the case when nbcon_context_try_acquire_direct() returns -EBUSY. + * + * The function sets "req_prio" field to make the current owner aware of + * the request. Then it waits until the current owner releases the console, + * or an even higher context takes over the request, or timeout expires. + * + * The current owner checks the "req_prio" field on exit from the unsafe + * region and releases the console. It does not touch the "req_prio" field + * so that the console stays reserved for the waiter. + * + * Return: 0 on success. Otherwise, an error code on failure. Also @cur + * is updated to the latest state when failed to modify it. + * + * Errors: + * + * -EPERM: A panic is in progress and this is not the panic CPU. + * Or a higher priority context has taken over the + * console or the handover request. + * + * -EBUSY: The current owner is on the same CPU so that the hand + * shake could not work. Or the current owner is not + * willing to wait (zero timeout). Or the console does + * not enter the safe state before timeout passed. The + * caller might still use the unsafe hostile takeover + * when allowed. + * + * -EAGAIN: @cur has changed when creating the handover request. + * The caller should retry with direct acquire. + */ +static int nbcon_context_try_acquire_handover(struct nbcon_context *ctxt, + struct nbcon_state *cur) +{ + unsigned int cpu = smp_processor_id(); + struct console *con = ctxt->console; + struct nbcon_state new; + int timeout; + int request_err = -EBUSY; + + /* + * Check that the handover is called when the direct acquire failed + * with -EBUSY. + */ + WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio); + WARN_ON_ONCE(!cur->unsafe); + + /* Handover is not possible on the same CPU. */ + if (cur->cpu == cpu) + return -EBUSY; + + /* + * Console stays unsafe after an unsafe takeover until re-initialized. + * Waiting is not going to help in this case. + */ + if (cur->unsafe_takeover) + return -EBUSY; + + /* Is the caller willing to wait? */ + if (ctxt->spinwait_max_us == 0) + return -EBUSY; + + /* + * Setup a request for the handover. The caller should try to acquire + * the console directly when the current state has been modified. + */ + new.atom = cur->atom; + new.req_prio = ctxt->prio; + if (!nbcon_state_try_cmpxchg(con, cur, &new)) + return -EAGAIN; + + cur->atom = new.atom; + + /* Wait until there is no owner and then acquire the console. */ + for (timeout = ctxt->spinwait_max_us; timeout >= 0; timeout--) { + /* On successful acquire, this request is cleared. */ + request_err = nbcon_context_try_acquire_requested(ctxt, cur); + if (!request_err) + return 0; + + /* + * If the acquire should be aborted, it must be ensured + * that the request is removed before returning to caller. + */ + if (request_err == -EPERM) + break; + + udelay(1); + + /* Re-read the state because some time has passed. */ + nbcon_state_read(con, cur); + } + + /* Timed out or aborted. Carefully remove handover request. */ + do { + /* + * No need to remove request if there is a new waiter. This + * can only happen if a higher priority context has taken over + * the console or the handover request. + */ + if (!nbcon_waiter_matches(cur, ctxt->prio)) + return -EPERM; + + /* Unset request for handover. */ + new.atom = cur->atom; + new.req_prio = NBCON_PRIO_NONE; + if (nbcon_state_try_cmpxchg(con, cur, &new)) { + /* + * Request successfully unset. Report failure of + * acquiring via handover. + */ + cur->atom = new.atom; + return request_err; + } + + /* + * Unable to remove request. Try to acquire in case + * the owner has released the lock. + */ + } while (nbcon_context_try_acquire_requested(ctxt, cur)); + + /* Lucky timing. The acquire succeeded while removing the request. */ + return 0; +} + +/** + * nbcon_context_try_acquire_hostile - Acquire via unsafe hostile takeover + * @ctxt: The context of the caller + * @cur: The current console state + * + * Acquire the console even in the unsafe state. + * + * It can be permitted by setting the 'allow_unsafe_takeover' field only + * by the final attempt to flush messages in panic(). + * + * Return: 0 on success. -EPERM when not allowed by the context. + */ +static int nbcon_context_try_acquire_hostile(struct nbcon_context *ctxt, + struct nbcon_state *cur) +{ + unsigned int cpu = smp_processor_id(); + struct console *con = ctxt->console; + struct nbcon_state new; + + if (!ctxt->allow_unsafe_takeover) + return -EPERM; + + /* Ensure caller is allowed to perform unsafe hostile takeovers. */ + if (WARN_ON_ONCE(ctxt->prio != NBCON_PRIO_PANIC)) + return -EPERM; + + /* + * Check that try_acquire_direct() and try_acquire_handover() returned + * -EBUSY in the right situation. + */ + WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio); + WARN_ON_ONCE(cur->unsafe != true); + + do { + new.atom = cur->atom; + new.cpu = cpu; + new.prio = ctxt->prio; + new.unsafe |= cur->unsafe_takeover; + new.unsafe_takeover |= cur->unsafe; + + } while (!nbcon_state_try_cmpxchg(con, cur, &new)); + + return 0; +} + +static struct printk_buffers panic_nbcon_pbufs; + +/** + * nbcon_context_try_acquire - Try to acquire nbcon console + * @ctxt: The context of the caller + * + * Return: True if the console was acquired. False otherwise. + * + * If the caller allowed an unsafe hostile takeover, on success the + * caller should check the current console state to see if it is + * in an unsafe state. Otherwise, on success the caller may assume + * the console is not in an unsafe state. + */ +__maybe_unused +static bool nbcon_context_try_acquire(struct nbcon_context *ctxt) +{ + unsigned int cpu = smp_processor_id(); + struct console *con = ctxt->console; + struct nbcon_state cur; + int err; + + nbcon_state_read(con, &cur); +try_again: + err = nbcon_context_try_acquire_direct(ctxt, &cur); + if (err != -EBUSY) + goto out; + + err = nbcon_context_try_acquire_handover(ctxt, &cur); + if (err == -EAGAIN) + goto try_again; + if (err != -EBUSY) + goto out; + + err = nbcon_context_try_acquire_hostile(ctxt, &cur); +out: + if (err) + return false; + + /* Acquire succeeded. */ + + /* Assign the appropriate buffer for this context. */ + if (atomic_read(&panic_cpu) == cpu) + ctxt->pbufs = &panic_nbcon_pbufs; + else + ctxt->pbufs = con->pbufs; + + /* Set the record sequence for this context to print. */ + ctxt->seq = nbcon_seq_read(ctxt->console); + + return true; +} + +static bool nbcon_owner_matches(struct nbcon_state *cur, int expected_cpu, + int expected_prio) +{ + /* + * Since consoles can only be acquired by higher priorities, + * owning contexts are uniquely identified by @prio. However, + * since contexts can unexpectedly lose ownership, it is + * possible that later another owner appears with the same + * priority. For this reason @cpu is also needed. + */ + + if (cur->prio != expected_prio) + return false; + + if (cur->cpu != expected_cpu) + return false; + + return true; +} + +/** + * nbcon_context_release - Release the console + * @ctxt: The nbcon context from nbcon_context_try_acquire() + */ +static void nbcon_context_release(struct nbcon_context *ctxt) +{ + unsigned int cpu = smp_processor_id(); + struct console *con = ctxt->console; + struct nbcon_state cur; + struct nbcon_state new; + + nbcon_state_read(con, &cur); + + do { + if (!nbcon_owner_matches(&cur, cpu, ctxt->prio)) + break; + + new.atom = cur.atom; + new.prio = NBCON_PRIO_NONE; + + /* + * If @unsafe_takeover is set, it is kept set so that + * the state remains permanently unsafe. + */ + new.unsafe |= cur.unsafe_takeover; + + } while (!nbcon_state_try_cmpxchg(con, &cur, &new)); + + ctxt->pbufs = NULL; +} + +/** + * nbcon_context_can_proceed - Check whether ownership can proceed + * @ctxt: The nbcon context from nbcon_context_try_acquire() + * @cur: The current console state + * + * Return: True if this context still owns the console. False if + * ownership was handed over or taken. + * + * Must be invoked when entering the unsafe state to make sure that it still + * owns the lock. Also must be invoked when exiting the unsafe context + * to eventually free the lock for a higher priority context which asked + * for the friendly handover. + * + * It can be called inside an unsafe section when the console is just + * temporary in safe state instead of exiting and entering the unsafe + * state. + * + * Also it can be called in the safe context before doing an expensive + * safe operation. It does not make sense to do the operation when + * a higher priority context took the lock. + * + * When this function returns false then the calling context no longer owns + * the console and is no longer allowed to go forward. In this case it must + * back out immediately and carefully. The buffer content is also no longer + * trusted since it no longer belongs to the calling context. + */ +static bool nbcon_context_can_proceed(struct nbcon_context *ctxt, struct nbcon_state *cur) +{ + unsigned int cpu = smp_processor_id(); + + /* Make sure this context still owns the console. */ + if (!nbcon_owner_matches(cur, cpu, ctxt->prio)) + return false; + + /* The console owner can proceed if there is no waiter. */ + if (cur->req_prio == NBCON_PRIO_NONE) + return true; + + /* + * A console owner within an unsafe region is always allowed to + * proceed, even if there are waiters. It can perform a handover + * when exiting the unsafe region. Otherwise the waiter will + * need to perform an unsafe hostile takeover. + */ + if (cur->unsafe) + return true; + + /* Waiters always have higher priorities than owners. */ + WARN_ON_ONCE(cur->req_prio <= cur->prio); + + /* + * Having a safe point for take over and eventually a few + * duplicated characters or a full line is way better than a + * hostile takeover. Post processing can take care of the garbage. + * Release and hand over. + */ + nbcon_context_release(ctxt); + + /* + * It is not clear whether the waiter really took over ownership. The + * outermost callsite must make the final decision whether console + * ownership is needed for it to proceed. If yes, it must reacquire + * ownership (possibly hostile) before carefully proceeding. + * + * The calling context no longer owns the console so go back all the + * way instead of trying to implement reacquire heuristics in tons of + * places. + */ + return false; +} + +/** + * nbcon_can_proceed - Check whether ownership can proceed + * @wctxt: The write context that was handed to the write function + * + * Return: True if this context still owns the console. False if + * ownership was handed over or taken. + * + * It is used in nbcon_enter_unsafe() to make sure that it still owns the + * lock. Also it is used in nbcon_exit_unsafe() to eventually free the lock + * for a higher priority context which asked for the friendly handover. + * + * It can be called inside an unsafe section when the console is just + * temporary in safe state instead of exiting and entering the unsafe state. + * + * Also it can be called in the safe context before doing an expensive safe + * operation. It does not make sense to do the operation when a higher + * priority context took the lock. + * + * When this function returns false then the calling context no longer owns + * the console and is no longer allowed to go forward. In this case it must + * back out immediately and carefully. The buffer content is also no longer + * trusted since it no longer belongs to the calling context. + */ +bool nbcon_can_proceed(struct nbcon_write_context *wctxt) +{ + struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); + struct console *con = ctxt->console; + struct nbcon_state cur; + + nbcon_state_read(con, &cur); + + return nbcon_context_can_proceed(ctxt, &cur); +} +EXPORT_SYMBOL_GPL(nbcon_can_proceed); + +#define nbcon_context_enter_unsafe(c) __nbcon_context_update_unsafe(c, true) +#define nbcon_context_exit_unsafe(c) __nbcon_context_update_unsafe(c, false) + +/** + * __nbcon_context_update_unsafe - Update the unsafe bit in @con->nbcon_state + * @ctxt: The nbcon context from nbcon_context_try_acquire() + * @unsafe: The new value for the unsafe bit + * + * Return: True if the unsafe state was updated and this context still + * owns the console. Otherwise false if ownership was handed + * over or taken. + * + * This function allows console owners to modify the unsafe status of the + * console. + * + * When this function returns false then the calling context no longer owns + * the console and is no longer allowed to go forward. In this case it must + * back out immediately and carefully. The buffer content is also no longer + * trusted since it no longer belongs to the calling context. + * + * Internal helper to avoid duplicated code. + */ +static bool __nbcon_context_update_unsafe(struct nbcon_context *ctxt, bool unsafe) +{ + struct console *con = ctxt->console; + struct nbcon_state cur; + struct nbcon_state new; + + nbcon_state_read(con, &cur); + + do { + /* + * The unsafe bit must not be cleared if an + * unsafe hostile takeover has occurred. + */ + if (!unsafe && cur.unsafe_takeover) + goto out; + + if (!nbcon_context_can_proceed(ctxt, &cur)) + return false; + + new.atom = cur.atom; + new.unsafe = unsafe; + } while (!nbcon_state_try_cmpxchg(con, &cur, &new)); + + cur.atom = new.atom; +out: + return nbcon_context_can_proceed(ctxt, &cur); +} + +/** + * nbcon_enter_unsafe - Enter an unsafe region in the driver + * @wctxt: The write context that was handed to the write function + * + * Return: True if this context still owns the console. False if + * ownership was handed over or taken. + * + * When this function returns false then the calling context no longer owns + * the console and is no longer allowed to go forward. In this case it must + * back out immediately and carefully. The buffer content is also no longer + * trusted since it no longer belongs to the calling context. + */ +bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) +{ + struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); + + return nbcon_context_enter_unsafe(ctxt); +} +EXPORT_SYMBOL_GPL(nbcon_enter_unsafe); + +/** + * nbcon_exit_unsafe - Exit an unsafe region in the driver + * @wctxt: The write context that was handed to the write function + * + * Return: True if this context still owns the console. False if + * ownership was handed over or taken. + * + * When this function returns false then the calling context no longer owns + * the console and is no longer allowed to go forward. In this case it must + * back out immediately and carefully. The buffer content is also no longer + * trusted since it no longer belongs to the calling context. + */ +bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) +{ + struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); + + return nbcon_context_exit_unsafe(ctxt); +} +EXPORT_SYMBOL_GPL(nbcon_exit_unsafe); + +/** + * nbcon_emit_next_record - Emit a record in the acquired context + * @wctxt: The write context that will be handed to the write function + * + * Return: True if this context still owns the console. False if + * ownership was handed over or taken. + * + * When this function returns false then the calling context no longer owns + * the console and is no longer allowed to go forward. In this case it must + * back out immediately and carefully. The buffer content is also no longer + * trusted since it no longer belongs to the calling context. If the caller + * wants to do more it must reacquire the console first. + * + * When true is returned, @wctxt->ctxt.backlog indicates whether there are + * still records pending in the ringbuffer, + */ +__maybe_unused +static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt) +{ + struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); + struct console *con = ctxt->console; + bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED; + struct printk_message pmsg = { + .pbufs = ctxt->pbufs, + }; + unsigned long con_dropped; + struct nbcon_state cur; + unsigned long dropped; + bool done; + + /* + * The printk buffers are filled within an unsafe section. This + * prevents NBCON_PRIO_NORMAL and NBCON_PRIO_EMERGENCY from + * clobbering each other. + */ + + if (!nbcon_context_enter_unsafe(ctxt)) + return false; + + ctxt->backlog = printk_get_next_message(&pmsg, ctxt->seq, is_extended, true); + if (!ctxt->backlog) + return nbcon_context_exit_unsafe(ctxt); + + /* + * @con->dropped is not protected in case of an unsafe hostile + * takeover. In that situation the update can be racy so + * annotate it accordingly. + */ + con_dropped = data_race(READ_ONCE(con->dropped)); + + dropped = con_dropped + pmsg.dropped; + if (dropped && !is_extended) + console_prepend_dropped(&pmsg, dropped); + + if (!nbcon_context_exit_unsafe(ctxt)) + return false; + + /* For skipped records just update seq/dropped in @con. */ + if (pmsg.outbuf_len == 0) + goto update_con; + + /* Initialize the write context for driver callbacks. */ + wctxt->outbuf = &pmsg.pbufs->outbuf[0]; + wctxt->len = pmsg.outbuf_len; + nbcon_state_read(con, &cur); + wctxt->unsafe_takeover = cur.unsafe_takeover; + + if (con->write_atomic) { + done = con->write_atomic(con, wctxt); + } else { + nbcon_context_release(ctxt); + WARN_ON_ONCE(1); + done = false; + } + + /* If not done, the emit was aborted. */ + if (!done) + return false; + + /* + * Since any dropped message was successfully output, reset the + * dropped count for the console. + */ + dropped = 0; +update_con: + /* + * The dropped count and the sequence number are updated within an + * unsafe section. This limits update races to the panic context and + * allows the panic context to win. + */ + + if (!nbcon_context_enter_unsafe(ctxt)) + return false; + + if (dropped != con_dropped) { + /* Counterpart to the READ_ONCE() above. */ + WRITE_ONCE(con->dropped, dropped); + } + + nbcon_seq_try_update(ctxt, pmsg.seq + 1); + + return nbcon_context_exit_unsafe(ctxt); +} + +/** + * nbcon_alloc - Allocate buffers needed by the nbcon console + * @con: Console to allocate buffers for + * + * Return: True on success. False otherwise and the console cannot + * be used. + * + * This is not part of nbcon_init() because buffer allocation must + * be performed earlier in the console registration process. + */ +bool nbcon_alloc(struct console *con) +{ + if (con->flags & CON_BOOT) { + /* + * Boot console printing is synchronized with legacy console + * printing, so boot consoles can share the same global printk + * buffers. + */ + con->pbufs = &printk_shared_pbufs; + } else { + con->pbufs = kmalloc(sizeof(*con->pbufs), GFP_KERNEL); + if (!con->pbufs) { + con_printk(KERN_ERR, con, "failed to allocate printing buffer\n"); + return false; + } + } + + return true; +} + +/** + * nbcon_init - Initialize the nbcon console specific data + * @con: Console to initialize + * + * nbcon_alloc() *must* be called and succeed before this function + * is called. + * + * This function expects that the legacy @con->seq has been set. + */ +void nbcon_init(struct console *con) +{ + struct nbcon_state state = { }; + + /* nbcon_alloc() must have been called and successful! */ + BUG_ON(!con->pbufs); + + nbcon_seq_force(con, con->seq); + nbcon_state_set(con, &state); +} + +/** + * nbcon_free - Free and cleanup the nbcon console specific data + * @con: Console to free/cleanup nbcon data + */ +void nbcon_free(struct console *con) +{ + struct nbcon_state state = { }; + + nbcon_state_set(con, &state); + + /* Boot consoles share global printk buffers. */ + if (!(con->flags & CON_BOOT)) + kfree(con->pbufs); + + con->pbufs = NULL; +} diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 357a4d18f638..f2444b581e16 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -88,7 +88,7 @@ EXPORT_SYMBOL(oops_in_progress); static DEFINE_MUTEX(console_mutex); /* - * console_sem protects updates to console->seq and console_suspended, + * console_sem protects updates to console->seq * and also provides serialization for console printing. */ static DEFINE_SEMAPHORE(console_sem, 1); @@ -102,12 +102,6 @@ DEFINE_STATIC_SRCU(console_srcu); */ int __read_mostly suppress_printk; -/* - * During panic, heavy printk by other CPUs can delay the - * panic and risk deadlock on console resources. - */ -static int __read_mostly suppress_panic_printk; - #ifdef CONFIG_LOCKDEP static struct lockdep_map console_lock_dep_map = { .name = "console_lock" @@ -361,7 +355,7 @@ static bool panic_in_progress(void) * paths in the console code where we end up in places I want * locked without the console semaphore held). */ -static int console_locked, console_suspended; +static int console_locked; /* * Array of consoles built from command line options (console=) @@ -445,6 +439,12 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; static DEFINE_MUTEX(syslog_lock); #ifdef CONFIG_PRINTK +/* + * During panic, heavy printk by other CPUs can delay the + * panic and risk deadlock on console resources. + */ +static int __read_mostly suppress_panic_printk; + DECLARE_WAIT_QUEUE_HEAD(log_wait); /* All 3 protected by @syslog_lock. */ /* the next printk record to read by syslog(READ) or /proc/kmsg */ @@ -494,7 +494,7 @@ _DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT - PRB_AVGBITS, static struct printk_ringbuffer printk_rb_dynamic; -static struct printk_ringbuffer *prb = &printk_rb_static; +struct printk_ringbuffer *prb = &printk_rb_static; /* * We cannot access per-CPU data (e.g. per-CPU flush irq_work) before @@ -698,9 +698,6 @@ out: return len; } -static bool printk_get_next_message(struct printk_message *pmsg, u64 seq, - bool is_extended, bool may_supress); - /* /dev/kmsg - userspace message inject/listen interface */ struct devkmsg_user { atomic64_t seq; @@ -1669,7 +1666,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear) prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX); - len = 0; prb_for_each_record(seq, prb, seq, &r) { int textlen; @@ -2308,7 +2304,11 @@ asmlinkage int vprintk_emit(int facility, int level, preempt_enable(); } - wake_up_klogd(); + if (in_sched) + defer_console_output(); + else + wake_up_klogd(); + return printed_len; } EXPORT_SYMBOL(vprintk_emit); @@ -2345,22 +2345,6 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre static u64 syslog_seq; -static size_t record_print_text(const struct printk_record *r, - bool syslog, bool time) -{ - return 0; -} -static ssize_t info_print_ext_header(char *buf, size_t size, - struct printk_info *info) -{ - return 0; -} -static ssize_t msg_print_ext_body(char *buf, size_t size, - char *text, size_t text_len, - struct dev_printk_info *dev_info) { return 0; } -static void console_lock_spinning_enable(void) { } -static int console_lock_spinning_disable_and_check(int cookie) { return 0; } -static bool suppress_message_printing(int level) { return false; } static bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; } static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; } @@ -2400,13 +2384,21 @@ static void set_user_specified(struct console_cmdline *c, bool user_specified) console_set_on_cmdline = 1; } -static int __add_preferred_console(char *name, int idx, char *options, +static int __add_preferred_console(const char *name, const short idx, char *options, char *brl_options, bool user_specified) { struct console_cmdline *c; int i; /* + * We use a signed short index for struct console for device drivers to + * indicate a not yet assigned index or port. However, a negative index + * value is not valid for preferred console. + */ + if (idx < 0) + return -EINVAL; + + /* * See if this tty is not yet registered, and * if we have a slot free. */ @@ -2509,7 +2501,7 @@ __setup("console=", console_setup); * commonly to provide a default console (ie from PROM variables) when * the user has not supplied one. */ -int add_preferred_console(char *name, int idx, char *options) +int add_preferred_console(const char *name, const short idx, char *options) { return __add_preferred_console(name, idx, options, NULL, false); } @@ -2547,22 +2539,46 @@ MODULE_PARM_DESC(console_no_auto_verbose, "Disable console loglevel raise to hig */ void suspend_console(void) { + struct console *con; + if (!console_suspend_enabled) return; pr_info("Suspending console(s) (use no_console_suspend to debug)\n"); pr_flush(1000, true); - console_lock(); - console_suspended = 1; - up_console_sem(); + + console_list_lock(); + for_each_console(con) + console_srcu_write_flags(con, con->flags | CON_SUSPENDED); + console_list_unlock(); + + /* + * Ensure that all SRCU list walks have completed. All printing + * contexts must be able to see that they are suspended so that it + * is guaranteed that all printing has stopped when this function + * completes. + */ + synchronize_srcu(&console_srcu); } void resume_console(void) { + struct console *con; + if (!console_suspend_enabled) return; - down_console_sem(); - console_suspended = 0; - console_unlock(); + + console_list_lock(); + for_each_console(con) + console_srcu_write_flags(con, con->flags & ~CON_SUSPENDED); + console_list_unlock(); + + /* + * Ensure that all SRCU list walks have completed. All printing + * contexts must be able to see they are no longer suspended so + * that they are guaranteed to wake up and resume printing. + */ + synchronize_srcu(&console_srcu); + pr_flush(1000, true); } @@ -2585,6 +2601,26 @@ static int console_cpu_notify(unsigned int cpu) return 0; } +/* + * Return true if a panic is in progress on a remote CPU. + * + * On true, the local CPU should immediately release any printing resources + * that may be needed by the panic CPU. + */ +bool other_cpu_in_panic(void) +{ + if (!panic_in_progress()) + return false; + + /* + * We can use raw_smp_processor_id() here because it is impossible for + * the task to be migrated to the panic_cpu, or away from it. If + * panic_cpu has already been set, and we're not currently executing on + * that CPU, then we never will be. + */ + return atomic_read(&panic_cpu) != raw_smp_processor_id(); +} + /** * console_lock - block the console subsystem from printing * @@ -2597,9 +2633,11 @@ void console_lock(void) { might_sleep(); + /* On panic, the console_lock must be left to the panic cpu. */ + while (other_cpu_in_panic()) + msleep(1000); + down_console_sem(); - if (console_suspended) - return; console_locked = 1; console_may_schedule = 1; } @@ -2615,12 +2653,11 @@ EXPORT_SYMBOL(console_lock); */ int console_trylock(void) { - if (down_trylock_console_sem()) + /* On panic, the console_lock must be left to the panic cpu. */ + if (other_cpu_in_panic()) return 0; - if (console_suspended) { - up_console_sem(); + if (down_trylock_console_sem()) return 0; - } console_locked = 1; console_may_schedule = 0; return 1; @@ -2634,25 +2671,6 @@ int is_console_locked(void) EXPORT_SYMBOL(is_console_locked); /* - * Return true when this CPU should unlock console_sem without pushing all - * messages to the console. This reduces the chance that the console is - * locked when the panic CPU tries to use it. - */ -static bool abandon_console_lock_in_panic(void) -{ - if (!panic_in_progress()) - return false; - - /* - * We can use raw_smp_processor_id() here because it is impossible for - * the task to be migrated to the panic_cpu, or away from it. If - * panic_cpu has already been set, and we're not currently executing on - * that CPU, then we never will be. - */ - return atomic_read(&panic_cpu) != raw_smp_processor_id(); -} - -/* * Check if the given console is currently capable and allowed to print * records. * @@ -2665,6 +2683,9 @@ static inline bool console_is_usable(struct console *con) if (!(flags & CON_ENABLED)) return false; + if ((flags & CON_SUSPENDED)) + return false; + if (!con->write) return false; @@ -2685,6 +2706,8 @@ static void __console_unlock(void) up_console_sem(); } +#ifdef CONFIG_PRINTK + /* * Prepend the message in @pmsg->pbufs->outbuf with a "dropped message". This * is achieved by shifting the existing message over and inserting the dropped @@ -2699,8 +2722,7 @@ static void __console_unlock(void) * * If @pmsg->pbufs->outbuf is modified, @pmsg->outbuf_len is updated. */ -#ifdef CONFIG_PRINTK -static void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped) +void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped) { struct printk_buffers *pbufs = pmsg->pbufs; const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf); @@ -2731,9 +2753,6 @@ static void console_prepend_dropped(struct printk_message *pmsg, unsigned long d memcpy(outbuf, scratchbuf, len); pmsg->outbuf_len += len; } -#else -#define console_prepend_dropped(pmsg, dropped) -#endif /* CONFIG_PRINTK */ /* * Read and format the specified record (or a later record if the specified @@ -2754,8 +2773,8 @@ static void console_prepend_dropped(struct printk_message *pmsg, unsigned long d * of @pmsg are valid. (See the documentation of struct printk_message * for information about the @pmsg fields.) */ -static bool printk_get_next_message(struct printk_message *pmsg, u64 seq, - bool is_extended, bool may_suppress) +bool printk_get_next_message(struct printk_message *pmsg, u64 seq, + bool is_extended, bool may_suppress) { static int panic_console_dropped; @@ -2814,6 +2833,13 @@ out: } /* + * Used as the printk buffers for non-panic, serialized console printing. + * This is for legacy (!CON_NBCON) as well as all boot (CON_BOOT) consoles. + * Its usage requires the console_lock held. + */ +struct printk_buffers printk_shared_pbufs; + +/* * Print one record for the given console. The record printed is whatever * record is the next available record for the given console. * @@ -2830,12 +2856,10 @@ out: */ static bool console_emit_next_record(struct console *con, bool *handover, int cookie) { - static struct printk_buffers pbufs; - bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED; - char *outbuf = &pbufs.outbuf[0]; + char *outbuf = &printk_shared_pbufs.outbuf[0]; struct printk_message pmsg = { - .pbufs = &pbufs, + .pbufs = &printk_shared_pbufs, }; unsigned long flags; @@ -2886,6 +2910,16 @@ skip: return true; } +#else + +static bool console_emit_next_record(struct console *con, bool *handover, int cookie) +{ + *handover = false; + return false; +} + +#endif /* CONFIG_PRINTK */ + /* * Print out all remaining records to all consoles. * @@ -2948,7 +2982,7 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove any_progress = true; /* Allow panic_cpu to take over the consoles safely. */ - if (abandon_console_lock_in_panic()) + if (other_cpu_in_panic()) goto abandon; if (do_cond_resched) @@ -2983,11 +3017,6 @@ void console_unlock(void) bool flushed; u64 next_seq; - if (console_suspended) { - up_console_sem(); - return; - } - /* * Console drivers are called with interrupts disabled, so * @console_may_schedule should be cleared before; however, we may @@ -3045,10 +3074,28 @@ EXPORT_SYMBOL(console_conditional_schedule); void console_unblank(void) { + bool found_unblank = false; struct console *c; int cookie; /* + * First check if there are any consoles implementing the unblank() + * callback. If not, there is no reason to continue and take the + * console lock, which in particular can be dangerous if + * @oops_in_progress is set. + */ + cookie = console_srcu_read_lock(); + for_each_console_srcu(c) { + if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank) { + found_unblank = true; + break; + } + } + console_srcu_read_unlock(cookie); + if (!found_unblank) + return; + + /* * Stop console printing because the unblank() callback may * assume the console is not within its write() callback. * @@ -3056,6 +3103,16 @@ void console_unblank(void) * In that case, attempt a trylock as best-effort. */ if (oops_in_progress) { + /* Semaphores are not NMI-safe. */ + if (in_nmi()) + return; + + /* + * Attempting to trylock the console lock can deadlock + * if another CPU was stopped while modifying the + * semaphore. "Hope and pray" that this is not the + * current situation. + */ if (down_trylock_console_sem() != 0) return; } else @@ -3085,18 +3142,29 @@ void console_unblank(void) */ void console_flush_on_panic(enum con_flush_mode mode) { + bool handover; + u64 next_seq; + + /* + * Ignore the console lock and flush out the messages. Attempting a + * trylock would not be useful because: + * + * - if it is contended, it must be ignored anyway + * - console_lock() and console_trylock() block and fail + * respectively in panic for non-panic CPUs + * - semaphores are not NMI-safe + */ + /* - * If someone else is holding the console lock, trylock will fail - * and may_schedule may be set. Ignore and proceed to unlock so - * that messages are flushed out. As this can be called from any - * context and we don't want to get preempted while flushing, - * ensure may_schedule is cleared. + * If another context is holding the console lock, + * @console_may_schedule might be set. Clear it so that + * this context does not call cond_resched() while flushing. */ - console_trylock(); console_may_schedule = 0; if (mode == CONSOLE_REPLAY_ALL) { struct console *c; + short flags; int cookie; u64 seq; @@ -3104,16 +3172,22 @@ void console_flush_on_panic(enum con_flush_mode mode) cookie = console_srcu_read_lock(); for_each_console_srcu(c) { - /* - * If the above console_trylock() failed, this is an - * unsynchronized assignment. But in that case, the - * kernel is in "hope and pray" mode anyway. - */ - c->seq = seq; + flags = console_srcu_read_flags(c); + + if (flags & CON_NBCON) { + nbcon_seq_force(c, seq); + } else { + /* + * This is an unsynchronized assignment. On + * panic legacy consoles are only best effort. + */ + c->seq = seq; + } } console_srcu_read_unlock(cookie); } - console_unlock(); + + console_flush_all(false, &next_seq, &handover); } /* @@ -3260,11 +3334,6 @@ static void try_enable_default_console(struct console *newcon) newcon->flags |= CON_CONSDEV; } -#define con_printk(lvl, con, fmt, ...) \ - printk(lvl pr_fmt("%sconsole [%s%d] " fmt), \ - (con->flags & CON_BOOT) ? "boot" : "", \ - con->name, con->index, ##__VA_ARGS__) - static void console_init_seq(struct console *newcon, bool bootcon_registered) { struct console *con; @@ -3378,6 +3447,15 @@ void register_console(struct console *newcon) goto unlock; } + if (newcon->flags & CON_NBCON) { + /* + * Ensure the nbcon console buffers can be allocated + * before modifying any global data. + */ + if (!nbcon_alloc(newcon)) + goto unlock; + } + /* * See if we want to enable this console driver by default. * @@ -3405,8 +3483,11 @@ void register_console(struct console *newcon) err = try_enable_preferred_console(newcon, false); /* printk() messages are not printed to the Braille console. */ - if (err || newcon->flags & CON_BRL) + if (err || newcon->flags & CON_BRL) { + if (newcon->flags & CON_NBCON) + nbcon_free(newcon); goto unlock; + } /* * If we have a bootconsole, and are switching to a real console, @@ -3422,6 +3503,9 @@ void register_console(struct console *newcon) newcon->dropped = 0; console_init_seq(newcon, bootcon_registered); + if (newcon->flags & CON_NBCON) + nbcon_init(newcon); + /* * Put this console in the list - keep the * preferred driver at the head of the list. @@ -3513,6 +3597,9 @@ static int unregister_console_locked(struct console *console) */ synchronize_srcu(&console_srcu); + if (console->flags & CON_NBCON) + nbcon_free(console); + console_sysfs_notify(); if (console->exit) @@ -3662,10 +3749,12 @@ late_initcall(printk_late_init); /* If @con is specified, only wait for that console. Otherwise wait for all. */ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { - int remaining = timeout_ms; + unsigned long timeout_jiffies = msecs_to_jiffies(timeout_ms); + unsigned long remaining_jiffies = timeout_jiffies; struct console *c; u64 last_diff = 0; u64 printk_seq; + short flags; int cookie; u64 diff; u64 seq; @@ -3674,13 +3763,21 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre seq = prb_next_seq(prb); + /* Flush the consoles so that records up to @seq are printed. */ + console_lock(); + console_unlock(); + for (;;) { + unsigned long begin_jiffies; + unsigned long slept_jiffies; + diff = 0; /* * Hold the console_lock to guarantee safe access to - * console->seq and to prevent changes to @console_suspended - * until all consoles have been processed. + * console->seq. Releasing console_lock flushes more + * records in case @seq is still not printed on all + * usable consoles. */ console_lock(); @@ -3688,39 +3785,43 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre for_each_console_srcu(c) { if (con && con != c) continue; + + flags = console_srcu_read_flags(c); + + /* + * If consoles are not usable, it cannot be expected + * that they make forward progress, so only increment + * @diff for usable consoles. + */ if (!console_is_usable(c)) continue; - printk_seq = c->seq; + + if (flags & CON_NBCON) { + printk_seq = nbcon_seq_read(c); + } else { + printk_seq = c->seq; + } + if (printk_seq < seq) diff += seq - printk_seq; } console_srcu_read_unlock(cookie); - /* - * If consoles are suspended, it cannot be expected that they - * make forward progress, so timeout immediately. @diff is - * still used to return a valid flush status. - */ - if (console_suspended) - remaining = 0; - else if (diff != last_diff && reset_on_progress) - remaining = timeout_ms; + if (diff != last_diff && reset_on_progress) + remaining_jiffies = timeout_jiffies; console_unlock(); - if (diff == 0 || remaining == 0) + /* Note: @diff is 0 if there are no usable consoles. */ + if (diff == 0 || remaining_jiffies == 0) break; - if (remaining < 0) { - /* no timeout limit */ - msleep(100); - } else if (remaining < 100) { - msleep(remaining); - remaining = 0; - } else { - msleep(100); - remaining -= 100; - } + /* msleep(1) might sleep much longer. Check time by jiffies. */ + begin_jiffies = jiffies; + msleep(1); + slept_jiffies = jiffies - begin_jiffies; + + remaining_jiffies -= min(slept_jiffies, remaining_jiffies); last_diff = diff; } @@ -3741,7 +3842,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre * printer has been seen to make some forward progress. * * Context: Process context. May sleep while acquiring console lock. - * Return: true if all enabled printers are caught up. + * Return: true if all usable printers are caught up. */ static bool pr_flush(int timeout_ms, bool reset_on_progress) { @@ -3798,11 +3899,33 @@ static void __wake_up_klogd(int val) preempt_enable(); } +/** + * wake_up_klogd - Wake kernel logging daemon + * + * Use this function when new records have been added to the ringbuffer + * and the console printing of those records has already occurred or is + * known to be handled by some other context. This function will only + * wake the logging daemon. + * + * Context: Any context. + */ void wake_up_klogd(void) { __wake_up_klogd(PRINTK_PENDING_WAKEUP); } +/** + * defer_console_output - Wake kernel logging daemon and trigger + * console printing in a deferred context + * + * Use this function when new records have been added to the ringbuffer, + * this context is responsible for console printing those records, but + * the current context is not allowed to perform the console printing. + * Trigger an irq_work context to perform the console printing. This + * function also wakes the logging daemon. + * + * Context: Any context. + */ void defer_console_output(void) { /* @@ -3819,12 +3942,7 @@ void printk_trigger_flush(void) int vprintk_deferred(const char *fmt, va_list args) { - int r; - - r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args); - defer_console_output(); - - return r; + return vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args); } int _printk_deferred(const char *fmt, ...) @@ -4107,7 +4225,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog, prb_rec_init_rd(&r, &info, buf, size); - len = 0; prb_for_each_record(seq, prb, seq, &r) { if (r.info->seq >= iter->next_seq) break; diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 2dc4d5a1f1ff..fde338606ce8 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -1735,7 +1735,7 @@ static bool copy_data(struct prb_data_ring *data_ring, if (!buf || !buf_size) return true; - data_size = min_t(u16, buf_size, len); + data_size = min_t(unsigned int, buf_size, len); memcpy(&buf[0], data, data_size); /* LMM(copy_data:A) */ return true; diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index ef0f9a2044da..6d10927a07d8 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -38,13 +38,8 @@ asmlinkage int vprintk(const char *fmt, va_list args) * Use the main logbuf even in NMI. But avoid calling console * drivers that might have their own locks. */ - if (this_cpu_read(printk_context) || in_nmi()) { - int len; - - len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args); - defer_console_output(); - return len; - } + if (this_cpu_read(printk_context) || in_nmi()) + return vprintk_deferred(fmt, args); /* No obstacles. */ return vprintk_default(fmt, args); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 443057bee87c..2fabd497d659 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -59,7 +59,7 @@ int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, return 0; } - ret = __access_remote_vm(mm, addr, buf, len, gup_flags); + ret = access_remote_vm(mm, addr, buf, len, gup_flags); mmput(mm); return ret; @@ -145,20 +145,9 @@ void __ptrace_unlink(struct task_struct *child) */ if (!(child->flags & PF_EXITING) && (child->signal->flags & SIGNAL_STOP_STOPPED || - child->signal->group_stop_count)) { + child->signal->group_stop_count)) child->jobctl |= JOBCTL_STOP_PENDING; - /* - * This is only possible if this thread was cloned by the - * traced task running in the stopped group, set the signal - * for the future reports. - * FIXME: we should change ptrace_init_task() to handle this - * case. - */ - if (!(child->jobctl & JOBCTL_STOP_SIGMASK)) - child->jobctl |= SIGSTOP; - } - /* * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick * @child in the butt. Note that @resume should be used iff @child @@ -386,6 +375,34 @@ static int check_ptrace_options(unsigned long data) return 0; } +static inline void ptrace_set_stopped(struct task_struct *task) +{ + guard(spinlock)(&task->sighand->siglock); + + /* + * If the task is already STOPPED, set JOBCTL_TRAP_STOP and + * TRAPPING, and kick it so that it transits to TRACED. TRAPPING + * will be cleared if the child completes the transition or any + * event which clears the group stop states happens. We'll wait + * for the transition to complete before returning from this + * function. + * + * This hides STOPPED -> RUNNING -> TRACED transition from the + * attaching thread but a different thread in the same group can + * still observe the transient RUNNING state. IOW, if another + * thread's WNOHANG wait(2) on the stopped tracee races against + * ATTACH, the wait(2) may fail due to the transient RUNNING. + * + * The following task_is_stopped() test is safe as both transitions + * in and out of STOPPED are protected by siglock. + */ + if (task_is_stopped(task) && + task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) { + task->jobctl &= ~JOBCTL_STOPPED; + signal_wake_up_state(task, __TASK_STOPPED); + } +} + static int ptrace_attach(struct task_struct *task, long request, unsigned long addr, unsigned long flags) @@ -393,17 +410,17 @@ static int ptrace_attach(struct task_struct *task, long request, bool seize = (request == PTRACE_SEIZE); int retval; - retval = -EIO; if (seize) { if (addr != 0) - goto out; + return -EIO; /* * This duplicates the check in check_ptrace_options() because * ptrace_attach() and ptrace_setoptions() have historically * used different error codes for unknown ptrace options. */ if (flags & ~(unsigned long)PTRACE_O_MASK) - goto out; + return -EIO; + retval = check_ptrace_options(flags); if (retval) return retval; @@ -414,88 +431,54 @@ static int ptrace_attach(struct task_struct *task, long request, audit_ptrace(task); - retval = -EPERM; if (unlikely(task->flags & PF_KTHREAD)) - goto out; + return -EPERM; if (same_thread_group(task, current)) - goto out; + return -EPERM; /* * Protect exec's credential calculations against our interference; * SUID, SGID and LSM creds get determined differently * under ptrace. */ - retval = -ERESTARTNOINTR; - if (mutex_lock_interruptible(&task->signal->cred_guard_mutex)) - goto out; + scoped_cond_guard (mutex_intr, return -ERESTARTNOINTR, + &task->signal->cred_guard_mutex) { - task_lock(task); - retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS); - task_unlock(task); - if (retval) - goto unlock_creds; + scoped_guard (task_lock, task) { + retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS); + if (retval) + return retval; + } - write_lock_irq(&tasklist_lock); - retval = -EPERM; - if (unlikely(task->exit_state)) - goto unlock_tasklist; - if (task->ptrace) - goto unlock_tasklist; + scoped_guard (write_lock_irq, &tasklist_lock) { + if (unlikely(task->exit_state)) + return -EPERM; + if (task->ptrace) + return -EPERM; - task->ptrace = flags; + task->ptrace = flags; - ptrace_link(task, current); + ptrace_link(task, current); - /* SEIZE doesn't trap tracee on attach */ - if (!seize) - send_sig_info(SIGSTOP, SEND_SIG_PRIV, task); + /* SEIZE doesn't trap tracee on attach */ + if (!seize) + send_sig_info(SIGSTOP, SEND_SIG_PRIV, task); - spin_lock(&task->sighand->siglock); + ptrace_set_stopped(task); + } + } /* - * If the task is already STOPPED, set JOBCTL_TRAP_STOP and - * TRAPPING, and kick it so that it transits to TRACED. TRAPPING - * will be cleared if the child completes the transition or any - * event which clears the group stop states happens. We'll wait - * for the transition to complete before returning from this - * function. - * - * This hides STOPPED -> RUNNING -> TRACED transition from the - * attaching thread but a different thread in the same group can - * still observe the transient RUNNING state. IOW, if another - * thread's WNOHANG wait(2) on the stopped tracee races against - * ATTACH, the wait(2) may fail due to the transient RUNNING. - * - * The following task_is_stopped() test is safe as both transitions - * in and out of STOPPED are protected by siglock. + * We do not bother to change retval or clear JOBCTL_TRAPPING + * if wait_on_bit() was interrupted by SIGKILL. The tracer will + * not return to user-mode, it will exit and clear this bit in + * __ptrace_unlink() if it wasn't already cleared by the tracee; + * and until then nobody can ptrace this task. */ - if (task_is_stopped(task) && - task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) { - task->jobctl &= ~JOBCTL_STOPPED; - signal_wake_up_state(task, __TASK_STOPPED); - } - - spin_unlock(&task->sighand->siglock); - - retval = 0; -unlock_tasklist: - write_unlock_irq(&tasklist_lock); -unlock_creds: - mutex_unlock(&task->signal->cred_guard_mutex); -out: - if (!retval) { - /* - * We do not bother to change retval or clear JOBCTL_TRAPPING - * if wait_on_bit() was interrupted by SIGKILL. The tracer will - * not return to user-mode, it will exit and clear this bit in - * __ptrace_unlink() if it wasn't already cleared by the tracee; - * and until then nobody can ptrace this task. - */ - wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, TASK_KILLABLE); - proc_ptrace_connector(task, PTRACE_ATTACH); - } + wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, TASK_KILLABLE); + proc_ptrace_connector(task, PTRACE_ATTACH); - return retval; + return 0; } /** diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 2984de629f74..9b0b52e1836f 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -105,6 +105,31 @@ config RCU_CPU_STALL_CPUTIME The boot option rcupdate.rcu_cpu_stall_cputime has the same function as this one, but will override this if it exists. +config RCU_CPU_STALL_NOTIFIER + bool "Provide RCU CPU-stall notifiers" + depends on RCU_STALL_COMMON + depends on DEBUG_KERNEL + depends on RCU_EXPERT + default n + help + WARNING: You almost certainly do not want this!!! + + Enable RCU CPU-stall notifiers, which are invoked just before + printing the RCU CPU stall warning. As such, bugs in notifier + callbacks can prevent stall warnings from being printed. + And the whole reason that a stall warning is being printed is + that something is hung up somewhere. Therefore, the notifier + callbacks must be written extremely carefully, preferably + containing only lockless code. After all, it is quite possible + that the whole reason that the RCU CPU stall is happening in + the first place is that someone forgot to release whatever lock + that you are thinking of acquiring. In which case, having your + notifier callback acquire that lock will hang, preventing the + RCU CPU stall warning from appearing. + + Say Y here if you want RCU CPU stall notifiers (you don't want them) + Say N if you are unsure. + config RCU_TRACE bool "Enable tracing for RCU" depends on DEBUG_KERNEL diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 98e13be411af..f94f65877f2b 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -10,6 +10,7 @@ #ifndef __LINUX_RCU_H #define __LINUX_RCU_H +#include <linux/slab.h> #include <trace/events/rcu.h> /* @@ -248,6 +249,12 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ +static inline void debug_rcu_head_callback(struct rcu_head *rhp) +{ + if (unlikely(!rhp->func)) + kmem_dump_obj(rhp); +} + extern int rcu_cpu_stall_suppress_at_boot; static inline bool rcu_stall_is_suppressed_at_boot(void) @@ -255,6 +262,8 @@ static inline bool rcu_stall_is_suppressed_at_boot(void) return rcu_cpu_stall_suppress_at_boot && !rcu_inkernel_boot_has_ended(); } +extern int rcu_cpu_stall_notifiers; + #ifdef CONFIG_RCU_STALL_COMMON extern int rcu_cpu_stall_ftrace_dump; @@ -493,6 +502,7 @@ static inline void rcu_expedite_gp(void) { } static inline void rcu_unexpedite_gp(void) { } static inline void rcu_async_hurry(void) { } static inline void rcu_async_relax(void) { } +static inline bool rcu_cpu_online(int cpu) { return true; } #else /* #ifdef CONFIG_TINY_RCU */ bool rcu_gp_is_normal(void); /* Internal RCU use. */ bool rcu_gp_is_expedited(void); /* Internal RCU use. */ @@ -502,6 +512,7 @@ void rcu_unexpedite_gp(void); void rcu_async_hurry(void); void rcu_async_relax(void); void rcupdate_announce_bootup_oddness(void); +bool rcu_cpu_online(int cpu); #ifdef CONFIG_TASKS_RCU_GENERIC void show_rcu_tasks_gp_kthreads(void); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ @@ -568,10 +579,6 @@ void do_trace_rcu_torture_read(const char *rcutorturename, static inline void rcu_gp_set_torture_wait(int duration) { } #endif -#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) -long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask); -#endif - #ifdef CONFIG_TINY_SRCU static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, @@ -654,4 +661,10 @@ static inline bool rcu_cpu_beenfullyonline(int cpu) { return true; } bool rcu_cpu_beenfullyonline(int cpu); #endif +#if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER) +int rcu_stall_notifier_call_chain(unsigned long val, void *v); +#else // #if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER) +static inline int rcu_stall_notifier_call_chain(unsigned long val, void *v) { return NOTIFY_DONE; } +#endif // #else // #if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER) + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index f71fac422c8f..1693ea22ef1b 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -368,7 +368,7 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, smp_mb(); /* Ensure counts are updated before callback is entrained. */ rhp->next = NULL; for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--) - if (rsclp->tails[i] != rsclp->tails[i - 1]) + if (!rcu_segcblist_segempty(rsclp, i)) break; rcu_segcblist_inc_seglen(rsclp, i); WRITE_ONCE(*rsclp->tails[i], rhp); @@ -551,7 +551,7 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) * as their ->gp_seq[] grace-period completion sequence number. */ for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--) - if (rsclp->tails[i] != rsclp->tails[i - 1] && + if (!rcu_segcblist_segempty(rsclp, i) && ULONG_CMP_LT(rsclp->gp_seq[i], seq)) break; diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index ade42d6a9d9b..7567ca8e743c 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -21,6 +21,7 @@ #include <linux/spinlock.h> #include <linux/smp.h> #include <linux/rcupdate_wait.h> +#include <linux/rcu_notifier.h> #include <linux/interrupt.h> #include <linux/sched/signal.h> #include <uapi/linux/sched/types.h> @@ -810,7 +811,7 @@ static void synchronize_rcu_trivial(void) int cpu; for_each_online_cpu(cpu) { - rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu)); + torture_sched_setaffinity(current->pid, cpumask_of(cpu)); WARN_ON_ONCE(raw_smp_processor_id() != cpu); } } @@ -1149,7 +1150,7 @@ static int rcu_torture_boost(void *arg) mutex_unlock(&boost_mutex); break; } - schedule_timeout_uninterruptible(1); + schedule_timeout_uninterruptible(HZ / 20); } /* Go do the stutter. */ @@ -1160,7 +1161,7 @@ checkwait: if (stutter_wait("rcu_torture_boost")) /* Clean up and exit. */ while (!kthread_should_stop()) { torture_shutdown_absorb("rcu_torture_boost"); - schedule_timeout_uninterruptible(1); + schedule_timeout_uninterruptible(HZ / 20); } torture_kthread_stopping("rcu_torture_boost"); return 0; @@ -1183,7 +1184,7 @@ rcu_torture_fqs(void *arg) fqs_resume_time = jiffies + fqs_stutter * HZ; while (time_before(jiffies, fqs_resume_time) && !kthread_should_stop()) { - schedule_timeout_interruptible(1); + schedule_timeout_interruptible(HZ / 20); } fqs_burst_remaining = fqs_duration; while (fqs_burst_remaining > 0 && @@ -2126,7 +2127,7 @@ static int rcu_nocb_toggle(void *arg) VERBOSE_TOROUT_STRING("rcu_nocb_toggle task started"); while (!rcu_inkernel_boot_has_ended()) schedule_timeout_interruptible(HZ / 10); - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) maxcpu = cpu; WARN_ON(maxcpu < 0); if (toggle_interval > ULONG_MAX) @@ -2428,6 +2429,16 @@ static int rcutorture_booster_init(unsigned int cpu) return 0; } +static int rcu_torture_stall_nf(struct notifier_block *nb, unsigned long v, void *ptr) +{ + pr_info("%s: v=%lu, duration=%lu.\n", __func__, v, (unsigned long)ptr); + return NOTIFY_OK; +} + +static struct notifier_block rcu_torture_stall_block = { + .notifier_call = rcu_torture_stall_nf, +}; + /* * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then * induces a CPU stall for the time specified by stall_cpu. @@ -2435,9 +2446,16 @@ static int rcutorture_booster_init(unsigned int cpu) static int rcu_torture_stall(void *args) { int idx; + int ret; unsigned long stop_at; VERBOSE_TOROUT_STRING("rcu_torture_stall task started"); + if (rcu_cpu_stall_notifiers) { + ret = rcu_stall_chain_notifier_register(&rcu_torture_stall_block); + if (ret) + pr_info("%s: rcu_stall_chain_notifier_register() returned %d, %sexpected.\n", + __func__, ret, !IS_ENABLED(CONFIG_RCU_STALL_COMMON) ? "un" : ""); + } if (stall_cpu_holdoff > 0) { VERBOSE_TOROUT_STRING("rcu_torture_stall begin holdoff"); schedule_timeout_interruptible(stall_cpu_holdoff * HZ); @@ -2481,6 +2499,11 @@ static int rcu_torture_stall(void *args) cur_ops->readunlock(idx); } pr_alert("%s end.\n", __func__); + if (rcu_cpu_stall_notifiers && !ret) { + ret = rcu_stall_chain_notifier_unregister(&rcu_torture_stall_block); + if (ret) + pr_info("%s: rcu_stall_chain_notifier_unregister() returned %d.\n", __func__, ret); + } torture_shutdown_absorb("rcu_torture_stall"); while (!kthread_should_stop()) schedule_timeout_interruptible(10 * HZ); @@ -2899,7 +2922,7 @@ static int rcu_torture_fwd_prog(void *args) WRITE_ONCE(rcu_fwd_seq, rcu_fwd_seq + 1); } else { while (READ_ONCE(rcu_fwd_seq) == oldseq && !torture_must_stop()) - schedule_timeout_interruptible(1); + schedule_timeout_interruptible(HZ / 20); oldseq = READ_ONCE(rcu_fwd_seq); } pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id); @@ -3200,7 +3223,7 @@ static int rcu_torture_read_exit_child(void *trsp_in) set_user_nice(current, MAX_NICE); // Minimize time between reading and exiting. while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); + schedule_timeout_uninterruptible(HZ / 20); (void)rcu_torture_one_read(trsp, -1); return 0; } @@ -3248,7 +3271,7 @@ static int rcu_torture_read_exit(void *unused) smp_mb(); // Store before wakeup. wake_up(&read_exit_wq); while (!torture_must_stop()) - schedule_timeout_uninterruptible(1); + schedule_timeout_uninterruptible(HZ / 20); torture_kthread_stopping("rcu_torture_read_exit"); return 0; } @@ -3851,7 +3874,9 @@ rcu_torture_init(void) } if (fqs_duration < 0) fqs_duration = 0; - if (fqs_duration) { + if (fqs_holdoff < 0) + fqs_holdoff = 0; + if (fqs_duration && fqs_holdoff) { /* Create the fqs thread */ firsterr = torture_create_kthread(rcu_torture_fqs, NULL, fqs_task); diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 91a0fd0d4d9a..2c2648a3ad30 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -655,12 +655,12 @@ retry: goto retry; } un_delay(udl, ndl); + b = READ_ONCE(rtsp->a); // Remember, seqlock read-side release can fail. if (!rts_release(rtsp, start)) { rcu_read_unlock(); goto retry; } - b = READ_ONCE(rtsp->a); WARN_ONCE(a != b, "Re-read of ->a changed from %u to %u.\n", a, b); b = rtsp->b; rcu_read_unlock(); @@ -1025,8 +1025,8 @@ static void ref_scale_print_module_parms(struct ref_scale_ops *cur_ops, const char *tag) { pr_alert("%s" SCALE_FLAG - "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag, - verbose, shutdown, holdoff, loops, nreaders, nruns, readdelay); + "--- %s: verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag, + verbose, verbose_batched, shutdown, holdoff, lookup_instances, loops, nreaders, nruns, readdelay); } static void diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 336af24e0fe3..c38e5933a5d6 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -138,6 +138,7 @@ void srcu_drive_gp(struct work_struct *wp) while (lh) { rhp = lh; lh = lh->next; + debug_rcu_head_callback(rhp); local_bh_disable(); rhp->func(rhp); local_bh_enable(); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 20d7a238d675..0351a4e83529 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -223,7 +223,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) snp->grplo = cpu; snp->grphi = cpu; } - sdp->grpmask = 1 << (cpu - sdp->mynode->grplo); + sdp->grpmask = 1UL << (cpu - sdp->mynode->grplo); } smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_WAIT_BARRIER); return true; @@ -255,29 +255,31 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) ssp->srcu_sup->sda_is_static = is_static; if (!is_static) ssp->sda = alloc_percpu(struct srcu_data); - if (!ssp->sda) { - if (!is_static) - kfree(ssp->srcu_sup); - return -ENOMEM; - } + if (!ssp->sda) + goto err_free_sup; init_srcu_struct_data(ssp); ssp->srcu_sup->srcu_gp_seq_needed_exp = 0; ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns(); if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) { - if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) { - if (!ssp->srcu_sup->sda_is_static) { - free_percpu(ssp->sda); - ssp->sda = NULL; - kfree(ssp->srcu_sup); - return -ENOMEM; - } - } else { - WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG); - } + if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) + goto err_free_sda; + WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG); } ssp->srcu_sup->srcu_ssp = ssp; smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, 0); /* Init done. */ return 0; + +err_free_sda: + if (!is_static) { + free_percpu(ssp->sda); + ssp->sda = NULL; + } +err_free_sup: + if (!is_static) { + kfree(ssp->srcu_sup); + ssp->srcu_sup = NULL; + } + return -ENOMEM; } #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -770,21 +772,10 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock_nmisafe); */ static void srcu_gp_start(struct srcu_struct *ssp) { - struct srcu_data *sdp; int state; - if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) - sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id()); - else - sdp = this_cpu_ptr(ssp->sda); lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)); - spin_lock_rcu_node(sdp); /* Interrupts already disabled. */ - rcu_segcblist_advance(&sdp->srcu_cblist, - rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); - (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, - rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq)); - spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */ WRITE_ONCE(ssp->srcu_sup->srcu_gp_start, jiffies); WRITE_ONCE(ssp->srcu_sup->srcu_n_exp_nodelay, 0); smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */ @@ -833,7 +824,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp int cpu; for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { - if (!(mask & (1 << (cpu - snp->grplo)))) + if (!(mask & (1UL << (cpu - snp->grplo)))) continue; srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, cpu), delay); } @@ -1242,10 +1233,39 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, spin_lock_irqsave_sdp_contention(sdp, &flags); if (rhp) rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); - rcu_segcblist_advance(&sdp->srcu_cblist, - rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); + /* + * The snapshot for acceleration must be taken _before_ the read of the + * current gp sequence used for advancing, otherwise advancing may fail + * and acceleration may then fail too. + * + * This could happen if: + * + * 1) The RCU_WAIT_TAIL segment has callbacks (gp_num = X + 4) and the + * RCU_NEXT_READY_TAIL also has callbacks (gp_num = X + 8). + * + * 2) The grace period for RCU_WAIT_TAIL is seen as started but not + * completed so rcu_seq_current() returns X + SRCU_STATE_SCAN1. + * + * 3) This value is passed to rcu_segcblist_advance() which can't move + * any segment forward and fails. + * + * 4) srcu_gp_start_if_needed() still proceeds with callback acceleration. + * But then the call to rcu_seq_snap() observes the grace period for the + * RCU_WAIT_TAIL segment as completed and the subsequent one for the + * RCU_NEXT_READY_TAIL segment as started (ie: X + 4 + SRCU_STATE_SCAN1) + * so it returns a snapshot of the next grace period, which is X + 12. + * + * 5) The value of X + 12 is passed to rcu_segcblist_accelerate() but the + * freshly enqueued callback in RCU_NEXT_TAIL can't move to + * RCU_NEXT_READY_TAIL which already has callbacks for a previous grace + * period (gp_num = X + 8). So acceleration fails. + */ s = rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq); - (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s); + if (rhp) { + rcu_segcblist_advance(&sdp->srcu_cblist, + rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); + WARN_ON_ONCE(!rcu_segcblist_accelerate(&sdp->srcu_cblist, s)); + } if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { sdp->srcu_gp_seq_needed = s; needgp = true; @@ -1692,8 +1712,14 @@ static void srcu_invoke_callbacks(struct work_struct *work) ssp = sdp->ssp; rcu_cblist_init(&ready_cbs); spin_lock_irq_rcu_node(sdp); + WARN_ON_ONCE(!rcu_segcblist_segempty(&sdp->srcu_cblist, RCU_NEXT_TAIL)); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); + /* + * Although this function is theoretically re-entrant, concurrent + * callbacks invocation is disallowed to avoid executing an SRCU barrier + * too early. + */ if (sdp->srcu_cblist_invoking || !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) { spin_unlock_irq_rcu_node(sdp); @@ -1708,6 +1734,7 @@ static void srcu_invoke_callbacks(struct work_struct *work) rhp = rcu_cblist_dequeue(&ready_cbs); for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { debug_rcu_head_unqueue(rhp); + debug_rcu_head_callback(rhp); local_bh_disable(); rhp->func(rhp); local_bh_enable(); @@ -1720,11 +1747,10 @@ static void srcu_invoke_callbacks(struct work_struct *work) */ spin_lock_irq_rcu_node(sdp); rcu_segcblist_add_len(&sdp->srcu_cblist, -len); - (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, - rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq)); sdp->srcu_cblist_invoking = false; more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); spin_unlock_irq_rcu_node(sdp); + /* An SRCU barrier or callbacks from previous nesting work pending */ if (more) srcu_schedule_cbs_sdp(sdp, 0); } diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 8d65f7d576a3..732ad5b39946 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -432,6 +432,7 @@ static void rcu_barrier_tasks_generic(struct rcu_tasks *rtp) static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) { int cpu; + int dequeue_limit; unsigned long flags; bool gpdone = poll_state_synchronize_rcu(rtp->percpu_dequeue_gpseq); long n; @@ -439,7 +440,8 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) long ncbsnz = 0; int needgpcb = 0; - for (cpu = 0; cpu < smp_load_acquire(&rtp->percpu_dequeue_lim); cpu++) { + dequeue_limit = smp_load_acquire(&rtp->percpu_dequeue_lim); + for (cpu = 0; cpu < dequeue_limit; cpu++) { struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); /* Advance and accelerate any new callbacks. */ @@ -538,6 +540,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); len = rcl.len; for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = rcu_cblist_dequeue(&rcl)) { + debug_rcu_head_callback(rhp); local_bh_disable(); rhp->func(rhp); local_bh_enable(); @@ -892,10 +895,36 @@ static void rcu_tasks_pregp_step(struct list_head *hop) synchronize_rcu(); } +/* Check for quiescent states since the pregp's synchronize_rcu() */ +static bool rcu_tasks_is_holdout(struct task_struct *t) +{ + int cpu; + + /* Has the task been seen voluntarily sleeping? */ + if (!READ_ONCE(t->on_rq)) + return false; + + /* + * Idle tasks (or idle injection) within the idle loop are RCU-tasks + * quiescent states. But CPU boot code performed by the idle task + * isn't a quiescent state. + */ + if (is_idle_task(t)) + return false; + + cpu = task_cpu(t); + + /* Idle tasks on offline CPUs are RCU-tasks quiescent states. */ + if (t == idle_task(cpu) && !rcu_cpu_online(cpu)) + return false; + + return true; +} + /* Per-task initial processing. */ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop) { - if (t != current && READ_ONCE(t->on_rq) && !is_idle_task(t)) { + if (t != current && rcu_tasks_is_holdout(t)) { get_task_struct(t); t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); WRITE_ONCE(t->rcu_tasks_holdout, true); @@ -944,9 +973,9 @@ static void check_holdout_task(struct task_struct *t, if (!READ_ONCE(t->rcu_tasks_holdout) || t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) || - !READ_ONCE(t->on_rq) || + !rcu_tasks_is_holdout(t) || (IS_ENABLED(CONFIG_NO_HZ_FULL) && - !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { + !is_idle_task(t) && READ_ONCE(t->rcu_tasks_idle_cpu) >= 0)) { WRITE_ONCE(t->rcu_tasks_holdout, false); list_del_init(&t->rcu_tasks_holdout_list); put_task_struct(t); @@ -964,7 +993,7 @@ static void check_holdout_task(struct task_struct *t, t, ".I"[is_idle_task(t)], "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)], t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout, - t->rcu_tasks_idle_cpu, cpu); + data_race(t->rcu_tasks_idle_cpu), cpu); sched_show_task(t); } @@ -1084,7 +1113,7 @@ void rcu_barrier_tasks(void) } EXPORT_SYMBOL_GPL(rcu_barrier_tasks); -int rcu_tasks_lazy_ms = -1; +static int rcu_tasks_lazy_ms = -1; module_param(rcu_tasks_lazy_ms, int, 0444); static int __init rcu_spawn_tasks_kthread(void) @@ -1522,7 +1551,7 @@ static int trc_inspect_reader(struct task_struct *t, void *bhp_in) } else { // The task is not running, so C-language access is safe. nesting = t->trc_reader_nesting; - WARN_ON_ONCE(ofl && task_curr(t) && !is_idle_task(t)); + WARN_ON_ONCE(ofl && task_curr(t) && (t != idle_task(task_cpu(t)))); if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && ofl) n_heavy_reader_ofl_updates++; } @@ -1979,20 +2008,22 @@ static void test_rcu_tasks_callback(struct rcu_head *rhp) static void rcu_tasks_initiate_self_tests(void) { - pr_info("Running RCU-tasks wait API self tests\n"); #ifdef CONFIG_TASKS_RCU + pr_info("Running RCU Tasks wait API self tests\n"); tests[0].runstart = jiffies; synchronize_rcu_tasks(); call_rcu_tasks(&tests[0].rh, test_rcu_tasks_callback); #endif #ifdef CONFIG_TASKS_RUDE_RCU + pr_info("Running RCU Tasks Rude wait API self tests\n"); tests[1].runstart = jiffies; synchronize_rcu_tasks_rude(); call_rcu_tasks_rude(&tests[1].rh, test_rcu_tasks_callback); #endif #ifdef CONFIG_TASKS_TRACE_RCU + pr_info("Running RCU Tasks Trace wait API self tests\n"); tests[2].runstart = jiffies; synchronize_rcu_tasks_trace(); call_rcu_tasks_trace(&tests[2].rh, test_rcu_tasks_callback); diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 42f7589e51e0..fec804b79080 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -97,6 +97,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head) trace_rcu_invoke_callback("", head); f = head->func; + debug_rcu_head_callback(head); WRITE_ONCE(head->func, (rcu_callback_t)0L); f(head); rcu_lock_release(&rcu_callback_map); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cb1caefa8bd0..b2bccfd37c38 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -31,6 +31,7 @@ #include <linux/bitops.h> #include <linux/export.h> #include <linux/completion.h> +#include <linux/kmemleak.h> #include <linux/moduleparam.h> #include <linux/panic.h> #include <linux/panic_notifier.h> @@ -754,14 +755,19 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) } /* - * Return true if the specified CPU has passed through a quiescent - * state by virtue of being in or having passed through an dynticks - * idle state since the last call to dyntick_save_progress_counter() - * for this same CPU, or by virtue of having been offline. + * Returns positive if the specified CPU has passed through a quiescent state + * by virtue of being in or having passed through an dynticks idle state since + * the last call to dyntick_save_progress_counter() for this same CPU, or by + * virtue of having been offline. + * + * Returns negative if the specified CPU needs a force resched. + * + * Returns zero otherwise. */ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) { unsigned long jtsq; + int ret = 0; struct rcu_node *rnp = rdp->mynode; /* @@ -847,8 +853,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) (time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq * 3) || rcu_state.cbovld)) { WRITE_ONCE(rdp->rcu_urgent_qs, true); - resched_cpu(rdp->cpu); WRITE_ONCE(rdp->last_fqs_resched, jiffies); + ret = -1; } /* @@ -861,8 +867,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) if (time_after(jiffies, rcu_state.jiffies_resched)) { if (time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq)) { - resched_cpu(rdp->cpu); WRITE_ONCE(rdp->last_fqs_resched, jiffies); + ret = -1; } if (IS_ENABLED(CONFIG_IRQ_WORK) && !rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq && @@ -891,7 +897,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) } } - return 0; + return ret; } /* Trace-event wrapper function for trace_rcu_future_grace_period. */ @@ -1007,6 +1013,38 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp) return needmore; } +static void swake_up_one_online_ipi(void *arg) +{ + struct swait_queue_head *wqh = arg; + + swake_up_one(wqh); +} + +static void swake_up_one_online(struct swait_queue_head *wqh) +{ + int cpu = get_cpu(); + + /* + * If called from rcutree_report_cpu_starting(), wake up + * is dangerous that late in the CPU-down hotplug process. The + * scheduler might queue an ignored hrtimer. Defer the wake up + * to an online CPU instead. + */ + if (unlikely(cpu_is_offline(cpu))) { + int target; + + target = cpumask_any_and(housekeeping_cpumask(HK_TYPE_RCU), + cpu_online_mask); + + smp_call_function_single(target, swake_up_one_online_ipi, + wqh, 0); + put_cpu(); + } else { + put_cpu(); + swake_up_one(wqh); + } +} + /* * Awaken the grace-period kthread. Don't do a self-awaken (unless in an * interrupt or softirq handler, in which case we just might immediately @@ -1031,7 +1069,7 @@ static void rcu_gp_kthread_wake(void) return; WRITE_ONCE(rcu_state.gp_wake_time, jiffies); WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq)); - swake_up_one(&rcu_state.gp_wq); + swake_up_one_online(&rcu_state.gp_wq); } /* @@ -1260,7 +1298,7 @@ EXPORT_SYMBOL_GPL(rcu_gp_slow_register); /* Unregister a counter, with NULL for not caring which. */ void rcu_gp_slow_unregister(atomic_t *rgssp) { - WARN_ON_ONCE(rgssp && rgssp != rcu_gp_slow_suppress); + WARN_ON_ONCE(rgssp && rgssp != rcu_gp_slow_suppress && rcu_gp_slow_suppress != NULL); WRITE_ONCE(rcu_gp_slow_suppress, NULL); } @@ -1556,10 +1594,22 @@ static bool rcu_gp_fqs_check_wake(int *gfp) */ static void rcu_gp_fqs(bool first_time) { + int nr_fqs = READ_ONCE(rcu_state.nr_fqs_jiffies_stall); struct rcu_node *rnp = rcu_get_root(); WRITE_ONCE(rcu_state.gp_activity, jiffies); WRITE_ONCE(rcu_state.n_force_qs, rcu_state.n_force_qs + 1); + + WARN_ON_ONCE(nr_fqs > 3); + /* Only countdown nr_fqs for stall purposes if jiffies moves. */ + if (nr_fqs) { + if (nr_fqs == 1) { + WRITE_ONCE(rcu_state.jiffies_stall, + jiffies + rcu_jiffies_till_stall_check()); + } + WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, --nr_fqs); + } + if (first_time) { /* Collect dyntick-idle snapshots. */ force_qs_rnp(dyntick_save_progress_counter); @@ -2135,6 +2185,7 @@ static void rcu_do_batch(struct rcu_data *rdp) trace_rcu_invoke_callback(rcu_state.name, rhp); f = rhp->func; + debug_rcu_head_callback(rhp); WRITE_ONCE(rhp->func, (rcu_callback_t)0L); f(rhp); @@ -2257,15 +2308,15 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) { int cpu; unsigned long flags; - unsigned long mask; - struct rcu_data *rdp; struct rcu_node *rnp; rcu_state.cbovld = rcu_state.cbovldnext; rcu_state.cbovldnext = false; rcu_for_each_leaf_node(rnp) { + unsigned long mask = 0; + unsigned long rsmask = 0; + cond_resched_tasks_rcu_qs(); - mask = 0; raw_spin_lock_irqsave_rcu_node(rnp, flags); rcu_state.cbovldnext |= !!rnp->cbovldmask; if (rnp->qsmask == 0) { @@ -2283,11 +2334,17 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) continue; } for_each_leaf_node_cpu_mask(rnp, cpu, rnp->qsmask) { + struct rcu_data *rdp; + int ret; + rdp = per_cpu_ptr(&rcu_data, cpu); - if (f(rdp)) { + ret = f(rdp); + if (ret > 0) { mask |= rdp->grpmask; rcu_disable_urgency_upon_qs(rdp); } + if (ret < 0) + rsmask |= rdp->grpmask; } if (mask != 0) { /* Idle/offline CPUs, report (releases rnp->lock). */ @@ -2296,6 +2353,9 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) /* Nothing to do here, so just drop the lock. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } + + for_each_leaf_node_cpu_mask(rnp, cpu, rsmask) + resched_cpu(cpu); } } @@ -2310,6 +2370,8 @@ void rcu_force_quiescent_state(void) struct rcu_node *rnp; struct rcu_node *rnp_old = NULL; + if (!rcu_gp_in_progress()) + return; /* Funnel through hierarchy to reduce memory contention. */ rnp = raw_cpu_read(rcu_data.mynode); for (; rnp != NULL; rnp = rnp->parent) { @@ -2713,7 +2775,7 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in) */ void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func) { - return __call_rcu_common(head, func, false); + __call_rcu_common(head, func, false); } EXPORT_SYMBOL_GPL(call_rcu_hurry); #endif @@ -2764,7 +2826,7 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry); */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { - return __call_rcu_common(head, func, IS_ENABLED(CONFIG_RCU_LAZY)); + __call_rcu_common(head, func, IS_ENABLED(CONFIG_RCU_LAZY)); } EXPORT_SYMBOL_GPL(call_rcu); @@ -3388,6 +3450,14 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr) success = true; } + /* + * The kvfree_rcu() caller considers the pointer freed at this point + * and likely removes any references to it. Since the actual slab + * freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore + * this object (no scanning or false positives reporting). + */ + kmemleak_ignore(ptr); + // Set timer to drain after KFREE_DRAIN_JIFFIES. if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING) schedule_delayed_monitor_work(krcp); @@ -3449,13 +3519,6 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) return freed == 0 ? SHRINK_STOP : freed; } -static struct shrinker kfree_rcu_shrinker = { - .count_objects = kfree_rcu_shrink_count, - .scan_objects = kfree_rcu_shrink_scan, - .batch = 0, - .seeks = DEFAULT_SEEKS, -}; - void __init kfree_rcu_scheduler_running(void) { int cpu; @@ -4083,6 +4146,82 @@ retry: } EXPORT_SYMBOL_GPL(rcu_barrier); +static unsigned long rcu_barrier_last_throttle; + +/** + * rcu_barrier_throttled - Do rcu_barrier(), but limit to one per second + * + * This can be thought of as guard rails around rcu_barrier() that + * permits unrestricted userspace use, at least assuming the hardware's + * try_cmpxchg() is robust. There will be at most one call per second to + * rcu_barrier() system-wide from use of this function, which means that + * callers might needlessly wait a second or three. + * + * This is intended for use by test suites to avoid OOM by flushing RCU + * callbacks from the previous test before starting the next. See the + * rcutree.do_rcu_barrier module parameter for more information. + * + * Why not simply make rcu_barrier() more scalable? That might be + * the eventual endpoint, but let's keep it simple for the time being. + * Note that the module parameter infrastructure serializes calls to a + * given .set() function, but should concurrent .set() invocation ever be + * possible, we are ready! + */ +static void rcu_barrier_throttled(void) +{ + unsigned long j = jiffies; + unsigned long old = READ_ONCE(rcu_barrier_last_throttle); + unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence); + + while (time_in_range(j, old, old + HZ / 16) || + !try_cmpxchg(&rcu_barrier_last_throttle, &old, j)) { + schedule_timeout_idle(HZ / 16); + if (rcu_seq_done(&rcu_state.barrier_sequence, s)) { + smp_mb(); /* caller's subsequent code after above check. */ + return; + } + j = jiffies; + old = READ_ONCE(rcu_barrier_last_throttle); + } + rcu_barrier(); +} + +/* + * Invoke rcu_barrier_throttled() when a rcutree.do_rcu_barrier + * request arrives. We insist on a true value to allow for possible + * future expansion. + */ +static int param_set_do_rcu_barrier(const char *val, const struct kernel_param *kp) +{ + bool b; + int ret; + + if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) + return -EAGAIN; + ret = kstrtobool(val, &b); + if (!ret && b) { + atomic_inc((atomic_t *)kp->arg); + rcu_barrier_throttled(); + atomic_dec((atomic_t *)kp->arg); + } + return ret; +} + +/* + * Output the number of outstanding rcutree.do_rcu_barrier requests. + */ +static int param_get_do_rcu_barrier(char *buffer, const struct kernel_param *kp) +{ + return sprintf(buffer, "%d\n", atomic_read((atomic_t *)kp->arg)); +} + +static const struct kernel_param_ops do_rcu_barrier_ops = { + .set = param_set_do_rcu_barrier, + .get = param_get_do_rcu_barrier, +}; +static atomic_t do_rcu_barrier; +module_param_cb(do_rcu_barrier, &do_rcu_barrier_ops, &do_rcu_barrier, 0644); + /* * Compute the mask of online CPUs for the specified rcu_node structure. * This will not be stable unless the rcu_node structure's ->lock is @@ -4104,6 +4243,13 @@ static bool rcu_rdp_cpu_online(struct rcu_data *rdp) return !!(rdp->grpmask & rcu_rnp_online_cpus(rdp->mynode)); } +bool rcu_cpu_online(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + + return rcu_rdp_cpu_online(rdp); +} + #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) /* @@ -4130,7 +4276,7 @@ bool rcu_lockdep_current_cpu_online(void) rdp = this_cpu_ptr(&rcu_data); /* * Strictly, we care here about the case where the current CPU is - * in rcu_cpu_starting() and thus has an excuse for rdp->grpmask + * in rcutree_report_cpu_starting() and thus has an excuse for rdp->grpmask * not being up to date. So arch_spin_is_locked() might have a * false positive if it's held by some *other* CPU, but that's * OK because that just means a false *negative* on the warning. @@ -4152,25 +4298,6 @@ static bool rcu_init_invoked(void) } /* - * Near the end of the offline process. Trace the fact that this CPU - * is going offline. - */ -int rcutree_dying_cpu(unsigned int cpu) -{ - bool blkd; - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - struct rcu_node *rnp = rdp->mynode; - - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) - return 0; - - blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask); - trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq), - blkd ? TPS("cpuofl-bgp") : TPS("cpuofl")); - return 0; -} - -/* * All CPUs for the specified rcu_node structure have gone offline, * and all tasks that were preempted within an RCU read-side critical * section while running on one of those CPUs have since exited their RCU @@ -4216,23 +4343,6 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) } /* - * The CPU has been completely removed, and some other CPU is reporting - * this fact from process context. Do the remainder of the cleanup. - * There can only be one CPU hotplug operation at a time, so no need for - * explicit locking. - */ -int rcutree_dead_cpu(unsigned int cpu) -{ - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) - return 0; - - WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1); - // Stop-machine done, so allow nohz_full to disable tick. - tick_dep_clear(TICK_DEP_BIT_RCU); - return 0; -} - -/* * Propagate ->qsinitmask bits up the rcu_node tree to account for the * first CPU in a given leaf rcu_node structure coming online. The caller * must hold the corresponding leaf rcu_node ->lock with interrupts @@ -4385,29 +4495,6 @@ int rcutree_online_cpu(unsigned int cpu) } /* - * Near the beginning of the process. The CPU is still very much alive - * with pretty much all services enabled. - */ -int rcutree_offline_cpu(unsigned int cpu) -{ - unsigned long flags; - struct rcu_data *rdp; - struct rcu_node *rnp; - - rdp = per_cpu_ptr(&rcu_data, cpu); - rnp = rdp->mynode; - raw_spin_lock_irqsave_rcu_node(rnp, flags); - rnp->ffmask &= ~rdp->grpmask; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - - rcutree_affinity_setting(cpu, cpu); - - // nohz_full CPUs need the tick for stop-machine to work quickly - tick_dep_set(TICK_DEP_BIT_RCU); - return 0; -} - -/* * Mark the specified CPU as being online so that subsequent grace periods * (both expedited and normal) will wait on it. Note that this means that * incoming CPUs are not allowed to use RCU read-side critical sections @@ -4418,8 +4505,10 @@ int rcutree_offline_cpu(unsigned int cpu) * from the incoming CPU rather than from the cpuhp_step mechanism. * This is because this function must be invoked at a precise location. * This incoming CPU must not have enabled interrupts yet. + * + * This mirrors the effects of rcutree_report_cpu_dead(). */ -void rcu_cpu_starting(unsigned int cpu) +void rcutree_report_cpu_starting(unsigned int cpu) { unsigned long mask; struct rcu_data *rdp; @@ -4473,14 +4562,21 @@ void rcu_cpu_starting(unsigned int cpu) * Note that this function is special in that it is invoked directly * from the outgoing CPU rather than from the cpuhp_step mechanism. * This is because this function must be invoked at a precise location. + * + * This mirrors the effect of rcutree_report_cpu_starting(). */ -void rcu_report_dead(unsigned int cpu) +void rcutree_report_cpu_dead(void) { - unsigned long flags, seq_flags; + unsigned long flags; unsigned long mask; - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ + /* + * IRQS must be disabled from now on and until the CPU dies, or an interrupt + * may introduce a new READ-side while it is actually off the QS masks. + */ + lockdep_assert_irqs_disabled(); // Do any dangling deferred wakeups. do_nocb_deferred_wakeup(rdp); @@ -4488,7 +4584,6 @@ void rcu_report_dead(unsigned int cpu) /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; - local_irq_save(seq_flags); arch_spin_lock(&rcu_state.ofl_lock); raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq); @@ -4502,8 +4597,6 @@ void rcu_report_dead(unsigned int cpu) WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); arch_spin_unlock(&rcu_state.ofl_lock); - local_irq_restore(seq_flags); - rdp->cpu_started = false; } @@ -4558,7 +4651,60 @@ void rcutree_migrate_callbacks(int cpu) cpu, rcu_segcblist_n_cbs(&rdp->cblist), rcu_segcblist_first_cb(&rdp->cblist)); } -#endif + +/* + * The CPU has been completely removed, and some other CPU is reporting + * this fact from process context. Do the remainder of the cleanup. + * There can only be one CPU hotplug operation at a time, so no need for + * explicit locking. + */ +int rcutree_dead_cpu(unsigned int cpu) +{ + WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1); + // Stop-machine done, so allow nohz_full to disable tick. + tick_dep_clear(TICK_DEP_BIT_RCU); + return 0; +} + +/* + * Near the end of the offline process. Trace the fact that this CPU + * is going offline. + */ +int rcutree_dying_cpu(unsigned int cpu) +{ + bool blkd; + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + struct rcu_node *rnp = rdp->mynode; + + blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask); + trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq), + blkd ? TPS("cpuofl-bgp") : TPS("cpuofl")); + return 0; +} + +/* + * Near the beginning of the process. The CPU is still very much alive + * with pretty much all services enabled. + */ +int rcutree_offline_cpu(unsigned int cpu) +{ + unsigned long flags; + struct rcu_data *rdp; + struct rcu_node *rnp; + + rdp = per_cpu_ptr(&rcu_data, cpu); + rnp = rdp->mynode; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + rnp->ffmask &= ~rdp->grpmask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + + rcutree_affinity_setting(cpu, cpu); + + // nohz_full CPUs need the tick for stop-machine to work quickly + tick_dep_set(TICK_DEP_BIT_RCU); + return 0; +} +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ /* * On non-huge systems, use expedited RCU grace periods to make suspend @@ -4931,6 +5077,7 @@ static void __init kfree_rcu_batch_init(void) { int cpu; int i, j; + struct shrinker *kfree_rcu_shrinker; /* Clamp it to [0:100] seconds interval. */ if (rcu_delay_page_cache_fill_msec < 0 || @@ -4962,8 +5109,17 @@ static void __init kfree_rcu_batch_init(void) INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func); krcp->initialized = true; } - if (register_shrinker(&kfree_rcu_shrinker, "rcu-kfree")) - pr_err("Failed to register kfree_rcu() shrinker!\n"); + + kfree_rcu_shrinker = shrinker_alloc(0, "rcu-kfree"); + if (!kfree_rcu_shrinker) { + pr_err("Failed to allocate kfree_rcu() shrinker!\n"); + return; + } + + kfree_rcu_shrinker->count_objects = kfree_rcu_shrink_count; + kfree_rcu_shrinker->scan_objects = kfree_rcu_shrink_scan; + + shrinker_register(kfree_rcu_shrinker); } void __init rcu_init(void) @@ -4990,7 +5146,7 @@ void __init rcu_init(void) pm_notifier(rcu_pm_notify, 0); WARN_ON(num_online_cpus() > 1); // Only one CPU this early in boot. rcutree_prepare_cpu(cpu); - rcu_cpu_starting(cpu); + rcutree_report_cpu_starting(cpu); rcutree_online_cpu(cpu); /* Create workqueue for Tree SRCU and for expedited GPs. */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 192536916f9a..e9821a8422db 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -386,6 +386,10 @@ struct rcu_state { /* in jiffies. */ unsigned long jiffies_stall; /* Time at which to check */ /* for CPU stalls. */ + int nr_fqs_jiffies_stall; /* Number of fqs loops after + * which read jiffies and set + * jiffies_stall. Stall + * warnings disabled if !0. */ unsigned long jiffies_resched; /* Time at which to resched */ /* a reluctant CPU. */ unsigned long n_force_qs_gpstart; /* Snapshot of n_force_qs at */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 8239b39d945b..2ac440bc7e10 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -173,7 +173,6 @@ static bool sync_rcu_exp_done_unlocked(struct rcu_node *rnp) return ret; } - /* * Report the exit from RCU read-side critical section for the last task * that queued itself during or before the current expedited preemptible-RCU @@ -201,7 +200,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp, raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (wake) { smp_mb(); /* EGP done before wake_up(). */ - swake_up_one(&rcu_state.expedited_wq); + swake_up_one_online(&rcu_state.expedited_wq); } break; } @@ -621,10 +620,14 @@ static void synchronize_rcu_expedited_wait(void) } for (;;) { + unsigned long j; + if (synchronize_rcu_expedited_wait_once(jiffies_stall)) return; if (rcu_stall_is_suppressed()) continue; + j = jiffies; + rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_EXP, (void *)(j - jiffies_start)); trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall")); pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rcu_state.name); @@ -647,7 +650,7 @@ static void synchronize_rcu_expedited_wait(void) } } pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", - jiffies - jiffies_start, rcu_state.expedited_sequence, + j - jiffies_start, rcu_state.expedited_sequence, data_race(rnp_root->expmask), ".T"[!!data_race(rnp_root->exp_tasks)]); if (ndetected) { diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 5598212d1f27..4efbf7333d4e 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1396,13 +1396,6 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) return count ? count : SHRINK_STOP; } - -static struct shrinker lazy_rcu_shrinker = { - .count_objects = lazy_rcu_shrink_count, - .scan_objects = lazy_rcu_shrink_scan, - .batch = 0, - .seeks = DEFAULT_SEEKS, -}; #endif // #ifdef CONFIG_RCU_LAZY void __init rcu_init_nohz(void) @@ -1410,6 +1403,7 @@ void __init rcu_init_nohz(void) int cpu; struct rcu_data *rdp; const struct cpumask *cpumask = NULL; + struct shrinker * __maybe_unused lazy_rcu_shrinker; #if defined(CONFIG_NO_HZ_FULL) if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask)) @@ -1436,8 +1430,15 @@ void __init rcu_init_nohz(void) return; #ifdef CONFIG_RCU_LAZY - if (register_shrinker(&lazy_rcu_shrinker, "rcu-lazy")) - pr_err("Failed to register lazy_rcu shrinker!\n"); + lazy_rcu_shrinker = shrinker_alloc(0, "rcu-lazy"); + if (!lazy_rcu_shrinker) { + pr_err("Failed to allocate lazy_rcu shrinker!\n"); + } else { + lazy_rcu_shrinker->count_objects = lazy_rcu_shrink_count; + lazy_rcu_shrinker->scan_objects = lazy_rcu_shrink_scan; + + shrinker_register(lazy_rcu_shrinker); + } #endif // #ifdef CONFIG_RCU_LAZY if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 6f06dc12904a..5d666428546b 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -8,6 +8,7 @@ */ #include <linux/kvm_para.h> +#include <linux/rcu_notifier.h> ////////////////////////////////////////////////////////////////////////////// // @@ -149,12 +150,17 @@ static void panic_on_rcu_stall(void) /** * rcu_cpu_stall_reset - restart stall-warning timeout for current grace period * + * To perform the reset request from the caller, disable stall detection until + * 3 fqs loops have passed. This is required to ensure a fresh jiffies is + * loaded. It should be safe to do from the fqs loop as enough timer + * interrupts and context switches should have passed. + * * The caller must disable hard irqs. */ void rcu_cpu_stall_reset(void) { - WRITE_ONCE(rcu_state.jiffies_stall, - jiffies + rcu_jiffies_till_stall_check()); + WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, 3); + WRITE_ONCE(rcu_state.jiffies_stall, ULONG_MAX); } ////////////////////////////////////////////////////////////////////////////// @@ -170,6 +176,7 @@ static void record_gp_stall_check_time(void) WRITE_ONCE(rcu_state.gp_start, j); j1 = rcu_jiffies_till_stall_check(); smp_mb(); // ->gp_start before ->jiffies_stall and caller's ->gp_seq. + WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, 0); WRITE_ONCE(rcu_state.jiffies_stall, j + j1); rcu_state.jiffies_resched = j + j1 / 2; rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs); @@ -534,16 +541,16 @@ static void rcu_check_gp_kthread_starvation(void) data_race(READ_ONCE(rcu_state.gp_state)), gpk ? data_race(READ_ONCE(gpk->__state)) : ~0, cpu); if (gpk) { + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name); pr_err("RCU grace-period kthread stack dump:\n"); sched_show_task(gpk); - if (cpu >= 0) { - if (cpu_is_offline(cpu)) { - pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu); - } else { - pr_err("Stack dump where RCU GP kthread last ran:\n"); - dump_cpu_task(cpu); - } + if (cpu_is_offline(cpu)) { + pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu); + } else if (!(data_race(READ_ONCE(rdp->mynode->qsmask)) & rdp->grpmask)) { + pr_err("Stack dump where RCU GP kthread last ran:\n"); + dump_cpu_task(cpu); } wake_up_process(gpk); } @@ -711,7 +718,7 @@ static void print_cpu_stall(unsigned long gps) static void check_cpu_stall(struct rcu_data *rdp) { - bool didstall = false; + bool self_detected; unsigned long gs1; unsigned long gs2; unsigned long gps; @@ -725,6 +732,16 @@ static void check_cpu_stall(struct rcu_data *rdp) !rcu_gp_in_progress()) return; rcu_stall_kick_kthreads(); + + /* + * Check if it was requested (via rcu_cpu_stall_reset()) that the FQS + * loop has to set jiffies to ensure a non-stale jiffies value. This + * is required to have good jiffies value after coming out of long + * breaks of jiffies updates. Not doing so can cause false positives. + */ + if (READ_ONCE(rcu_state.nr_fqs_jiffies_stall) > 0) + return; + j = jiffies; /* @@ -758,10 +775,10 @@ static void check_cpu_stall(struct rcu_data *rdp) return; /* No stall or GP completed since entering function. */ rnp = rdp->mynode; jn = jiffies + ULONG_MAX / 2; + self_detected = READ_ONCE(rnp->qsmask) & rdp->grpmask; if (rcu_gp_in_progress() && - (READ_ONCE(rnp->qsmask) & rdp->grpmask) && + (self_detected || ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) && cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { - /* * If a virtual machine is stopped by the host it can look to * the watchdog like an RCU stall. Check to see if the host @@ -770,39 +787,28 @@ static void check_cpu_stall(struct rcu_data *rdp) if (kvm_check_and_clear_guest_paused()) return; - /* We haven't checked in, so go dump stack. */ - print_cpu_stall(gps); - if (READ_ONCE(rcu_cpu_stall_ftrace_dump)) - rcu_ftrace_dump(DUMP_ALL); - didstall = true; - - } else if (rcu_gp_in_progress() && - ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && - cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { - - /* - * If a virtual machine is stopped by the host it can look to - * the watchdog like an RCU stall. Check to see if the host - * stopped the vm. - */ - if (kvm_check_and_clear_guest_paused()) - return; + rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_NORM, (void *)j - gps); + if (self_detected) { + /* We haven't checked in, so go dump stack. */ + print_cpu_stall(gps); + } else { + /* They had a few time units to dump stack, so complain. */ + print_other_cpu_stall(gs2, gps); + } - /* They had a few time units to dump stack, so complain. */ - print_other_cpu_stall(gs2, gps); if (READ_ONCE(rcu_cpu_stall_ftrace_dump)) rcu_ftrace_dump(DUMP_ALL); - didstall = true; - } - if (didstall && READ_ONCE(rcu_state.jiffies_stall) == jn) { - jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; - WRITE_ONCE(rcu_state.jiffies_stall, jn); + + if (READ_ONCE(rcu_state.jiffies_stall) == jn) { + jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; + WRITE_ONCE(rcu_state.jiffies_stall, jn); + } } } ////////////////////////////////////////////////////////////////////////////// // -// RCU forward-progress mechanisms, including of callback invocation. +// RCU forward-progress mechanisms, including for callback invocation. /* @@ -1054,3 +1060,67 @@ static int __init rcu_sysrq_init(void) return 0; } early_initcall(rcu_sysrq_init); + +#ifdef CONFIG_RCU_CPU_STALL_NOTIFIER + +////////////////////////////////////////////////////////////////////////////// +// +// RCU CPU stall-warning notifiers + +static ATOMIC_NOTIFIER_HEAD(rcu_cpu_stall_notifier_list); + +/** + * rcu_stall_chain_notifier_register - Add an RCU CPU stall notifier + * @n: Entry to add. + * + * Adds an RCU CPU stall notifier to an atomic notifier chain. + * The @action passed to a notifier will be @RCU_STALL_NOTIFY_NORM or + * friends. The @data will be the duration of the stalled grace period, + * in jiffies, coerced to a void* pointer. + * + * Returns 0 on success, %-EEXIST on error. + */ +int rcu_stall_chain_notifier_register(struct notifier_block *n) +{ + int rcsn = rcu_cpu_stall_notifiers; + + WARN(1, "Adding %pS() to RCU stall notifier list (%s).\n", n->notifier_call, + rcsn ? "possibly suppressing RCU CPU stall warnings" : "failed, so all is well"); + if (rcsn) + return atomic_notifier_chain_register(&rcu_cpu_stall_notifier_list, n); + return -EEXIST; +} +EXPORT_SYMBOL_GPL(rcu_stall_chain_notifier_register); + +/** + * rcu_stall_chain_notifier_unregister - Remove an RCU CPU stall notifier + * @n: Entry to add. + * + * Removes an RCU CPU stall notifier from an atomic notifier chain. + * + * Returns zero on success, %-ENOENT on failure. + */ +int rcu_stall_chain_notifier_unregister(struct notifier_block *n) +{ + return atomic_notifier_chain_unregister(&rcu_cpu_stall_notifier_list, n); +} +EXPORT_SYMBOL_GPL(rcu_stall_chain_notifier_unregister); + +/* + * rcu_stall_notifier_call_chain - Call functions in an RCU CPU stall notifier chain + * @val: Value passed unmodified to notifier function + * @v: Pointer passed unmodified to notifier function + * + * Calls each function in the RCU CPU stall notifier chain in turn, which + * is an atomic call chain. See atomic_notifier_call_chain() for more + * information. + * + * This is for use within RCU, hence the omission of the extra asterisk + * to indicate a non-kerneldoc format header comment. + */ +int rcu_stall_notifier_call_chain(unsigned long val, void *v) +{ + return atomic_notifier_call_chain(&rcu_cpu_stall_notifier_list, val, v); +} + +#endif // #ifdef CONFIG_RCU_CPU_STALL_NOTIFIER diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 19bf6fa3ee6a..46aaaa9fe339 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -25,6 +25,7 @@ #include <linux/interrupt.h> #include <linux/sched/signal.h> #include <linux/sched/debug.h> +#include <linux/torture.h> #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/percpu.h> @@ -524,22 +525,28 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); do { } while (0) #endif -#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) +#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) || IS_ENABLED(CONFIG_LOCK_TORTURE_TEST) || IS_MODULE(CONFIG_LOCK_TORTURE_TEST) /* Get rcutorture access to sched_setaffinity(). */ -long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask) { int ret; ret = sched_setaffinity(pid, in_mask); - WARN_ONCE(ret, "%s: sched_setaffinity() returned %d\n", __func__, ret); + WARN_ONCE(ret, "%s: sched_setaffinity(%d) returned %d\n", __func__, pid, ret); return ret; } -EXPORT_SYMBOL_GPL(rcutorture_sched_setaffinity); +EXPORT_SYMBOL_GPL(torture_sched_setaffinity); #endif +int rcu_cpu_stall_notifiers __read_mostly; // !0 = provide stall notifiers (rarely useful) +EXPORT_SYMBOL_GPL(rcu_cpu_stall_notifiers); + #ifdef CONFIG_RCU_STALL_COMMON int rcu_cpu_stall_ftrace_dump __read_mostly; module_param(rcu_cpu_stall_ftrace_dump, int, 0644); +#ifdef CONFIG_RCU_CPU_STALL_NOTIFIER +module_param(rcu_cpu_stall_notifiers, int, 0444); +#endif // #ifdef CONFIG_RCU_CPU_STALL_NOTIFIER int rcu_cpu_stall_suppress __read_mostly; // !0 = suppress stall warnings. EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); module_param(rcu_cpu_stall_suppress, int, 0644); diff --git a/kernel/reboot.c b/kernel/reboot.c index 3bba88c7ffc6..22c16e2564cc 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -55,9 +55,18 @@ struct sys_off_handler { enum sys_off_mode mode; bool blocking; void *list; + struct device *dev; }; /* + * This variable is used to indicate if a halt was initiated instead of a + * reboot when the reboot call was invoked with LINUX_REBOOT_CMD_POWER_OFF, but + * the system cannot be powered off. This allowes kernel_halt() to notify users + * of that. + */ +static bool poweroff_fallback_to_halt; + +/* * Temporary stub that prevents linkage failure while we're in process * of removing all uses of legacy pm_power_off() around the kernel. */ @@ -74,6 +83,7 @@ void __weak (*pm_power_off)(void); void emergency_restart(void) { kmsg_dump(KMSG_DUMP_EMERG); + system_state = SYSTEM_RESTART; machine_emergency_restart(); } EXPORT_SYMBOL_GPL(emergency_restart); @@ -295,7 +305,10 @@ void kernel_halt(void) kernel_shutdown_prepare(SYSTEM_HALT); migrate_to_reboot_cpu(); syscore_shutdown(); - pr_emerg("System halted\n"); + if (poweroff_fallback_to_halt) + pr_emerg("Power off not available: System halted instead\n"); + else + pr_emerg("System halted\n"); kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_halt(); } @@ -323,6 +336,7 @@ static int sys_off_notify(struct notifier_block *nb, data.cb_data = handler->cb_data; data.mode = mode; data.cmd = cmd; + data.dev = handler->dev; return handler->sys_off_cb(&data); } @@ -510,6 +524,7 @@ int devm_register_sys_off_handler(struct device *dev, handler = register_sys_off_handler(mode, priority, callback, cb_data); if (IS_ERR(handler)) return PTR_ERR(handler); + handler->dev = dev; return devm_add_action_or_reset(dev, devm_unregister_sys_off_handler, handler); @@ -728,8 +743,10 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, /* Instead of trying to make the power_off code look like * halt when pm_power_off is not set do it the easy way. */ - if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !kernel_can_power_off()) + if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !kernel_can_power_off()) { + poweroff_fallback_to_halt = true; cmd = LINUX_REBOOT_CMD_HALT; + } mutex_lock(&system_transition_mutex); switch (cmd) { @@ -953,21 +970,24 @@ static void hw_failure_emergency_poweroff(int poweroff_delay_ms) } /** - * hw_protection_shutdown - Trigger an emergency system poweroff + * __hw_protection_shutdown - Trigger an emergency system shutdown or reboot * - * @reason: Reason of emergency shutdown to be printed. - * @ms_until_forced: Time to wait for orderly shutdown before tiggering a - * forced shudown. Negative value disables the forced - * shutdown. + * @reason: Reason of emergency shutdown or reboot to be printed. + * @ms_until_forced: Time to wait for orderly shutdown or reboot before + * triggering it. Negative value disables the forced + * shutdown or reboot. + * @shutdown: If true, indicates that a shutdown will happen + * after the critical tempeature is reached. + * If false, indicates that a reboot will happen + * after the critical tempeature is reached. * - * Initiate an emergency system shutdown in order to protect hardware from - * further damage. Usage examples include a thermal protection or a voltage or - * current regulator failures. - * NOTE: The request is ignored if protection shutdown is already pending even - * if the previous request has given a large timeout for forced shutdown. - * Can be called from any context. + * Initiate an emergency system shutdown or reboot in order to protect + * hardware from further damage. Usage examples include a thermal protection. + * NOTE: The request is ignored if protection shutdown or reboot is already + * pending even if the previous request has given a large timeout for forced + * shutdown/reboot. */ -void hw_protection_shutdown(const char *reason, int ms_until_forced) +void __hw_protection_shutdown(const char *reason, int ms_until_forced, bool shutdown) { static atomic_t allow_proceed = ATOMIC_INIT(1); @@ -982,9 +1002,12 @@ void hw_protection_shutdown(const char *reason, int ms_until_forced) * orderly_poweroff failure */ hw_failure_emergency_poweroff(ms_until_forced); - orderly_poweroff(true); + if (shutdown) + orderly_poweroff(true); + else + orderly_reboot(); } -EXPORT_SYMBOL_GPL(hw_protection_shutdown); +EXPORT_SYMBOL_GPL(__hw_protection_shutdown); static int __init reboot_setup(char *str) { diff --git a/kernel/relay.c b/kernel/relay.c index 83fe0325cde1..a8e90e98bf2c 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1073,167 +1073,6 @@ static ssize_t relay_file_read(struct file *filp, return written; } -static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed) -{ - rbuf->bytes_consumed += bytes_consumed; - - if (rbuf->bytes_consumed >= rbuf->chan->subbuf_size) { - relay_subbufs_consumed(rbuf->chan, rbuf->cpu, 1); - rbuf->bytes_consumed %= rbuf->chan->subbuf_size; - } -} - -static void relay_pipe_buf_release(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - struct rchan_buf *rbuf; - - rbuf = (struct rchan_buf *)page_private(buf->page); - relay_consume_bytes(rbuf, buf->private); -} - -static const struct pipe_buf_operations relay_pipe_buf_ops = { - .release = relay_pipe_buf_release, - .try_steal = generic_pipe_buf_try_steal, - .get = generic_pipe_buf_get, -}; - -static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i) -{ -} - -/* - * subbuf_splice_actor - splice up to one subbuf's worth of data - */ -static ssize_t subbuf_splice_actor(struct file *in, - loff_t *ppos, - struct pipe_inode_info *pipe, - size_t len, - unsigned int flags, - int *nonpad_ret) -{ - unsigned int pidx, poff, total_len, subbuf_pages, nr_pages; - struct rchan_buf *rbuf = in->private_data; - unsigned int subbuf_size = rbuf->chan->subbuf_size; - uint64_t pos = (uint64_t) *ppos; - uint32_t alloc_size = (uint32_t) rbuf->chan->alloc_size; - size_t read_start = (size_t) do_div(pos, alloc_size); - size_t read_subbuf = read_start / subbuf_size; - size_t padding = rbuf->padding[read_subbuf]; - size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding; - struct page *pages[PIPE_DEF_BUFFERS]; - struct partial_page partial[PIPE_DEF_BUFFERS]; - struct splice_pipe_desc spd = { - .pages = pages, - .nr_pages = 0, - .nr_pages_max = PIPE_DEF_BUFFERS, - .partial = partial, - .ops = &relay_pipe_buf_ops, - .spd_release = relay_page_release, - }; - ssize_t ret; - - if (rbuf->subbufs_produced == rbuf->subbufs_consumed) - return 0; - if (splice_grow_spd(pipe, &spd)) - return -ENOMEM; - - /* - * Adjust read len, if longer than what is available - */ - if (len > (subbuf_size - read_start % subbuf_size)) - len = subbuf_size - read_start % subbuf_size; - - subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; - pidx = (read_start / PAGE_SIZE) % subbuf_pages; - poff = read_start & ~PAGE_MASK; - nr_pages = min_t(unsigned int, subbuf_pages, spd.nr_pages_max); - - for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) { - unsigned int this_len, this_end, private; - unsigned int cur_pos = read_start + total_len; - - if (!len) - break; - - this_len = min_t(unsigned long, len, PAGE_SIZE - poff); - private = this_len; - - spd.pages[spd.nr_pages] = rbuf->page_array[pidx]; - spd.partial[spd.nr_pages].offset = poff; - - this_end = cur_pos + this_len; - if (this_end >= nonpad_end) { - this_len = nonpad_end - cur_pos; - private = this_len + padding; - } - spd.partial[spd.nr_pages].len = this_len; - spd.partial[spd.nr_pages].private = private; - - len -= this_len; - total_len += this_len; - poff = 0; - pidx = (pidx + 1) % subbuf_pages; - - if (this_end >= nonpad_end) { - spd.nr_pages++; - break; - } - } - - ret = 0; - if (!spd.nr_pages) - goto out; - - ret = *nonpad_ret = splice_to_pipe(pipe, &spd); - if (ret < 0 || ret < total_len) - goto out; - - if (read_start + ret == nonpad_end) - ret += padding; - -out: - splice_shrink_spd(&spd); - return ret; -} - -static ssize_t relay_file_splice_read(struct file *in, - loff_t *ppos, - struct pipe_inode_info *pipe, - size_t len, - unsigned int flags) -{ - ssize_t spliced; - int ret; - int nonpad_ret = 0; - - ret = 0; - spliced = 0; - - while (len && !spliced) { - ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret); - if (ret < 0) - break; - else if (!ret) { - if (flags & SPLICE_F_NONBLOCK) - ret = -EAGAIN; - break; - } - - *ppos += ret; - if (ret > len) - len = 0; - else - len -= ret; - spliced += nonpad_ret; - nonpad_ret = 0; - } - - if (spliced) - return spliced; - - return ret; -} const struct file_operations relay_file_operations = { .open = relay_file_open, @@ -1242,6 +1081,5 @@ const struct file_operations relay_file_operations = { .read = relay_file_read, .llseek = no_llseek, .release = relay_file_release, - .splice_read = relay_file_splice_read, }; EXPORT_SYMBOL_GPL(relay_file_operations); diff --git a/kernel/resource.c b/kernel/resource.c index b1763b2fd7ef..fcbca39dbc45 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -27,6 +27,8 @@ #include <linux/mount.h> #include <linux/resource_ext.h> #include <uapi/linux/magic.h> +#include <linux/string.h> +#include <linux/vmalloc.h> #include <asm/io.h> @@ -56,33 +58,17 @@ struct resource_constraint { static DEFINE_RWLOCK(resource_lock); -static struct resource *next_resource(struct resource *p) +static struct resource *next_resource(struct resource *p, bool skip_children) { - if (p->child) + if (!skip_children && p->child) return p->child; while (!p->sibling && p->parent) p = p->parent; return p->sibling; } -static struct resource *next_resource_skip_children(struct resource *p) -{ - while (!p->sibling && p->parent) - p = p->parent; - return p->sibling; -} - #define for_each_resource(_root, _p, _skip_children) \ - for ((_p) = (_root)->child; (_p); \ - (_p) = (_skip_children) ? next_resource_skip_children(_p) : \ - next_resource(_p)) - -static void *r_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct resource *p = v; - (*pos)++; - return (void *)next_resource(p); -} + for ((_p) = (_root)->child; (_p); (_p) = next_resource(_p, _skip_children)) #ifdef CONFIG_PROC_FS @@ -91,14 +77,28 @@ enum { MAX_IORES_LEVEL = 5 }; static void *r_start(struct seq_file *m, loff_t *pos) __acquires(resource_lock) { - struct resource *p = pde_data(file_inode(m->file)); - loff_t l = 0; + struct resource *root = pde_data(file_inode(m->file)); + struct resource *p; + loff_t l = *pos; + read_lock(&resource_lock); - for (p = p->child; p && l < *pos; p = r_next(m, p, &l)) - ; + for_each_resource(root, p, false) { + if (l-- == 0) + break; + } + return p; } +static void *r_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct resource *p = v; + + (*pos)++; + + return (void *)next_resource(p, false); +} + static void r_stop(struct seq_file *m, void *v) __releases(resource_lock) { @@ -336,7 +336,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, read_lock(&resource_lock); - for (p = iomem_resource.child; p; p = next_resource(p)) { + for_each_resource(&iomem_resource, p, false) { /* If we passed the resource we are looking for, stop */ if (p->start > end) { p = NULL; @@ -432,6 +432,61 @@ int walk_system_ram_res(u64 start, u64 end, void *arg, } /* + * This function, being a variant of walk_system_ram_res(), calls the @func + * callback against all memory ranges of type System RAM which are marked as + * IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY in reversed order, i.e., from + * higher to lower. + */ +int walk_system_ram_res_rev(u64 start, u64 end, void *arg, + int (*func)(struct resource *, void *)) +{ + struct resource res, *rams; + int rams_size = 16, i; + unsigned long flags; + int ret = -1; + + /* create a list */ + rams = kvcalloc(rams_size, sizeof(struct resource), GFP_KERNEL); + if (!rams) + return ret; + + flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + i = 0; + while ((start < end) && + (!find_next_iomem_res(start, end, flags, IORES_DESC_NONE, &res))) { + if (i >= rams_size) { + /* re-alloc */ + struct resource *rams_new; + + rams_new = kvrealloc(rams, rams_size * sizeof(struct resource), + (rams_size + 16) * sizeof(struct resource), + GFP_KERNEL); + if (!rams_new) + goto out; + + rams = rams_new; + rams_size += 16; + } + + rams[i].start = res.start; + rams[i++].end = res.end; + + start = res.end + 1; + } + + /* go reverse */ + for (i--; i >= 0; i--) { + ret = (*func)(&rams[i], arg); + if (ret) + break; + } + +out: + kvfree(rams); + return ret; +} + +/* * This function calls the @func callback against all memory ranges, which * are ranges marked as IORESOURCE_MEM and IORESOUCE_BUSY. */ @@ -1641,13 +1696,12 @@ __setup("reserve=", reserve_setup); */ int iomem_map_sanity_check(resource_size_t addr, unsigned long size) { - struct resource *p = &iomem_resource; resource_size_t end = addr + size - 1; + struct resource *p; int err = 0; - loff_t l; read_lock(&resource_lock); - for (p = p->child; p ; p = r_next(NULL, p, &l)) { + for_each_resource(&iomem_resource, p, false) { /* * We can probably skip the resources without * IORESOURCE_IO attribute? @@ -1847,8 +1901,8 @@ get_free_mem_region(struct device *dev, struct resource *base, write_lock(&resource_lock); for (addr = gfr_start(base, size, align, flags); - gfr_continue(base, addr, size, flags); - addr = gfr_next(addr, size, flags)) { + gfr_continue(base, addr, align, flags); + addr = gfr_next(addr, align, flags)) { if (__region_intersects(base, addr, size, 0, IORES_DESC_NONE) != REGION_DISJOINT) continue; diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c index 99bdd96f454f..80a3df49ab47 100644 --- a/kernel/sched/build_utility.c +++ b/kernel/sched/build_utility.c @@ -34,7 +34,6 @@ #include <linux/nospec.h> #include <linux/proc_fs.h> #include <linux/psi.h> -#include <linux/psi.h> #include <linux/ptrace_api.h> #include <linux/sched_clock.h> #include <linux/security.h> diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2299a5cfbfb9..9116bcc90346 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -57,6 +57,7 @@ #include <linux/profile.h> #include <linux/psi.h> #include <linux/rcuwait_api.h> +#include <linux/rseq.h> #include <linux/sched/wake_q.h> #include <linux/scs.h> #include <linux/slab.h> @@ -85,7 +86,6 @@ #include "sched.h" #include "stats.h" -#include "autogroup.h" #include "autogroup.h" #include "pelt.h" @@ -114,6 +114,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -919,14 +920,13 @@ static bool set_nr_if_polling(struct task_struct *p) struct thread_info *ti = task_thread_info(p); typeof(ti->flags) val = READ_ONCE(ti->flags); - for (;;) { + do { if (!(val & _TIF_POLLING_NRFLAG)) return false; if (val & _TIF_NEED_RESCHED) return true; - if (try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED)) - break; - } + } while (!try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED)); + return true; } @@ -1132,6 +1132,28 @@ static void wake_up_idle_cpu(int cpu) if (cpu == smp_processor_id()) return; + /* + * Set TIF_NEED_RESCHED and send an IPI if in the non-polling + * part of the idle loop. This forces an exit from the idle loop + * and a round trip to schedule(). Now this could be optimized + * because a simple new idle loop iteration is enough to + * re-evaluate the next tick. Provided some re-ordering of tick + * nohz functions that would need to follow TIF_NR_POLLING + * clearing: + * + * - On most archs, a simple fetch_or on ti::flags with a + * "0" value would be enough to know if an IPI needs to be sent. + * + * - x86 needs to perform a last need_resched() check between + * monitor and mwait which doesn't take timers into account. + * There a dedicated TIF_TIMER flag would be required to + * fetch_or here and be checked along with TIF_NEED_RESCHED + * before mwait(). + * + * However, remote timer enqueue is not such a frequent event + * and testing of the above solutions didn't appear to report + * much benefits. + */ if (set_nr_and_not_polling(rq->idle)) smp_send_reschedule(cpu); else @@ -1480,16 +1502,12 @@ static void __uclamp_update_util_min_rt_default(struct task_struct *p) static void uclamp_update_util_min_rt_default(struct task_struct *p) { - struct rq_flags rf; - struct rq *rq; - if (!rt_task(p)) return; /* Protect updates to p->uclamp_* */ - rq = task_rq_lock(p, &rf); + guard(task_rq_lock)(p); __uclamp_update_util_min_rt_default(p); - task_rq_unlock(rq, p, &rf); } static inline struct uclamp_se @@ -1785,9 +1803,8 @@ static void uclamp_update_root_tg(void) uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX], sysctl_sched_uclamp_util_max, false); - rcu_read_lock(); + guard(rcu)(); cpu_util_update_eff(&root_task_group.css); - rcu_read_unlock(); } #else static void uclamp_update_root_tg(void) { } @@ -1814,10 +1831,9 @@ static void uclamp_sync_util_min_rt_default(void) smp_mb__after_spinlock(); read_unlock(&tasklist_lock); - rcu_read_lock(); + guard(rcu)(); for_each_process_thread(g, p) uclamp_update_util_min_rt_default(p); - rcu_read_unlock(); } static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, @@ -2131,12 +2147,14 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags) enqueue_task(rq, p, flags); - p->on_rq = TASK_ON_RQ_QUEUED; + WRITE_ONCE(p->on_rq, TASK_ON_RQ_QUEUED); + ASSERT_EXCLUSIVE_WRITER(p->on_rq); } void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { - p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING; + WRITE_ONCE(p->on_rq, (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING); + ASSERT_EXCLUSIVE_WRITER(p->on_rq); dequeue_task(rq, p, flags); } @@ -2218,10 +2236,10 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, p->sched_class->prio_changed(rq, p, oldprio); } -void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) +void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags) { if (p->sched_class == rq->curr->sched_class) - rq->curr->sched_class->check_preempt_curr(rq, p, flags); + rq->curr->sched_class->wakeup_preempt(rq, p, flags); else if (sched_class_above(p->sched_class, rq->curr->sched_class)) resched_curr(rq); @@ -2239,31 +2257,21 @@ int __task_state_match(struct task_struct *p, unsigned int state) if (READ_ONCE(p->__state) & state) return 1; -#ifdef CONFIG_PREEMPT_RT if (READ_ONCE(p->saved_state) & state) return -1; -#endif + return 0; } static __always_inline int task_state_match(struct task_struct *p, unsigned int state) { -#ifdef CONFIG_PREEMPT_RT - int match; - /* - * Serialize against current_save_and_set_rtlock_wait_state() and - * current_restore_rtlock_saved_state(). + * Serialize against current_save_and_set_rtlock_wait_state(), + * current_restore_rtlock_saved_state(), and __refrigerator(). */ - raw_spin_lock_irq(&p->pi_lock); - match = __task_state_match(p, state); - raw_spin_unlock_irq(&p->pi_lock); - - return match; -#else + guard(raw_spinlock_irq)(&p->pi_lock); return __task_state_match(p, state); -#endif } /* @@ -2417,10 +2425,9 @@ void migrate_disable(void) return; } - preempt_disable(); + guard(preempt)(); this_rq()->nr_pinned++; p->migration_disabled = 1; - preempt_enable(); } EXPORT_SYMBOL_GPL(migrate_disable); @@ -2444,7 +2451,7 @@ void migrate_enable(void) * Ensure stop_task runs either before or after this, and that * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). */ - preempt_disable(); + guard(preempt)(); if (p->cpus_ptr != &p->cpus_mask) __set_cpus_allowed_ptr(p, &ac); /* @@ -2455,7 +2462,6 @@ void migrate_enable(void) barrier(); p->migration_disabled = 0; this_rq()->nr_pinned--; - preempt_enable(); } EXPORT_SYMBOL_GPL(migrate_enable); @@ -2527,7 +2533,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, rq_lock(rq, rf); WARN_ON_ONCE(task_cpu(p) != new_cpu); activate_task(rq, p, 0); - check_preempt_curr(rq, p, 0); + wakeup_preempt(rq, p, 0); return rq; } @@ -2664,9 +2670,11 @@ static int migration_cpu_stop(void *data) * it. */ WARN_ON_ONCE(!pending->stop_pending); + preempt_disable(); task_rq_unlock(rq, p, &rf); stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop, &pending->arg, &pending->stop_work); + preempt_enable(); return 0; } out: @@ -2986,12 +2994,13 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag complete = true; } + preempt_disable(); task_rq_unlock(rq, p, rf); - if (push_task) { stop_one_cpu_nowait(rq->cpu, push_cpu_stop, p, &rq->push_work); } + preempt_enable(); if (complete) complete_all(&pending->done); @@ -3057,12 +3066,13 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag if (flags & SCA_MIGRATE_ENABLE) p->migration_flags &= ~MDF_PUSH; + preempt_disable(); task_rq_unlock(rq, p, rf); - if (!stop_pending) { stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, &pending->arg, &pending->stop_work); } + preempt_enable(); if (flags & SCA_MIGRATE_ENABLE) return 0; @@ -3409,7 +3419,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) deactivate_task(src_rq, p, 0); set_task_cpu(p, cpu); activate_task(dst_rq, p, 0); - check_preempt_curr(dst_rq, p, 0); + wakeup_preempt(dst_rq, p, 0); rq_unpin_lock(dst_rq, &drf); rq_unpin_lock(src_rq, &srf); @@ -3516,13 +3526,11 @@ out: */ void kick_process(struct task_struct *p) { - int cpu; + guard(preempt)(); + int cpu = task_cpu(p); - preempt_disable(); - cpu = task_cpu(p); if ((cpu != smp_processor_id()) && task_curr(p)) smp_send_reschedule(cpu); - preempt_enable(); } EXPORT_SYMBOL_GPL(kick_process); @@ -3785,7 +3793,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, } activate_task(rq, p, en_flags); - check_preempt_curr(rq, p, wake_flags); + wakeup_preempt(rq, p, wake_flags); ttwu_do_wakeup(p); @@ -3809,12 +3817,11 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, if (rq->avg_idle > max) rq->avg_idle = max; - rq->wake_stamp = jiffies; - rq->wake_avg_idle = rq->avg_idle / 2; - rq->idle_stamp = 0; } #endif + + p->dl_server = NULL; } /* @@ -3856,7 +3863,7 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) * it should preempt the task that is current now. */ update_rq_clock(rq); - check_preempt_curr(rq, p, wake_flags); + wakeup_preempt(rq, p, wake_flags); } ttwu_do_wakeup(p); ret = 1; @@ -3956,6 +3963,18 @@ bool cpus_share_cache(int this_cpu, int that_cpu) return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } +/* + * Whether CPUs are share cache resources, which means LLC on non-cluster + * machines and LLC tag or L2 on machines with clusters. + */ +bool cpus_share_resources(int this_cpu, int that_cpu) +{ + if (this_cpu == that_cpu) + return true; + + return per_cpu(sd_share_id, this_cpu) == per_cpu(sd_share_id, that_cpu); +} + static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) { /* @@ -4036,13 +4055,17 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * The caller holds p::pi_lock if p != current or has preemption * disabled when p == current. * - * The rules of PREEMPT_RT saved_state: + * The rules of saved_state: * * The related locking code always holds p::pi_lock when updating * p::saved_state, which means the code is fully serialized in both cases. * - * The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other - * bits set. This allows to distinguish all wakeup scenarios. + * For PREEMPT_RT, the lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. + * No other bits set. This allows to distinguish all wakeup scenarios. + * + * For FREEZER, the wakeup happens via TASK_FROZEN. No other bits set. This + * allows us to prevent early wakeup of tasks before they can be run on + * asymmetric ISA architectures (eg ARMv9). */ static __always_inline bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) @@ -4056,13 +4079,13 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) *success = !!(match = __task_state_match(p, state)); -#ifdef CONFIG_PREEMPT_RT /* * Saved state preserves the task state across blocking on - * an RT lock. If the state matches, set p::saved_state to - * TASK_RUNNING, but do not wake the task because it waits - * for a lock wakeup. Also indicate success because from - * the regular waker's point of view this has succeeded. + * an RT lock or TASK_FREEZABLE tasks. If the state matches, + * set p::saved_state to TASK_RUNNING, but do not wake the task + * because it waits for a lock wakeup or __thaw_task(). Also + * indicate success because from the regular waker's point of + * view this has succeeded. * * After acquiring the lock the task will restore p::__state * from p::saved_state which ensures that the regular @@ -4072,7 +4095,7 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) */ if (match < 0) p->saved_state = TASK_RUNNING; -#endif + return match > 0; } @@ -4254,7 +4277,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in * __schedule(). See the comment for smp_mb__after_spinlock(). * - * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). + * A similar smp_rmb() lives in __task_needs_rq_lock(). */ smp_rmb(); if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) @@ -4513,10 +4536,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) memset(&p->stats, 0, sizeof(p->stats)); #endif - RB_CLEAR_NODE(&p->dl.rb_node); - init_dl_task_timer(&p->dl); - init_dl_inactive_task_timer(&p->dl); - __dl_clear_params(p); + init_dl_entity(&p->dl); INIT_LIST_HEAD(&p->rt.run_list); p->rt.timeout = 0; @@ -4871,7 +4891,7 @@ void wake_up_new_task(struct task_struct *p) activate_task(rq, p, ENQUEUE_NOCLOCK); trace_sched_wakeup_new(p); - check_preempt_curr(rq, p, WF_FORK); + wakeup_preempt(rq, p, WF_FORK); #ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* @@ -5374,8 +5394,6 @@ context_switch(struct rq *rq, struct task_struct *prev, /* switch_mm_cid() requires the memory barriers above. */ switch_mm_cid(rq, prev, next); - rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); - prepare_lock_switch(rq, next, rf); /* Here we just switch the register state and the stack. */ @@ -5916,8 +5934,7 @@ static noinline void __schedule_bug(struct task_struct *prev) print_modules(); if (irqs_disabled()) print_irqtrace_events(prev); - if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) - && in_atomic_preempt_off()) { + if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) { pr_err("Preemption disabled at:"); print_ip_sym(KERN_ERR, preempt_disable_ip); } @@ -6011,12 +6028,27 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) p = pick_next_task_idle(rq); } + /* + * This is the fast path; it cannot be a DL server pick; + * therefore even if @p == @prev, ->dl_server must be NULL. + */ + if (p->dl_server) + p->dl_server = NULL; + return p; } restart: put_prev_task_balance(rq, prev, rf); + /* + * We've updated @prev and no longer need the server link, clear it. + * Must be done before ->pick_next_task() because that can (re)set + * ->dl_server. + */ + if (prev->dl_server) + prev->dl_server = NULL; + for_each_class(class) { p = class->pick_next_task(rq); if (p) @@ -6368,8 +6400,9 @@ static void sched_core_balance(struct rq *rq) struct sched_domain *sd; int cpu = cpu_of(rq); - preempt_disable(); - rcu_read_lock(); + guard(preempt)(); + guard(rcu)(); + raw_spin_rq_unlock_irq(rq); for_each_domain(cpu, sd) { if (need_resched()) @@ -6379,8 +6412,6 @@ static void sched_core_balance(struct rq *rq) break; } raw_spin_rq_lock_irq(rq); - rcu_read_unlock(); - preempt_enable(); } static DEFINE_PER_CPU(struct balance_callback, core_balance_head); @@ -6615,6 +6646,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) /* Promote REQ to ACT */ rq->clock_update_flags <<= 1; update_rq_clock(rq); + rq->clock_update_flags = RQCF_UPDATED; switch_count = &prev->nivcsw; @@ -6694,8 +6726,6 @@ static void __sched notrace __schedule(unsigned int sched_mode) /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); } else { - rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); - rq_unpin_lock(rq, &rf); __balance_callbacks(rq); raw_spin_rq_unlock_irq(rq); @@ -6720,22 +6750,24 @@ void __noreturn do_task_dead(void) static inline void sched_submit_work(struct task_struct *tsk) { + static DEFINE_WAIT_OVERRIDE_MAP(sched_map, LD_WAIT_CONFIG); unsigned int task_flags; - if (task_is_running(tsk)) - return; + /* + * Establish LD_WAIT_CONFIG context to ensure none of the code called + * will use a blocking primitive -- which would lead to recursion. + */ + lock_map_acquire_try(&sched_map); task_flags = tsk->flags; /* * If a worker goes to sleep, notify and ask workqueue whether it * wants to wake up a task to maintain concurrency. */ - if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) { - if (task_flags & PF_WQ_WORKER) - wq_worker_sleeping(tsk); - else - io_wq_worker_sleeping(tsk); - } + if (task_flags & PF_WQ_WORKER) + wq_worker_sleeping(tsk); + else if (task_flags & PF_IO_WORKER) + io_wq_worker_sleeping(tsk); /* * spinlock and rwlock must not flush block requests. This will @@ -6749,6 +6781,8 @@ static inline void sched_submit_work(struct task_struct *tsk) * make sure to submit it to avoid deadlocks. */ blk_flush_plug(tsk->plug, true); + + lock_map_release(&sched_map); } static void sched_update_worker(struct task_struct *tsk) @@ -6761,16 +6795,26 @@ static void sched_update_worker(struct task_struct *tsk) } } -asmlinkage __visible void __sched schedule(void) +static __always_inline void __schedule_loop(unsigned int sched_mode) { - struct task_struct *tsk = current; - - sched_submit_work(tsk); do { preempt_disable(); - __schedule(SM_NONE); + __schedule(sched_mode); sched_preempt_enable_no_resched(); } while (need_resched()); +} + +asmlinkage __visible void __sched schedule(void) +{ + struct task_struct *tsk = current; + +#ifdef CONFIG_RT_MUTEXES + lockdep_assert(!tsk->sched_rt_mutex); +#endif + + if (!task_is_running(tsk)) + sched_submit_work(tsk); + __schedule_loop(SM_NONE); sched_update_worker(tsk); } EXPORT_SYMBOL(schedule); @@ -6834,11 +6878,7 @@ void __sched schedule_preempt_disabled(void) #ifdef CONFIG_PREEMPT_RT void __sched notrace schedule_rtlock(void) { - do { - preempt_disable(); - __schedule(SM_RTLOCK_WAIT); - sched_preempt_enable_no_resched(); - } while (need_resched()); + __schedule_loop(SM_RTLOCK_WAIT); } NOKPROBE_SYMBOL(schedule_rtlock); #endif @@ -7034,6 +7074,32 @@ static void __setscheduler_prio(struct task_struct *p, int prio) #ifdef CONFIG_RT_MUTEXES +/* + * Would be more useful with typeof()/auto_type but they don't mix with + * bit-fields. Since it's a local thing, use int. Keep the generic sounding + * name such that if someone were to implement this function we get to compare + * notes. + */ +#define fetch_and_set(x, v) ({ int _x = (x); (x) = (v); _x; }) + +void rt_mutex_pre_schedule(void) +{ + lockdep_assert(!fetch_and_set(current->sched_rt_mutex, 1)); + sched_submit_work(current); +} + +void rt_mutex_schedule(void) +{ + lockdep_assert(current->sched_rt_mutex); + __schedule_loop(SM_NONE); +} + +void rt_mutex_post_schedule(void) +{ + sched_update_worker(current); + lockdep_assert(fetch_and_set(current->sched_rt_mutex, 0)); +} + static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) { if (pi_task) @@ -7187,9 +7253,8 @@ static inline int rt_effective_prio(struct task_struct *p, int prio) void set_user_nice(struct task_struct *p, long nice) { bool queued, running; - int old_prio; - struct rq_flags rf; struct rq *rq; + int old_prio; if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) return; @@ -7197,7 +7262,9 @@ void set_user_nice(struct task_struct *p, long nice) * We have to be careful, if called from sys_setpriority(), * the task might be in the middle of scheduling on another CPU. */ - rq = task_rq_lock(p, &rf); + CLASS(task_rq_lock, rq_guard)(p); + rq = rq_guard.rq; + update_rq_clock(rq); /* @@ -7208,8 +7275,9 @@ void set_user_nice(struct task_struct *p, long nice) */ if (task_has_dl_policy(p) || task_has_rt_policy(p)) { p->static_prio = NICE_TO_PRIO(nice); - goto out_unlock; + return; } + queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) @@ -7232,9 +7300,6 @@ void set_user_nice(struct task_struct *p, long nice) * lowered its priority, then reschedule its CPU: */ p->sched_class->prio_changed(rq, p, old_prio); - -out_unlock: - task_rq_unlock(rq, p, &rf); } EXPORT_SYMBOL(set_user_nice); @@ -7403,18 +7468,13 @@ int sched_core_idle_cpu(int cpu) * required to meet deadlines. */ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, - enum cpu_util_type type, - struct task_struct *p) + unsigned long *min, + unsigned long *max) { - unsigned long dl_util, util, irq, max; + unsigned long util, irq, scale; struct rq *rq = cpu_rq(cpu); - max = arch_scale_cpu_capacity(cpu); - - if (!uclamp_is_used() && - type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { - return max; - } + scale = arch_scale_cpu_capacity(cpu); /* * Early check to see if IRQ/steal time saturates the CPU, can be @@ -7422,45 +7482,49 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, * update_irq_load_avg(). */ irq = cpu_util_irq(rq); - if (unlikely(irq >= max)) - return max; + if (unlikely(irq >= scale)) { + if (min) + *min = scale; + if (max) + *max = scale; + return scale; + } + + if (min) { + /* + * The minimum utilization returns the highest level between: + * - the computed DL bandwidth needed with the IRQ pressure which + * steals time to the deadline task. + * - The minimum performance requirement for CFS and/or RT. + */ + *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN)); + + /* + * When an RT task is runnable and uclamp is not used, we must + * ensure that the task will run at maximum compute capacity. + */ + if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt)) + *min = max(*min, scale); + } /* * Because the time spend on RT/DL tasks is visible as 'lost' time to * CFS tasks and we use the same metric to track the effective * utilization (PELT windows are synchronized) we can directly add them * to obtain the CPU's actual utilization. - * - * CFS and RT utilization can be boosted or capped, depending on - * utilization clamp constraints requested by currently RUNNABLE - * tasks. - * When there are no CFS RUNNABLE tasks, clamps are released and - * frequency will be gracefully reduced with the utilization decay. */ util = util_cfs + cpu_util_rt(rq); - if (type == FREQUENCY_UTIL) - util = uclamp_rq_util_with(rq, util, p); - - dl_util = cpu_util_dl(rq); + util += cpu_util_dl(rq); /* - * For frequency selection we do not make cpu_util_dl() a permanent part - * of this sum because we want to use cpu_bw_dl() later on, but we need - * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such - * that we select f_max when there is no idle time. - * - * NOTE: numerical errors or stop class might cause us to not quite hit - * saturation when we should -- something for later. + * The maximum hint is a soft bandwidth requirement, which can be lower + * than the actual utilization because of uclamp_max requirements. */ - if (util + dl_util >= max) - return max; + if (max) + *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX)); - /* - * OTOH, for energy computation we need the estimated running time, so - * include util_dl and ignore dl_bw. - */ - if (type == ENERGY_UTIL) - util += dl_util; + if (util >= scale) + return scale; /* * There is still idle time; further improve the number by using the @@ -7471,28 +7535,15 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, * U' = irq + --------- * U * max */ - util = scale_irq_capacity(util, irq, max); + util = scale_irq_capacity(util, irq, scale); util += irq; - /* - * Bandwidth required by DEADLINE must always be granted while, for - * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism - * to gracefully reduce the frequency when no tasks show up for longer - * periods of time. - * - * Ideally we would like to set bw_dl as min/guaranteed freq and util + - * bw_dl as requested freq. However, cpufreq is not yet ready for such - * an interface. So, we only do the latter for now. - */ - if (type == FREQUENCY_UTIL) - util += cpu_bw_dl(rq); - - return min(max, util); + return min(scale, util); } unsigned long sched_cpu_util(int cpu) { - return effective_cpu_util(cpu, cpu_util_cfs(cpu), ENERGY_UTIL, NULL); + return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL); } #endif /* CONFIG_SMP */ @@ -7507,6 +7558,21 @@ static struct task_struct *find_process_by_pid(pid_t pid) return pid ? find_task_by_vpid(pid) : current; } +static struct task_struct *find_get_task(pid_t pid) +{ + struct task_struct *p; + guard(rcu)(); + + p = find_process_by_pid(pid); + if (likely(p)) + get_task_struct(p); + + return p; +} + +DEFINE_CLASS(find_get_task, struct task_struct *, if (_T) put_task_struct(_T), + find_get_task(pid), pid_t pid) + /* * sched_setparam() passes in -1 for its policy, to let the functions * it calls know not to change it. @@ -7544,14 +7610,11 @@ static void __setscheduler_params(struct task_struct *p, static bool check_same_owner(struct task_struct *p) { const struct cred *cred = current_cred(), *pcred; - bool match; + guard(rcu)(); - rcu_read_lock(); pcred = __task_cred(p); - match = (uid_eq(cred->euid, pcred->euid) || - uid_eq(cred->euid, pcred->uid)); - rcu_read_unlock(); - return match; + return (uid_eq(cred->euid, pcred->euid) || + uid_eq(cred->euid, pcred->uid)); } /* @@ -7963,27 +8026,17 @@ static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) { struct sched_param lparam; - struct task_struct *p; - int retval; if (!param || pid < 0) return -EINVAL; if (copy_from_user(&lparam, param, sizeof(struct sched_param))) return -EFAULT; - rcu_read_lock(); - retval = -ESRCH; - p = find_process_by_pid(pid); - if (likely(p)) - get_task_struct(p); - rcu_read_unlock(); - - if (likely(p)) { - retval = sched_setscheduler(p, policy, &lparam); - put_task_struct(p); - } + CLASS(find_get_task, p)(pid); + if (!p) + return -ESRCH; - return retval; + return sched_setscheduler(p, policy, &lparam); } /* @@ -8079,7 +8132,6 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, unsigned int, flags) { struct sched_attr attr; - struct task_struct *p; int retval; if (!uattr || pid < 0 || flags) @@ -8094,21 +8146,14 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) attr.sched_policy = SETPARAM_POLICY; - rcu_read_lock(); - retval = -ESRCH; - p = find_process_by_pid(pid); - if (likely(p)) - get_task_struct(p); - rcu_read_unlock(); + CLASS(find_get_task, p)(pid); + if (!p) + return -ESRCH; - if (likely(p)) { - if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS) - get_params(p, &attr); - retval = sched_setattr(p, &attr); - put_task_struct(p); - } + if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS) + get_params(p, &attr); - return retval; + return sched_setattr(p, &attr); } /** @@ -8126,16 +8171,17 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) if (pid < 0) return -EINVAL; - retval = -ESRCH; - rcu_read_lock(); + guard(rcu)(); p = find_process_by_pid(pid); - if (p) { - retval = security_task_getscheduler(p); - if (!retval) - retval = p->policy - | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); + if (!p) + return -ESRCH; + + retval = security_task_getscheduler(p); + if (!retval) { + retval = p->policy; + if (p->sched_reset_on_fork) + retval |= SCHED_RESET_ON_FORK; } - rcu_read_unlock(); return retval; } @@ -8156,30 +8202,23 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) if (!param || pid < 0) return -EINVAL; - rcu_read_lock(); - p = find_process_by_pid(pid); - retval = -ESRCH; - if (!p) - goto out_unlock; + scoped_guard (rcu) { + p = find_process_by_pid(pid); + if (!p) + return -ESRCH; - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; + retval = security_task_getscheduler(p); + if (retval) + return retval; - if (task_has_rt_policy(p)) - lp.sched_priority = p->rt_priority; - rcu_read_unlock(); + if (task_has_rt_policy(p)) + lp.sched_priority = p->rt_priority; + } /* * This one might sleep, we cannot do it with a spinlock held ... */ - retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; - - return retval; - -out_unlock: - rcu_read_unlock(); - return retval; + return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; } /* @@ -8239,46 +8278,38 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, usize < SCHED_ATTR_SIZE_VER0 || flags) return -EINVAL; - rcu_read_lock(); - p = find_process_by_pid(pid); - retval = -ESRCH; - if (!p) - goto out_unlock; + scoped_guard (rcu) { + p = find_process_by_pid(pid); + if (!p) + return -ESRCH; - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; + retval = security_task_getscheduler(p); + if (retval) + return retval; - kattr.sched_policy = p->policy; - if (p->sched_reset_on_fork) - kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; - get_params(p, &kattr); - kattr.sched_flags &= SCHED_FLAG_ALL; + kattr.sched_policy = p->policy; + if (p->sched_reset_on_fork) + kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; + get_params(p, &kattr); + kattr.sched_flags &= SCHED_FLAG_ALL; #ifdef CONFIG_UCLAMP_TASK - /* - * This could race with another potential updater, but this is fine - * because it'll correctly read the old or the new value. We don't need - * to guarantee who wins the race as long as it doesn't return garbage. - */ - kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; - kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; + /* + * This could race with another potential updater, but this is fine + * because it'll correctly read the old or the new value. We don't need + * to guarantee who wins the race as long as it doesn't return garbage. + */ + kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; + kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; #endif - - rcu_read_unlock(); + } return sched_attr_copy_to_user(uattr, &kattr, usize); - -out_unlock: - rcu_read_unlock(); - return retval; } #ifdef CONFIG_SMP int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) { - int ret = 0; - /* * If the task isn't a deadline task or admission control is * disabled then we don't care about affinity changes. @@ -8292,11 +8323,11 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) * tasks allowed to run on all the CPUs in the task's * root_domain. */ - rcu_read_lock(); + guard(rcu)(); if (!cpumask_subset(task_rq(p)->rd->span, mask)) - ret = -EBUSY; - rcu_read_unlock(); - return ret; + return -EBUSY; + + return 0; } #endif @@ -8366,39 +8397,24 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) { struct affinity_context ac; struct cpumask *user_mask; - struct task_struct *p; int retval; - rcu_read_lock(); - - p = find_process_by_pid(pid); - if (!p) { - rcu_read_unlock(); + CLASS(find_get_task, p)(pid); + if (!p) return -ESRCH; - } - - /* Prevent p going away */ - get_task_struct(p); - rcu_read_unlock(); - if (p->flags & PF_NO_SETAFFINITY) { - retval = -EINVAL; - goto out_put_task; - } + if (p->flags & PF_NO_SETAFFINITY) + return -EINVAL; if (!check_same_owner(p)) { - rcu_read_lock(); - if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { - rcu_read_unlock(); - retval = -EPERM; - goto out_put_task; - } - rcu_read_unlock(); + guard(rcu)(); + if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) + return -EPERM; } retval = security_task_setscheduler(p); if (retval) - goto out_put_task; + return retval; /* * With non-SMP configs, user_cpus_ptr/user_mask isn't used and @@ -8408,8 +8424,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) if (user_mask) { cpumask_copy(user_mask, in_mask); } else if (IS_ENABLED(CONFIG_SMP)) { - retval = -ENOMEM; - goto out_put_task; + return -ENOMEM; } ac = (struct affinity_context){ @@ -8421,8 +8436,6 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) retval = __sched_setaffinity(p, &ac); kfree(ac.user_mask); -out_put_task: - put_task_struct(p); return retval; } @@ -8464,28 +8477,21 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, long sched_getaffinity(pid_t pid, struct cpumask *mask) { struct task_struct *p; - unsigned long flags; int retval; - rcu_read_lock(); - - retval = -ESRCH; + guard(rcu)(); p = find_process_by_pid(pid); if (!p) - goto out_unlock; + return -ESRCH; retval = security_task_getscheduler(p); if (retval) - goto out_unlock; + return retval; - raw_spin_lock_irqsave(&p->pi_lock, flags); + guard(raw_spinlock_irqsave)(&p->pi_lock); cpumask_and(mask, &p->cpus_mask, cpu_active_mask); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); - -out_unlock: - rcu_read_unlock(); - return retval; + return 0; } /** @@ -8932,55 +8938,46 @@ int __sched yield_to(struct task_struct *p, bool preempt) { struct task_struct *curr = current; struct rq *rq, *p_rq; - unsigned long flags; int yielded = 0; - local_irq_save(flags); - rq = this_rq(); + scoped_guard (irqsave) { + rq = this_rq(); again: - p_rq = task_rq(p); - /* - * If we're the only runnable task on the rq and target rq also - * has only one task, there's absolutely no point in yielding. - */ - if (rq->nr_running == 1 && p_rq->nr_running == 1) { - yielded = -ESRCH; - goto out_irq; - } + p_rq = task_rq(p); + /* + * If we're the only runnable task on the rq and target rq also + * has only one task, there's absolutely no point in yielding. + */ + if (rq->nr_running == 1 && p_rq->nr_running == 1) + return -ESRCH; - double_rq_lock(rq, p_rq); - if (task_rq(p) != p_rq) { - double_rq_unlock(rq, p_rq); - goto again; - } + guard(double_rq_lock)(rq, p_rq); + if (task_rq(p) != p_rq) + goto again; - if (!curr->sched_class->yield_to_task) - goto out_unlock; + if (!curr->sched_class->yield_to_task) + return 0; - if (curr->sched_class != p->sched_class) - goto out_unlock; + if (curr->sched_class != p->sched_class) + return 0; - if (task_on_cpu(p_rq, p) || !task_is_running(p)) - goto out_unlock; + if (task_on_cpu(p_rq, p) || !task_is_running(p)) + return 0; - yielded = curr->sched_class->yield_to_task(rq, p); - if (yielded) { - schedstat_inc(rq->yld_count); - /* - * Make p's CPU reschedule; pick_next_entity takes care of - * fairness. - */ - if (preempt && rq != p_rq) - resched_curr(p_rq); + yielded = curr->sched_class->yield_to_task(rq, p); + if (yielded) { + schedstat_inc(rq->yld_count); + /* + * Make p's CPU reschedule; pick_next_entity + * takes care of fairness. + */ + if (preempt && rq != p_rq) + resched_curr(p_rq); + } } -out_unlock: - double_rq_unlock(rq, p_rq); -out_irq: - local_irq_restore(flags); - - if (yielded > 0) + if (yielded) schedule(); return yielded; @@ -9083,38 +9080,30 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) { - struct task_struct *p; - unsigned int time_slice; - struct rq_flags rf; - struct rq *rq; + unsigned int time_slice = 0; int retval; if (pid < 0) return -EINVAL; - retval = -ESRCH; - rcu_read_lock(); - p = find_process_by_pid(pid); - if (!p) - goto out_unlock; + scoped_guard (rcu) { + struct task_struct *p = find_process_by_pid(pid); + if (!p) + return -ESRCH; - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; + retval = security_task_getscheduler(p); + if (retval) + return retval; - rq = task_rq_lock(p, &rf); - time_slice = 0; - if (p->sched_class->get_rr_interval) - time_slice = p->sched_class->get_rr_interval(rq, p); - task_rq_unlock(rq, p, &rf); + scoped_guard (task_rq_lock, p) { + struct rq *rq = scope.rq; + if (p->sched_class->get_rr_interval) + time_slice = p->sched_class->get_rr_interval(rq, p); + } + } - rcu_read_unlock(); jiffies_to_timespec64(time_slice, t); return 0; - -out_unlock: - rcu_read_unlock(); - return retval; } /** @@ -9173,9 +9162,9 @@ void sched_show_task(struct task_struct *p) if (pid_alive(p)) ppid = task_pid_nr(rcu_dereference(p->real_parent)); rcu_read_unlock(); - pr_cont(" stack:%-5lu pid:%-5d ppid:%-6d flags:0x%08lx\n", - free, task_pid_nr(p), ppid, - read_task_thread_flags(p)); + pr_cont(" stack:%-5lu pid:%-5d tgid:%-5d ppid:%-6d flags:0x%08lx\n", + free, task_pid_nr(p), task_tgid_nr(p), + ppid, read_task_thread_flags(p)); print_worker_info(KERN_INFO, p); print_stop_info(KERN_INFO, p); @@ -9269,7 +9258,7 @@ void __init init_idle(struct task_struct *idle, int cpu) * PF_KTHREAD should already be set at this point; regardless, make it * look like a proper per-CPU kthread. */ - idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY; + idle->flags |= PF_KTHREAD | PF_NO_SETAFFINITY; kthread_set_per_cpu(idle, cpu); #ifdef CONFIG_SMP @@ -9505,9 +9494,11 @@ static void balance_push(struct rq *rq) * Temporarily drop rq->lock such that we can wake-up the stop task. * Both preemption and IRQs are still disabled. */ + preempt_disable(); raw_spin_rq_unlock(rq); stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task, this_cpu_ptr(&push_work)); + preempt_enable(); /* * At this point need_resched() is true and we'll take the loop in * schedule(). The next pick is obviously going to be the stop task @@ -9903,7 +9894,7 @@ struct task_group root_task_group; LIST_HEAD(task_groups); /* Cacheline aligned slab cache for task_group */ -static struct kmem_cache *task_group_cache __read_mostly; +static struct kmem_cache *task_group_cache __ro_after_init; #endif void __init sched_init(void) @@ -10013,7 +10004,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; - rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; + rq->cpu_capacity = SCHED_CAPACITY_SCALE; rq->balance_callback = &balance_push_callback; rq->active_balance = 0; rq->next_balance = jiffies; @@ -10022,8 +10013,6 @@ void __init sched_init(void) rq->online = 0; rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; - rq->wake_stamp = jiffies; - rq->wake_avg_idle = rq->avg_idle; rq->max_idle_balance_cost = sysctl_sched_migration_cost; INIT_LIST_HEAD(&rq->cfs_tasks); @@ -10289,9 +10278,9 @@ void normalize_rt_tasks(void) #endif /* CONFIG_MAGIC_SYSRQ */ -#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) +#if defined(CONFIG_KGDB_KDB) /* - * These functions are only useful for the IA64 MCA handling, or kdb. + * These functions are only useful for kdb. * * They can only be called when the whole system has been * stopped - every CPU needs to be quiescent, and no scheduling @@ -10313,30 +10302,7 @@ struct task_struct *curr_task(int cpu) return cpu_curr(cpu); } -#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ - -#ifdef CONFIG_IA64 -/** - * ia64_set_curr_task - set the current task for a given CPU. - * @cpu: the processor in question. - * @p: the task pointer to set. - * - * Description: This function must only be used when non-maskable interrupts - * are serviced on a separate stack. It allows the architecture to switch the - * notion of the current task on a CPU in a non-blocking manner. This function - * must be called with all CPU's synchronized, and interrupts disabled, the - * and caller must save the original value of the current task (see - * curr_task() above) and restore that value before reenabling interrupts and - * re-starting the system. - * - * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! - */ -void ia64_set_curr_task(int cpu, struct task_struct *p) -{ - cpu_curr(cpu) = p; -} - -#endif +#endif /* defined(CONFIG_KGDB_KDB) */ #ifdef CONFIG_CGROUP_SCHED /* task_group_lock serializes the addition/removal of task groups */ @@ -10498,17 +10464,18 @@ void sched_move_task(struct task_struct *tsk) int queued, running, queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; struct task_group *group; - struct rq_flags rf; struct rq *rq; - rq = task_rq_lock(tsk, &rf); + CLASS(task_rq_lock, rq_guard)(tsk); + rq = rq_guard.rq; + /* * Esp. with SCHED_AUTOGROUP enabled it is possible to get superfluous * group changes. */ group = sched_get_task_group(tsk); if (group == tsk->sched_task_group) - goto unlock; + return; update_rq_clock(rq); @@ -10533,9 +10500,6 @@ void sched_move_task(struct task_struct *tsk) */ resched_curr(rq); } - -unlock: - task_rq_unlock(rq, tsk, &rf); } static inline struct task_group *css_tg(struct cgroup_subsys_state *css) @@ -10572,11 +10536,9 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) #ifdef CONFIG_UCLAMP_TASK_GROUP /* Propagate the effective uclamp value for the new group */ - mutex_lock(&uclamp_mutex); - rcu_read_lock(); + guard(mutex)(&uclamp_mutex); + guard(rcu)(); cpu_util_update_eff(css); - rcu_read_unlock(); - mutex_unlock(&uclamp_mutex); #endif return 0; @@ -10727,8 +10689,8 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf, static_branch_enable(&sched_uclamp_used); - mutex_lock(&uclamp_mutex); - rcu_read_lock(); + guard(mutex)(&uclamp_mutex); + guard(rcu)(); tg = css_tg(of_css(of)); if (tg->uclamp_req[clamp_id].value != req.util) @@ -10743,9 +10705,6 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf, /* Update effective clamps to track the most restrictive value */ cpu_util_update_eff(of_css(of)); - rcu_read_unlock(); - mutex_unlock(&uclamp_mutex); - return nbytes; } @@ -10771,10 +10730,10 @@ static inline void cpu_uclamp_print(struct seq_file *sf, u64 percent; u32 rem; - rcu_read_lock(); - tg = css_tg(seq_css(sf)); - util_clamp = tg->uclamp_req[clamp_id].value; - rcu_read_unlock(); + scoped_guard (rcu) { + tg = css_tg(seq_css(sf)); + util_clamp = tg->uclamp_req[clamp_id].value; + } if (util_clamp == SCHED_CAPACITY_SCALE) { seq_puts(sf, "max\n"); @@ -10865,11 +10824,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, * Prevent race between setting of cfs_rq->runtime_enabled and * unthrottle_offline_cfs_rqs(). */ - cpus_read_lock(); - mutex_lock(&cfs_constraints_mutex); + guard(cpus_read_lock)(); + guard(mutex)(&cfs_constraints_mutex); + ret = __cfs_schedulable(tg, period, quota); if (ret) - goto out_unlock; + return ret; runtime_enabled = quota != RUNTIME_INF; runtime_was_enabled = cfs_b->quota != RUNTIME_INF; @@ -10879,39 +10839,38 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, */ if (runtime_enabled && !runtime_was_enabled) cfs_bandwidth_usage_inc(); - raw_spin_lock_irq(&cfs_b->lock); - cfs_b->period = ns_to_ktime(period); - cfs_b->quota = quota; - cfs_b->burst = burst; - __refill_cfs_bandwidth_runtime(cfs_b); + scoped_guard (raw_spinlock_irq, &cfs_b->lock) { + cfs_b->period = ns_to_ktime(period); + cfs_b->quota = quota; + cfs_b->burst = burst; - /* Restart the period timer (if active) to handle new period expiry: */ - if (runtime_enabled) - start_cfs_bandwidth(cfs_b); + __refill_cfs_bandwidth_runtime(cfs_b); - raw_spin_unlock_irq(&cfs_b->lock); + /* + * Restart the period timer (if active) to handle new + * period expiry: + */ + if (runtime_enabled) + start_cfs_bandwidth(cfs_b); + } for_each_online_cpu(i) { struct cfs_rq *cfs_rq = tg->cfs_rq[i]; struct rq *rq = cfs_rq->rq; - struct rq_flags rf; - rq_lock_irq(rq, &rf); + guard(rq_lock_irq)(rq); cfs_rq->runtime_enabled = runtime_enabled; cfs_rq->runtime_remaining = 0; if (cfs_rq->throttled) unthrottle_cfs_rq(cfs_rq); - rq_unlock_irq(rq, &rf); } + if (runtime_was_enabled && !runtime_enabled) cfs_bandwidth_usage_dec(); -out_unlock: - mutex_unlock(&cfs_constraints_mutex); - cpus_read_unlock(); - return ret; + return 0; } static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) @@ -11096,7 +11055,6 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) { - int ret; struct cfs_schedulable_data data = { .tg = tg, .period = period, @@ -11108,11 +11066,8 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) do_div(data.quota, NSEC_PER_USEC); } - rcu_read_lock(); - ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); - rcu_read_unlock(); - - return ret; + guard(rcu)(); + return walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); } static int cpu_cfs_stat_show(struct seq_file *sf, void *v) @@ -11717,14 +11672,12 @@ int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq, * are not the last task to be migrated from this cpu for this mm, so * there is no need to move src_cid to the destination cpu. */ - rcu_read_lock(); + guard(rcu)(); src_task = rcu_dereference(src_rq->curr); if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { - rcu_read_unlock(); t->last_mm_cid = -1; return -1; } - rcu_read_unlock(); return src_cid; } @@ -11768,18 +11721,17 @@ int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq, * the lazy-put flag, this task will be responsible for transitioning * from lazy-put flag set to MM_CID_UNSET. */ - rcu_read_lock(); - src_task = rcu_dereference(src_rq->curr); - if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { - rcu_read_unlock(); - /* - * We observed an active task for this mm, there is therefore - * no point in moving this cid to the destination cpu. - */ - t->last_mm_cid = -1; - return -1; + scoped_guard (rcu) { + src_task = rcu_dereference(src_rq->curr); + if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { + /* + * We observed an active task for this mm, there is therefore + * no point in moving this cid to the destination cpu. + */ + t->last_mm_cid = -1; + return -1; + } } - rcu_read_unlock(); /* * The src_cid is unused, so it can be unset. @@ -11852,7 +11804,6 @@ static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_ { struct rq *rq = cpu_rq(cpu); struct task_struct *t; - unsigned long flags; int cid, lazy_cid; cid = READ_ONCE(pcpu_cid->cid); @@ -11887,23 +11838,21 @@ static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_ * the lazy-put flag, that task will be responsible for transitioning * from lazy-put flag set to MM_CID_UNSET. */ - rcu_read_lock(); - t = rcu_dereference(rq->curr); - if (READ_ONCE(t->mm_cid_active) && t->mm == mm) { - rcu_read_unlock(); - return; + scoped_guard (rcu) { + t = rcu_dereference(rq->curr); + if (READ_ONCE(t->mm_cid_active) && t->mm == mm) + return; } - rcu_read_unlock(); /* * The cid is unused, so it can be unset. * Disable interrupts to keep the window of cid ownership without rq * lock small. */ - local_irq_save(flags); - if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) - __mm_cid_put(mm, cid); - local_irq_restore(flags); + scoped_guard (irqsave) { + if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) + __mm_cid_put(mm, cid); + } } static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu) @@ -11925,14 +11874,13 @@ static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu) * snapshot associated with this cid if an active task using the mm is * observed on this rq. */ - rcu_read_lock(); - curr = rcu_dereference(rq->curr); - if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) { - WRITE_ONCE(pcpu_cid->time, rq_clock); - rcu_read_unlock(); - return; + scoped_guard (rcu) { + curr = rcu_dereference(rq->curr); + if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) { + WRITE_ONCE(pcpu_cid->time, rq_clock); + return; + } } - rcu_read_unlock(); if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS) return; @@ -12026,7 +11974,6 @@ void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) void sched_mm_cid_exit_signals(struct task_struct *t) { struct mm_struct *mm = t->mm; - struct rq_flags rf; struct rq *rq; if (!mm) @@ -12034,7 +11981,7 @@ void sched_mm_cid_exit_signals(struct task_struct *t) preempt_disable(); rq = this_rq(); - rq_lock_irqsave(rq, &rf); + guard(rq_lock_irqsave)(rq); preempt_enable_no_resched(); /* holding spinlock */ WRITE_ONCE(t->mm_cid_active, 0); /* @@ -12044,13 +11991,11 @@ void sched_mm_cid_exit_signals(struct task_struct *t) smp_mb(); mm_cid_put(mm); t->last_mm_cid = t->mm_cid = -1; - rq_unlock_irqrestore(rq, &rf); } void sched_mm_cid_before_execve(struct task_struct *t) { struct mm_struct *mm = t->mm; - struct rq_flags rf; struct rq *rq; if (!mm) @@ -12058,7 +12003,7 @@ void sched_mm_cid_before_execve(struct task_struct *t) preempt_disable(); rq = this_rq(); - rq_lock_irqsave(rq, &rf); + guard(rq_lock_irqsave)(rq); preempt_enable_no_resched(); /* holding spinlock */ WRITE_ONCE(t->mm_cid_active, 0); /* @@ -12068,13 +12013,11 @@ void sched_mm_cid_before_execve(struct task_struct *t) smp_mb(); mm_cid_put(mm); t->last_mm_cid = t->mm_cid = -1; - rq_unlock_irqrestore(rq, &rf); } void sched_mm_cid_after_execve(struct task_struct *t) { struct mm_struct *mm = t->mm; - struct rq_flags rf; struct rq *rq; if (!mm) @@ -12082,16 +12025,16 @@ void sched_mm_cid_after_execve(struct task_struct *t) preempt_disable(); rq = this_rq(); - rq_lock_irqsave(rq, &rf); - preempt_enable_no_resched(); /* holding spinlock */ - WRITE_ONCE(t->mm_cid_active, 1); - /* - * Store t->mm_cid_active before loading per-mm/cpu cid. - * Matches barrier in sched_mm_cid_remote_clear_old(). - */ - smp_mb(); - t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm); - rq_unlock_irqrestore(rq, &rf); + scoped_guard (rq_lock_irqsave, rq) { + preempt_enable_no_resched(); /* holding spinlock */ + WRITE_ONCE(t->mm_cid_active, 1); + /* + * Store t->mm_cid_active before loading per-mm/cpu cid. + * Matches barrier in sched_mm_cid_remote_clear_old(). + */ + smp_mb(); + t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm); + } rseq_set_notify_resume(t); } diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 57c92d751bcd..95baa12a1029 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -131,7 +131,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, if (!dl_task_fits_capacity(p, cpu)) { cpumask_clear_cpu(cpu, later_mask); - cap = capacity_orig_of(cpu); + cap = arch_scale_cpu_capacity(cpu); if (cap > max_cap || (cpu == task_cpu(p) && cap == max_cap)) { diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 4492608b7d7f..eece6244f9d2 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -47,7 +47,7 @@ struct sugov_cpu { u64 last_update; unsigned long util; - unsigned long bw_dl; + unsigned long bw_min; /* The field below is for single-CPU policies only: */ #ifdef CONFIG_NO_HZ_COMMON @@ -115,6 +115,32 @@ static void sugov_deferred_update(struct sugov_policy *sg_policy) } /** + * get_capacity_ref_freq - get the reference frequency that has been used to + * correlate frequency and compute capacity for a given cpufreq policy. We use + * the CPU managing it for the arch_scale_freq_ref() call in the function. + * @policy: the cpufreq policy of the CPU in question. + * + * Return: the reference CPU frequency to compute a capacity. + */ +static __always_inline +unsigned long get_capacity_ref_freq(struct cpufreq_policy *policy) +{ + unsigned int freq = arch_scale_freq_ref(policy->cpu); + + if (freq) + return freq; + + if (arch_scale_freq_invariant()) + return policy->cpuinfo.max_freq; + + /* + * Apply a 25% margin so that we select a higher frequency than + * the current one before the CPU is fully busy: + */ + return policy->cur + (policy->cur >> 2); +} + +/** * get_next_freq - Compute a new frequency for a given cpufreq policy. * @sg_policy: schedutil policy object to compute the new frequency for. * @util: Current CPU utilization. @@ -140,10 +166,9 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, unsigned long util, unsigned long max) { struct cpufreq_policy *policy = sg_policy->policy; - unsigned int freq = arch_scale_freq_invariant() ? - policy->cpuinfo.max_freq : policy->cur; + unsigned int freq; - util = map_util_perf(util); + freq = get_capacity_ref_freq(policy); freq = map_util_freq(util, freq, max); if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update) @@ -153,14 +178,31 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, return cpufreq_driver_resolve_freq(policy, freq); } -static void sugov_get_util(struct sugov_cpu *sg_cpu) +unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual, + unsigned long min, + unsigned long max) { - unsigned long util = cpu_util_cfs_boost(sg_cpu->cpu); - struct rq *rq = cpu_rq(sg_cpu->cpu); + /* Add dvfs headroom to actual utilization */ + actual = map_util_perf(actual); + /* Actually we don't need to target the max performance */ + if (actual < max) + max = actual; - sg_cpu->bw_dl = cpu_bw_dl(rq); - sg_cpu->util = effective_cpu_util(sg_cpu->cpu, util, - FREQUENCY_UTIL, NULL); + /* + * Ensure at least minimum performance while providing more compute + * capacity when possible. + */ + return max(min, max); +} + +static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost) +{ + unsigned long min, max, util = cpu_util_cfs_boost(sg_cpu->cpu); + + util = effective_cpu_util(sg_cpu->cpu, util, &min, &max); + util = max(util, boost); + sg_cpu->bw_min = min; + sg_cpu->util = sugov_effective_cpu_perf(sg_cpu->cpu, util, min, max); } /** @@ -251,18 +293,16 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, * This mechanism is designed to boost high frequently IO waiting tasks, while * being more conservative on tasks which does sporadic IO operations. */ -static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, +static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, unsigned long max_cap) { - unsigned long boost; - /* No boost currently required */ if (!sg_cpu->iowait_boost) - return; + return 0; /* Reset boost if the CPU appears to have been idle enough */ if (sugov_iowait_reset(sg_cpu, time, false)) - return; + return 0; if (!sg_cpu->iowait_boost_pending) { /* @@ -271,7 +311,7 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, sg_cpu->iowait_boost >>= 1; if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) { sg_cpu->iowait_boost = 0; - return; + return 0; } } @@ -281,10 +321,7 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, * sg_cpu->util is already in capacity scale; convert iowait_boost * into the same scale so we can compare. */ - boost = (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT; - boost = uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL); - if (sg_cpu->util < boost) - sg_cpu->util = boost; + return (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT; } #ifdef CONFIG_NO_HZ_COMMON @@ -306,7 +343,7 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } */ static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu) { - if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) + if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_min) sg_cpu->sg_policy->limits_changed = true; } @@ -314,6 +351,8 @@ static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, u64 time, unsigned long max_cap, unsigned int flags) { + unsigned long boost; + sugov_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; @@ -322,8 +361,8 @@ static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, if (!sugov_should_update_freq(sg_cpu->sg_policy, time)) return false; - sugov_get_util(sg_cpu); - sugov_iowait_apply(sg_cpu, time, max_cap); + boost = sugov_iowait_apply(sg_cpu, time, max_cap); + sugov_get_util(sg_cpu, boost); return true; } @@ -350,7 +389,8 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time, * Except when the rq is capped by uclamp_max. */ if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) && - sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq) { + sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq && + !sg_policy->need_freq_update) { next_f = sg_policy->next_freq; /* Restore cached freq as next_freq has changed */ @@ -406,8 +446,8 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util) sg_cpu->util = prev_util; - cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl), - map_util_perf(sg_cpu->util), max_cap); + cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min, + sg_cpu->util, max_cap); sg_cpu->sg_policy->last_freq_update_time = time; } @@ -423,9 +463,10 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) for_each_cpu(j, policy->cpus) { struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); + unsigned long boost; - sugov_get_util(j_sg_cpu); - sugov_iowait_apply(j_sg_cpu, time, max_cap); + boost = sugov_iowait_apply(j_sg_cpu, time, max_cap); + sugov_get_util(j_sg_cpu, boost); util = max(j_sg_cpu->util, util); } @@ -555,6 +596,31 @@ static const struct kobj_type sugov_tunables_ktype = { /********************** cpufreq governor interface *********************/ +#ifdef CONFIG_ENERGY_MODEL +static void rebuild_sd_workfn(struct work_struct *work) +{ + rebuild_sched_domains_energy(); +} + +static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); + +/* + * EAS shouldn't be attempted without sugov, so rebuild the sched_domains + * on governor changes to make sure the scheduler knows about it. + */ +static void sugov_eas_rebuild_sd(void) +{ + /* + * When called from the cpufreq_register_driver() path, the + * cpu_hotplug_lock is already held, so use a work item to + * avoid nested locking in rebuild_sched_domains(). + */ + schedule_work(&rebuild_sd_work); +} +#else +static inline void sugov_eas_rebuild_sd(void) { }; +#endif + struct cpufreq_governor schedutil_gov; static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) @@ -709,6 +775,8 @@ static int sugov_init(struct cpufreq_policy *policy) if (ret) goto fail; + sugov_eas_rebuild_sd(); + out: mutex_unlock(&global_tunables_lock); return 0; @@ -750,6 +818,8 @@ static void sugov_exit(struct cpufreq_policy *policy) sugov_kthread_stop(sg_policy); sugov_policy_free(sg_policy); cpufreq_disable_fast_switch(policy); + + sugov_eas_rebuild_sd(); } static int sugov_start(struct cpufreq_policy *policy) @@ -767,14 +837,6 @@ static int sugov_start(struct cpufreq_policy *policy) sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); - for_each_cpu(cpu, policy->cpus) { - struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); - - memset(sg_cpu, 0, sizeof(*sg_cpu)); - sg_cpu->cpu = cpu; - sg_cpu->sg_policy = sg_policy; - } - if (policy_is_shared(policy)) uu = sugov_update_shared; else if (policy->fast_switch_enabled && cpufreq_driver_has_adjust_perf()) @@ -785,6 +847,9 @@ static int sugov_start(struct cpufreq_policy *policy) for_each_cpu(cpu, policy->cpus) { struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); + memset(sg_cpu, 0, sizeof(*sg_cpu)); + sg_cpu->cpu = cpu; + sg_cpu->sg_policy = sg_policy; cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, uu); } return 0; @@ -838,29 +903,3 @@ struct cpufreq_governor *cpufreq_default_governor(void) #endif cpufreq_governor_init(schedutil_gov); - -#ifdef CONFIG_ENERGY_MODEL -static void rebuild_sd_workfn(struct work_struct *work) -{ - rebuild_sched_domains_energy(); -} -static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); - -/* - * EAS shouldn't be attempted without sugov, so rebuild the sched_domains - * on governor changes to make sure the scheduler knows about it. - */ -void sched_cpufreq_governor_change(struct cpufreq_policy *policy, - struct cpufreq_governor *old_gov) -{ - if (old_gov == &schedutil_gov || policy->governor == &schedutil_gov) { - /* - * When called from the cpufreq_register_driver() path, the - * cpu_hotplug_lock is already held, so use a work item to - * avoid nested locking in rebuild_sched_domains(). - */ - schedule_work(&rebuild_sd_work); - } - -} -#endif diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index a286e726eb4b..42c40cfdf836 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -101,6 +101,7 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p, if (lowest_mask) { cpumask_and(lowest_mask, &p->cpus_mask, vec->mask); + cpumask_and(lowest_mask, lowest_mask, cpu_active_mask); /* * We have to ensure that we have at least one bit diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 58b542bf2893..a04a436af8cc 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -54,8 +54,14 @@ static int __init sched_dl_sysctl_init(void) late_initcall(sched_dl_sysctl_init); #endif +static bool dl_server(struct sched_dl_entity *dl_se) +{ + return dl_se->dl_server; +} + static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) { + BUG_ON(dl_server(dl_se)); return container_of(dl_se, struct task_struct, dl); } @@ -64,12 +70,19 @@ static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq) return container_of(dl_rq, struct rq, dl); } -static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se) +static inline struct rq *rq_of_dl_se(struct sched_dl_entity *dl_se) { - struct task_struct *p = dl_task_of(dl_se); - struct rq *rq = task_rq(p); + struct rq *rq = dl_se->rq; - return &rq->dl; + if (!dl_server(dl_se)) + rq = task_rq(dl_task_of(dl_se)); + + return rq; +} + +static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se) +{ + return &rq_of_dl_se(dl_se)->dl; } static inline int on_dl_rq(struct sched_dl_entity *dl_se) @@ -132,7 +145,7 @@ static inline unsigned long __dl_bw_capacity(const struct cpumask *mask) int i; for_each_cpu_and(i, mask, cpu_active_mask) - cap += capacity_orig_of(i); + cap += arch_scale_cpu_capacity(i); return cap; } @@ -144,7 +157,7 @@ static inline unsigned long __dl_bw_capacity(const struct cpumask *mask) static inline unsigned long dl_bw_capacity(int i) { if (!sched_asym_cpucap_active() && - capacity_orig_of(i) == SCHED_CAPACITY_SCALE) { + arch_scale_cpu_capacity(i) == SCHED_CAPACITY_SCALE) { return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT; } else { RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), @@ -335,6 +348,8 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw) __add_rq_bw(new_bw, &rq->dl); } +static void __dl_clear_params(struct sched_dl_entity *dl_se); + /* * The utilization of a task cannot be immediately removed from * the rq active utilization (running_bw) when the task blocks. @@ -389,12 +404,11 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw) * up, and checks if the task is still in the "ACTIVE non contending" * state or not (in the second case, it updates running_bw). */ -static void task_non_contending(struct task_struct *p) +static void task_non_contending(struct sched_dl_entity *dl_se) { - struct sched_dl_entity *dl_se = &p->dl; struct hrtimer *timer = &dl_se->inactive_timer; - struct dl_rq *dl_rq = dl_rq_of_se(dl_se); - struct rq *rq = rq_of_dl_rq(dl_rq); + struct rq *rq = rq_of_dl_se(dl_se); + struct dl_rq *dl_rq = &rq->dl; s64 zerolag_time; /* @@ -424,24 +438,33 @@ static void task_non_contending(struct task_struct *p) * utilization now, instead of starting a timer */ if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) { - if (dl_task(p)) + if (dl_server(dl_se)) { sub_running_bw(dl_se, dl_rq); - if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) { - struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); - - if (READ_ONCE(p->__state) == TASK_DEAD) - sub_rq_bw(&p->dl, &rq->dl); - raw_spin_lock(&dl_b->lock); - __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); - raw_spin_unlock(&dl_b->lock); - __dl_clear_params(p); + } else { + struct task_struct *p = dl_task_of(dl_se); + + if (dl_task(p)) + sub_running_bw(dl_se, dl_rq); + + if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) { + struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + + if (READ_ONCE(p->__state) == TASK_DEAD) + sub_rq_bw(dl_se, &rq->dl); + raw_spin_lock(&dl_b->lock); + __dl_sub(dl_b, dl_se->dl_bw, dl_bw_cpus(task_cpu(p))); + raw_spin_unlock(&dl_b->lock); + __dl_clear_params(dl_se); + } } return; } dl_se->dl_non_contending = 1; - get_task_struct(p); + if (!dl_server(dl_se)) + get_task_struct(dl_task_of(dl_se)); + hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL_HARD); } @@ -468,8 +491,10 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags) * will not touch the rq's active utilization, * so we are still safe. */ - if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) - put_task_struct(dl_task_of(dl_se)); + if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) { + if (!dl_server(dl_se)) + put_task_struct(dl_task_of(dl_se)); + } } else { /* * Since "dl_non_contending" is not set, the @@ -482,10 +507,8 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags) } } -static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) +static inline int is_leftmost(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { - struct sched_dl_entity *dl_se = &p->dl; - return rb_first_cached(&dl_rq->root) == &dl_se->rb_node; } @@ -509,7 +532,6 @@ void init_dl_rq(struct dl_rq *dl_rq) /* zero means no -deadline tasks */ dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0; - dl_rq->dl_nr_migratory = 0; dl_rq->overloaded = 0; dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED; #else @@ -553,39 +575,6 @@ static inline void dl_clear_overload(struct rq *rq) cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask); } -static void update_dl_migration(struct dl_rq *dl_rq) -{ - if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) { - if (!dl_rq->overloaded) { - dl_set_overload(rq_of_dl_rq(dl_rq)); - dl_rq->overloaded = 1; - } - } else if (dl_rq->overloaded) { - dl_clear_overload(rq_of_dl_rq(dl_rq)); - dl_rq->overloaded = 0; - } -} - -static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) -{ - struct task_struct *p = dl_task_of(dl_se); - - if (p->nr_cpus_allowed > 1) - dl_rq->dl_nr_migratory++; - - update_dl_migration(dl_rq); -} - -static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) -{ - struct task_struct *p = dl_task_of(dl_se); - - if (p->nr_cpus_allowed > 1) - dl_rq->dl_nr_migratory--; - - update_dl_migration(dl_rq); -} - #define __node_2_pdl(node) \ rb_entry((node), struct task_struct, pushable_dl_tasks) @@ -594,6 +583,11 @@ static inline bool __pushable_less(struct rb_node *a, const struct rb_node *b) return dl_entity_preempt(&__node_2_pdl(a)->dl, &__node_2_pdl(b)->dl); } +static inline int has_pushable_dl_tasks(struct rq *rq) +{ + return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root); +} + /* * The list of pushable -deadline task is not a plist, like in * sched_rt.c, it is an rb-tree with tasks ordered by deadline. @@ -609,6 +603,11 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) __pushable_less); if (leftmost) rq->dl.earliest_dl.next = p->dl.deadline; + + if (!rq->dl.overloaded) { + dl_set_overload(rq); + rq->dl.overloaded = 1; + } } static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) @@ -625,11 +624,11 @@ static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) dl_rq->earliest_dl.next = __node_2_pdl(leftmost)->dl.deadline; RB_CLEAR_NODE(&p->pushable_dl_tasks); -} -static inline int has_pushable_dl_tasks(struct rq *rq) -{ - return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root); + if (!has_pushable_dl_tasks(rq) && rq->dl.overloaded) { + dl_clear_overload(rq); + rq->dl.overloaded = 0; + } } static int push_dl_task(struct rq *rq); @@ -761,9 +760,11 @@ static inline void deadline_queue_pull_task(struct rq *rq) } #endif /* CONFIG_SMP */ +static void +enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags); static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); -static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); -static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags); +static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags); +static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags); static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se, struct rq *rq) @@ -1010,8 +1011,7 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se) */ static void update_dl_entity(struct sched_dl_entity *dl_se) { - struct dl_rq *dl_rq = dl_rq_of_se(dl_se); - struct rq *rq = rq_of_dl_rq(dl_rq); + struct rq *rq = rq_of_dl_se(dl_se); if (dl_time_before(dl_se->deadline, rq_clock(rq)) || dl_entity_overflow(dl_se, rq_clock(rq))) { @@ -1042,11 +1042,11 @@ static inline u64 dl_next_period(struct sched_dl_entity *dl_se) * actually started or not (i.e., the replenishment instant is in * the future or in the past). */ -static int start_dl_timer(struct task_struct *p) +static int start_dl_timer(struct sched_dl_entity *dl_se) { - struct sched_dl_entity *dl_se = &p->dl; struct hrtimer *timer = &dl_se->dl_timer; - struct rq *rq = task_rq(p); + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); ktime_t now, act; s64 delta; @@ -1080,13 +1080,33 @@ static int start_dl_timer(struct task_struct *p) * and observe our state. */ if (!hrtimer_is_queued(timer)) { - get_task_struct(p); + if (!dl_server(dl_se)) + get_task_struct(dl_task_of(dl_se)); hrtimer_start(timer, act, HRTIMER_MODE_ABS_HARD); } return 1; } +static void __push_dl_task(struct rq *rq, struct rq_flags *rf) +{ +#ifdef CONFIG_SMP + /* + * Queueing this task back might have overloaded rq, check if we need + * to kick someone away. + */ + if (has_pushable_dl_tasks(rq)) { + /* + * Nothing relies on rq->lock after this, so its safe to drop + * rq->lock. + */ + rq_unpin_lock(rq, rf); + push_dl_task(rq); + rq_repin_lock(rq, rf); + } +#endif +} + /* * This is the bandwidth enforcement timer callback. If here, we know * a task is not on its dl_rq, since the fact that the timer was running @@ -1105,10 +1125,34 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) struct sched_dl_entity *dl_se = container_of(timer, struct sched_dl_entity, dl_timer); - struct task_struct *p = dl_task_of(dl_se); + struct task_struct *p; struct rq_flags rf; struct rq *rq; + if (dl_server(dl_se)) { + struct rq *rq = rq_of_dl_se(dl_se); + struct rq_flags rf; + + rq_lock(rq, &rf); + if (dl_se->dl_throttled) { + sched_clock_tick(); + update_rq_clock(rq); + + if (dl_se->server_has_tasks(dl_se)) { + enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); + resched_curr(rq); + __push_dl_task(rq, &rf); + } else { + replenish_dl_entity(dl_se); + } + + } + rq_unlock(rq, &rf); + + return HRTIMER_NORESTART; + } + + p = dl_task_of(dl_se); rq = task_rq_lock(p, &rf); /* @@ -1175,25 +1219,11 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); if (dl_task(rq->curr)) - check_preempt_curr_dl(rq, p, 0); + wakeup_preempt_dl(rq, p, 0); else resched_curr(rq); -#ifdef CONFIG_SMP - /* - * Queueing this task back might have overloaded rq, check if we need - * to kick someone away. - */ - if (has_pushable_dl_tasks(rq)) { - /* - * Nothing relies on rq->lock after this, so its safe to drop - * rq->lock. - */ - rq_unpin_lock(rq, &rf); - push_dl_task(rq); - rq_repin_lock(rq, &rf); - } -#endif + __push_dl_task(rq, &rf); unlock: task_rq_unlock(rq, p, &rf); @@ -1207,7 +1237,7 @@ unlock: return HRTIMER_NORESTART; } -void init_dl_task_timer(struct sched_dl_entity *dl_se) +static void init_dl_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->dl_timer; @@ -1235,12 +1265,11 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) */ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se) { - struct task_struct *p = dl_task_of(dl_se); - struct rq *rq = rq_of_dl_rq(dl_rq_of_se(dl_se)); + struct rq *rq = rq_of_dl_se(dl_se); if (dl_time_before(dl_se->deadline, rq_clock(rq)) && dl_time_before(rq_clock(rq), dl_next_period(dl_se))) { - if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(p))) + if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se))) return; dl_se->dl_throttled = 1; if (dl_se->runtime > 0) @@ -1291,44 +1320,19 @@ static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se) return (delta * u_act) >> BW_SHIFT; } -/* - * Update the current task's runtime statistics (provided it is still - * a -deadline task and has not been removed from the dl_rq). - */ -static void update_curr_dl(struct rq *rq) +static inline void +update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, + int flags); +static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) { - struct task_struct *curr = rq->curr; - struct sched_dl_entity *dl_se = &curr->dl; - u64 delta_exec, scaled_delta_exec; - int cpu = cpu_of(rq); - u64 now; + s64 scaled_delta_exec; - if (!dl_task(curr) || !on_dl_rq(dl_se)) - return; - - /* - * Consumed budget is computed considering the time as - * observed by schedulable tasks (excluding time spent - * in hardirq context, etc.). Deadlines are instead - * computed using hard walltime. This seems to be the more - * natural solution, but the full ramifications of this - * approach need further study. - */ - now = rq_clock_task(rq); - delta_exec = now - curr->se.exec_start; - if (unlikely((s64)delta_exec <= 0)) { + if (unlikely(delta_exec <= 0)) { if (unlikely(dl_se->dl_yielded)) goto throttle; return; } - schedstat_set(curr->stats.exec_max, - max(curr->stats.exec_max, delta_exec)); - - trace_sched_stat_runtime(curr, delta_exec, 0); - - update_current_exec_runtime(curr, now, delta_exec); - if (dl_entity_is_special(dl_se)) return; @@ -1340,10 +1344,9 @@ static void update_curr_dl(struct rq *rq) * according to current frequency and CPU maximum capacity. */ if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) { - scaled_delta_exec = grub_reclaim(delta_exec, - rq, - &curr->dl); + scaled_delta_exec = grub_reclaim(delta_exec, rq, dl_se); } else { + int cpu = cpu_of(rq); unsigned long scale_freq = arch_scale_freq_capacity(cpu); unsigned long scale_cpu = arch_scale_cpu_capacity(cpu); @@ -1362,11 +1365,20 @@ throttle: (dl_se->flags & SCHED_FLAG_DL_OVERRUN)) dl_se->dl_overrun = 1; - __dequeue_task_dl(rq, curr, 0); - if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(curr))) - enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); + dequeue_dl_entity(dl_se, 0); + if (!dl_server(dl_se)) { + update_stats_dequeue_dl(&rq->dl, dl_se, 0); + dequeue_pushable_dl_task(rq, dl_task_of(dl_se)); + } - if (!is_leftmost(curr, &rq->dl)) + if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se))) { + if (dl_server(dl_se)) + enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); + else + enqueue_task_dl(rq, dl_task_of(dl_se), ENQUEUE_REPLENISH); + } + + if (!is_leftmost(dl_se, &rq->dl)) resched_curr(rq); } @@ -1396,20 +1408,82 @@ throttle: } } +void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec) +{ + update_curr_dl_se(dl_se->rq, dl_se, delta_exec); +} + +void dl_server_start(struct sched_dl_entity *dl_se) +{ + if (!dl_server(dl_se)) { + dl_se->dl_server = 1; + setup_new_dl_entity(dl_se); + } + enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP); +} + +void dl_server_stop(struct sched_dl_entity *dl_se) +{ + dequeue_dl_entity(dl_se, DEQUEUE_SLEEP); +} + +void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, + dl_server_has_tasks_f has_tasks, + dl_server_pick_f pick) +{ + dl_se->rq = rq; + dl_se->server_has_tasks = has_tasks; + dl_se->server_pick = pick; +} + +/* + * Update the current task's runtime statistics (provided it is still + * a -deadline task and has not been removed from the dl_rq). + */ +static void update_curr_dl(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + struct sched_dl_entity *dl_se = &curr->dl; + s64 delta_exec; + + if (!dl_task(curr) || !on_dl_rq(dl_se)) + return; + + /* + * Consumed budget is computed considering the time as + * observed by schedulable tasks (excluding time spent + * in hardirq context, etc.). Deadlines are instead + * computed using hard walltime. This seems to be the more + * natural solution, but the full ramifications of this + * approach need further study. + */ + delta_exec = update_curr_common(rq); + update_curr_dl_se(rq, dl_se, delta_exec); +} + static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) { struct sched_dl_entity *dl_se = container_of(timer, struct sched_dl_entity, inactive_timer); - struct task_struct *p = dl_task_of(dl_se); + struct task_struct *p = NULL; struct rq_flags rf; struct rq *rq; - rq = task_rq_lock(p, &rf); + if (!dl_server(dl_se)) { + p = dl_task_of(dl_se); + rq = task_rq_lock(p, &rf); + } else { + rq = dl_se->rq; + rq_lock(rq, &rf); + } sched_clock_tick(); update_rq_clock(rq); + if (dl_server(dl_se)) + goto no_task; + if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); @@ -1422,23 +1496,30 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) raw_spin_lock(&dl_b->lock); __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); raw_spin_unlock(&dl_b->lock); - __dl_clear_params(p); + __dl_clear_params(dl_se); goto unlock; } + +no_task: if (dl_se->dl_non_contending == 0) goto unlock; sub_running_bw(dl_se, &rq->dl); dl_se->dl_non_contending = 0; unlock: - task_rq_unlock(rq, p, &rf); - put_task_struct(p); + + if (!dl_server(dl_se)) { + task_rq_unlock(rq, p, &rf); + put_task_struct(p); + } else { + rq_unlock(rq, &rf); + } return HRTIMER_NORESTART; } -void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) +static void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->inactive_timer; @@ -1496,29 +1577,22 @@ static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} static inline void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { - int prio = dl_task_of(dl_se)->prio; u64 deadline = dl_se->deadline; - WARN_ON(!dl_prio(prio)); dl_rq->dl_nr_running++; add_nr_running(rq_of_dl_rq(dl_rq), 1); inc_dl_deadline(dl_rq, deadline); - inc_dl_migration(dl_se, dl_rq); } static inline void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { - int prio = dl_task_of(dl_se)->prio; - - WARN_ON(!dl_prio(prio)); WARN_ON(!dl_rq->dl_nr_running); dl_rq->dl_nr_running--; sub_nr_running(rq_of_dl_rq(dl_rq), 1); dec_dl_deadline(dl_rq, dl_se->deadline); - dec_dl_migration(dl_se, dl_rq); } static inline bool __dl_less(struct rb_node *a, const struct rb_node *b) @@ -1635,6 +1709,41 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) update_stats_enqueue_dl(dl_rq_of_se(dl_se), dl_se, flags); /* + * Check if a constrained deadline task was activated + * after the deadline but before the next period. + * If that is the case, the task will be throttled and + * the replenishment timer will be set to the next period. + */ + if (!dl_se->dl_throttled && !dl_is_implicit(dl_se)) + dl_check_constrained_dl(dl_se); + + if (flags & (ENQUEUE_RESTORE|ENQUEUE_MIGRATING)) { + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + + add_rq_bw(dl_se, dl_rq); + add_running_bw(dl_se, dl_rq); + } + + /* + * If p is throttled, we do not enqueue it. In fact, if it exhausted + * its budget it needs a replenishment and, since it now is on + * its rq, the bandwidth timer callback (which clearly has not + * run yet) will take care of this. + * However, the active utilization does not depend on the fact + * that the task is on the runqueue or not (but depends on the + * task's state - in GRUB parlance, "inactive" vs "active contending"). + * In other words, even if a task is throttled its utilization must + * be counted in the active utilization; hence, we need to call + * add_running_bw(). + */ + if (dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) { + if (flags & ENQUEUE_WAKEUP) + task_contending(dl_se, flags); + + return; + } + + /* * If this is a wakeup or a new instance, the scheduling * parameters of the task might need updating. Otherwise, * we want a replenishment of its runtime. @@ -1645,17 +1754,35 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) } else if (flags & ENQUEUE_REPLENISH) { replenish_dl_entity(dl_se); } else if ((flags & ENQUEUE_RESTORE) && - dl_time_before(dl_se->deadline, - rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) { + dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) { setup_new_dl_entity(dl_se); } __enqueue_dl_entity(dl_se); } -static void dequeue_dl_entity(struct sched_dl_entity *dl_se) +static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags) { __dequeue_dl_entity(dl_se); + + if (flags & (DEQUEUE_SAVE|DEQUEUE_MIGRATING)) { + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + + sub_running_bw(dl_se, dl_rq); + sub_rq_bw(dl_se, dl_rq); + } + + /* + * This check allows to start the inactive timer (or to immediately + * decrease the active utilization, if needed) in two cases: + * when the task blocks and when it is terminating + * (p->state == TASK_DEAD). We can handle the two cases in the same + * way, because from GRUB's point of view the same thing is happening + * (the task moves from "active contending" to "active non contending" + * or "inactive") + */ + if (flags & DEQUEUE_SLEEP) + task_non_contending(dl_se); } static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) @@ -1700,76 +1827,31 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) return; } - /* - * Check if a constrained deadline task was activated - * after the deadline but before the next period. - * If that is the case, the task will be throttled and - * the replenishment timer will be set to the next period. - */ - if (!p->dl.dl_throttled && !dl_is_implicit(&p->dl)) - dl_check_constrained_dl(&p->dl); - - if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) { - add_rq_bw(&p->dl, &rq->dl); - add_running_bw(&p->dl, &rq->dl); - } - - /* - * If p is throttled, we do not enqueue it. In fact, if it exhausted - * its budget it needs a replenishment and, since it now is on - * its rq, the bandwidth timer callback (which clearly has not - * run yet) will take care of this. - * However, the active utilization does not depend on the fact - * that the task is on the runqueue or not (but depends on the - * task's state - in GRUB parlance, "inactive" vs "active contending"). - * In other words, even if a task is throttled its utilization must - * be counted in the active utilization; hence, we need to call - * add_running_bw(). - */ - if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) { - if (flags & ENQUEUE_WAKEUP) - task_contending(&p->dl, flags); - - return; - } - check_schedstat_required(); update_stats_wait_start_dl(dl_rq_of_se(&p->dl), &p->dl); + if (p->on_rq == TASK_ON_RQ_MIGRATING) + flags |= ENQUEUE_MIGRATING; + enqueue_dl_entity(&p->dl, flags); - if (!task_current(rq, p) && p->nr_cpus_allowed > 1) - enqueue_pushable_dl_task(rq, p); -} + if (dl_server(&p->dl)) + return; -static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) -{ - update_stats_dequeue_dl(&rq->dl, &p->dl, flags); - dequeue_dl_entity(&p->dl); - dequeue_pushable_dl_task(rq, p); + if (!task_current(rq, p) && !p->dl.dl_throttled && p->nr_cpus_allowed > 1) + enqueue_pushable_dl_task(rq, p); } static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) { update_curr_dl(rq); - __dequeue_task_dl(rq, p, flags); - if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) { - sub_running_bw(&p->dl, &rq->dl); - sub_rq_bw(&p->dl, &rq->dl); - } + if (p->on_rq == TASK_ON_RQ_MIGRATING) + flags |= DEQUEUE_MIGRATING; - /* - * This check allows to start the inactive timer (or to immediately - * decrease the active utilization, if needed) in two cases: - * when the task blocks and when it is terminating - * (p->state == TASK_DEAD). We can handle the two cases in the same - * way, because from GRUB's point of view the same thing is happening - * (the task moves from "active contending" to "active non contending" - * or "inactive") - */ - if (flags & DEQUEUE_SLEEP) - task_non_contending(p); + dequeue_dl_entity(&p->dl, flags); + if (!p->dl.dl_throttled && !dl_server(&p->dl)) + dequeue_pushable_dl_task(rq, p); } /* @@ -1939,7 +2021,7 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) * Only called when both the current and waking task are -deadline * tasks. */ -static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, +static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags) { if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { @@ -1959,12 +2041,12 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, } #ifdef CONFIG_SCHED_HRTICK -static void start_hrtick_dl(struct rq *rq, struct task_struct *p) +static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se) { - hrtick_start(rq, p->dl.runtime); + hrtick_start(rq, dl_se->runtime); } #else /* !CONFIG_SCHED_HRTICK */ -static void start_hrtick_dl(struct rq *rq, struct task_struct *p) +static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se) { } #endif @@ -1984,9 +2066,6 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) if (!first) return; - if (hrtick_enabled_dl(rq)) - start_hrtick_dl(rq, p); - if (rq->curr->sched_class != &dl_sched_class) update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); @@ -2009,12 +2088,25 @@ static struct task_struct *pick_task_dl(struct rq *rq) struct dl_rq *dl_rq = &rq->dl; struct task_struct *p; +again: if (!sched_dl_runnable(rq)) return NULL; dl_se = pick_next_dl_entity(dl_rq); WARN_ON_ONCE(!dl_se); - p = dl_task_of(dl_se); + + if (dl_server(dl_se)) { + p = dl_se->server_pick(dl_se); + if (!p) { + WARN_ON_ONCE(1); + dl_se->dl_yielded = 1; + update_curr_dl_se(rq, dl_se, 0); + goto again; + } + p->dl_server = dl_se; + } else { + p = dl_task_of(dl_se); + } return p; } @@ -2024,9 +2116,15 @@ static struct task_struct *pick_next_task_dl(struct rq *rq) struct task_struct *p; p = pick_task_dl(rq); - if (p) + if (!p) + return p; + + if (!p->dl_server) set_next_task_dl(rq, p, true); + if (hrtick_enabled(rq)) + start_hrtick_dl(rq, &p->dl); + return p; } @@ -2064,8 +2162,8 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) * be set and schedule() will start a new hrtick for the next task. */ if (hrtick_enabled_dl(rq) && queued && p->dl.runtime > 0 && - is_leftmost(p, &rq->dl)) - start_hrtick_dl(rq, p); + is_leftmost(&p->dl, &rq->dl)) + start_hrtick_dl(rq, &p->dl); } static void task_fork_dl(struct task_struct *p) @@ -2291,9 +2389,6 @@ static int push_dl_task(struct rq *rq) struct rq *later_rq; int ret = 0; - if (!rq->dl.overloaded) - return 0; - next_task = pick_next_pushable_dl_task(rq); if (!next_task) return 0; @@ -2449,9 +2544,11 @@ skip: double_unlock_balance(this_rq, src_rq); if (push_task) { + preempt_disable(); raw_spin_rq_unlock(this_rq); stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, push_task, &src_rq->push_work); + preempt_enable(); raw_spin_rq_lock(this_rq); } } @@ -2585,7 +2682,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) * will reset the task parameters. */ if (task_on_rq_queued(p) && p->dl.dl_runtime) - task_non_contending(p); + task_non_contending(&p->dl); /* * In case a task is setscheduled out from SCHED_DEADLINE we need to @@ -2652,7 +2749,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) deadline_queue_push_tasks(rq); #endif if (dl_task(rq->curr)) - check_preempt_curr_dl(rq, p, 0); + wakeup_preempt_dl(rq, p, 0); else resched_curr(rq); } else { @@ -2721,7 +2818,7 @@ DEFINE_SCHED_CLASS(dl) = { .dequeue_task = dequeue_task_dl, .yield_task = yield_task_dl, - .check_preempt_curr = check_preempt_curr_dl, + .wakeup_preempt = wakeup_preempt_dl, .pick_next_task = pick_next_task_dl, .put_prev_task = put_prev_task_dl, @@ -2993,10 +3090,8 @@ bool __checkparam_dl(const struct sched_attr *attr) /* * This function clears the sched_dl_entity static params. */ -void __dl_clear_params(struct task_struct *p) +static void __dl_clear_params(struct sched_dl_entity *dl_se) { - struct sched_dl_entity *dl_se = &p->dl; - dl_se->dl_runtime = 0; dl_se->dl_deadline = 0; dl_se->dl_period = 0; @@ -3008,12 +3103,21 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_yielded = 0; dl_se->dl_non_contending = 0; dl_se->dl_overrun = 0; + dl_se->dl_server = 0; #ifdef CONFIG_RT_MUTEXES dl_se->pi_se = dl_se; #endif } +void init_dl_entity(struct sched_dl_entity *dl_se) +{ + RB_CLEAR_NODE(&dl_se->rb_node); + init_dl_task_timer(dl_se); + init_dl_inactive_task_timer(dl_se); + __dl_clear_params(dl_se); +} + bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) { struct sched_dl_entity *dl_se = &p->dl; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4c3d0d9f3db6..8d5d98a5834d 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -8,7 +8,7 @@ */ /* - * This allows printing both to /proc/sched_debug and + * This allows printing both to /sys/kernel/debug/sched/debug and * to the console */ #define SEQ_printf(m, x...) \ @@ -628,8 +628,8 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { - s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, spread; - struct sched_entity *last, *first; + s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, left_deadline = -1, spread; + struct sched_entity *last, *first, *root; struct rq *rq = cpu_rq(cpu); unsigned long flags; @@ -644,15 +644,20 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(cfs_rq->exec_clock)); raw_spin_rq_lock_irqsave(rq, flags); + root = __pick_root_entity(cfs_rq); + if (root) + left_vruntime = root->min_vruntime; first = __pick_first_entity(cfs_rq); if (first) - left_vruntime = first->vruntime; + left_deadline = first->deadline; last = __pick_last_entity(cfs_rq); if (last) right_vruntime = last->vruntime; min_vruntime = cfs_rq->min_vruntime; raw_spin_rq_unlock_irqrestore(rq, flags); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline", + SPLIT_NS(left_deadline)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime", SPLIT_NS(left_vruntime)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", @@ -679,8 +684,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->avg.runnable_avg); SEQ_printf(m, " .%-30s: %lu\n", "util_avg", cfs_rq->avg.util_avg); - SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued", - cfs_rq->avg.util_est.enqueued); + SEQ_printf(m, " .%-30s: %u\n", "util_est", + cfs_rq->avg.util_est); SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg", cfs_rq->removed.load_avg); SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", @@ -724,9 +729,6 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x)) PU(rt_nr_running); -#ifdef CONFIG_SMP - PU(rt_nr_migratory); -#endif P(rt_throttled); PN(rt_time); PN(rt_runtime); @@ -748,7 +750,6 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) PU(dl_nr_running); #ifdef CONFIG_SMP - PU(dl_nr_migratory); dl_bw = &cpu_rq(cpu)->rd->dl_bw; #else dl_bw = &dl_rq->dl_bw; @@ -864,7 +865,6 @@ static void sched_debug_header(struct seq_file *m) #define PN(x) \ SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) PN(sysctl_sched_base_slice); - P(sysctl_sched_child_runs_first); P(sysctl_sched_features); #undef PN #undef P @@ -1075,8 +1075,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P(se.avg.runnable_avg); P(se.avg.util_avg); P(se.avg.last_update_time); - P(se.avg.util_est.ewma); - PM(se.avg.util_est.enqueued, ~UTIL_AVG_UNCHANGED); + PM(se.avg.util_est, ~UTIL_AVG_UNCHANGED); #endif #ifdef CONFIG_UCLAMP_TASK __PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8dbff6e7ad4f..533547e3c90a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -51,8 +51,6 @@ #include <asm/switch_to.h> -#include <linux/sched/cond_resched.h> - #include "sched.h" #include "stats.h" #include "autogroup.h" @@ -78,12 +76,6 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; unsigned int sysctl_sched_base_slice = 750000ULL; static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; -/* - * After fork, child runs first. If set to 0 (default) then - * parent will (try to) run first. - */ -unsigned int sysctl_sched_child_runs_first __read_mostly; - const_debug unsigned int sysctl_sched_migration_cost = 500000UL; int sched_thermal_decay_shift; @@ -145,13 +137,6 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; #ifdef CONFIG_SYSCTL static struct ctl_table sched_fair_sysctls[] = { - { - .procname = "sched_child_runs_first", - .data = &sysctl_sched_child_runs_first, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, #ifdef CONFIG_CFS_BANDWIDTH { .procname = "sched_cfs_bandwidth_slice_us", @@ -566,7 +551,11 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) static inline bool entity_before(const struct sched_entity *a, const struct sched_entity *b) { - return (s64)(a->vruntime - b->vruntime) < 0; + /* + * Tiebreak on vruntime seems unnecessary since it can + * hardly happen. + */ + return (s64)(a->deadline - b->deadline) < 0; } static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -664,6 +653,10 @@ void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta; } +/* + * Specifically: avg_runtime() + 0 must result in entity_eligible() := true + * For this to be so, the result of this function must have a left bias. + */ u64 avg_vruntime(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; @@ -677,8 +670,12 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) load += weight; } - if (load) + if (load) { + /* sign flips effective floor / ceil */ + if (avg < 0) + avg -= (load - 1); avg = div_s64(avg, load); + } return cfs_rq->min_vruntime + avg; } @@ -727,7 +724,7 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) * Note: using 'avg_vruntime() > se->vruntime' is inacurate due * to the loss in precision caused by the division. */ -int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) +static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) { struct sched_entity *curr = cfs_rq->curr; s64 avg = cfs_rq->avg_vruntime; @@ -740,7 +737,12 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) load += weight; } - return avg >= entity_key(cfs_rq, se) * load; + return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load; +} + +int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + return vruntime_eligible(cfs_rq, se->vruntime); } static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime) @@ -759,9 +761,8 @@ static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime) static void update_min_vruntime(struct cfs_rq *cfs_rq) { - struct sched_entity *se = __pick_first_entity(cfs_rq); + struct sched_entity *se = __pick_root_entity(cfs_rq); struct sched_entity *curr = cfs_rq->curr; - u64 vruntime = cfs_rq->min_vruntime; if (curr) { @@ -773,9 +774,9 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) if (se) { if (!curr) - vruntime = se->vruntime; + vruntime = se->min_vruntime; else - vruntime = min_vruntime(vruntime, se->vruntime); + vruntime = min_vruntime(vruntime, se->min_vruntime); } /* ensure we never gain time by being placed backwards. */ @@ -788,34 +789,34 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) return entity_before(__node_2_se(a), __node_2_se(b)); } -#define deadline_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; }) +#define vruntime_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; }) -static inline void __update_min_deadline(struct sched_entity *se, struct rb_node *node) +static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node *node) { if (node) { struct sched_entity *rse = __node_2_se(node); - if (deadline_gt(min_deadline, se, rse)) - se->min_deadline = rse->min_deadline; + if (vruntime_gt(min_vruntime, se, rse)) + se->min_vruntime = rse->min_vruntime; } } /* - * se->min_deadline = min(se->deadline, left->min_deadline, right->min_deadline) + * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime) */ -static inline bool min_deadline_update(struct sched_entity *se, bool exit) +static inline bool min_vruntime_update(struct sched_entity *se, bool exit) { - u64 old_min_deadline = se->min_deadline; + u64 old_min_vruntime = se->min_vruntime; struct rb_node *node = &se->run_node; - se->min_deadline = se->deadline; - __update_min_deadline(se, node->rb_right); - __update_min_deadline(se, node->rb_left); + se->min_vruntime = se->vruntime; + __min_vruntime_update(se, node->rb_right); + __min_vruntime_update(se, node->rb_left); - return se->min_deadline == old_min_deadline; + return se->min_vruntime == old_min_vruntime; } -RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity, - run_node, min_deadline, min_deadline_update); +RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity, + run_node, min_vruntime, min_vruntime_update); /* * Enqueue an entity into the rb-tree: @@ -823,18 +824,28 @@ RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity, static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { avg_vruntime_add(cfs_rq, se); - se->min_deadline = se->deadline; + se->min_vruntime = se->vruntime; rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, - __entity_less, &min_deadline_cb); + __entity_less, &min_vruntime_cb); } static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, - &min_deadline_cb); + &min_vruntime_cb); avg_vruntime_sub(cfs_rq, se); } +struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq) +{ + struct rb_node *root = cfs_rq->tasks_timeline.rb_root.rb_node; + + if (!root) + return NULL; + + return __node_2_se(root); +} + struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) { struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline); @@ -857,19 +868,27 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) * with the earliest virtual deadline. * * We can do this in O(log n) time due to an augmented RB-tree. The - * tree keeps the entries sorted on service, but also functions as a - * heap based on the deadline by keeping: + * tree keeps the entries sorted on deadline, but also functions as a + * heap based on the vruntime by keeping: * - * se->min_deadline = min(se->deadline, se->{left,right}->min_deadline) + * se->min_vruntime = min(se->vruntime, se->{left,right}->min_vruntime) * - * Which allows an EDF like search on (sub)trees. + * Which allows tree pruning through eligibility. */ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) { struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node; + struct sched_entity *se = __pick_first_entity(cfs_rq); struct sched_entity *curr = cfs_rq->curr; struct sched_entity *best = NULL; + /* + * We can safely skip eligibility check if there is only one entity + * in this cfs_rq, saving some cycles. + */ + if (cfs_rq->nr_running == 1) + return curr && curr->on_rq ? curr : se; + if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) curr = NULL; @@ -880,52 +899,44 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline) return curr; + /* Pick the leftmost entity if it's eligible */ + if (se && entity_eligible(cfs_rq, se)) { + best = se; + goto found; + } + + /* Heap search for the EEVD entity */ while (node) { - struct sched_entity *se = __node_2_se(node); + struct rb_node *left = node->rb_left; /* - * If this entity is not eligible, try the left subtree. + * Eligible entities in left subtree are always better + * choices, since they have earlier deadlines. */ - if (!entity_eligible(cfs_rq, se)) { - node = node->rb_left; + if (left && vruntime_eligible(cfs_rq, + __node_2_se(left)->min_vruntime)) { + node = left; continue; } - /* - * If this entity has an earlier deadline than the previous - * best, take this one. If it also has the earliest deadline - * of its subtree, we're done. - */ - if (!best || deadline_gt(deadline, best, se)) { - best = se; - if (best->deadline == best->min_deadline) - break; - } + se = __node_2_se(node); /* - * If the earlest deadline in this subtree is in the fully - * eligible left half of our space, go there. + * The left subtree either is empty or has no eligible + * entity, so check the current node since it is the one + * with earliest deadline that might be eligible. */ - if (node->rb_left && - __node_2_se(node->rb_left)->min_deadline == se->min_deadline) { - node = node->rb_left; - continue; + if (entity_eligible(cfs_rq, se)) { + best = se; + break; } node = node->rb_right; } - - if (!best || (curr && deadline_gt(deadline, best, curr))) +found: + if (!best || (curr && entity_before(curr, best))) best = curr; - if (unlikely(!best)) { - struct sched_entity *left = __pick_first_entity(cfs_rq); - if (left) { - pr_err("EEVDF scheduling fail, picking leftmost\n"); - return left; - } - } - return best; } @@ -1092,23 +1103,17 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq) } #endif /* CONFIG_SMP */ -/* - * Update the current task's runtime statistics. - */ -static void update_curr(struct cfs_rq *cfs_rq) +static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) { - struct sched_entity *curr = cfs_rq->curr; - u64 now = rq_clock_task(rq_of(cfs_rq)); - u64 delta_exec; - - if (unlikely(!curr)) - return; + u64 now = rq_clock_task(rq); + s64 delta_exec; delta_exec = now - curr->exec_start; - if (unlikely((s64)delta_exec <= 0)) - return; + if (unlikely(delta_exec <= 0)) + return delta_exec; curr->exec_start = now; + curr->sum_exec_runtime += delta_exec; if (schedstat_enabled()) { struct sched_statistics *stats; @@ -1118,20 +1123,54 @@ static void update_curr(struct cfs_rq *cfs_rq) max(delta_exec, stats->exec_max)); } - curr->sum_exec_runtime += delta_exec; - schedstat_add(cfs_rq->exec_clock, delta_exec); + return delta_exec; +} + +static inline void update_curr_task(struct task_struct *p, s64 delta_exec) +{ + trace_sched_stat_runtime(p, delta_exec); + account_group_exec_runtime(p, delta_exec); + cgroup_account_cputime(p, delta_exec); + if (p->dl_server) + dl_server_update(p->dl_server, delta_exec); +} + +/* + * Used by other classes to account runtime. + */ +s64 update_curr_common(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + s64 delta_exec; + + delta_exec = update_curr_se(rq, &curr->se); + if (likely(delta_exec > 0)) + update_curr_task(curr, delta_exec); + + return delta_exec; +} + +/* + * Update the current task's runtime statistics. + */ +static void update_curr(struct cfs_rq *cfs_rq) +{ + struct sched_entity *curr = cfs_rq->curr; + s64 delta_exec; + + if (unlikely(!curr)) + return; + + delta_exec = update_curr_se(rq_of(cfs_rq), curr); + if (unlikely(delta_exec <= 0)) + return; curr->vruntime += calc_delta_fair(delta_exec, curr); update_deadline(cfs_rq, curr); update_min_vruntime(cfs_rq); - if (entity_is_task(curr)) { - struct task_struct *curtask = task_of(curr); - - trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); - cgroup_account_cputime(curtask, delta_exec); - account_group_exec_runtime(curtask, delta_exec); - } + if (entity_is_task(curr)) + update_curr_task(task_of(curr), delta_exec); account_cfs_rq_runtime(cfs_rq, delta_exec); } @@ -1722,12 +1761,12 @@ static bool pgdat_free_space_enough(struct pglist_data *pgdat) * The smaller the hint page fault latency, the higher the possibility * for the page to be hot. */ -static int numa_hint_fault_latency(struct page *page) +static int numa_hint_fault_latency(struct folio *folio) { int last_time, time; time = jiffies_to_msecs(jiffies); - last_time = xchg_page_access_time(page, time); + last_time = folio_xchg_access_time(folio, time); return (time - last_time) & PAGE_ACCESS_TIME_MASK; } @@ -1784,7 +1823,7 @@ static void numa_promotion_adjust_threshold(struct pglist_data *pgdat, } } -bool should_numa_migrate_memory(struct task_struct *p, struct page * page, +bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, int src_nid, int dst_cpu) { struct numa_group *ng = deref_curr_numa_group(p); @@ -1814,16 +1853,16 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, numa_promotion_adjust_threshold(pgdat, rate_limit, def_th); th = pgdat->nbp_threshold ? : def_th; - latency = numa_hint_fault_latency(page); + latency = numa_hint_fault_latency(folio); if (latency >= th) return false; return !numa_promotion_rate_limit(pgdat, rate_limit, - thp_nr_pages(page)); + folio_nr_pages(folio)); } this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); - last_cpupid = page_cpupid_xchg_last(page, this_cpupid); + last_cpupid = folio_xchg_last_cpupid(folio, this_cpupid); if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid)) @@ -2847,19 +2886,7 @@ static void task_numa_placement(struct task_struct *p) } /* Cannot migrate task to CPU-less node */ - if (max_nid != NUMA_NO_NODE && !node_state(max_nid, N_CPU)) { - int near_nid = max_nid; - int distance, near_distance = INT_MAX; - - for_each_node_state(nid, N_CPU) { - distance = node_distance(max_nid, nid); - if (distance < near_distance) { - near_nid = nid; - near_distance = distance; - } - } - max_nid = near_nid; - } + max_nid = numa_nearest_node(max_nid, N_CPU); if (ng) { numa_group_count_active_nodes(ng); @@ -3130,7 +3157,7 @@ static void reset_ptenuma_scan(struct task_struct *p) p->mm->numa_scan_offset = 0; } -static bool vma_is_accessed(struct vm_area_struct *vma) +static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma) { unsigned long pids; /* @@ -3139,11 +3166,23 @@ static bool vma_is_accessed(struct vm_area_struct *vma) * This is also done to avoid any side effect of task scanning * amplifying the unfairness of disjoint set of VMAs' access. */ - if (READ_ONCE(current->mm->numa_scan_seq) < 2) + if ((READ_ONCE(current->mm->numa_scan_seq) - vma->numab_state->start_scan_seq) < 2) return true; - pids = vma->numab_state->access_pids[0] | vma->numab_state->access_pids[1]; - return test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids); + pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1]; + if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids)) + return true; + + /* + * Complete a scan that has already started regardless of PID access, or + * some VMAs may never be scanned in multi-threaded applications: + */ + if (mm->numa_scan_offset > vma->vm_start) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_IGNORE_PID); + return true; + } + + return false; } #define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay) @@ -3163,6 +3202,8 @@ static void task_numa_work(struct callback_head *work) unsigned long nr_pte_updates = 0; long pages, virtpages; struct vma_iterator vmi; + bool vma_pids_skipped; + bool vma_pids_forced = false; SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); @@ -3205,7 +3246,6 @@ static void task_numa_work(struct callback_head *work) */ p->node_stamp += 2 * TICK_NSEC; - start = mm->numa_scan_offset; pages = sysctl_numa_balancing_scan_size; pages <<= 20 - PAGE_SHIFT; /* MB in pages */ virtpages = pages * 8; /* Scan up to this much virtual space */ @@ -3215,6 +3255,16 @@ static void task_numa_work(struct callback_head *work) if (!mmap_read_trylock(mm)) return; + + /* + * VMAs are skipped if the current PID has not trapped a fault within + * the VMA recently. Allow scanning to be forced if there is no + * suitable VMA remaining. + */ + vma_pids_skipped = false; + +retry_pids: + start = mm->numa_scan_offset; vma_iter_init(&vmi, mm, start); vma = vma_next(&vmi); if (!vma) { @@ -3227,6 +3277,7 @@ static void task_numa_work(struct callback_head *work) do { if (!vma_migratable(vma) || !vma_policy_mof(vma) || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE); continue; } @@ -3237,15 +3288,19 @@ static void task_numa_work(struct callback_head *work) * as migrating the pages will be of marginal benefit. */ if (!vma->vm_mm || - (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) + (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SHARED_RO); continue; + } /* * Skip inaccessible VMAs to avoid any confusion between * PROT_NONE and NUMA hinting ptes */ - if (!vma_is_accessible(vma)) + if (!vma_is_accessible(vma)) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE); continue; + } /* Initialise new per-VMA NUMAB state. */ if (!vma->numab_state) { @@ -3254,12 +3309,21 @@ static void task_numa_work(struct callback_head *work) if (!vma->numab_state) continue; + vma->numab_state->start_scan_seq = mm->numa_scan_seq; + vma->numab_state->next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); /* Reset happens after 4 times scan delay of scan start */ - vma->numab_state->next_pid_reset = vma->numab_state->next_scan + + vma->numab_state->pids_active_reset = vma->numab_state->next_scan + msecs_to_jiffies(VMA_PID_RESET_PERIOD); + + /* + * Ensure prev_scan_seq does not match numa_scan_seq, + * to prevent VMAs being skipped prematurely on the + * first scan: + */ + vma->numab_state->prev_scan_seq = mm->numa_scan_seq - 1; } /* @@ -3267,23 +3331,35 @@ static void task_numa_work(struct callback_head *work) * delay the scan for new VMAs. */ if (mm->numa_scan_seq && time_before(jiffies, - vma->numab_state->next_scan)) + vma->numab_state->next_scan)) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SCAN_DELAY); continue; + } + + /* RESET access PIDs regularly for old VMAs. */ + if (mm->numa_scan_seq && + time_after(jiffies, vma->numab_state->pids_active_reset)) { + vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset + + msecs_to_jiffies(VMA_PID_RESET_PERIOD); + vma->numab_state->pids_active[0] = READ_ONCE(vma->numab_state->pids_active[1]); + vma->numab_state->pids_active[1] = 0; + } - /* Do not scan the VMA if task has not accessed */ - if (!vma_is_accessed(vma)) + /* Do not rescan VMAs twice within the same sequence. */ + if (vma->numab_state->prev_scan_seq == mm->numa_scan_seq) { + mm->numa_scan_offset = vma->vm_end; + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SEQ_COMPLETED); continue; + } /* - * RESET access PIDs regularly for old VMAs. Resetting after checking - * vma for recent access to avoid clearing PID info before access.. + * Do not scan the VMA if task has not accessed it, unless no other + * VMA candidate exists. */ - if (mm->numa_scan_seq && - time_after(jiffies, vma->numab_state->next_pid_reset)) { - vma->numab_state->next_pid_reset = vma->numab_state->next_pid_reset + - msecs_to_jiffies(VMA_PID_RESET_PERIOD); - vma->numab_state->access_pids[0] = READ_ONCE(vma->numab_state->access_pids[1]); - vma->numab_state->access_pids[1] = 0; + if (!vma_pids_forced && !vma_is_accessed(mm, vma)) { + vma_pids_skipped = true; + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE); + continue; } do { @@ -3310,8 +3386,28 @@ static void task_numa_work(struct callback_head *work) cond_resched(); } while (end != vma->vm_end); + + /* VMA scan is complete, do not scan until next sequence. */ + vma->numab_state->prev_scan_seq = mm->numa_scan_seq; + + /* + * Only force scan within one VMA at a time, to limit the + * cost of scanning a potentially uninteresting VMA. + */ + if (vma_pids_forced) + break; } for_each_vma(vmi, vma); + /* + * If no VMAs are remaining and VMAs were skipped due to the PID + * not accessing the VMA previously, then force a scan to ensure + * forward progress: + */ + if (!vma && !vma_pids_forced && vma_pids_skipped) { + vma_pids_forced = true; + goto retry_pids; + } + out: /* * It is possible to reach the end of the VMA list but the last few @@ -3574,39 +3670,140 @@ static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } #endif +static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se, + unsigned long weight) +{ + unsigned long old_weight = se->load.weight; + u64 avruntime = avg_vruntime(cfs_rq); + s64 vlag, vslice; + + /* + * VRUNTIME + * ======== + * + * COROLLARY #1: The virtual runtime of the entity needs to be + * adjusted if re-weight at !0-lag point. + * + * Proof: For contradiction assume this is not true, so we can + * re-weight without changing vruntime at !0-lag point. + * + * Weight VRuntime Avg-VRuntime + * before w v V + * after w' v' V' + * + * Since lag needs to be preserved through re-weight: + * + * lag = (V - v)*w = (V'- v')*w', where v = v' + * ==> V' = (V - v)*w/w' + v (1) + * + * Let W be the total weight of the entities before reweight, + * since V' is the new weighted average of entities: + * + * V' = (WV + w'v - wv) / (W + w' - w) (2) + * + * by using (1) & (2) we obtain: + * + * (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v + * ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v + * ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v + * ==> (V - v)*W/(W + w' - w) = (V - v)*w/w' (3) + * + * Since we are doing at !0-lag point which means V != v, we + * can simplify (3): + * + * ==> W / (W + w' - w) = w / w' + * ==> Ww' = Ww + ww' - ww + * ==> W * (w' - w) = w * (w' - w) + * ==> W = w (re-weight indicates w' != w) + * + * So the cfs_rq contains only one entity, hence vruntime of + * the entity @v should always equal to the cfs_rq's weighted + * average vruntime @V, which means we will always re-weight + * at 0-lag point, thus breach assumption. Proof completed. + * + * + * COROLLARY #2: Re-weight does NOT affect weighted average + * vruntime of all the entities. + * + * Proof: According to corollary #1, Eq. (1) should be: + * + * (V - v)*w = (V' - v')*w' + * ==> v' = V' - (V - v)*w/w' (4) + * + * According to the weighted average formula, we have: + * + * V' = (WV - wv + w'v') / (W - w + w') + * = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w') + * = (WV - wv + w'V' - Vw + wv) / (W - w + w') + * = (WV + w'V' - Vw) / (W - w + w') + * + * ==> V'*(W - w + w') = WV + w'V' - Vw + * ==> V' * (W - w) = (W - w) * V (5) + * + * If the entity is the only one in the cfs_rq, then reweight + * always occurs at 0-lag point, so V won't change. Or else + * there are other entities, hence W != w, then Eq. (5) turns + * into V' = V. So V won't change in either case, proof done. + * + * + * So according to corollary #1 & #2, the effect of re-weight + * on vruntime should be: + * + * v' = V' - (V - v) * w / w' (4) + * = V - (V - v) * w / w' + * = V - vl * w / w' + * = V - vl' + */ + if (avruntime != se->vruntime) { + vlag = (s64)(avruntime - se->vruntime); + vlag = div_s64(vlag * old_weight, weight); + se->vruntime = avruntime - vlag; + } + + /* + * DEADLINE + * ======== + * + * When the weight changes, the virtual time slope changes and + * we should adjust the relative virtual deadline accordingly. + * + * d' = v' + (d - v)*w/w' + * = V' - (V - v)*w/w' + (d - v)*w/w' + * = V - (V - v)*w/w' + (d - v)*w/w' + * = V + (d - V)*w/w' + */ + vslice = (s64)(se->deadline - avruntime); + vslice = div_s64(vslice * old_weight, weight); + se->deadline = avruntime + vslice; +} + static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { - unsigned long old_weight = se->load.weight; + bool curr = cfs_rq->curr == se; if (se->on_rq) { /* commit outstanding execution time */ - if (cfs_rq->curr == se) + if (curr) update_curr(cfs_rq); else - avg_vruntime_sub(cfs_rq, se); + __dequeue_entity(cfs_rq, se); update_load_sub(&cfs_rq->load, se->load.weight); } dequeue_load_avg(cfs_rq, se); - update_load_set(&se->load, weight); - if (!se->on_rq) { /* * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i), * we need to scale se->vlag when w_i changes. */ - se->vlag = div_s64(se->vlag * old_weight, weight); + se->vlag = div_s64(se->vlag * se->load.weight, weight); } else { - s64 deadline = se->deadline - se->vruntime; - /* - * When the weight changes, the virtual time slope changes and - * we should adjust the relative virtual deadline accordingly. - */ - deadline = div_s64(deadline * old_weight, weight); - se->deadline = se->vruntime + deadline; + reweight_eevdf(cfs_rq, se, weight); } + update_load_set(&se->load, weight); + #ifdef CONFIG_SMP do { u32 divider = get_pelt_divider(&se->avg); @@ -3618,8 +3815,17 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, enqueue_load_avg(cfs_rq, se); if (se->on_rq) { update_load_add(&cfs_rq->load, se->load.weight); - if (cfs_rq->curr != se) - avg_vruntime_add(cfs_rq, se); + if (!curr) + __enqueue_entity(cfs_rq, se); + + /* + * The entity's vruntime has been adjusted, so let's check + * whether the rq-wide min_vruntime needs updated too. Since + * the calculations above require stable min_vruntime rather + * than up-to-date one, we do the update at the end of the + * reweight process. + */ + update_min_vruntime(cfs_rq); } } @@ -3763,14 +3969,11 @@ static void update_cfs_group(struct sched_entity *se) #ifndef CONFIG_SMP shares = READ_ONCE(gcfs_rq->tg->shares); - - if (likely(se->load.weight == shares)) - return; #else - shares = calc_group_shares(gcfs_rq); + shares = calc_group_shares(gcfs_rq); #endif - - reweight_entity(cfs_rq_of(se), se, shares); + if (unlikely(se->load.weight != shares)) + reweight_entity(cfs_rq_of(se), se, shares); } #else /* CONFIG_FAIR_GROUP_SCHED */ @@ -3888,7 +4091,8 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) { - long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; + long delta; + u64 now; /* * No need to update load_avg for root_task_group as it is not used. @@ -3896,10 +4100,67 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) if (cfs_rq->tg == &root_task_group) return; + /* rq has been offline and doesn't contribute to the share anymore: */ + if (!cpu_active(cpu_of(rq_of(cfs_rq)))) + return; + + /* + * For migration heavy workloads, access to tg->load_avg can be + * unbound. Limit the update rate to at most once per ms. + */ + now = sched_clock_cpu(cpu_of(rq_of(cfs_rq))); + if (now - cfs_rq->last_update_tg_load_avg < NSEC_PER_MSEC) + return; + + delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { atomic_long_add(delta, &cfs_rq->tg->load_avg); cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; + cfs_rq->last_update_tg_load_avg = now; + } +} + +static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq) +{ + long delta; + u64 now; + + /* + * No need to update load_avg for root_task_group, as it is not used. + */ + if (cfs_rq->tg == &root_task_group) + return; + + now = sched_clock_cpu(cpu_of(rq_of(cfs_rq))); + delta = 0 - cfs_rq->tg_load_avg_contrib; + atomic_long_add(delta, &cfs_rq->tg->load_avg); + cfs_rq->tg_load_avg_contrib = 0; + cfs_rq->last_update_tg_load_avg = now; +} + +/* CPU offline callback: */ +static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq) +{ + struct task_group *tg; + + lockdep_assert_rq_held(rq); + + /* + * The rq clock has already been updated in + * set_rq_offline(), so we should skip updating + * the rq clock again in unthrottle_cfs_rq(). + */ + rq_clock_start_loop_update(rq); + + rcu_read_lock(); + list_for_each_entry_rcu(tg, &task_groups, list) { + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + + clear_tg_load_avg(cfs_rq); } + rcu_read_unlock(); + + rq_clock_stop_loop_update(rq); } /* @@ -4198,6 +4459,8 @@ static inline bool skip_blocked_update(struct sched_entity *se) static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {} +static inline void clear_tg_offline_cfs_rqs(struct rq *rq) {} + static inline int propagate_entity_load_avg(struct sched_entity *se) { return 0; @@ -4560,33 +4823,20 @@ static inline unsigned long task_util(struct task_struct *p) return READ_ONCE(p->se.avg.util_avg); } -static inline unsigned long _task_util_est(struct task_struct *p) +static inline unsigned long task_runnable(struct task_struct *p) { - struct util_est ue = READ_ONCE(p->se.avg.util_est); - - return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED)); + return READ_ONCE(p->se.avg.runnable_avg); } -static inline unsigned long task_util_est(struct task_struct *p) +static inline unsigned long _task_util_est(struct task_struct *p) { - return max(task_util(p), _task_util_est(p)); + return READ_ONCE(p->se.avg.util_est) & ~UTIL_AVG_UNCHANGED; } -#ifdef CONFIG_UCLAMP_TASK -static inline unsigned long uclamp_task_util(struct task_struct *p, - unsigned long uclamp_min, - unsigned long uclamp_max) -{ - return clamp(task_util_est(p), uclamp_min, uclamp_max); -} -#else -static inline unsigned long uclamp_task_util(struct task_struct *p, - unsigned long uclamp_min, - unsigned long uclamp_max) +static inline unsigned long task_util_est(struct task_struct *p) { - return task_util_est(p); + return max(task_util(p), _task_util_est(p)); } -#endif static inline void util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) @@ -4597,9 +4847,9 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq, return; /* Update root cfs_rq's estimated utilization */ - enqueued = cfs_rq->avg.util_est.enqueued; + enqueued = cfs_rq->avg.util_est; enqueued += _task_util_est(p); - WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); + WRITE_ONCE(cfs_rq->avg.util_est, enqueued); trace_sched_util_est_cfs_tp(cfs_rq); } @@ -4613,34 +4863,20 @@ static inline void util_est_dequeue(struct cfs_rq *cfs_rq, return; /* Update root cfs_rq's estimated utilization */ - enqueued = cfs_rq->avg.util_est.enqueued; + enqueued = cfs_rq->avg.util_est; enqueued -= min_t(unsigned int, enqueued, _task_util_est(p)); - WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); + WRITE_ONCE(cfs_rq->avg.util_est, enqueued); trace_sched_util_est_cfs_tp(cfs_rq); } #define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100) -/* - * Check if a (signed) value is within a specified (unsigned) margin, - * based on the observation that: - * - * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1) - * - * NOTE: this only works when value + margin < INT_MAX. - */ -static inline bool within_margin(int value, int margin) -{ - return ((unsigned int)(value + margin - 1) < (2 * margin - 1)); -} - static inline void util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) { - long last_ewma_diff, last_enqueued_diff; - struct util_est ue; + unsigned int ewma, dequeued, last_ewma_diff; if (!sched_feat(UTIL_EST)) return; @@ -4652,71 +4888,73 @@ static inline void util_est_update(struct cfs_rq *cfs_rq, if (!task_sleep) return; + /* Get current estimate of utilization */ + ewma = READ_ONCE(p->se.avg.util_est); + /* * If the PELT values haven't changed since enqueue time, * skip the util_est update. */ - ue = p->se.avg.util_est; - if (ue.enqueued & UTIL_AVG_UNCHANGED) + if (ewma & UTIL_AVG_UNCHANGED) return; - last_enqueued_diff = ue.enqueued; + /* Get utilization at dequeue */ + dequeued = task_util(p); /* * Reset EWMA on utilization increases, the moving average is used only * to smooth utilization decreases. */ - ue.enqueued = task_util(p); - if (sched_feat(UTIL_EST_FASTUP)) { - if (ue.ewma < ue.enqueued) { - ue.ewma = ue.enqueued; - goto done; - } + if (ewma <= dequeued) { + ewma = dequeued; + goto done; } /* * Skip update of task's estimated utilization when its members are * already ~1% close to its last activation value. */ - last_ewma_diff = ue.enqueued - ue.ewma; - last_enqueued_diff -= ue.enqueued; - if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) { - if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN)) - goto done; - - return; - } + last_ewma_diff = ewma - dequeued; + if (last_ewma_diff < UTIL_EST_MARGIN) + goto done; /* * To avoid overestimation of actual task utilization, skip updates if * we cannot grant there is idle time in this CPU. */ - if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq)))) + if (dequeued > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)))) return; /* + * To avoid underestimate of task utilization, skip updates of EWMA if + * we cannot grant that thread got all CPU time it wanted. + */ + if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p)) + goto done; + + + /* * Update Task's estimated utilization * * When *p completes an activation we can consolidate another sample - * of the task size. This is done by storing the current PELT value - * as ue.enqueued and by using this value to update the Exponential - * Weighted Moving Average (EWMA): + * of the task size. This is done by using this value to update the + * Exponential Weighted Moving Average (EWMA): * * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1) * = w * task_util(p) + ewma(t-1) - w * ewma(t-1) * = w * (task_util(p) - ewma(t-1)) + ewma(t-1) - * = w * ( last_ewma_diff ) + ewma(t-1) - * = w * (last_ewma_diff + ewma(t-1) / w) + * = w * ( -last_ewma_diff ) + ewma(t-1) + * = w * (-last_ewma_diff + ewma(t-1) / w) * * Where 'w' is the weight of new samples, which is configured to be * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT) */ - ue.ewma <<= UTIL_EST_WEIGHT_SHIFT; - ue.ewma += last_ewma_diff; - ue.ewma >>= UTIL_EST_WEIGHT_SHIFT; + ewma <<= UTIL_EST_WEIGHT_SHIFT; + ewma -= last_ewma_diff; + ewma >>= UTIL_EST_WEIGHT_SHIFT; done: - ue.enqueued |= UTIL_AVG_UNCHANGED; - WRITE_ONCE(p->se.avg.util_est, ue); + ewma |= UTIL_AVG_UNCHANGED; + WRITE_ONCE(p->se.avg.util_est, ewma); trace_sched_util_est_se_tp(&p->se); } @@ -4739,14 +4977,14 @@ static inline int util_fits_cpu(unsigned long util, return fits; /* - * We must use capacity_orig_of() for comparing against uclamp_min and + * We must use arch_scale_cpu_capacity() for comparing against uclamp_min and * uclamp_max. We only care about capacity pressure (by using * capacity_of()) for comparing against the real util. * * If a task is boosted to 1024 for example, we don't want a tiny * pressure to skew the check whether it fits a CPU or not. * - * Similarly if a task is capped to capacity_orig_of(little_cpu), it + * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it * should fit a little cpu even if there's some pressure. * * Only exception is for thermal pressure since it has a direct impact @@ -4758,7 +4996,7 @@ static inline int util_fits_cpu(unsigned long util, * For uclamp_max, we can tolerate a drop in performance level as the * goal is to cap the task. So it's okay if it's getting less. */ - capacity_orig = capacity_orig_of(cpu); + capacity_orig = arch_scale_cpu_capacity(cpu); capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu); /* @@ -4878,7 +5116,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) { - return true; + return !cfs_rq->nr_running; } #define UPDATE_TG 0x0 @@ -4919,10 +5157,12 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - u64 vslice = calc_delta_fair(se->slice, se); - u64 vruntime = avg_vruntime(cfs_rq); + u64 vslice, vruntime = avg_vruntime(cfs_rq); s64 lag = 0; + se->slice = sysctl_sched_base_slice; + vslice = calc_delta_fair(se->slice, se); + /* * Due to how V is constructed as the weighted average of entities, * adding tasks with positive lag, or removing tasks with negative lag @@ -5211,7 +5451,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * 4) do not run the "skip" process, if something else is available */ static struct sched_entity * -pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) +pick_next_entity(struct cfs_rq *cfs_rq) { /* * Enabling NEXT_BUDDY will affect latency but not fairness. @@ -5755,13 +5995,13 @@ static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) { - struct cfs_rq *local_unthrottle = NULL; int this_cpu = smp_processor_id(); u64 runtime, remaining = 1; bool throttled = false; - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq, *tmp; struct rq_flags rf; struct rq *rq; + LIST_HEAD(local_unthrottle); rcu_read_lock(); list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, @@ -5777,11 +6017,9 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) if (!cfs_rq_throttled(cfs_rq)) goto next; -#ifdef CONFIG_SMP /* Already queued for async unthrottle */ if (!list_empty(&cfs_rq->throttled_csd_list)) goto next; -#endif /* By the above checks, this should never be true */ SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); @@ -5798,11 +6036,17 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) /* we check whether we're throttled above */ if (cfs_rq->runtime_remaining > 0) { - if (cpu_of(rq) != this_cpu || - SCHED_WARN_ON(local_unthrottle)) + if (cpu_of(rq) != this_cpu) { unthrottle_cfs_rq_async(cfs_rq); - else - local_unthrottle = cfs_rq; + } else { + /* + * We currently only expect to be unthrottling + * a single cfs_rq locally. + */ + SCHED_WARN_ON(!list_empty(&local_unthrottle)); + list_add_tail(&cfs_rq->throttled_csd_list, + &local_unthrottle); + } } else { throttled = true; } @@ -5810,15 +6054,23 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) next: rq_unlock_irqrestore(rq, &rf); } - rcu_read_unlock(); - if (local_unthrottle) { - rq = cpu_rq(this_cpu); + list_for_each_entry_safe(cfs_rq, tmp, &local_unthrottle, + throttled_csd_list) { + struct rq *rq = rq_of(cfs_rq); + rq_lock_irqsave(rq, &rf); - if (cfs_rq_throttled(local_unthrottle)) - unthrottle_cfs_rq(local_unthrottle); + + list_del_init(&cfs_rq->throttled_csd_list); + + if (cfs_rq_throttled(cfs_rq)) + unthrottle_cfs_rq(cfs_rq); + rq_unlock_irqrestore(rq, &rf); } + SCHED_WARN_ON(!list_empty(&local_unthrottle)); + + rcu_read_unlock(); return throttled; } @@ -6148,9 +6400,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) { cfs_rq->runtime_enabled = 0; INIT_LIST_HEAD(&cfs_rq->throttled_list); -#ifdef CONFIG_SMP INIT_LIST_HEAD(&cfs_rq->throttled_csd_list); -#endif } void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) @@ -6619,6 +6869,7 @@ dequeue_throttle: /* Working cpumask for: load_balance, load_balance_newidle. */ static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask); +static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask); #ifdef CONFIG_NO_HZ_COMMON @@ -7107,45 +7358,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); int i, cpu, idle_cpu = -1, nr = INT_MAX; struct sched_domain_shared *sd_share; - struct rq *this_rq = this_rq(); - int this = smp_processor_id(); - struct sched_domain *this_sd = NULL; - u64 time = 0; cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); - if (sched_feat(SIS_PROP) && !has_idle_core) { - u64 avg_cost, avg_idle, span_avg; - unsigned long now = jiffies; - - this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); - if (!this_sd) - return -1; - - /* - * If we're busy, the assumption that the last idle period - * predicts the future is flawed; age away the remaining - * predicted idle time. - */ - if (unlikely(this_rq->wake_stamp < now)) { - while (this_rq->wake_stamp < now && this_rq->wake_avg_idle) { - this_rq->wake_stamp++; - this_rq->wake_avg_idle >>= 1; - } - } - - avg_idle = this_rq->wake_avg_idle; - avg_cost = this_sd->avg_scan_cost + 1; - - span_avg = sd->span_weight * avg_idle; - if (span_avg > 4*avg_cost) - nr = div_u64(span_avg, avg_cost); - else - nr = 4; - - time = cpu_clock(this); - } - if (sched_feat(SIS_UTIL)) { sd_share = rcu_dereference(per_cpu(sd_llc_shared, target)); if (sd_share) { @@ -7157,6 +7372,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool } } + if (static_branch_unlikely(&sched_cluster_active)) { + struct sched_group *sg = sd->groups; + + if (sg->flags & SD_CLUSTER) { + for_each_cpu_wrap(cpu, sched_group_span(sg), target + 1) { + if (!cpumask_test_cpu(cpu, cpus)) + continue; + + if (has_idle_core) { + i = select_idle_core(p, cpu, cpus, &idle_cpu); + if ((unsigned int)i < nr_cpumask_bits) + return i; + } else { + if (--nr <= 0) + return -1; + idle_cpu = __select_idle_cpu(cpu, p); + if ((unsigned int)idle_cpu < nr_cpumask_bits) + return idle_cpu; + } + } + cpumask_andnot(cpus, cpus, sched_group_span(sg)); + } + } + for_each_cpu_wrap(cpu, cpus, target + 1) { if (has_idle_core) { i = select_idle_core(p, cpu, cpus, &idle_cpu); @@ -7164,7 +7403,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool return i; } else { - if (!--nr) + if (--nr <= 0) return -1; idle_cpu = __select_idle_cpu(cpu, p); if ((unsigned int)idle_cpu < nr_cpumask_bits) @@ -7175,18 +7414,6 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool if (has_idle_core) set_idle_cores(target, false); - if (sched_feat(SIS_PROP) && this_sd && !has_idle_core) { - time = cpu_clock(this) - time; - - /* - * Account for the scan cost of wakeups against the average - * idle time. - */ - this_rq->wake_avg_idle -= min(this_rq->wake_avg_idle, time); - - update_avg(&this_sd->avg_scan_cost, time); - } - return idle_cpu; } @@ -7226,7 +7453,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) * Look for the CPU with best capacity. */ else if (fits < 0) - cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu)); + cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu)); /* * First, select CPU which fits better (-1 being better than 0). @@ -7266,7 +7493,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) bool has_idle_core = false; struct sched_domain *sd; unsigned long task_util, util_min, util_max; - int i, recent_used_cpu; + int i, recent_used_cpu, prev_aff = -1; /* * On asymmetric system, update task utilization because we will check @@ -7293,8 +7520,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) */ if (prev != target && cpus_share_cache(prev, target) && (available_idle_cpu(prev) || sched_idle_cpu(prev)) && - asym_fits_cpu(task_util, util_min, util_max, prev)) - return prev; + asym_fits_cpu(task_util, util_min, util_max, prev)) { + + if (!static_branch_unlikely(&sched_cluster_active) || + cpus_share_resources(prev, target)) + return prev; + + prev_aff = prev; + } /* * Allow a per-cpu kthread to stack with the wakee if the @@ -7321,7 +7554,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) && asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { - return recent_used_cpu; + + if (!static_branch_unlikely(&sched_cluster_active) || + cpus_share_resources(recent_used_cpu, target)) + return recent_used_cpu; + + } else { + recent_used_cpu = -1; } /* @@ -7362,6 +7601,17 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if ((unsigned)i < nr_cpumask_bits) return i; + /* + * For cluster machines which have lower sharing cache like L2 or + * LLC Tag, we tend to find an idle CPU in the target's cluster + * first. But prev_cpu or recent_used_cpu may also be a good candidate, + * use them if possible when no idle CPU found in select_idle_cpu(). + */ + if ((unsigned int)prev_aff < nr_cpumask_bits) + return prev_aff; + if ((unsigned int)recent_used_cpu < nr_cpumask_bits) + return recent_used_cpu; + return target; } @@ -7432,16 +7682,16 @@ cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost) if (sched_feat(UTIL_EST)) { unsigned long util_est; - util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued); + util_est = READ_ONCE(cfs_rq->avg.util_est); /* * During wake-up @p isn't enqueued yet and doesn't contribute - * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued. + * to any cpu_rq(cpu)->cfs.avg.util_est. * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p * has been enqueued. * * During exec (@dst_cpu = -1) @p is enqueued and does - * contribute to cpu_rq(cpu)->cfs.util_est.enqueued. + * contribute to cpu_rq(cpu)->cfs.util_est. * Remove it to "simulate" cpu_util without @p's contribution. * * Despite the task_on_rq_queued(@p) check there is still a @@ -7468,7 +7718,7 @@ cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost) util = max(util, util_est); } - return min(util, capacity_orig_of(cpu)); + return min(util, arch_scale_cpu_capacity(cpu)); } unsigned long cpu_util_cfs(int cpu) @@ -7570,7 +7820,7 @@ static inline void eenv_pd_busy_time(struct energy_env *eenv, for_each_cpu(cpu, pd_cpus) { unsigned long util = cpu_util(cpu, p, -1, 0); - busy_time += effective_cpu_util(cpu, util, ENERGY_UTIL, NULL); + busy_time += effective_cpu_util(cpu, util, NULL, NULL); } eenv->pd_busy_time = min(eenv->pd_cap, busy_time); @@ -7593,7 +7843,7 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus, for_each_cpu(cpu, pd_cpus) { struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL; unsigned long util = cpu_util(cpu, p, dst_cpu, 1); - unsigned long eff_util; + unsigned long eff_util, min, max; /* * Performance domain frequency: utilization clamping @@ -7602,7 +7852,23 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus, * NOTE: in case RT tasks are running, by default the * FREQUENCY_UTIL's utilization can be max OPP. */ - eff_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk); + eff_util = effective_cpu_util(cpu, util, &min, &max); + + /* Task's uclamp can modify min and max value */ + if (tsk && uclamp_is_used()) { + min = max(min, uclamp_eff_value(p, UCLAMP_MIN)); + + /* + * If there is no active max uclamp constraint, + * directly use task's one, otherwise keep max. + */ + if (uclamp_rq_is_idle(cpu_rq(cpu))) + max = uclamp_eff_value(p, UCLAMP_MAX); + else + max = max(max, uclamp_eff_value(p, UCLAMP_MAX)); + } + + eff_util = sugov_effective_cpu_perf(cpu, eff_util, min, max); max_util = max(max_util, eff_util); } @@ -7620,11 +7886,16 @@ compute_energy(struct energy_env *eenv, struct perf_domain *pd, { unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu); unsigned long busy_time = eenv->pd_busy_time; + unsigned long energy; if (dst_cpu >= 0) busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time); - return em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap); + energy = em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap); + + trace_sched_compute_energy_tp(p, dst_cpu, energy, max_util, busy_time); + + return energy; } /* @@ -7699,7 +7970,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) target = prev_cpu; sync_entity_load_avg(&p->se); - if (!uclamp_task_util(p, p_util_min, p_util_max)) + if (!task_util_est(p) && p_util_min == 0) goto unlock; eenv_task_busy_time(&eenv, p, prev_cpu); @@ -7707,11 +7978,10 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) for (; pd; pd = pd->next) { unsigned long util_min = p_util_min, util_max = p_util_max; unsigned long cpu_cap, cpu_thermal_cap, util; - unsigned long cur_delta, max_spare_cap = 0; + long prev_spare_cap = -1, max_spare_cap = -1; unsigned long rq_util_min, rq_util_max; - unsigned long prev_spare_cap = 0; + unsigned long cur_delta, base_energy; int max_spare_cap_cpu = -1; - unsigned long base_energy; int fits, max_fits = -1; cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask); @@ -7774,7 +8044,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) prev_spare_cap = cpu_cap; prev_fits = fits; } else if ((fits > max_fits) || - ((fits == max_fits) && (cpu_cap > max_spare_cap))) { + ((fits == max_fits) && ((long)cpu_cap > max_spare_cap))) { /* * Find the CPU with the maximum spare capacity * among the remaining CPUs in the performance @@ -7786,7 +8056,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) } } - if (max_spare_cap_cpu < 0 && prev_spare_cap == 0) + if (max_spare_cap_cpu < 0 && prev_spare_cap < 0) continue; eenv_pd_busy_time(&eenv, cpus, p); @@ -7794,7 +8064,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) base_energy = compute_energy(&eenv, pd, cpus, p, -1); /* Evaluate the energy impact of using prev_cpu. */ - if (prev_spare_cap > 0) { + if (prev_spare_cap > -1) { prev_delta = compute_energy(&eenv, pd, cpus, p, prev_cpu); /* CPU utilization has changed */ @@ -7995,12 +8265,11 @@ static void set_next_buddy(struct sched_entity *se) /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) +static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags) { struct task_struct *curr = rq->curr; struct sched_entity *se = &curr->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(curr); - int next_buddy_marked = 0; int cse_is_idle, pse_is_idle; if (unlikely(se == pse)) @@ -8008,7 +8277,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ /* * This is possible from callers such as attach_tasks(), in which we - * unconditionally check_preempt_curr() after an enqueue (which may have + * unconditionally wakeup_preempt() after an enqueue (which may have * lead to a throttle). This both saves work and prevents false * next-buddy nomination below. */ @@ -8017,7 +8286,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) { set_next_buddy(pse); - next_buddy_marked = 1; } /* @@ -8100,7 +8368,7 @@ again: goto again; } - se = pick_next_entity(cfs_rq, curr); + se = pick_next_entity(cfs_rq); cfs_rq = group_cfs_rq(se); } while (cfs_rq); @@ -8163,7 +8431,7 @@ again: } } - se = pick_next_entity(cfs_rq, curr); + se = pick_next_entity(cfs_rq); cfs_rq = group_cfs_rq(se); } while (cfs_rq); @@ -8202,7 +8470,7 @@ simple: put_prev_task(rq, prev); do { - se = pick_next_entity(cfs_rq, NULL); + se = pick_next_entity(cfs_rq); set_next_entity(cfs_rq, se); cfs_rq = group_cfs_rq(se); } while (cfs_rq); @@ -8850,7 +9118,7 @@ static int detach_tasks(struct lb_env *env) case migrate_util: util = task_util_est(p); - if (util > env->imbalance) + if (shr_bound(util, env->sd->nr_balance_failed) > env->imbalance) goto next; env->imbalance -= util; @@ -8915,7 +9183,7 @@ static void attach_task(struct rq *rq, struct task_struct *p) WARN_ON_ONCE(task_rq(p) != rq); activate_task(rq, p, ENQUEUE_NOCLOCK); - check_preempt_curr(rq, p, 0); + wakeup_preempt(rq, p, 0); } /* @@ -9255,8 +9523,6 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) unsigned long capacity = scale_rt_capacity(cpu); struct sched_group *sdg = sd->groups; - cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu); - if (!capacity) capacity = 1; @@ -9332,7 +9598,7 @@ static inline int check_cpu_capacity(struct rq *rq, struct sched_domain *sd) { return ((rq->cpu_capacity * sd->imbalance_pct) < - (rq->cpu_capacity_orig * 100)); + (arch_scale_cpu_capacity(cpu_of(rq)) * 100)); } /* @@ -9343,7 +9609,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd) { return rq->misfit_task_load && - (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity || + (arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity || check_cpu_capacity(rq, sd)); } @@ -9495,7 +9761,7 @@ static bool sched_use_asym_prio(struct sched_domain *sd, int cpu) * can only do it if @group is an SMT group and has exactly on busy CPU. Larger * imbalances in the number of CPUS are dealt with in find_busiest_group(). * - * If we are balancing load within an SMT core, or at DIE domain level, always + * If we are balancing load within an SMT core, or at PKG domain level, always * proceed. * * Return: true if @env::dst_cpu can do with asym_packing load balance. False @@ -9579,7 +9845,7 @@ static inline long sibling_imbalance(struct lb_env *env, imbalance /= ncores_local + ncores_busiest; /* Take advantage of resource in an empty sched group */ - if (imbalance == 0 && local->sum_nr_running == 0 && + if (imbalance <= 1 && local->sum_nr_running == 0 && busiest->sum_nr_running > 1) imbalance = 2; @@ -9767,6 +10033,15 @@ static bool update_sd_pick_busiest(struct lb_env *env, break; case group_smt_balance: + /* + * Check if we have spare CPUs on either SMT group to + * choose has spare or fully busy handling. + */ + if (sgs->idle_cpus != 0 || busiest->idle_cpus != 0) + goto has_spare; + + fallthrough; + case group_fully_busy: /* * Select the fully busy group with highest avg_load. In @@ -9806,6 +10081,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, else return true; } +has_spare: /* * Select not overloaded group with lowest number of idle cpus @@ -10917,6 +11193,7 @@ static int active_load_balance_cpu_stop(void *data); static int should_we_balance(struct lb_env *env) { + struct cpumask *swb_cpus = this_cpu_cpumask_var_ptr(should_we_balance_tmpmask); struct sched_group *sg = env->sd->groups; int cpu, idle_smt = -1; @@ -10940,8 +11217,9 @@ static int should_we_balance(struct lb_env *env) return 1; } + cpumask_copy(swb_cpus, group_balance_mask(sg)); /* Try to find first idle CPU */ - for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { + for_each_cpu_and(cpu, swb_cpus, env->cpus) { if (!idle_cpu(cpu)) continue; @@ -10953,15 +11231,27 @@ static int should_we_balance(struct lb_env *env) if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) { if (idle_smt == -1) idle_smt = cpu; + /* + * If the core is not idle, and first SMT sibling which is + * idle has been found, then its not needed to check other + * SMT siblings for idleness: + */ +#ifdef CONFIG_SCHED_SMT + cpumask_andnot(swb_cpus, swb_cpus, cpu_smt_mask(cpu)); +#endif continue; } - /* Are we the first idle CPU? */ + /* + * Are we the first idle core in a non-SMT domain or higher, + * or the first idle CPU in a SMT domain? + */ return cpu == env->dst_cpu; } - if (idle_smt == env->dst_cpu) - return true; + /* Are we the first idle CPU with busy siblings? */ + if (idle_smt != -1) + return idle_smt == env->dst_cpu; /* Are we the first CPU of this group ? */ return group_balance_cpu(sg) == env->dst_cpu; @@ -11174,13 +11464,15 @@ more_balance: busiest->push_cpu = this_cpu; active_balance = 1; } - raw_spin_rq_unlock_irqrestore(busiest, flags); + preempt_disable(); + raw_spin_rq_unlock_irqrestore(busiest, flags); if (active_balance) { stop_one_cpu_nowait(cpu_of(busiest), active_load_balance_cpu_stop, busiest, &busiest->active_balance_work); } + preempt_enable(); } } else { sd->nr_balance_failed = 0; @@ -11488,36 +11780,39 @@ static inline int on_null_domain(struct rq *rq) #ifdef CONFIG_NO_HZ_COMMON /* - * idle load balancing details - * - When one of the busy CPUs notice that there may be an idle rebalancing + * NOHZ idle load balancing (ILB) details: + * + * - When one of the busy CPUs notices that there may be an idle rebalancing * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. - * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED not set + * + * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED is not set * anywhere yet. */ - static inline int find_new_ilb(void) { - int ilb; const struct cpumask *hk_mask; + int ilb_cpu; hk_mask = housekeeping_cpumask(HK_TYPE_MISC); - for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) { + for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) { - if (ilb == smp_processor_id()) + if (ilb_cpu == smp_processor_id()) continue; - if (idle_cpu(ilb)) - return ilb; + if (idle_cpu(ilb_cpu)) + return ilb_cpu; } - return nr_cpu_ids; + return -1; } /* - * Kick a CPU to do the nohz balancing, if it is time for it. We pick any - * idle CPU in the HK_TYPE_MISC housekeeping set (if there is one). + * Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU + * SMP function call (IPI). + * + * We pick the first idle CPU in the HK_TYPE_MISC housekeeping set (if there is one). */ static void kick_ilb(unsigned int flags) { @@ -11531,8 +11826,7 @@ static void kick_ilb(unsigned int flags) nohz.next_balance = jiffies+1; ilb_cpu = find_new_ilb(); - - if (ilb_cpu >= nr_cpu_ids) + if (ilb_cpu < 0) return; /* @@ -11545,7 +11839,7 @@ static void kick_ilb(unsigned int flags) /* * This way we generate an IPI on the target CPU which - * is idle. And the softirq performing nohz idle load balance + * is idle, and the softirq performing NOHZ idle load balancing * will be run before returning from the IPI. */ smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd); @@ -11574,7 +11868,7 @@ static void nohz_balancer_kick(struct rq *rq) /* * None are in tickless mode and hence no need for NOHZ idle load - * balancing. + * balancing: */ if (likely(!atomic_read(&nohz.nr_cpus))) return; @@ -11596,9 +11890,8 @@ static void nohz_balancer_kick(struct rq *rq) sd = rcu_dereference(rq->sd); if (sd) { /* - * If there's a CFS task and the current CPU has reduced - * capacity; kick the ILB to see if there's a better CPU to run - * on. + * If there's a runnable CFS task and the current CPU has reduced + * capacity, kick the ILB to see if there's a better CPU to run on: */ if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) { flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; @@ -11650,11 +11943,11 @@ static void nohz_balancer_kick(struct rq *rq) if (sds) { /* * If there is an imbalance between LLC domains (IOW we could - * increase the overall cache use), we need some less-loaded LLC - * domain to pull some load. Likewise, we may need to spread + * increase the overall cache utilization), we need a less-loaded LLC + * domain to pull some load from. Likewise, we may need to spread * load within the current LLC domain (e.g. packed SMT cores but * other CPUs are idle). We can't really know from here how busy - * the others are - so just get a nohz balance going if it looks + * the others are - so just get a NOHZ balance going if it looks * like this LLC domain has tasks we could move. */ nr_busy = atomic_read(&sds->nr_busy_cpus); @@ -11924,8 +12217,19 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) } /* - * Check if we need to run the ILB for updating blocked load before entering - * idle state. + * Check if we need to directly run the ILB for updating blocked load before + * entering idle state. Here we run ILB directly without issuing IPIs. + * + * Note that when this function is called, the tick may not yet be stopped on + * this CPU yet. nohz.idle_cpus_mask is updated only when tick is stopped and + * cleared on the next busy tick. In other words, nohz.idle_cpus_mask updates + * don't align with CPUs enter/exit idle to avoid bottlenecks due to high idle + * entry/exit rate (usec). So it is possible that _nohz_idle_balance() is + * called from this function on (this) CPU that's not yet in the mask. That's + * OK because the goal of nohz_run_idle_balance() is to run ILB only for + * updating the blocked load of already idle CPUs without waking up one of + * those idle CPUs and outside the preempt disable / irq off phase of the local + * cpu about to enter idle, because it can take a long time. */ void nohz_run_idle_balance(int cpu) { @@ -12167,6 +12471,9 @@ static void rq_offline_fair(struct rq *rq) /* Ensure any throttled groups are reachable by pick_next_task */ unthrottle_offline_cfs_rqs(rq); + + /* Ensure that we remove rq contribution to group share: */ + clear_tg_offline_cfs_rqs(rq); } #endif /* CONFIG_SMP */ @@ -12370,7 +12677,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) if (p->prio > oldprio) resched_curr(rq); } else - check_preempt_curr(rq, p, 0); + wakeup_preempt(rq, p, 0); } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -12472,7 +12779,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) if (task_current(rq, p)) resched_curr(rq); else - check_preempt_curr(rq, p, 0); + wakeup_preempt(rq, p, 0); } } @@ -12790,19 +13097,6 @@ next_cpu: return 0; } -#else /* CONFIG_FAIR_GROUP_SCHED */ - -void free_fair_sched_group(struct task_group *tg) { } - -int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) -{ - return 1; -} - -void online_fair_sched_group(struct task_group *tg) { } - -void unregister_fair_sched_group(struct task_group *tg) { } - #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -12831,7 +13125,7 @@ DEFINE_SCHED_CLASS(fair) = { .yield_task = yield_task_fair, .yield_to_task = yield_to_task_fair, - .check_preempt_curr = check_preempt_wakeup, + .wakeup_preempt = check_preempt_wakeup_fair, .pick_next_task = __pick_next_task_fair, .put_prev_task = put_prev_task_fair, @@ -12918,6 +13212,8 @@ __init void init_sched_fair_class(void) for_each_possible_cpu(i) { zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i)); zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i)); + zalloc_cpumask_var_node(&per_cpu(should_we_balance_tmpmask, i), + GFP_KERNEL, cpu_to_node(i)); #ifdef CONFIG_CFS_BANDWIDTH INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i)); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index f770168230ae..143f55df890b 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -49,7 +49,6 @@ SCHED_FEAT(TTWU_QUEUE, true) /* * When doing wakeups, attempt to limit superfluous scans of the LLC domain. */ -SCHED_FEAT(SIS_PROP, false) SCHED_FEAT(SIS_UTIL, true) /* @@ -84,7 +83,6 @@ SCHED_FEAT(WA_BIAS, true) * UtilEstimation. Use estimated CPU utilization. */ SCHED_FEAT(UTIL_EST, true) -SCHED_FEAT(UTIL_EST_FASTUP, true) SCHED_FEAT(LATENCY_WARN, false) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 342f58a329f5..31231925f1ec 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -258,6 +258,36 @@ static void do_idle(void) while (!need_resched()) { rmb(); + /* + * Interrupts shouldn't be re-enabled from that point on until + * the CPU sleeping instruction is reached. Otherwise an interrupt + * may fire and queue a timer that would be ignored until the CPU + * wakes from the sleeping instruction. And testing need_resched() + * doesn't tell about pending needed timer reprogram. + * + * Several cases to consider: + * + * - SLEEP-UNTIL-PENDING-INTERRUPT based instructions such as + * "wfi" or "mwait" are fine because they can be entered with + * interrupt disabled. + * + * - sti;mwait() couple is fine because the interrupts are + * re-enabled only upon the execution of mwait, leaving no gap + * in-between. + * + * - ROLLBACK based idle handlers with the sleeping instruction + * called with interrupts enabled are NOT fine. In this scheme + * when the interrupt detects it has interrupted an idle handler, + * it rolls back to its beginning which performs the + * need_resched() check before re-executing the sleeping + * instruction. This can leak a pending needed timer reprogram. + * If such a scheme is really mandatory due to the lack of an + * appropriate CPU sleeping instruction, then a FAST-FORWARD + * must instead be applied: when the interrupt detects it has + * interrupted an idle handler, it must resume to the end of + * this idle handler so that the generic idle loop is iterated + * again to reprogram the tick. + */ local_irq_disable(); if (cpu_is_offline(cpu)) { @@ -373,6 +403,7 @@ EXPORT_SYMBOL_GPL(play_idle_precise); void cpu_startup_entry(enum cpuhp_state state) { + current->flags |= PF_IDLE; arch_cpu_idle_prepare(); cpuhp_online_idle(state); while (1) @@ -400,7 +431,7 @@ balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) /* * Idle tasks are unconditionally rescheduled: */ -static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) +static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags) { resched_curr(rq); } @@ -481,7 +512,7 @@ DEFINE_SCHED_CLASS(idle) = { /* dequeue is not valid, we print a debug message there: */ .dequeue_task = dequeue_task_idle, - .check_preempt_curr = check_preempt_curr_idle, + .wakeup_preempt = wakeup_preempt_idle, .pick_next_task = pick_next_task_idle, .put_prev_task = put_prev_task_idle, diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 0f310768260c..63b6cf898220 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Per Entity Load Tracking + * Per Entity Load Tracking (PELT) * * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index 3a0e0dc28721..9e1083465fbc 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -52,13 +52,13 @@ static inline void cfs_se_util_change(struct sched_avg *avg) return; /* Avoid store if the flag has been already reset */ - enqueued = avg->util_est.enqueued; + enqueued = avg->util_est; if (!(enqueued & UTIL_AVG_UNCHANGED)) return; /* Reset flag to report util_avg has been updated */ enqueued &= ~UTIL_AVG_UNCHANGED; - WRITE_ONCE(avg->util_est.enqueued, enqueued); + WRITE_ONCE(avg->util_est, enqueued); } static inline u64 rq_clock_pelt(struct rq *rq) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 1d0f634725a6..7b4aa5809c0f 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -434,14 +434,13 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value) return growth; } -static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total, +static void update_triggers(struct psi_group *group, u64 now, enum psi_aggregators aggregator) { struct psi_trigger *t; u64 *total = group->total[aggregator]; struct list_head *triggers; u64 *aggregator_total; - *update_total = false; if (aggregator == PSI_AVGS) { triggers = &group->avg_triggers; @@ -471,14 +470,6 @@ static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total, * events without dropping any). */ if (new_stall) { - /* - * Multiple triggers might be looking at the same state, - * remember to update group->polling_total[] once we've - * been through all of them. Also remember to extend the - * polling time if we see new stall activity. - */ - *update_total = true; - /* Calculate growth since last update */ growth = window_update(&t->win, now, total[t->state]); if (!t->pending_event) { @@ -503,8 +494,6 @@ static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total, /* Reset threshold breach flag once event got generated */ t->pending_event = false; } - - return now + group->rtpoll_min_period; } static u64 update_averages(struct psi_group *group, u64 now) @@ -565,7 +554,6 @@ static void psi_avgs_work(struct work_struct *work) struct delayed_work *dwork; struct psi_group *group; u32 changed_states; - bool update_total; u64 now; dwork = to_delayed_work(work); @@ -584,7 +572,7 @@ static void psi_avgs_work(struct work_struct *work) * go - see calc_avgs() and missed_periods. */ if (now >= group->avg_next_update) { - update_triggers(group, now, &update_total, PSI_AVGS); + update_triggers(group, now, PSI_AVGS); group->avg_next_update = update_averages(group, now); } @@ -608,7 +596,7 @@ static void init_rtpoll_triggers(struct psi_group *group, u64 now) group->rtpoll_next_update = now + group->rtpoll_min_period; } -/* Schedule polling if it's not already scheduled or forced. */ +/* Schedule rtpolling if it's not already scheduled or forced. */ static void psi_schedule_rtpoll_work(struct psi_group *group, unsigned long delay, bool force) { @@ -640,7 +628,6 @@ static void psi_rtpoll_work(struct psi_group *group) { bool force_reschedule = false; u32 changed_states; - bool update_total; u64 now; mutex_lock(&group->rtpoll_trigger_lock); @@ -649,37 +636,37 @@ static void psi_rtpoll_work(struct psi_group *group) if (now > group->rtpoll_until) { /* - * We are either about to start or might stop polling if no - * state change was recorded. Resetting poll_scheduled leaves + * We are either about to start or might stop rtpolling if no + * state change was recorded. Resetting rtpoll_scheduled leaves * a small window for psi_group_change to sneak in and schedule - * an immediate poll_work before we get to rescheduling. One - * potential extra wakeup at the end of the polling window - * should be negligible and polling_next_update still keeps + * an immediate rtpoll_work before we get to rescheduling. One + * potential extra wakeup at the end of the rtpolling window + * should be negligible and rtpoll_next_update still keeps * updates correctly on schedule. */ atomic_set(&group->rtpoll_scheduled, 0); /* - * A task change can race with the poll worker that is supposed to + * A task change can race with the rtpoll worker that is supposed to * report on it. To avoid missing events, ensure ordering between - * poll_scheduled and the task state accesses, such that if the poll - * worker misses the state update, the task change is guaranteed to - * reschedule the poll worker: + * rtpoll_scheduled and the task state accesses, such that if the + * rtpoll worker misses the state update, the task change is + * guaranteed to reschedule the rtpoll worker: * - * poll worker: - * atomic_set(poll_scheduled, 0) + * rtpoll worker: + * atomic_set(rtpoll_scheduled, 0) * smp_mb() * LOAD states * * task change: * STORE states - * if atomic_xchg(poll_scheduled, 1) == 0: - * schedule poll worker + * if atomic_xchg(rtpoll_scheduled, 1) == 0: + * schedule rtpoll worker * * The atomic_xchg() implies a full barrier. */ smp_mb(); } else { - /* Polling window is not over, keep rescheduling */ + /* The rtpolling window is not over, keep rescheduling */ force_reschedule = true; } @@ -687,7 +674,7 @@ static void psi_rtpoll_work(struct psi_group *group) collect_percpu_times(group, PSI_POLL, &changed_states); if (changed_states & group->rtpoll_states) { - /* Initialize trigger windows when entering polling mode */ + /* Initialize trigger windows when entering rtpolling mode */ if (now > group->rtpoll_until) init_rtpoll_triggers(group, now); @@ -706,10 +693,12 @@ static void psi_rtpoll_work(struct psi_group *group) } if (now >= group->rtpoll_next_update) { - group->rtpoll_next_update = update_triggers(group, now, &update_total, PSI_POLL); - if (update_total) + if (changed_states & group->rtpoll_states) { + update_triggers(group, now, PSI_POLL); memcpy(group->rtpoll_total, group->total[PSI_POLL], sizeof(group->rtpoll_total)); + } + group->rtpoll_next_update = now + group->rtpoll_min_period; } psi_schedule_rtpoll_work(group, @@ -1009,6 +998,9 @@ void psi_account_irqtime(struct task_struct *task, u32 delta) struct psi_group_cpu *groupc; u64 now; + if (static_branch_likely(&psi_disabled)) + return; + if (!task->pid) return; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 0597ba0f85ff..3261b067b67e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -16,7 +16,7 @@ struct rt_bandwidth def_rt_bandwidth; * period over which we measure -rt task CPU usage in us. * default: 1s */ -unsigned int sysctl_sched_rt_period = 1000000; +int sysctl_sched_rt_period = 1000000; /* * part of the period that we allow rt tasks to run in us. @@ -34,9 +34,11 @@ static struct ctl_table sched_rt_sysctls[] = { { .procname = "sched_rt_period_us", .data = &sysctl_sched_rt_period, - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(int), .mode = 0644, .proc_handler = sched_rt_handler, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "sched_rt_runtime_us", @@ -44,6 +46,8 @@ static struct ctl_table sched_rt_sysctls[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = sched_rt_handler, + .extra1 = SYSCTL_NEG_ONE, + .extra2 = (void *)&sysctl_sched_rt_period, }, { .procname = "sched_rr_timeslice_ms", @@ -143,7 +147,6 @@ void init_rt_rq(struct rt_rq *rt_rq) #if defined CONFIG_SMP rt_rq->highest_prio.curr = MAX_RT_PRIO-1; rt_rq->highest_prio.next = MAX_RT_PRIO-1; - rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; plist_head_init(&rt_rq->pushable_tasks); #endif /* CONFIG_SMP */ @@ -358,53 +361,6 @@ static inline void rt_clear_overload(struct rq *rq) cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask); } -static void update_rt_migration(struct rt_rq *rt_rq) -{ - if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) { - if (!rt_rq->overloaded) { - rt_set_overload(rq_of_rt_rq(rt_rq)); - rt_rq->overloaded = 1; - } - } else if (rt_rq->overloaded) { - rt_clear_overload(rq_of_rt_rq(rt_rq)); - rt_rq->overloaded = 0; - } -} - -static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ - struct task_struct *p; - - if (!rt_entity_is_task(rt_se)) - return; - - p = rt_task_of(rt_se); - rt_rq = &rq_of_rt_rq(rt_rq)->rt; - - rt_rq->rt_nr_total++; - if (p->nr_cpus_allowed > 1) - rt_rq->rt_nr_migratory++; - - update_rt_migration(rt_rq); -} - -static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ - struct task_struct *p; - - if (!rt_entity_is_task(rt_se)) - return; - - p = rt_task_of(rt_se); - rt_rq = &rq_of_rt_rq(rt_rq)->rt; - - rt_rq->rt_nr_total--; - if (p->nr_cpus_allowed > 1) - rt_rq->rt_nr_migratory--; - - update_rt_migration(rt_rq); -} - static inline int has_pushable_tasks(struct rq *rq) { return !plist_head_empty(&rq->rt.pushable_tasks); @@ -438,6 +394,11 @@ static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) /* Update the highest prio pushable task */ if (p->prio < rq->rt.highest_prio.next) rq->rt.highest_prio.next = p->prio; + + if (!rq->rt.overloaded) { + rt_set_overload(rq); + rq->rt.overloaded = 1; + } } static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) @@ -451,6 +412,11 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) rq->rt.highest_prio.next = p->prio; } else { rq->rt.highest_prio.next = MAX_RT_PRIO-1; + + if (rq->rt.overloaded) { + rt_clear_overload(rq); + rq->rt.overloaded = 0; + } } } @@ -464,16 +430,6 @@ static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p) { } -static inline -void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ -} - -static inline -void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ -} - static inline void rt_queue_push_tasks(struct rq *rq) { } @@ -515,7 +471,7 @@ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) min_cap = uclamp_eff_value(p, UCLAMP_MIN); max_cap = uclamp_eff_value(p, UCLAMP_MAX); - cpu_cap = capacity_orig_of(cpu); + cpu_cap = arch_scale_cpu_capacity(cpu); return cpu_cap >= min(min_cap, max_cap); } @@ -953,7 +909,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) /* * When we're idle and a woken (rt) task is - * throttled check_preempt_curr() will set + * throttled wakeup_preempt() will set * skip_update and the time between the wakeup * and this unthrottle will get accounted as * 'runtime'. @@ -1046,24 +1002,15 @@ static void update_curr_rt(struct rq *rq) { struct task_struct *curr = rq->curr; struct sched_rt_entity *rt_se = &curr->rt; - u64 delta_exec; - u64 now; + s64 delta_exec; if (curr->sched_class != &rt_sched_class) return; - now = rq_clock_task(rq); - delta_exec = now - curr->se.exec_start; - if (unlikely((s64)delta_exec <= 0)) + delta_exec = update_curr_common(rq); + if (unlikely(delta_exec <= 0)) return; - schedstat_set(curr->stats.exec_max, - max(curr->stats.exec_max, delta_exec)); - - trace_sched_stat_runtime(curr, delta_exec, 0); - - update_current_exec_runtime(curr, now, delta_exec); - if (!rt_bandwidth_enabled()) return; @@ -1281,7 +1228,6 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se); inc_rt_prio(rt_rq, prio); - inc_rt_migration(rt_se, rt_rq); inc_rt_group(rt_se, rt_rq); } @@ -1294,7 +1240,6 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se); dec_rt_prio(rt_rq, rt_se_prio(rt_se)); - dec_rt_migration(rt_se, rt_rq); dec_rt_group(rt_se, rt_rq); } @@ -1715,7 +1660,7 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) +static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) { if (p->prio < rq->curr->prio) { resched_curr(rq); @@ -2109,9 +2054,11 @@ retry: */ push_task = get_push_task(rq); if (push_task) { + preempt_disable(); raw_spin_rq_unlock(rq); stop_one_cpu_nowait(rq->cpu, push_cpu_stop, push_task, &rq->push_work); + preempt_enable(); raw_spin_rq_lock(rq); } @@ -2448,9 +2395,11 @@ skip: double_unlock_balance(this_rq, src_rq); if (push_task) { + preempt_disable(); raw_spin_rq_unlock(this_rq); stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, push_task, &src_rq->push_work); + preempt_enable(); raw_spin_rq_lock(this_rq); } } @@ -2702,7 +2651,7 @@ DEFINE_SCHED_CLASS(rt) = { .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, - .check_preempt_curr = check_preempt_curr_rt, + .wakeup_preempt = wakeup_preempt_rt, .pick_next_task = pick_next_task_rt, .put_prev_task = put_prev_task_rt, @@ -2985,9 +2934,6 @@ static int sched_rt_global_constraints(void) #ifdef CONFIG_SYSCTL static int sched_rt_global_validate(void) { - if (sysctl_sched_rt_period <= 0) - return -EINVAL; - if ((sysctl_sched_rt_runtime != RUNTIME_INF) && ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) || ((u64)sysctl_sched_rt_runtime * @@ -3018,7 +2964,7 @@ static int sched_rt_handler(struct ctl_table *table, int write, void *buffer, old_period = sysctl_sched_rt_period; old_runtime = sysctl_sched_rt_runtime; - ret = proc_dointvec(table, write, buffer, lenp, ppos); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) { ret = sched_rt_global_validate(); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 04846272409c..001fe047bd5d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -74,15 +74,6 @@ #include "../workqueue_internal.h" -#ifdef CONFIG_CGROUP_SCHED -#include <linux/cgroup.h> -#include <linux/psi.h> -#endif - -#ifdef CONFIG_SCHED_DEBUG -# include <linux/static_key.h> -#endif - #ifdef CONFIG_PARAVIRT # include <asm/paravirt.h> # include <asm/paravirt_api_clock.h> @@ -109,14 +100,12 @@ extern __read_mostly int scheduler_running; extern unsigned long calc_load_update; extern atomic_long_t calc_load_tasks; -extern unsigned int sysctl_sched_child_runs_first; - extern void calc_global_load_tick(struct rq *this_rq); extern long calc_load_fold_active(struct rq *this_rq, long adjust); extern void call_trace_sched_update_nr_running(struct rq *rq, int count); -extern unsigned int sysctl_sched_rt_period; +extern int sysctl_sched_rt_period; extern int sysctl_sched_rt_runtime; extern int sched_rr_timeslice; @@ -284,8 +273,6 @@ struct rt_bandwidth { unsigned int rt_period_active; }; -void __dl_clear_params(struct task_struct *p); - static inline int dl_bandwidth_enabled(void) { return sysctl_sched_rt_runtime >= 0; @@ -326,6 +313,33 @@ extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *att extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); extern int dl_bw_check_overflow(int cpu); +/* + * SCHED_DEADLINE supports servers (nested scheduling) with the following + * interface: + * + * dl_se::rq -- runqueue we belong to. + * + * dl_se::server_has_tasks() -- used on bandwidth enforcement; we 'stop' the + * server when it runs out of tasks to run. + * + * dl_se::server_pick() -- nested pick_next_task(); we yield the period if this + * returns NULL. + * + * dl_server_update() -- called from update_curr_common(), propagates runtime + * to the server. + * + * dl_server_start() + * dl_server_stop() -- start/stop the server when it has (no) tasks. + * + * dl_server_init() -- initializes the server. + */ +extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec); +extern void dl_server_start(struct sched_dl_entity *dl_se); +extern void dl_server_stop(struct sched_dl_entity *dl_se); +extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, + dl_server_has_tasks_f has_tasks, + dl_server_pick_f pick); + #ifdef CONFIG_CGROUP_SCHED struct cfs_rq; @@ -447,10 +461,21 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) extern int tg_nop(struct task_group *tg, void *data); +#ifdef CONFIG_FAIR_GROUP_SCHED extern void free_fair_sched_group(struct task_group *tg); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); extern void online_fair_sched_group(struct task_group *tg); extern void unregister_fair_sched_group(struct task_group *tg); +#else +static inline void free_fair_sched_group(struct task_group *tg) { } +static inline int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) +{ + return 1; +} +static inline void online_fair_sched_group(struct task_group *tg) { } +static inline void unregister_fair_sched_group(struct task_group *tg) { } +#endif + extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *se, int cpu, struct sched_entity *parent); @@ -594,6 +619,7 @@ struct cfs_rq { } removed; #ifdef CONFIG_FAIR_GROUP_SCHED + u64 last_update_tg_load_avg; unsigned long tg_load_avg_contrib; long propagate; long prop_runnable_sum; @@ -644,9 +670,7 @@ struct cfs_rq { int throttled; int throttle_count; struct list_head throttled_list; -#ifdef CONFIG_SMP struct list_head throttled_csd_list; -#endif #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ }; @@ -675,8 +699,6 @@ struct rt_rq { } highest_prio; #endif #ifdef CONFIG_SMP - unsigned int rt_nr_migratory; - unsigned int rt_nr_total; int overloaded; struct plist_head pushable_tasks; @@ -721,7 +743,6 @@ struct dl_rq { u64 next; } earliest_dl; - unsigned int dl_nr_migratory; int overloaded; /* @@ -963,10 +984,6 @@ struct rq { /* runqueue lock: */ raw_spinlock_t __lock; - /* - * nr_running and cpu_load should be in the same cacheline because - * remote CPUs use both these fields when doing load calculation. - */ unsigned int nr_running; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; @@ -1048,7 +1065,6 @@ struct rq { struct sched_domain __rcu *sd; unsigned long cpu_capacity; - unsigned long cpu_capacity_orig; struct balance_callback *balance_callback; @@ -1079,9 +1095,6 @@ struct rq { u64 idle_stamp; u64 avg_idle; - unsigned long wake_stamp; - u64 wake_avg_idle; - /* This is used to determine avg_idle's max value */ u64 max_idle_balance_cost; @@ -1658,6 +1671,11 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); } +DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct, + _T->rq = task_rq_lock(_T->lock, &_T->rf), + task_rq_unlock(_T->rq, _T->lock, &_T->rf), + struct rq *rq; struct rq_flags rf) + static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) @@ -1868,11 +1886,13 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc); DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); +DECLARE_PER_CPU(int, sd_share_id); DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); extern struct static_key_false sched_asym_cpucapacity; +extern struct static_key_false sched_cluster_active; static __always_inline bool sched_asym_cpucap_active(void) { @@ -2195,6 +2215,10 @@ extern const u32 sched_prio_to_wmult[40]; * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location * in the runqueue. * + * NOCLOCK - skip the update_rq_clock() (avoids double updates) + * + * MIGRATION - p->on_rq == TASK_ON_RQ_MIGRATING (used for DEADLINE) + * * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) * ENQUEUE_MIGRATED - the task was migrated during wakeup @@ -2205,6 +2229,7 @@ extern const u32 sched_prio_to_wmult[40]; #define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ #define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ #define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ +#define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */ #define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_RESTORE 0x02 @@ -2219,6 +2244,7 @@ extern const u32 sched_prio_to_wmult[40]; #define ENQUEUE_MIGRATED 0x00 #endif #define ENQUEUE_INITIAL 0x80 +#define ENQUEUE_MIGRATING 0x100 #define RETRY_TASK ((void *)-1UL) @@ -2228,6 +2254,8 @@ struct affinity_context { unsigned int flags; }; +extern s64 update_curr_common(struct rq *rq); + struct sched_class { #ifdef CONFIG_UCLAMP_TASK @@ -2239,7 +2267,7 @@ struct sched_class { void (*yield_task) (struct rq *rq); bool (*yield_to_task)(struct rq *rq, struct task_struct *p); - void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags); + void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags); struct task_struct *(*pick_next_task)(struct rq *rq); @@ -2441,8 +2469,7 @@ extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); -extern void init_dl_task_timer(struct sched_dl_entity *dl_se); -extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); +extern void init_dl_entity(struct sched_dl_entity *dl_se); #define BW_SHIFT 20 #define BW_UNIT (1 << BW_SHIFT) @@ -2513,7 +2540,7 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) extern void activate_task(struct rq *rq, struct task_struct *p, int flags); extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); -extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); +extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); #ifdef CONFIG_PREEMPT_RT #define SCHED_NR_MIGRATE_BREAK 8 @@ -2838,6 +2865,7 @@ DEFINE_LOCK_GUARD_2(double_rq_lock, struct rq, double_rq_lock(_T->lock, _T->lock2), double_rq_unlock(_T->lock, _T->lock2)) +extern struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); @@ -2977,29 +3005,14 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif #ifdef CONFIG_SMP -static inline unsigned long capacity_orig_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity_orig; -} +unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, + unsigned long *min, + unsigned long *max); -/** - * enum cpu_util_type - CPU utilization type - * @FREQUENCY_UTIL: Utilization used to select frequency - * @ENERGY_UTIL: Utilization used during energy calculation - * - * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time - * need to be aggregated differently depending on the usage made of them. This - * enum is used within effective_cpu_util() to differentiate the types of - * utilization expected by the callers, and adjust the aggregation accordingly. - */ -enum cpu_util_type { - FREQUENCY_UTIL, - ENERGY_UTIL, -}; +unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual, + unsigned long min, + unsigned long max); -unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, - enum cpu_util_type type, - struct task_struct *p); /* * Verify the fitness of task @p to run on @cpu taking into account the @@ -3056,59 +3069,6 @@ static inline bool uclamp_rq_is_idle(struct rq *rq) return rq->uclamp_flags & UCLAMP_FLAG_IDLE; } -/** - * uclamp_rq_util_with - clamp @util with @rq and @p effective uclamp values. - * @rq: The rq to clamp against. Must not be NULL. - * @util: The util value to clamp. - * @p: The task to clamp against. Can be NULL if you want to clamp - * against @rq only. - * - * Clamps the passed @util to the max(@rq, @p) effective uclamp values. - * - * If sched_uclamp_used static key is disabled, then just return the util - * without any clamping since uclamp aggregation at the rq level in the fast - * path is disabled, rendering this operation a NOP. - * - * Use uclamp_eff_value() if you don't care about uclamp values at rq level. It - * will return the correct effective uclamp value of the task even if the - * static key is disabled. - */ -static __always_inline -unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, - struct task_struct *p) -{ - unsigned long min_util = 0; - unsigned long max_util = 0; - - if (!static_branch_likely(&sched_uclamp_used)) - return util; - - if (p) { - min_util = uclamp_eff_value(p, UCLAMP_MIN); - max_util = uclamp_eff_value(p, UCLAMP_MAX); - - /* - * Ignore last runnable task's max clamp, as this task will - * reset it. Similarly, no need to read the rq's min clamp. - */ - if (uclamp_rq_is_idle(rq)) - goto out; - } - - min_util = max_t(unsigned long, min_util, uclamp_rq_get(rq, UCLAMP_MIN)); - max_util = max_t(unsigned long, max_util, uclamp_rq_get(rq, UCLAMP_MAX)); -out: - /* - * Since CPU's {min,max}_util clamps are MAX aggregated considering - * RUNNABLE tasks with _different_ clamps, we can end up with an - * inversion. Fix it now when the clamps are applied. - */ - if (unlikely(min_util >= max_util)) - return min_util; - - return clamp(util, min_util, max_util); -} - /* Is the rq being capped/throttled by uclamp_max? */ static inline bool uclamp_rq_is_capped(struct rq *rq) { @@ -3146,13 +3106,6 @@ static inline unsigned long uclamp_eff_value(struct task_struct *p, return SCHED_CAPACITY_SCALE; } -static inline -unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, - struct task_struct *p) -{ - return util; -} - static inline bool uclamp_rq_is_capped(struct rq *rq) { return false; } static inline bool uclamp_is_used(void) @@ -3219,6 +3172,8 @@ static inline bool sched_energy_enabled(void) return static_branch_unlikely(&sched_energy_present); } +extern struct cpufreq_governor schedutil_gov; + #else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ #define perf_domain_span(pd) NULL @@ -3280,16 +3235,6 @@ extern int sched_dynamic_mode(const char *str); extern void sched_dynamic_update(int mode); #endif -static inline void update_current_exec_runtime(struct task_struct *curr, - u64 now, u64 delta_exec) -{ - curr->se.sum_exec_runtime += delta_exec; - account_group_exec_runtime(curr, delta_exec); - - curr->se.exec_start = now; - cgroup_account_cputime(curr, delta_exec); -} - #ifdef CONFIG_SCHED_MM_CID #define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */ diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 85590599b4d6..b1b8fe61c532 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -23,7 +23,7 @@ balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) #endif /* CONFIG_SMP */ static void -check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) +wakeup_preempt_stop(struct rq *rq, struct task_struct *p, int flags) { /* we're never preempted */ } @@ -70,18 +70,7 @@ static void yield_task_stop(struct rq *rq) static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) { - struct task_struct *curr = rq->curr; - u64 now, delta_exec; - - now = rq_clock_task(rq); - delta_exec = now - curr->se.exec_start; - if (unlikely((s64)delta_exec < 0)) - delta_exec = 0; - - schedstat_set(curr->stats.exec_max, - max(curr->stats.exec_max, delta_exec)); - - update_current_exec_runtime(curr, now, delta_exec); + update_curr_common(rq); } /* @@ -120,7 +109,7 @@ DEFINE_SCHED_CLASS(stop) = { .dequeue_task = dequeue_task_stop, .yield_task = yield_task_stop, - .check_preempt_curr = check_preempt_curr_stop, + .wakeup_preempt = wakeup_preempt_stop, .pick_next_task = pick_next_task_stop, .put_prev_task = put_prev_task_stop, diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 05a5bc678c08..10d1391e7416 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -212,6 +212,69 @@ static unsigned int sysctl_sched_energy_aware = 1; static DEFINE_MUTEX(sched_energy_mutex); static bool sched_energy_update; +static bool sched_is_eas_possible(const struct cpumask *cpu_mask) +{ + bool any_asym_capacity = false; + struct cpufreq_policy *policy; + struct cpufreq_governor *gov; + int i; + + /* EAS is enabled for asymmetric CPU capacity topologies. */ + for_each_cpu(i, cpu_mask) { + if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, i))) { + any_asym_capacity = true; + break; + } + } + if (!any_asym_capacity) { + if (sched_debug()) { + pr_info("rd %*pbl: Checking EAS, CPUs do not have asymmetric capacities\n", + cpumask_pr_args(cpu_mask)); + } + return false; + } + + /* EAS definitely does *not* handle SMT */ + if (sched_smt_active()) { + if (sched_debug()) { + pr_info("rd %*pbl: Checking EAS, SMT is not supported\n", + cpumask_pr_args(cpu_mask)); + } + return false; + } + + if (!arch_scale_freq_invariant()) { + if (sched_debug()) { + pr_info("rd %*pbl: Checking EAS: frequency-invariant load tracking not yet supported", + cpumask_pr_args(cpu_mask)); + } + return false; + } + + /* Do not attempt EAS if schedutil is not being used. */ + for_each_cpu(i, cpu_mask) { + policy = cpufreq_cpu_get(i); + if (!policy) { + if (sched_debug()) { + pr_info("rd %*pbl: Checking EAS, cpufreq policy not set for CPU: %d", + cpumask_pr_args(cpu_mask), i); + } + return false; + } + gov = policy->governor; + cpufreq_cpu_put(policy); + if (gov != &schedutil_gov) { + if (sched_debug()) { + pr_info("rd %*pbl: Checking EAS, schedutil is mandatory\n", + cpumask_pr_args(cpu_mask)); + } + return false; + } + } + + return true; +} + void rebuild_sched_domains_energy(void) { mutex_lock(&sched_energy_mutex); @@ -230,6 +293,15 @@ static int sched_energy_aware_handler(struct ctl_table *table, int write, if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; + if (!sched_is_eas_possible(cpu_active_mask)) { + if (write) { + return -EOPNOTSUPP; + } else { + *lenp = 0; + return 0; + } + } + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) { state = static_branch_unlikely(&sched_energy_present); @@ -348,103 +420,33 @@ static void sched_energy_set(bool has_eas) * 1. an Energy Model (EM) is available; * 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy. * 3. no SMT is detected. - * 4. the EM complexity is low enough to keep scheduling overheads low; - * 5. schedutil is driving the frequency of all CPUs of the rd; - * 6. frequency invariance support is present; - * - * The complexity of the Energy Model is defined as: - * - * C = nr_pd * (nr_cpus + nr_ps) - * - * with parameters defined as: - * - nr_pd: the number of performance domains - * - nr_cpus: the number of CPUs - * - nr_ps: the sum of the number of performance states of all performance - * domains (for example, on a system with 2 performance domains, - * with 10 performance states each, nr_ps = 2 * 10 = 20). - * - * It is generally not a good idea to use such a model in the wake-up path on - * very complex platforms because of the associated scheduling overheads. The - * arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs - * with per-CPU DVFS and less than 8 performance states each, for example. + * 4. schedutil is driving the frequency of all CPUs of the rd; + * 5. frequency invariance support is present; */ -#define EM_MAX_COMPLEXITY 2048 - -extern struct cpufreq_governor schedutil_gov; static bool build_perf_domains(const struct cpumask *cpu_map) { - int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map); + int i; struct perf_domain *pd = NULL, *tmp; int cpu = cpumask_first(cpu_map); struct root_domain *rd = cpu_rq(cpu)->rd; - struct cpufreq_policy *policy; - struct cpufreq_governor *gov; if (!sysctl_sched_energy_aware) goto free; - /* EAS is enabled for asymmetric CPU capacity topologies. */ - if (!per_cpu(sd_asym_cpucapacity, cpu)) { - if (sched_debug()) { - pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n", - cpumask_pr_args(cpu_map)); - } + if (!sched_is_eas_possible(cpu_map)) goto free; - } - - /* EAS definitely does *not* handle SMT */ - if (sched_smt_active()) { - pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n", - cpumask_pr_args(cpu_map)); - goto free; - } - - if (!arch_scale_freq_invariant()) { - if (sched_debug()) { - pr_warn("rd %*pbl: Disabling EAS: frequency-invariant load tracking not yet supported", - cpumask_pr_args(cpu_map)); - } - goto free; - } for_each_cpu(i, cpu_map) { /* Skip already covered CPUs. */ if (find_pd(pd, i)) continue; - /* Do not attempt EAS if schedutil is not being used. */ - policy = cpufreq_cpu_get(i); - if (!policy) - goto free; - gov = policy->governor; - cpufreq_cpu_put(policy); - if (gov != &schedutil_gov) { - if (rd->pd) - pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n", - cpumask_pr_args(cpu_map)); - goto free; - } - /* Create the new pd and add it to the local list. */ tmp = pd_init(i); if (!tmp) goto free; tmp->next = pd; pd = tmp; - - /* - * Count performance domains and performance states for the - * complexity check. - */ - nr_pd++; - nr_ps += em_pd_nr_perf_states(pd->em_pd); - } - - /* Bail out if the Energy Model complexity is too high. */ - if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) { - WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n", - cpumask_pr_args(cpu_map)); - goto free; } perf_domain_debug(cpu_map, pd); @@ -666,11 +668,14 @@ static void destroy_sched_domains(struct sched_domain *sd) DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); +DEFINE_PER_CPU(int, sd_share_id); DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); + DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); +DEFINE_STATIC_KEY_FALSE(sched_cluster_active); static void update_top_cache_domain(int cpu) { @@ -691,6 +696,17 @@ static void update_top_cache_domain(int cpu) per_cpu(sd_llc_id, cpu) = id; rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); + sd = lowest_flag_domain(cpu, SD_CLUSTER); + if (sd) + id = cpumask_first(sched_domain_span(sd)); + + /* + * This assignment should be placed after the sd_llc_id as + * we want this id equals to cluster id on cluster machines + * but equals to LLC id on non-Cluster machines. + */ + per_cpu(sd_share_id, cpu) = id; + sd = lowest_flag_domain(cpu, SD_NUMA); rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); @@ -1117,7 +1133,7 @@ fail: * * - Simultaneous multithreading (SMT) * - Multi-Core Cache (MC) - * - Package (DIE) + * - Package (PKG) * * Where the last one more or less denotes everything up to a NUMA node. * @@ -1139,13 +1155,13 @@ fail: * * CPU 0 1 2 3 4 5 6 7 * - * DIE [ ] + * PKG [ ] * MC [ ] [ ] * SMT [ ] [ ] [ ] [ ] * * - or - * - * DIE 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7 + * PKG 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7 * MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7 * SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7 * @@ -1548,6 +1564,7 @@ static struct cpumask ***sched_domains_numa_masks; */ #define TOPOLOGY_SD_FLAGS \ (SD_SHARE_CPUCAPACITY | \ + SD_CLUSTER | \ SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING) @@ -1679,7 +1696,7 @@ static struct sched_domain_topology_level default_topology[] = { #ifdef CONFIG_SCHED_MC { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, #endif - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { cpu_cpu_mask, SD_INIT_NAME(PKG) }, { NULL, }, }; @@ -2112,22 +2129,31 @@ static int hop_cmp(const void *a, const void *b) return -1; } -/* - * sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth next cpu - * closest to @cpu from @cpumask. - * cpumask: cpumask to find a cpu from - * cpu: Nth cpu to find - * - * returns: cpu, or nr_cpu_ids when nothing found. +/** + * sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth closest CPU + * from @cpus to @cpu, taking into account distance + * from a given @node. + * @cpus: cpumask to find a cpu from + * @cpu: CPU to start searching + * @node: NUMA node to order CPUs by distance + * + * Return: cpu, or nr_cpu_ids when nothing found. */ int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node) { - struct __cmp_key k = { .cpus = cpus, .node = node, .cpu = cpu }; + struct __cmp_key k = { .cpus = cpus, .cpu = cpu }; struct cpumask ***hop_masks; int hop, ret = nr_cpu_ids; + if (node == NUMA_NO_NODE) + return cpumask_nth_and(cpu, cpus, cpu_online_mask); + rcu_read_lock(); + /* CPU-less node entries are uninitialized in sched_domains_numa_masks */ + node = numa_nearest_node(node, N_CPU); + k.node = node; + k.masks = rcu_dereference(sched_domains_numa_masks); if (!k.masks) goto unlock; @@ -2362,6 +2388,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att struct rq *rq = NULL; int i, ret = -ENOMEM; bool has_asym = false; + bool has_cluster = false; if (WARN_ON(cpumask_empty(cpu_map))) goto error; @@ -2479,20 +2506,29 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) { + unsigned long capacity; + rq = cpu_rq(i); sd = *per_cpu_ptr(d.sd, i); + capacity = arch_scale_cpu_capacity(i); /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ - if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity)) - WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig); + if (capacity > READ_ONCE(d.rd->max_cpu_capacity)) + WRITE_ONCE(d.rd->max_cpu_capacity, capacity); cpu_attach_domain(sd, d.rd, i); + + if (lowest_flag_domain(i, SD_CLUSTER)) + has_cluster = true; } rcu_read_unlock(); if (has_asym) static_branch_inc_cpuslocked(&sched_asym_cpucapacity); + if (has_cluster) + static_branch_inc_cpuslocked(&sched_cluster_active); + if (rq && sched_debug_verbose) { pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); @@ -2592,6 +2628,9 @@ static void detach_destroy_domains(const struct cpumask *cpu_map) if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu))) static_branch_dec_cpuslocked(&sched_asym_cpucapacity); + if (static_branch_unlikely(&sched_cluster_active)) + static_branch_dec_cpuslocked(&sched_cluster_active); + rcu_read_lock(); for_each_cpu(i, cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 802d98cf2de3..51e38f5f4701 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -58,13 +58,6 @@ void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry EXPORT_SYMBOL(remove_wait_queue); /* - * Scan threshold to break wait queue walk. - * This allows a waker to take a break from holding the - * wait queue lock during the wait queue walk. - */ -#define WAITQUEUE_WALK_BREAK_CNT 64 - -/* * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve * number) then we wake that number of exclusive tasks, and potentially all @@ -78,21 +71,13 @@ EXPORT_SYMBOL(remove_wait_queue); * zero in this (rare) case, and we handle it by continuing to scan the queue. */ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, - int nr_exclusive, int wake_flags, void *key, - wait_queue_entry_t *bookmark) + int nr_exclusive, int wake_flags, void *key) { wait_queue_entry_t *curr, *next; - int cnt = 0; lockdep_assert_held(&wq_head->lock); - if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) { - curr = list_next_entry(bookmark, entry); - - list_del(&bookmark->entry); - bookmark->flags = 0; - } else - curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry); + curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry); if (&curr->entry == &wq_head->head) return nr_exclusive; @@ -101,21 +86,11 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, unsigned flags = curr->flags; int ret; - if (flags & WQ_FLAG_BOOKMARK) - continue; - ret = curr->func(curr, mode, wake_flags, key); if (ret < 0) break; if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) break; - - if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) && - (&next->entry != &wq_head->head)) { - bookmark->flags = WQ_FLAG_BOOKMARK; - list_add_tail(&bookmark->entry, &next->entry); - break; - } } return nr_exclusive; @@ -125,20 +100,12 @@ static int __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int m int nr_exclusive, int wake_flags, void *key) { unsigned long flags; - wait_queue_entry_t bookmark; - int remaining = nr_exclusive; + int remaining; - bookmark.flags = 0; - bookmark.private = NULL; - bookmark.func = NULL; - INIT_LIST_HEAD(&bookmark.entry); - - do { - spin_lock_irqsave(&wq_head->lock, flags); - remaining = __wake_up_common(wq_head, mode, remaining, - wake_flags, key, &bookmark); - spin_unlock_irqrestore(&wq_head->lock, flags); - } while (bookmark.flags & WQ_FLAG_BOOKMARK); + spin_lock_irqsave(&wq_head->lock, flags); + remaining = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, + key); + spin_unlock_irqrestore(&wq_head->lock, flags); return nr_exclusive - remaining; } @@ -171,23 +138,16 @@ void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode */ void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr) { - __wake_up_common(wq_head, mode, nr, 0, NULL, NULL); + __wake_up_common(wq_head, mode, nr, 0, NULL); } EXPORT_SYMBOL_GPL(__wake_up_locked); void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key) { - __wake_up_common(wq_head, mode, 1, 0, key, NULL); + __wake_up_common(wq_head, mode, 1, 0, key); } EXPORT_SYMBOL_GPL(__wake_up_locked_key); -void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head, - unsigned int mode, void *key, wait_queue_entry_t *bookmark) -{ - __wake_up_common(wq_head, mode, 1, 0, key, bookmark); -} -EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark); - /** * __wake_up_sync_key - wake up threads blocked on a waitqueue. * @wq_head: the waitqueue @@ -233,7 +193,7 @@ EXPORT_SYMBOL_GPL(__wake_up_sync_key); void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key) { - __wake_up_common(wq_head, mode, 1, WF_SYNC, key, NULL); + __wake_up_common(wq_head, mode, 1, WF_SYNC, key); } EXPORT_SYMBOL_GPL(__wake_up_locked_sync_key); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 255999ba9190..aca7b437882e 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1072,7 +1072,7 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_kn */ list_del_init(&addfd->list); if (!addfd->setfd) - fd = receive_fd(addfd->file, addfd->flags); + fd = receive_fd(addfd->file, NULL, addfd->flags); else fd = receive_fd_replace(addfd->fd, addfd->file, addfd->flags); addfd->ret = fd; diff --git a/kernel/signal.c b/kernel/signal.c index 09019017d669..c9c57d053ce4 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -171,16 +171,6 @@ static bool recalc_sigpending_tsk(struct task_struct *t) return false; } -/* - * After recalculating TIF_SIGPENDING, we need to make sure the task wakes up. - * This is superfluous when called on current, the wakeup is a harmless no-op. - */ -void recalc_sigpending_and_wake(struct task_struct *t) -{ - if (recalc_sigpending_tsk(t)) - signal_wake_up(t, 0); -} - void recalc_sigpending(void) { if (!recalc_sigpending_tsk(current) && !freezing(current)) @@ -415,7 +405,7 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags, int override_rlimit, const unsigned int sigqueue_flags) { struct sigqueue *q = NULL; - struct ucounts *ucounts = NULL; + struct ucounts *ucounts; long sigpending; /* @@ -1058,12 +1048,11 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type) signal->flags = SIGNAL_GROUP_EXIT; signal->group_exit_code = sig; signal->group_stop_count = 0; - t = p; - do { + __for_each_thread(signal, t) { task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); - } while_each_thread(p, t); + } return; } } @@ -1349,10 +1338,8 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, action->sa.sa_handler = SIG_DFL; if (handler == HANDLER_EXIT) action->sa.sa_flags |= SA_IMMUTABLE; - if (blocked) { + if (blocked) sigdelset(&t->blocked, sig); - recalc_sigpending_and_wake(t); - } } /* * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect @@ -1362,6 +1349,9 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, (!t->ptrace || (handler == HANDLER_EXIT))) t->signal->flags &= ~SIGNAL_UNKILLABLE; ret = send_signal_locked(sig, info, t, PIDTYPE_PID); + /* This can happen if the signal was already pending and blocked */ + if (!task_sigpending(t)) + signal_wake_up(t, 0); spin_unlock_irqrestore(&t->sighand->siglock, flags); return ret; @@ -1377,12 +1367,12 @@ int force_sig_info(struct kernel_siginfo *info) */ int zap_other_threads(struct task_struct *p) { - struct task_struct *t = p; + struct task_struct *t; int count = 0; p->signal->group_stop_count = 0; - while_each_thread(p, t) { + for_other_threads(p, t) { task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); /* Don't require de_thread to wait for the vhost_worker */ if ((t->flags & (PF_IO_WORKER | PF_USER_WORKER)) != PF_USER_WORKER) @@ -1471,16 +1461,21 @@ int group_send_sig_info(int sig, struct kernel_siginfo *info, int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp) { struct task_struct *p = NULL; - int retval, success; + int ret = -ESRCH; - success = 0; - retval = -ESRCH; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { int err = group_send_sig_info(sig, info, p, PIDTYPE_PGID); - success |= !err; - retval = err; + /* + * If group_send_sig_info() succeeds at least once ret + * becomes 0 and after that the code below has no effect. + * Otherwise we return the last err or -ESRCH if this + * process group is empty. + */ + if (ret) + ret = err; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); - return success ? 0 : retval; + + return ret; } int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid) @@ -1718,9 +1713,8 @@ void force_sigsegv(int sig) force_sig(SIGSEGV); } -int force_sig_fault_to_task(int sig, int code, void __user *addr - ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) - , struct task_struct *t) +int force_sig_fault_to_task(int sig, int code, void __user *addr, + struct task_struct *t) { struct kernel_siginfo info; @@ -1729,24 +1723,15 @@ int force_sig_fault_to_task(int sig, int code, void __user *addr info.si_errno = 0; info.si_code = code; info.si_addr = addr; -#ifdef __ia64__ - info.si_imm = imm; - info.si_flags = flags; - info.si_isr = isr; -#endif return force_sig_info_to_task(&info, t, HANDLER_CURRENT); } -int force_sig_fault(int sig, int code, void __user *addr - ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)) +int force_sig_fault(int sig, int code, void __user *addr) { - return force_sig_fault_to_task(sig, code, addr - ___ARCH_SI_IA64(imm, flags, isr), current); + return force_sig_fault_to_task(sig, code, addr, current); } -int send_sig_fault(int sig, int code, void __user *addr - ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) - , struct task_struct *t) +int send_sig_fault(int sig, int code, void __user *addr, struct task_struct *t) { struct kernel_siginfo info; @@ -1755,11 +1740,6 @@ int send_sig_fault(int sig, int code, void __user *addr info.si_errno = 0; info.si_code = code; info.si_addr = addr; -#ifdef __ia64__ - info.si_imm = imm; - info.si_flags = flags; - info.si_isr = isr; -#endif return send_sig_info(info.si_signo, &info, t); } @@ -2329,15 +2309,38 @@ static int ptrace_stop(int exit_code, int why, unsigned long message, do_notify_parent_cldstop(current, false, why); /* - * Don't want to allow preemption here, because - * sys_ptrace() needs this task to be inactive. + * The previous do_notify_parent_cldstop() invocation woke ptracer. + * One a PREEMPTION kernel this can result in preemption requirement + * which will be fulfilled after read_unlock() and the ptracer will be + * put on the CPU. + * The ptracer is in wait_task_inactive(, __TASK_TRACED) waiting for + * this task wait in schedule(). If this task gets preempted then it + * remains enqueued on the runqueue. The ptracer will observe this and + * then sleep for a delay of one HZ tick. In the meantime this task + * gets scheduled, enters schedule() and will wait for the ptracer. * - * XXX: implement read_unlock_no_resched(). + * This preemption point is not bad from a correctness point of + * view but extends the runtime by one HZ tick time due to the + * ptracer's sleep. The preempt-disable section ensures that there + * will be no preemption between unlock and schedule() and so + * improving the performance since the ptracer will observe that + * the tracee is scheduled out once it gets on the CPU. + * + * On PREEMPT_RT locking tasklist_lock does not disable preemption. + * Therefore the task can be preempted after do_notify_parent_cldstop() + * before unlocking tasklist_lock so there is no benefit in doing this. + * + * In fact disabling preemption is harmful on PREEMPT_RT because + * the spinlock_t in cgroup_enter_frozen() must not be acquired + * with preemption disabled due to the 'sleeping' spinlock + * substitution of RT. */ - preempt_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); read_unlock(&tasklist_lock); cgroup_enter_frozen(); - preempt_enable_no_resched(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable_no_resched(); schedule(); cgroup_leave_frozen(true); @@ -2453,12 +2456,10 @@ static bool do_signal_stop(int signr) sig->group_exit_code = signr; sig->group_stop_count = 0; - if (task_set_jobctl_pending(current, signr | gstop)) sig->group_stop_count++; - t = current; - while_each_thread(current, t) { + for_other_threads(current, t) { /* * Setting state to TASK_STOPPED for a group * stop is always done with the siglock held, @@ -2954,8 +2955,7 @@ static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which) if (sigisemptyset(&retarget)) return; - t = tsk; - while_each_thread(tsk, t) { + for_other_threads(tsk, t) { if (t->flags & PF_EXITING) continue; diff --git a/kernel/smp.c b/kernel/smp.c index 8455a53465af..f085ebcdf9e7 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -127,7 +127,7 @@ send_call_function_ipi_mask(struct cpumask *mask) } static __always_inline void -csd_do_func(smp_call_func_t func, void *info, struct __call_single_data *csd) +csd_do_func(smp_call_func_t func, void *info, call_single_data_t *csd) { trace_csd_function_entry(func, csd); func(info); @@ -170,11 +170,13 @@ static DEFINE_PER_CPU(void *, cur_csd_info); static ulong csd_lock_timeout = 5000; /* CSD lock timeout in milliseconds. */ module_param(csd_lock_timeout, ulong, 0444); +static int panic_on_ipistall; /* CSD panic timeout in milliseconds, 300000 for five minutes. */ +module_param(panic_on_ipistall, int, 0444); static atomic_t csd_bug_count = ATOMIC_INIT(0); /* Record current CSD work for current CPU, NULL to erase. */ -static void __csd_lock_record(struct __call_single_data *csd) +static void __csd_lock_record(call_single_data_t *csd) { if (!csd) { smp_mb(); /* NULL cur_csd after unlock. */ @@ -189,13 +191,13 @@ static void __csd_lock_record(struct __call_single_data *csd) /* Or before unlock, as the case may be. */ } -static __always_inline void csd_lock_record(struct __call_single_data *csd) +static __always_inline void csd_lock_record(call_single_data_t *csd) { if (static_branch_unlikely(&csdlock_debug_enabled)) __csd_lock_record(csd); } -static int csd_lock_wait_getcpu(struct __call_single_data *csd) +static int csd_lock_wait_getcpu(call_single_data_t *csd) { unsigned int csd_type; @@ -210,7 +212,7 @@ static int csd_lock_wait_getcpu(struct __call_single_data *csd) * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU, * so waiting on other types gets much less information. */ -static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *ts1, int *bug_id) +static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id) { int cpu = -1; int cpux; @@ -230,6 +232,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 * } ts2 = sched_clock(); + /* How long since we last checked for a stuck CSD lock.*/ ts_delta = ts2 - *ts1; if (likely(ts_delta <= csd_lock_timeout_ns || csd_lock_timeout_ns == 0)) return false; @@ -243,9 +246,17 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 * else cpux = cpu; cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */ + /* How long since this CSD lock was stuck. */ + ts_delta = ts2 - ts0; pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %llu ns for CPU#%02d %pS(%ps).\n", - firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts2 - ts0, + firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts_delta, cpu, csd->func, csd->info); + /* + * If the CSD lock is still stuck after 5 minutes, it is unlikely + * to become unstuck. Use a signed comparison to avoid triggering + * on underflows when the TSC is out of sync between sockets. + */ + BUG_ON(panic_on_ipistall > 0 && (s64)ts_delta > ((s64)panic_on_ipistall * NSEC_PER_MSEC)); if (cpu_cur_csd && csd != cpu_cur_csd) { pr_alert("\tcsd: CSD lock (#%d) handling prior %pS(%ps) request.\n", *bug_id, READ_ONCE(per_cpu(cur_csd_func, cpux)), @@ -276,7 +287,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 * * previous function call. For multi-cpu calls its even more interesting * as we'll have to ensure no other cpu is observing our csd. */ -static void __csd_lock_wait(struct __call_single_data *csd) +static void __csd_lock_wait(call_single_data_t *csd) { int bug_id = 0; u64 ts0, ts1; @@ -290,7 +301,7 @@ static void __csd_lock_wait(struct __call_single_data *csd) smp_acquire__after_ctrl_dep(); } -static __always_inline void csd_lock_wait(struct __call_single_data *csd) +static __always_inline void csd_lock_wait(call_single_data_t *csd) { if (static_branch_unlikely(&csdlock_debug_enabled)) { __csd_lock_wait(csd); @@ -300,17 +311,17 @@ static __always_inline void csd_lock_wait(struct __call_single_data *csd) smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK)); } #else -static void csd_lock_record(struct __call_single_data *csd) +static void csd_lock_record(call_single_data_t *csd) { } -static __always_inline void csd_lock_wait(struct __call_single_data *csd) +static __always_inline void csd_lock_wait(call_single_data_t *csd) { smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK)); } #endif -static __always_inline void csd_lock(struct __call_single_data *csd) +static __always_inline void csd_lock(call_single_data_t *csd) { csd_lock_wait(csd); csd->node.u_flags |= CSD_FLAG_LOCK; @@ -323,7 +334,7 @@ static __always_inline void csd_lock(struct __call_single_data *csd) smp_wmb(); } -static __always_inline void csd_unlock(struct __call_single_data *csd) +static __always_inline void csd_unlock(call_single_data_t *csd) { WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK)); @@ -376,7 +387,7 @@ void __smp_call_single_queue(int cpu, struct llist_node *node) * for execution on the given CPU. data must already have * ->func, ->info, and ->flags set. */ -static int generic_exec_single(int cpu, struct __call_single_data *csd) +static int generic_exec_single(int cpu, call_single_data_t *csd) { if (cpu == smp_processor_id()) { smp_call_func_t func = csd->func; @@ -667,7 +678,7 @@ EXPORT_SYMBOL(smp_call_function_single); * * Return: %0 on success or negative errno value on error */ -int smp_call_function_single_async(int cpu, struct __call_single_data *csd) +int smp_call_function_single_async(int cpu, call_single_data_t *csd) { int err = 0; diff --git a/kernel/smpboot.c b/kernel/smpboot.c index f47d8f375946..1992b62e980b 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -272,8 +272,7 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); if (tsk) { - kthread_stop(tsk); - put_task_struct(tsk); + kthread_stop_put(tsk); *per_cpu_ptr(ht->store, cpu) = NULL; } } diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 9ed5ce989415..afb3c116da91 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -126,7 +126,7 @@ EXPORT_SYMBOL_GPL(stack_trace_save); /** * stack_trace_save_tsk - Save a task stack trace into a storage array - * @task: The task to examine + * @tsk: The task to examine * @store: Pointer to storage array * @size: Size of the storage array * @skipnr: Number of entries to skip at the start of the stack trace @@ -151,6 +151,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *tsk, unsigned long *store, put_task_stack(tsk); return c.len; } +EXPORT_SYMBOL_GPL(stack_trace_save_tsk); /** * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array @@ -301,6 +302,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *task, save_stack_trace_tsk(task, &trace); return trace.nr_entries; } +EXPORT_SYMBOL_GPL(stack_trace_save_tsk); /** * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array diff --git a/kernel/sys.c b/kernel/sys.c index 2410e3999ebe..f8e543f1e38a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1785,74 +1785,87 @@ void getrusage(struct task_struct *p, int who, struct rusage *r) struct task_struct *t; unsigned long flags; u64 tgutime, tgstime, utime, stime; - unsigned long maxrss = 0; + unsigned long maxrss; + struct mm_struct *mm; + struct signal_struct *sig = p->signal; + unsigned int seq = 0; - memset((char *)r, 0, sizeof (*r)); +retry: + memset(r, 0, sizeof(*r)); utime = stime = 0; + maxrss = 0; if (who == RUSAGE_THREAD) { task_cputime_adjusted(current, &utime, &stime); accumulate_thread_rusage(p, r); - maxrss = p->signal->maxrss; - goto out; + maxrss = sig->maxrss; + goto out_thread; } - if (!lock_task_sighand(p, &flags)) - return; + flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); switch (who) { case RUSAGE_BOTH: case RUSAGE_CHILDREN: - utime = p->signal->cutime; - stime = p->signal->cstime; - r->ru_nvcsw = p->signal->cnvcsw; - r->ru_nivcsw = p->signal->cnivcsw; - r->ru_minflt = p->signal->cmin_flt; - r->ru_majflt = p->signal->cmaj_flt; - r->ru_inblock = p->signal->cinblock; - r->ru_oublock = p->signal->coublock; - maxrss = p->signal->cmaxrss; + utime = sig->cutime; + stime = sig->cstime; + r->ru_nvcsw = sig->cnvcsw; + r->ru_nivcsw = sig->cnivcsw; + r->ru_minflt = sig->cmin_flt; + r->ru_majflt = sig->cmaj_flt; + r->ru_inblock = sig->cinblock; + r->ru_oublock = sig->coublock; + maxrss = sig->cmaxrss; if (who == RUSAGE_CHILDREN) break; fallthrough; case RUSAGE_SELF: - thread_group_cputime_adjusted(p, &tgutime, &tgstime); - utime += tgutime; - stime += tgstime; - r->ru_nvcsw += p->signal->nvcsw; - r->ru_nivcsw += p->signal->nivcsw; - r->ru_minflt += p->signal->min_flt; - r->ru_majflt += p->signal->maj_flt; - r->ru_inblock += p->signal->inblock; - r->ru_oublock += p->signal->oublock; - if (maxrss < p->signal->maxrss) - maxrss = p->signal->maxrss; - t = p; - do { + r->ru_nvcsw += sig->nvcsw; + r->ru_nivcsw += sig->nivcsw; + r->ru_minflt += sig->min_flt; + r->ru_majflt += sig->maj_flt; + r->ru_inblock += sig->inblock; + r->ru_oublock += sig->oublock; + if (maxrss < sig->maxrss) + maxrss = sig->maxrss; + + rcu_read_lock(); + __for_each_thread(sig, t) accumulate_thread_rusage(t, r); - } while_each_thread(p, t); + rcu_read_unlock(); + break; default: BUG(); } - unlock_task_sighand(p, &flags); -out: - r->ru_utime = ns_to_kernel_old_timeval(utime); - r->ru_stime = ns_to_kernel_old_timeval(stime); + if (need_seqretry(&sig->stats_lock, seq)) { + seq = 1; + goto retry; + } + done_seqretry_irqrestore(&sig->stats_lock, seq, flags); - if (who != RUSAGE_CHILDREN) { - struct mm_struct *mm = get_task_mm(p); + if (who == RUSAGE_CHILDREN) + goto out_children; - if (mm) { - setmax_mm_hiwater_rss(&maxrss, mm); - mmput(mm); - } + thread_group_cputime_adjusted(p, &tgutime, &tgstime); + utime += tgutime; + stime += tgstime; + +out_thread: + mm = get_task_mm(p); + if (mm) { + setmax_mm_hiwater_rss(&maxrss, mm); + mmput(mm); } + +out_children: r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */ + r->ru_utime = ns_to_kernel_old_timeval(utime); + r->ru_stime = ns_to_kernel_old_timeval(stime); } SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru) @@ -2368,19 +2381,45 @@ static int prctl_set_vma(unsigned long opt, unsigned long start, } #endif /* CONFIG_ANON_VMA_NAME */ +static inline unsigned long get_current_mdwe(void) +{ + unsigned long ret = 0; + + if (test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) + ret |= PR_MDWE_REFUSE_EXEC_GAIN; + if (test_bit(MMF_HAS_MDWE_NO_INHERIT, ¤t->mm->flags)) + ret |= PR_MDWE_NO_INHERIT; + + return ret; +} + static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3, unsigned long arg4, unsigned long arg5) { + unsigned long current_bits; + if (arg3 || arg4 || arg5) return -EINVAL; - if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN)) + if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN | PR_MDWE_NO_INHERIT)) return -EINVAL; + /* NO_INHERIT only makes sense with REFUSE_EXEC_GAIN */ + if (bits & PR_MDWE_NO_INHERIT && !(bits & PR_MDWE_REFUSE_EXEC_GAIN)) + return -EINVAL; + + /* PARISC cannot allow mdwe as it needs writable stacks */ + if (IS_ENABLED(CONFIG_PARISC)) + return -EINVAL; + + current_bits = get_current_mdwe(); + if (current_bits && current_bits != bits) + return -EPERM; /* Cannot unset the flags */ + + if (bits & PR_MDWE_NO_INHERIT) + set_bit(MMF_HAS_MDWE_NO_INHERIT, ¤t->mm->flags); if (bits & PR_MDWE_REFUSE_EXEC_GAIN) set_bit(MMF_HAS_MDWE, ¤t->mm->flags); - else if (test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) - return -EPERM; /* Cannot unset the flag */ return 0; } @@ -2390,9 +2429,7 @@ static inline int prctl_get_mdwe(unsigned long arg2, unsigned long arg3, { if (arg2 || arg3 || arg4 || arg5) return -EINVAL; - - return test_bit(MMF_HAS_MDWE, ¤t->mm->flags) ? - PR_MDWE_REFUSE_EXEC_GAIN : 0; + return get_current_mdwe(); } static int prctl_get_auxv(void __user *addr, unsigned long len) diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index e137c1385c56..faad00cce269 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -51,8 +51,6 @@ COND_SYSCALL_COMPAT(io_pgetevents); COND_SYSCALL(io_uring_setup); COND_SYSCALL(io_uring_enter); COND_SYSCALL(io_uring_register); -COND_SYSCALL(lookup_dcookie); -COND_SYSCALL_COMPAT(lookup_dcookie); COND_SYSCALL(eventfd2); COND_SYSCALL(epoll_create1); COND_SYSCALL(epoll_ctl); @@ -87,6 +85,9 @@ COND_SYSCALL_COMPAT(set_robust_list); COND_SYSCALL(get_robust_list); COND_SYSCALL_COMPAT(get_robust_list); COND_SYSCALL(futex_waitv); +COND_SYSCALL(futex_wake); +COND_SYSCALL(futex_wait); +COND_SYSCALL(futex_requeue); COND_SYSCALL(kexec_load); COND_SYSCALL_COMPAT(kexec_load); COND_SYSCALL(init_module); @@ -170,6 +171,9 @@ COND_SYSCALL(landlock_add_rule); COND_SYSCALL(landlock_restrict_self); COND_SYSCALL(fadvise64_64); COND_SYSCALL_COMPAT(fadvise64_64); +COND_SYSCALL(lsm_get_self_attr); +COND_SYSCALL(lsm_set_self_attr); +COND_SYSCALL(lsm_list_modules); /* CONFIG_MMU only */ COND_SYSCALL(swapon); @@ -200,6 +204,20 @@ COND_SYSCALL(recvmmsg_time32); COND_SYSCALL_COMPAT(recvmmsg_time32); COND_SYSCALL_COMPAT(recvmmsg_time64); +/* Posix timer syscalls may be configured out */ +COND_SYSCALL(timer_create); +COND_SYSCALL(timer_gettime); +COND_SYSCALL(timer_getoverrun); +COND_SYSCALL(timer_settime); +COND_SYSCALL(timer_delete); +COND_SYSCALL(clock_adjtime); +COND_SYSCALL(getitimer); +COND_SYSCALL(setitimer); +COND_SYSCALL(alarm); +COND_SYSCALL_COMPAT(timer_create); +COND_SYSCALL_COMPAT(getitimer); +COND_SYSCALL_COMPAT(setitimer); + /* * Architecture specific syscalls: see further below */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 354a2d294f52..157f7ce2942d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1939,15 +1939,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_IA64 - { - .procname = "unaligned-dump-stack", - .data = &unaligned_dump_stack, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif #ifdef CONFIG_RT_MUTEXES { .procname = "max_lock_depth", @@ -1983,7 +1974,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_perf_event_sample_rate, .maxlen = sizeof(sysctl_perf_event_sample_rate), .mode = 0644, - .proc_handler = perf_proc_update_handler, + .proc_handler = perf_event_max_sample_rate_handler, .extra1 = SYSCTL_ONE, }, { diff --git a/kernel/task_work.c b/kernel/task_work.c index 065e1ef8fc8d..95a7e1b7f1da 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -78,6 +78,7 @@ int task_work_add(struct task_struct *task, struct callback_head *work, * task_work_cancel_match - cancel a pending work added by task_work_add() * @task: the task which should execute the work * @match: match function to call + * @data: data to be passed in to match function * * RETURNS: * The found work or NULL if not found. diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 8ce3fa0c19e2..4354ea231fab 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -233,9 +233,8 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) else memset(stats, 0, sizeof(*stats)); - tsk = first; start_time = ktime_get_ns(); - do { + for_each_thread(first, tsk) { if (tsk->exit_state) continue; /* @@ -258,7 +257,7 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) stats->nvcsw += tsk->nvcsw; stats->nivcsw += tsk->nivcsw; - } while_each_thread(first, tsk); + } unlock_task_sighand(first, &flags); rc = 0; diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 8d9f13d847f0..4657cb8e8b1f 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -290,6 +290,17 @@ static int alarmtimer_suspend(struct device *dev) rtc_timer_cancel(rtc, &rtctimer); rtc_read_time(rtc, &tm); now = rtc_tm_to_ktime(tm); + + /* + * If the RTC alarm timer only supports a limited time offset, set the + * alarm time to the maximum supported value. + * The system may wake up earlier (possibly much earlier) than expected + * when the alarmtimer runs. This is the best the kernel can do if + * the alarmtimer exceeds the time that the rtc device can be programmed + * for. + */ + min = rtc_bound_alarmtime(rtc, min); + now = ktime_add(now, min); /* Set alarm, if in the past reject suspend briefly to handle */ diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index c108ed8a9804..3052b1f1168e 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -99,6 +99,7 @@ static u64 suspend_start; * Interval: 0.5sec. */ #define WATCHDOG_INTERVAL (HZ >> 1) +#define WATCHDOG_INTERVAL_MAX_NS ((2 * WATCHDOG_INTERVAL) * (NSEC_PER_SEC / HZ)) /* * Threshold: 0.0312s, when doubled: 0.0625s. @@ -134,6 +135,7 @@ static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); static DEFINE_SPINLOCK(watchdog_lock); static int watchdog_running; static atomic_t watchdog_reset_pending; +static int64_t watchdog_max_interval; static inline void clocksource_watchdog_lock(unsigned long *flags) { @@ -399,8 +401,8 @@ static inline void clocksource_reset_watchdog(void) static void clocksource_watchdog(struct timer_list *unused) { u64 csnow, wdnow, cslast, wdlast, delta; + int64_t wd_nsec, cs_nsec, interval; int next_cpu, reset_pending; - int64_t wd_nsec, cs_nsec; struct clocksource *cs; enum wd_read_status read_ret; unsigned long extra_wait = 0; @@ -470,6 +472,27 @@ static void clocksource_watchdog(struct timer_list *unused) if (atomic_read(&watchdog_reset_pending)) continue; + /* + * The processing of timer softirqs can get delayed (usually + * on account of ksoftirqd not getting to run in a timely + * manner), which causes the watchdog interval to stretch. + * Skew detection may fail for longer watchdog intervals + * on account of fixed margins being used. + * Some clocksources, e.g. acpi_pm, cannot tolerate + * watchdog intervals longer than a few seconds. + */ + interval = max(cs_nsec, wd_nsec); + if (unlikely(interval > WATCHDOG_INTERVAL_MAX_NS)) { + if (system_state > SYSTEM_SCHEDULING && + interval > 2 * watchdog_max_interval) { + watchdog_max_interval = interval; + pr_warn("Long readout interval, skipping watchdog check: cs_nsec: %lld wd_nsec: %lld\n", + cs_nsec, wd_nsec); + } + watchdog_timer.expires = jiffies; + continue; + } + /* Check the deviation from the watchdog clocksource. */ md = cs->uncertainty_margin + watchdog->uncertainty_margin; if (abs(cs_nsec - wd_nsec) > md) { diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 238262e4aba7..edb0f821dcea 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1085,6 +1085,7 @@ static int enqueue_hrtimer(struct hrtimer *timer, enum hrtimer_mode mode) { debug_activate(timer, mode); + WARN_ON_ONCE(!base->cpu_base->online); base->cpu_base->active_bases |= 1 << base->index; @@ -2183,6 +2184,7 @@ int hrtimers_prepare_cpu(unsigned int cpu) cpu_base->softirq_next_timer = NULL; cpu_base->expires_next = KTIME_MAX; cpu_base->softirq_expires_next = KTIME_MAX; + cpu_base->online = 1; hrtimer_cpu_base_init_expiry_lock(cpu_base); return 0; } @@ -2219,29 +2221,22 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, } } -int hrtimers_dead_cpu(unsigned int scpu) +int hrtimers_cpu_dying(unsigned int dying_cpu) { struct hrtimer_cpu_base *old_base, *new_base; - int i; + int i, ncpu = cpumask_first(cpu_active_mask); - BUG_ON(cpu_online(scpu)); - tick_cancel_sched_timer(scpu); + tick_cancel_sched_timer(dying_cpu); + + old_base = this_cpu_ptr(&hrtimer_bases); + new_base = &per_cpu(hrtimer_bases, ncpu); - /* - * this BH disable ensures that raise_softirq_irqoff() does - * not wakeup ksoftirqd (and acquire the pi-lock) while - * holding the cpu_base lock - */ - local_bh_disable(); - local_irq_disable(); - old_base = &per_cpu(hrtimer_bases, scpu); - new_base = this_cpu_ptr(&hrtimer_bases); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ - raw_spin_lock(&new_base->lock); - raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + raw_spin_lock(&old_base->lock); + raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { migrate_hrtimer_list(&old_base->clock_base[i], @@ -2252,15 +2247,14 @@ int hrtimers_dead_cpu(unsigned int scpu) * The migration might have changed the first expiring softirq * timer on this CPU. Update it. */ - hrtimer_update_softirq_timer(new_base, false); + __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT); + /* Tell the other CPU to retrigger the next event */ + smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); - raw_spin_unlock(&old_base->lock); raw_spin_unlock(&new_base->lock); + old_base->online = 0; + raw_spin_unlock(&old_base->lock); - /* Check, if we got expired work to do */ - __hrtimer_peek_ahead_timers(); - local_irq_enable(); - local_bh_enable(); return 0; } diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 77c0c2370b6d..9de66bbbb3d1 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -19,7 +19,8 @@ */ static struct posix_clock *get_posix_clock(struct file *fp) { - struct posix_clock *clk = fp->private_data; + struct posix_clock_context *pccontext = fp->private_data; + struct posix_clock *clk = pccontext->clk; down_read(&clk->rwsem); @@ -39,6 +40,7 @@ static void put_posix_clock(struct posix_clock *clk) static ssize_t posix_clock_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos) { + struct posix_clock_context *pccontext = fp->private_data; struct posix_clock *clk = get_posix_clock(fp); int err = -EINVAL; @@ -46,7 +48,7 @@ static ssize_t posix_clock_read(struct file *fp, char __user *buf, return -ENODEV; if (clk->ops.read) - err = clk->ops.read(clk, fp->f_flags, buf, count); + err = clk->ops.read(pccontext, fp->f_flags, buf, count); put_posix_clock(clk); @@ -55,6 +57,7 @@ static ssize_t posix_clock_read(struct file *fp, char __user *buf, static __poll_t posix_clock_poll(struct file *fp, poll_table *wait) { + struct posix_clock_context *pccontext = fp->private_data; struct posix_clock *clk = get_posix_clock(fp); __poll_t result = 0; @@ -62,7 +65,7 @@ static __poll_t posix_clock_poll(struct file *fp, poll_table *wait) return EPOLLERR; if (clk->ops.poll) - result = clk->ops.poll(clk, fp, wait); + result = clk->ops.poll(pccontext, fp, wait); put_posix_clock(clk); @@ -72,6 +75,7 @@ static __poll_t posix_clock_poll(struct file *fp, poll_table *wait) static long posix_clock_ioctl(struct file *fp, unsigned int cmd, unsigned long arg) { + struct posix_clock_context *pccontext = fp->private_data; struct posix_clock *clk = get_posix_clock(fp); int err = -ENOTTY; @@ -79,7 +83,7 @@ static long posix_clock_ioctl(struct file *fp, return -ENODEV; if (clk->ops.ioctl) - err = clk->ops.ioctl(clk, cmd, arg); + err = clk->ops.ioctl(pccontext, cmd, arg); put_posix_clock(clk); @@ -90,6 +94,7 @@ static long posix_clock_ioctl(struct file *fp, static long posix_clock_compat_ioctl(struct file *fp, unsigned int cmd, unsigned long arg) { + struct posix_clock_context *pccontext = fp->private_data; struct posix_clock *clk = get_posix_clock(fp); int err = -ENOTTY; @@ -97,7 +102,7 @@ static long posix_clock_compat_ioctl(struct file *fp, return -ENODEV; if (clk->ops.ioctl) - err = clk->ops.ioctl(clk, cmd, arg); + err = clk->ops.ioctl(pccontext, cmd, arg); put_posix_clock(clk); @@ -110,6 +115,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp) int err; struct posix_clock *clk = container_of(inode->i_cdev, struct posix_clock, cdev); + struct posix_clock_context *pccontext; down_read(&clk->rwsem); @@ -117,14 +123,20 @@ static int posix_clock_open(struct inode *inode, struct file *fp) err = -ENODEV; goto out; } + pccontext = kzalloc(sizeof(*pccontext), GFP_KERNEL); + if (!pccontext) { + err = -ENOMEM; + goto out; + } + pccontext->clk = clk; + fp->private_data = pccontext; if (clk->ops.open) - err = clk->ops.open(clk, fp->f_mode); + err = clk->ops.open(pccontext, fp->f_mode); else err = 0; if (!err) { get_device(clk->dev); - fp->private_data = clk; } out: up_read(&clk->rwsem); @@ -133,14 +145,20 @@ out: static int posix_clock_release(struct inode *inode, struct file *fp) { - struct posix_clock *clk = fp->private_data; + struct posix_clock_context *pccontext = fp->private_data; + struct posix_clock *clk; int err = 0; + if (!pccontext) + return -ENODEV; + clk = pccontext->clk; + if (clk->ops.release) - err = clk->ops.release(clk); + err = clk->ops.release(pccontext); put_device(clk->dev); + kfree(pccontext); fp->private_data = NULL; return err; diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 828aeecbd1e8..9b6fcb8d85e7 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -17,40 +17,6 @@ #include <linux/time_namespace.h> #include <linux/compat.h> -#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER -/* Architectures may override SYS_NI and COMPAT_SYS_NI */ -#include <asm/syscall_wrapper.h> -#endif - -asmlinkage long sys_ni_posix_timers(void) -{ - pr_err_once("process %d (%s) attempted a POSIX timer syscall " - "while CONFIG_POSIX_TIMERS is not set\n", - current->pid, current->comm); - return -ENOSYS; -} - -#ifndef SYS_NI -#define SYS_NI(name) SYSCALL_ALIAS(sys_##name, sys_ni_posix_timers) -#endif - -#ifndef COMPAT_SYS_NI -#define COMPAT_SYS_NI(name) SYSCALL_ALIAS(compat_sys_##name, sys_ni_posix_timers) -#endif - -SYS_NI(timer_create); -SYS_NI(timer_gettime); -SYS_NI(timer_getoverrun); -SYS_NI(timer_settime); -SYS_NI(timer_delete); -SYS_NI(clock_adjtime); -SYS_NI(getitimer); -SYS_NI(setitimer); -SYS_NI(clock_adjtime32); -#ifdef __ARCH_WANT_SYS_ALARM -SYS_NI(alarm); -#endif - /* * We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC * as it is easy to remain compatible with little code. CLOCK_BOOTTIME @@ -158,18 +124,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, which_clock); } -#ifdef CONFIG_COMPAT -COMPAT_SYS_NI(timer_create); -#endif - -#if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA) -COMPAT_SYS_NI(getitimer); -COMPAT_SYS_NI(setitimer); -#endif - #ifdef CONFIG_COMPAT_32BIT_TIME -SYS_NI(timer_settime32); -SYS_NI(timer_gettime32); SYSCALL_DEFINE2(clock_settime32, const clockid_t, which_clock, struct old_timespec32 __user *, tp) diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 649f2b48e8f0..481b7ab65e2c 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -56,7 +56,6 @@ extern int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, bool force); extern void clockevents_handle_noop(struct clock_event_device *dev); extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); -extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); /* Broadcasting support */ # ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST @@ -197,3 +196,5 @@ void hrtimers_resume_local(void); #else #define JIFFIES_SHIFT 8 #endif + +extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 87015e9deacc..01fb50c1b17e 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -4,7 +4,7 @@ * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner * - * No idle tick implementation for low and high resolution timers + * NOHZ implementation for low and high resolution timers * * Started by: Thomas Gleixner and Ingo Molnar */ @@ -45,7 +45,7 @@ struct tick_sched *tick_get_tick_sched(int cpu) #if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) /* - * The time, when the last jiffy update happened. Write access must hold + * The time when the last jiffy update happened. Write access must hold * jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a * consistent view of jiffies and last_jiffies_update. */ @@ -60,13 +60,13 @@ static void tick_do_update_jiffies64(ktime_t now) ktime_t delta, nextp; /* - * 64bit can do a quick check without holding jiffies lock and + * 64-bit can do a quick check without holding the jiffies lock and * without looking at the sequence count. The smp_load_acquire() * pairs with the update done later in this function. * - * 32bit cannot do that because the store of tick_next_period - * consists of two 32bit stores and the first store could move it - * to a random point in the future. + * 32-bit cannot do that because the store of 'tick_next_period' + * consists of two 32-bit stores, and the first store could be + * moved by the CPU to a random point in the future. */ if (IS_ENABLED(CONFIG_64BIT)) { if (ktime_before(now, smp_load_acquire(&tick_next_period))) @@ -75,7 +75,7 @@ static void tick_do_update_jiffies64(ktime_t now) unsigned int seq; /* - * Avoid contention on jiffies_lock and protect the quick + * Avoid contention on 'jiffies_lock' and protect the quick * check with the sequence count. */ do { @@ -90,7 +90,7 @@ static void tick_do_update_jiffies64(ktime_t now) /* Quick check failed, i.e. update is required. */ raw_spin_lock(&jiffies_lock); /* - * Reevaluate with the lock held. Another CPU might have done the + * Re-evaluate with the lock held. Another CPU might have done the * update already. */ if (ktime_before(now, tick_next_period)) { @@ -114,25 +114,23 @@ static void tick_do_update_jiffies64(ktime_t now) TICK_NSEC); } - /* Advance jiffies to complete the jiffies_seq protected job */ + /* Advance jiffies to complete the 'jiffies_seq' protected job */ jiffies_64 += ticks; - /* - * Keep the tick_next_period variable up to date. - */ + /* Keep the tick_next_period variable up to date */ nextp = ktime_add_ns(last_jiffies_update, TICK_NSEC); if (IS_ENABLED(CONFIG_64BIT)) { /* * Pairs with smp_load_acquire() in the lockless quick - * check above and ensures that the update to jiffies_64 is - * not reordered vs. the store to tick_next_period, neither + * check above, and ensures that the update to 'jiffies_64' is + * not reordered vs. the store to 'tick_next_period', neither * by the compiler nor by the CPU. */ smp_store_release(&tick_next_period, nextp); } else { /* - * A plain store is good enough on 32bit as the quick check + * A plain store is good enough on 32-bit, as the quick check * above is protected by the sequence count. */ tick_next_period = nextp; @@ -140,7 +138,7 @@ static void tick_do_update_jiffies64(ktime_t now) /* * Release the sequence count. calc_global_load() below is not - * protected by it, but jiffies_lock needs to be held to prevent + * protected by it, but 'jiffies_lock' needs to be held to prevent * concurrent invocations. */ write_seqcount_end(&jiffies_seq); @@ -160,7 +158,8 @@ static ktime_t tick_init_jiffy_update(void) raw_spin_lock(&jiffies_lock); write_seqcount_begin(&jiffies_seq); - /* Did we start the jiffies update yet ? */ + + /* Have we started the jiffies update yet ? */ if (last_jiffies_update == 0) { u32 rem; @@ -175,8 +174,10 @@ static ktime_t tick_init_jiffy_update(void) last_jiffies_update = tick_next_period; } period = last_jiffies_update; + write_seqcount_end(&jiffies_seq); raw_spin_unlock(&jiffies_lock); + return period; } @@ -192,10 +193,10 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) * concurrency: This happens only when the CPU in charge went * into a long sleep. If two CPUs happen to assign themselves to * this duty, then the jiffies update is still serialized by - * jiffies_lock. + * 'jiffies_lock'. * * If nohz_full is enabled, this should not happen because the - * tick_do_timer_cpu never relinquishes. + * 'tick_do_timer_cpu' CPU never relinquishes. */ if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) { #ifdef CONFIG_NO_HZ_FULL @@ -205,12 +206,12 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) } #endif - /* Check, if the jiffies need an update */ + /* Check if jiffies need an update */ if (tick_do_timer_cpu == cpu) tick_do_update_jiffies64(now); /* - * If jiffies update stalled for too long (timekeeper in stop_machine() + * If the jiffies update stalled for too long (timekeeper in stop_machine() * or VMEXIT'ed for several msecs), force an update. */ if (ts->last_tick_jiffies != jiffies) { @@ -234,10 +235,10 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) /* * When we are idle and the tick is stopped, we have to touch * the watchdog as we might not schedule for a really long - * time. This happens on complete idle SMP systems while + * time. This happens on completely idle SMP systems while * waiting on the login prompt. We also increment the "start of * idle" jiffy stamp so the idle accounting adjustment we do - * when we go busy again does not account too much ticks. + * when we go busy again does not account too many ticks. */ if (ts->tick_stopped) { touch_softlockup_watchdog_sched(); @@ -362,7 +363,7 @@ static void tick_nohz_kick_task(struct task_struct *tsk) /* * If the task is not running, run_posix_cpu_timers() - * has nothing to elapse, IPI can then be spared. + * has nothing to elapse, and an IPI can then be optimized out. * * activate_task() STORE p->tick_dep_mask * STORE p->on_rq @@ -425,7 +426,7 @@ static void tick_nohz_dep_set_all(atomic_t *dep, /* * Set a global tick dependency. Used by perf events that rely on freq and - * by unstable clock. + * unstable clocks. */ void tick_nohz_dep_set(enum tick_dep_bits bit) { @@ -439,7 +440,7 @@ void tick_nohz_dep_clear(enum tick_dep_bits bit) /* * Set per-CPU tick dependency. Used by scheduler and perf events in order to - * manage events throttling. + * manage event-throttling. */ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) { @@ -455,7 +456,7 @@ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) if (cpu == smp_processor_id()) { tick_nohz_full_kick(); } else { - /* Remote irq work not NMI-safe */ + /* Remote IRQ work not NMI-safe */ if (!WARN_ON_ONCE(in_nmi())) tick_nohz_full_kick_cpu(cpu); } @@ -473,7 +474,7 @@ void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu); /* - * Set a per-task tick dependency. RCU need this. Also posix CPU timers + * Set a per-task tick dependency. RCU needs this. Also posix CPU timers * in order to elapse per task timers. */ void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) @@ -546,7 +547,7 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask) bool tick_nohz_cpu_hotpluggable(unsigned int cpu) { /* - * The tick_do_timer_cpu CPU handles housekeeping duty (unbound + * The 'tick_do_timer_cpu' CPU handles housekeeping duty (unbound * timers, workqueues, timekeeping, ...) on behalf of full dynticks * CPUs. It must remain online when nohz full is enabled. */ @@ -568,12 +569,12 @@ void __init tick_nohz_init(void) return; /* - * Full dynticks uses irq work to drive the tick rescheduling on safe - * locking contexts. But then we need irq work to raise its own - * interrupts to avoid circular dependency on the tick + * Full dynticks uses IRQ work to drive the tick rescheduling on safe + * locking contexts. But then we need IRQ work to raise its own + * interrupts to avoid circular dependency on the tick. */ if (!arch_irq_work_has_interrupt()) { - pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n"); + pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support IRQ work self-IPIs\n"); cpumask_clear(tick_nohz_full_mask); tick_nohz_full_running = false; return; @@ -643,7 +644,7 @@ bool tick_nohz_tick_stopped_cpu(int cpu) * In case the sched_tick was stopped on this CPU, we have to check if jiffies * must be updated. Otherwise an interrupt handler could use a stale jiffy * value. We do this unconditionally on any CPU, as we don't know whether the - * CPU, which has the update task assigned is in a long sleep. + * CPU, which has the update task assigned, is in a long sleep. */ static void tick_nohz_update_jiffies(ktime_t now) { @@ -726,7 +727,7 @@ static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime, * counters if NULL. * * Return the cumulative idle time (since boot) for a given - * CPU, in microseconds. Note this is partially broken due to + * CPU, in microseconds. Note that this is partially broken due to * the counter of iowait tasks that can be remotely updated without * any synchronization. Therefore it is possible to observe backward * values within two consecutive reads. @@ -787,7 +788,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) } /* - * Reset to make sure next tick stop doesn't get fooled by past + * Reset to make sure the next tick stop doesn't get fooled by past * cached clock deadline. */ ts->next_tick = 0; @@ -816,11 +817,11 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) /* * Keep the periodic tick, when RCU, architecture or irq_work * requests it. - * Aside of that check whether the local timer softirq is - * pending. If so its a bad idea to call get_next_timer_interrupt() + * Aside of that, check whether the local timer softirq is + * pending. If so, its a bad idea to call get_next_timer_interrupt(), * because there is an already expired timer, so it will request * immediate expiry, which rearms the hardware timer with a - * minimal delta which brings us back to this place + * minimal delta, which brings us back to this place * immediately. Lather, rinse and repeat... */ if (rcu_needs_cpu() || arch_needs_cpu() || @@ -838,6 +839,10 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) ts->next_timer = next_tick; } + /* Make sure next_tick is never before basemono! */ + if (WARN_ON_ONCE(basemono > next_tick)) + next_tick = basemono; + /* * If the tick is due in the next period, keep it ticking or * force prod the timer. @@ -861,7 +866,7 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) /* * If this CPU is the one which had the do_timer() duty last, we limit - * the sleep time to the timekeeping max_deferment value. + * the sleep time to the timekeeping 'max_deferment' value. * Otherwise we can sleep as long as we want. */ delta = timekeeping_max_deferment(); @@ -886,7 +891,6 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); u64 basemono = ts->timer_expires_base; u64 expires = ts->timer_expires; - ktime_t tick = expires; /* Make sure we won't be trying to stop it twice in a row. */ ts->timer_expires_base = 0; @@ -895,8 +899,8 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) * If this CPU is the one which updates jiffies, then give up * the assignment and let it be taken by the CPU which runs * the tick timer next, which might be this CPU as well. If we - * don't drop this here the jiffies might be stale and - * do_timer() never invoked. Keep track of the fact that it + * don't drop this here, the jiffies might be stale and + * do_timer() never gets invoked. Keep track of the fact that it * was the one which had the do_timer() duty last. */ if (cpu == tick_do_timer_cpu) { @@ -906,10 +910,10 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) ts->do_timer_last = 0; } - /* Skip reprogram of event if its not changed */ + /* Skip reprogram of event if it's not changed */ if (ts->tick_stopped && (expires == ts->next_tick)) { /* Sanity check: make sure clockevent is actually programmed */ - if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) + if (expires == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) return; WARN_ON_ONCE(1); @@ -919,11 +923,11 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) } /* - * nohz_stop_sched_tick can be called several times before - * the nohz_restart_sched_tick is called. This happens when - * interrupts arrive which do not cause a reschedule. In the - * first call we save the current tick time, so we can restart - * the scheduler tick in nohz_restart_sched_tick. + * tick_nohz_stop_tick() can be called several times before + * tick_nohz_restart_sched_tick() is called. This happens when + * interrupts arrive which do not cause a reschedule. In the first + * call we save the current tick time, so we can restart the + * scheduler tick in tick_nohz_restart_sched_tick(). */ if (!ts->tick_stopped) { calc_load_nohz_start(); @@ -934,7 +938,7 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) trace_tick_stop(1, TICK_DEP_MASK_NONE); } - ts->next_tick = tick; + ts->next_tick = expires; /* * If the expiration time == KTIME_MAX, then we simply stop @@ -949,11 +953,11 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) } if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { - hrtimer_start(&ts->sched_timer, tick, + hrtimer_start(&ts->sched_timer, expires, HRTIMER_MODE_ABS_PINNED_HARD); } else { - hrtimer_set_expires(&ts->sched_timer, tick); - tick_program_event(tick, 1); + hrtimer_set_expires(&ts->sched_timer, expires); + tick_program_event(expires, 1); } } @@ -985,9 +989,8 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) calc_load_nohz_stop(); touch_softlockup_watchdog_sched(); - /* - * Cancel the scheduled timer and restore the tick - */ + + /* Cancel the scheduled timer and restore the tick: */ ts->tick_stopped = 0; tick_nohz_restart(ts, now); } @@ -1019,11 +1022,11 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts) /* * A pending softirq outside an IRQ (or softirq disabled section) context * should be waiting for ksoftirqd to handle it. Therefore we shouldn't - * reach here due to the need_resched() early check in can_stop_idle_tick(). + * reach this code due to the need_resched() early check in can_stop_idle_tick(). * * However if we are between CPUHP_AP_SMPBOOT_THREADS and CPU_TEARDOWN_CPU on the * cpu_down() process, softirqs can still be raised while ksoftirqd is parked, - * triggering the below since wakep_softirqd() is ignored. + * triggering the code below, since wakep_softirqd() is ignored. * */ static bool report_idle_softirq(void) @@ -1044,7 +1047,7 @@ static bool report_idle_softirq(void) if (ratelimit >= 10) return false; - /* On RT, softirqs handling may be waiting on some lock */ + /* On RT, softirq handling may be waiting on some lock */ if (local_bh_blocked()) return false; @@ -1061,8 +1064,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) * If this CPU is offline and it is the one which updates * jiffies, then give up the assignment and let it be taken by * the CPU which runs the tick timer next. If we don't drop - * this here the jiffies might be stale and do_timer() never - * invoked. + * this here, the jiffies might be stale and do_timer() never + * gets invoked. */ if (unlikely(!cpu_online(cpu))) { if (cpu == tick_do_timer_cpu) @@ -1175,12 +1178,23 @@ void tick_nohz_idle_enter(void) } /** - * tick_nohz_irq_exit - update next tick event from interrupt exit + * tick_nohz_irq_exit - Notify the tick about IRQ exit + * + * A timer may have been added/modified/deleted either by the current IRQ, + * or by another place using this IRQ as a notification. This IRQ may have + * also updated the RCU callback list. These events may require a + * re-evaluation of the next tick. Depending on the context: + * + * 1) If the CPU is idle and no resched is pending, just proceed with idle + * time accounting. The next tick will be re-evaluated on the next idle + * loop iteration. * - * When an interrupt fires while we are idle and it doesn't cause - * a reschedule, it may still add, modify or delete a timer, enqueue - * an RCU callback, etc... - * So we need to re-calculate and reprogram the next tick event. + * 2) If the CPU is nohz_full: + * + * 2.1) If there is any tick dependency, restart the tick if stopped. + * + * 2.2) If there is no tick dependency, (re-)evaluate the next tick and + * stop/update it accordingly. */ void tick_nohz_irq_exit(void) { @@ -1208,7 +1222,7 @@ bool tick_nohz_idle_got_tick(void) /** * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer - * or the tick, whatever that expires first. Note that, if the tick has been + * or the tick, whichever expires first. Note that, if the tick has been * stopped, it returns the next hrtimer. * * Called from power state control code with interrupts disabled @@ -1252,7 +1266,7 @@ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) return *delta_next; /* - * If the next highres timer to expire is earlier than next_event, the + * If the next highres timer to expire is earlier than 'next_event', the * idle governor needs to know that. */ next_event = min_t(u64, next_event, @@ -1296,9 +1310,9 @@ static void tick_nohz_account_idle_time(struct tick_sched *ts, if (vtime_accounting_enabled_this_cpu()) return; /* - * We stopped the tick in idle. Update process times would miss the - * time we slept as update_process_times does only a 1 tick - * accounting. Enforce that this is accounted to idle ! + * We stopped the tick in idle. update_process_times() would miss the + * time we slept, as it does only a 1 tick accounting. + * Enforce that this is accounted to idle ! */ ticks = jiffies - ts->idle_jiffies; /* @@ -1330,11 +1344,20 @@ static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now) } /** - * tick_nohz_idle_exit - restart the idle tick from the idle task + * tick_nohz_idle_exit - Update the tick upon idle task exit + * + * When the idle task exits, update the tick depending on the + * following situations: + * + * 1) If the CPU is not in nohz_full mode (most cases), then + * restart the tick. + * + * 2) If the CPU is in nohz_full mode (corner case): + * 2.1) If the tick can be kept stopped (no tick dependencies) + * then re-evaluate the next tick and try to keep it stopped + * as long as possible. + * 2.2) If the tick has dependencies, restart the tick. * - * Restart the idle tick when the CPU is woken up from idle - * This also exit the RCU extended quiescent state. The CPU - * can use RCU again after this function is called. */ void tick_nohz_idle_exit(void) { @@ -1364,9 +1387,15 @@ void tick_nohz_idle_exit(void) } /* - * The nohz low res interrupt handler + * In low-resolution mode, the tick handler must be implemented directly + * at the clockevent level. hrtimer can't be used instead, because its + * infrastructure actually relies on the tick itself as a backend in + * low-resolution mode (see hrtimer_run_queues()). + * + * This low-resolution handler still makes use of some hrtimer APIs meanwhile + * for convenience with expiration calculation and forwarding. */ -static void tick_nohz_handler(struct clock_event_device *dev) +static void tick_nohz_lowres_handler(struct clock_event_device *dev) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); struct pt_regs *regs = get_irq_regs(); @@ -1377,18 +1406,16 @@ static void tick_nohz_handler(struct clock_event_device *dev) tick_sched_do_timer(ts, now); tick_sched_handle(ts, regs); - if (unlikely(ts->tick_stopped)) { - /* - * The clockevent device is not reprogrammed, so change the - * clock event device to ONESHOT_STOPPED to avoid spurious - * interrupts on devices which might not be truly one shot. - */ - tick_program_event(KTIME_MAX, 1); - return; + /* + * In dynticks mode, tick reprogram is deferred: + * - to the idle task if in dynticks-idle + * - to IRQ exit if in full-dynticks. + */ + if (likely(!ts->tick_stopped)) { + hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); + tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); } - hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); - tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); } static inline void tick_nohz_activate(struct tick_sched *ts, int mode) @@ -1402,7 +1429,7 @@ static inline void tick_nohz_activate(struct tick_sched *ts, int mode) } /** - * tick_nohz_switch_to_nohz - switch to nohz mode + * tick_nohz_switch_to_nohz - switch to NOHZ mode */ static void tick_nohz_switch_to_nohz(void) { @@ -1412,12 +1439,12 @@ static void tick_nohz_switch_to_nohz(void) if (!tick_nohz_enabled) return; - if (tick_switch_to_oneshot(tick_nohz_handler)) + if (tick_switch_to_oneshot(tick_nohz_lowres_handler)) return; /* - * Recycle the hrtimer in ts, so we can share the - * hrtimer_forward with the highres code. + * Recycle the hrtimer in 'ts', so we can share the + * hrtimer_forward_now() function with the highres code. */ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); /* Get the next period */ @@ -1440,7 +1467,7 @@ static inline void tick_nohz_irq_enter(void) if (ts->idle_active) tick_nohz_stop_idle(ts, now); /* - * If all CPUs are idle. We may need to update a stale jiffies value. + * If all CPUs are idle we may need to update a stale jiffies value. * Note nohz_full is a special case: a timekeeper is guaranteed to stay * alive but it might be busy looping with interrupts disabled in some * rare case (typically stop machine). So we must make sure we have a @@ -1459,7 +1486,7 @@ static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { } #endif /* CONFIG_NO_HZ_COMMON */ /* - * Called from irq_enter to notify about the possible interruption of idle() + * Called from irq_enter() to notify about the possible interruption of idle() */ void tick_irq_enter(void) { @@ -1475,7 +1502,7 @@ void tick_irq_enter(void) * We rearm the timer until we get disabled by the idle code. * Called with interrupts disabled. */ -static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) +static enum hrtimer_restart tick_nohz_highres_handler(struct hrtimer *timer) { struct tick_sched *ts = container_of(timer, struct tick_sched, sched_timer); @@ -1485,15 +1512,19 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) tick_sched_do_timer(ts, now); /* - * Do not call, when we are not in irq context and have - * no valid regs pointer + * Do not call when we are not in IRQ context and have + * no valid 'regs' pointer */ if (regs) tick_sched_handle(ts, regs); else ts->next_tick = 0; - /* No need to reprogram if we are in idle or full dynticks mode */ + /* + * In dynticks mode, tick reprogram is deferred: + * - to the idle task if in dynticks-idle + * - to IRQ exit if in full-dynticks. + */ if (unlikely(ts->tick_stopped)) return HRTIMER_NORESTART; @@ -1520,16 +1551,14 @@ void tick_setup_sched_timer(void) struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t now = ktime_get(); - /* - * Emulate tick processing via per-CPU hrtimers: - */ + /* Emulate tick processing via per-CPU hrtimers: */ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - ts->sched_timer.function = tick_sched_timer; + ts->sched_timer.function = tick_nohz_highres_handler; /* Get the next period (per-CPU) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); - /* Offset the tick to avert jiffies_lock contention. */ + /* Offset the tick to avert 'jiffies_lock' contention. */ if (sched_skew_tick) { u64 offset = TICK_NSEC >> 1; do_div(offset, num_possible_cpus()); @@ -1547,13 +1576,23 @@ void tick_setup_sched_timer(void) void tick_cancel_sched_timer(int cpu) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + ktime_t idle_sleeptime, iowait_sleeptime; + unsigned long idle_calls, idle_sleeps; # ifdef CONFIG_HIGH_RES_TIMERS if (ts->sched_timer.base) hrtimer_cancel(&ts->sched_timer); # endif + idle_sleeptime = ts->idle_sleeptime; + iowait_sleeptime = ts->iowait_sleeptime; + idle_calls = ts->idle_calls; + idle_sleeps = ts->idle_sleeps; memset(ts, 0, sizeof(*ts)); + ts->idle_sleeptime = idle_sleeptime; + ts->iowait_sleeptime = iowait_sleeptime; + ts->idle_calls = idle_calls; + ts->idle_sleeps = idle_sleeps; } #endif @@ -1579,10 +1618,10 @@ void tick_oneshot_notify(void) } /* - * Check, if a change happened, which makes oneshot possible. + * Check if a change happened, which makes oneshot possible. * - * Called cyclic from the hrtimer softirq (driven by the timer - * softirq) allow_nohz signals, that we can switch into low-res nohz + * Called cyclically from the hrtimer softirq (driven by the timer + * softirq). 'allow_nohz' signals that we can switch into low-res NOHZ * mode, because high resolution timers are disabled (either compile * or runtime). Called with interrupts disabled. */ diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 63a8ce7177dd..352b161113cd 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -571,18 +571,15 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk, static void trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) { - if (!is_timers_nohz_active()) - return; - /* - * TODO: This wants some optimizing similar to the code below, but we - * will do that when we switch from push to pull for deferrable timers. + * Deferrable timers do not prevent the CPU from entering dynticks and + * are not taken into account on the idle/nohz_full path. An IPI when a + * new deferrable timer is enqueued will wake up the remote CPU but + * nothing will be done with the deferrable timer base. Therefore skip + * the remote IPI for deferrable timers completely. */ - if (timer->flags & TIMER_DEFERRABLE) { - if (tick_nohz_full_cpu(base->cpu)) - wake_up_nohz_cpu(base->cpu); + if (!is_timers_nohz_active() || timer->flags & TIMER_DEFERRABLE) return; - } /* * We might have to IPI the remote CPU if the base is idle and the @@ -606,7 +603,7 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer, __set_bit(idx, base->pending_map); timer_set_idx(timer, idx); - trace_timer_start(timer, timer->expires, timer->flags); + trace_timer_start(timer, bucket_expiry); /* * Check whether this is the new first expiring timer. The @@ -942,31 +939,34 @@ get_target_base(struct timer_base *base, unsigned tflags) return get_timer_this_cpu_base(tflags); } -static inline void forward_timer_base(struct timer_base *base) +static inline void __forward_timer_base(struct timer_base *base, + unsigned long basej) { - unsigned long jnow = READ_ONCE(jiffies); - /* - * No need to forward if we are close enough below jiffies. - * Also while executing timers, base->clk is 1 offset ahead - * of jiffies to avoid endless requeuing to current jiffies. + * Check whether we can forward the base. We can only do that when + * @basej is past base->clk otherwise we might rewind base->clk. */ - if ((long)(jnow - base->clk) < 1) + if (time_before_eq(basej, base->clk)) return; /* * If the next expiry value is > jiffies, then we fast forward to * jiffies otherwise we forward to the next expiry value. */ - if (time_after(base->next_expiry, jnow)) { - base->clk = jnow; + if (time_after(base->next_expiry, basej)) { + base->clk = basej; } else { if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk))) return; base->clk = base->next_expiry; } + } +static inline void forward_timer_base(struct timer_base *base) +{ + __forward_timer_base(base, READ_ONCE(jiffies)); +} /* * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means @@ -1803,8 +1803,10 @@ static int next_pending_bucket(struct timer_base *base, unsigned offset, /* * Search the first expiring timer in the various clock levels. Caller must * hold base->lock. + * + * Store next expiry time in base->next_expiry. */ -static unsigned long __next_timer_interrupt(struct timer_base *base) +static void next_expiry_recalc(struct timer_base *base) { unsigned long clk, next, adj; unsigned lvl, offset = 0; @@ -1870,10 +1872,9 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) clk += adj; } + base->next_expiry = next; base->next_expiry_recalc = false; base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA); - - return next; } #ifdef CONFIG_NO_HZ_COMMON @@ -1921,8 +1922,9 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) u64 get_next_timer_interrupt(unsigned long basej, u64 basem) { struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + unsigned long nextevt = basej + NEXT_TIMER_MAX_DELTA; u64 expires = KTIME_MAX; - unsigned long nextevt; + bool was_idle; /* * Pretend that there is no timer pending if the cpu is offline. @@ -1933,37 +1935,44 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) raw_spin_lock(&base->lock); if (base->next_expiry_recalc) - base->next_expiry = __next_timer_interrupt(base); - nextevt = base->next_expiry; + next_expiry_recalc(base); /* * We have a fresh next event. Check whether we can forward the - * base. We can only do that when @basej is past base->clk - * otherwise we might rewind base->clk. + * base. */ - if (time_after(basej, base->clk)) { - if (time_after(nextevt, basej)) - base->clk = basej; - else if (time_after(nextevt, base->clk)) - base->clk = nextevt; - } + __forward_timer_base(base, basej); - if (time_before_eq(nextevt, basej)) { - expires = basem; - base->is_idle = false; + if (base->timers_pending) { + nextevt = base->next_expiry; + + /* If we missed a tick already, force 0 delta */ + if (time_before(nextevt, basej)) + nextevt = basej; + expires = basem + (u64)(nextevt - basej) * TICK_NSEC; } else { - if (base->timers_pending) - expires = basem + (u64)(nextevt - basej) * TICK_NSEC; /* - * If we expect to sleep more than a tick, mark the base idle. - * Also the tick is stopped so any added timer must forward - * the base clk itself to keep granularity small. This idle - * logic is only maintained for the BASE_STD base, deferrable - * timers may still see large granularity skew (by design). + * Move next_expiry for the empty base into the future to + * prevent a unnecessary raise of the timer softirq when the + * next_expiry value will be reached even if there is no timer + * pending. */ - if ((expires - basem) > TICK_NSEC) - base->is_idle = true; + base->next_expiry = nextevt; } + + /* + * Base is idle if the next event is more than a tick away. + * + * If the base is marked idle then any timer add operation must forward + * the base clk itself to keep granularity small. This idle logic is + * only maintained for the BASE_STD base, deferrable timers may still + * see large granularity skew (by design). + */ + was_idle = base->is_idle; + base->is_idle = time_after(nextevt, basej + 1); + if (was_idle != base->is_idle) + trace_timer_base_idle(base->is_idle, base->cpu); + raw_spin_unlock(&base->lock); return cmp_next_hrtimer_event(basem, expires); @@ -1984,7 +1993,10 @@ void timer_clear_idle(void) * sending the IPI a few instructions smaller for the cost of taking * the lock in the exit from idle path. */ - base->is_idle = false; + if (base->is_idle) { + base->is_idle = false; + trace_timer_base_idle(false, smp_processor_id()); + } } #endif @@ -2015,8 +2027,12 @@ static inline void __run_timers(struct timer_base *base) */ WARN_ON_ONCE(!levels && !base->next_expiry_recalc && base->timers_pending); + /* + * While executing timers, base->clk is set 1 offset ahead of + * jiffies to avoid endless requeuing to current jiffies. + */ base->clk++; - base->next_expiry = __next_timer_interrupt(base); + next_expiry_recalc(base); while (levels--) expire_timers(base, heads + levels); diff --git a/kernel/torture.c b/kernel/torture.c index b28b05bbef02..c72ab2d251f4 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -87,14 +87,15 @@ EXPORT_SYMBOL_GPL(verbose_torout_sleep); * nanosecond random fuzz. This function and its friends desynchronize * testing from the timer wheel. */ -int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, struct torture_random_state *trsp) +int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, const enum hrtimer_mode mode, + struct torture_random_state *trsp) { ktime_t hto = baset_ns; if (trsp) hto += torture_random(trsp) % fuzzt_ns; set_current_state(TASK_IDLE); - return schedule_hrtimeout(&hto, HRTIMER_MODE_REL); + return schedule_hrtimeout(&hto, mode); } EXPORT_SYMBOL_GPL(torture_hrtimeout_ns); @@ -106,7 +107,7 @@ int torture_hrtimeout_us(u32 baset_us, u32 fuzzt_ns, struct torture_random_state { ktime_t baset_ns = baset_us * NSEC_PER_USEC; - return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp); + return torture_hrtimeout_ns(baset_ns, fuzzt_ns, HRTIMER_MODE_REL, trsp); } EXPORT_SYMBOL_GPL(torture_hrtimeout_us); @@ -123,7 +124,7 @@ int torture_hrtimeout_ms(u32 baset_ms, u32 fuzzt_us, struct torture_random_state fuzzt_ns = (u32)~0U; else fuzzt_ns = fuzzt_us * NSEC_PER_USEC; - return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp); + return torture_hrtimeout_ns(baset_ns, fuzzt_ns, HRTIMER_MODE_REL, trsp); } EXPORT_SYMBOL_GPL(torture_hrtimeout_ms); @@ -136,7 +137,7 @@ int torture_hrtimeout_jiffies(u32 baset_j, struct torture_random_state *trsp) { ktime_t baset_ns = jiffies_to_nsecs(baset_j); - return torture_hrtimeout_ns(baset_ns, jiffies_to_nsecs(1), trsp); + return torture_hrtimeout_ns(baset_ns, jiffies_to_nsecs(1), HRTIMER_MODE_REL, trsp); } EXPORT_SYMBOL_GPL(torture_hrtimeout_jiffies); @@ -153,7 +154,7 @@ int torture_hrtimeout_s(u32 baset_s, u32 fuzzt_ms, struct torture_random_state * fuzzt_ns = (u32)~0U; else fuzzt_ns = fuzzt_ms * NSEC_PER_MSEC; - return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp); + return torture_hrtimeout_ns(baset_ns, fuzzt_ns, HRTIMER_MODE_REL, trsp); } EXPORT_SYMBOL_GPL(torture_hrtimeout_s); @@ -520,9 +521,8 @@ static void torture_shuffle_task_unregister_all(void) * A special case is when shuffle_idle_cpu = -1, in which case we allow * the tasks to run on all CPUs. */ -static void torture_shuffle_tasks(void) +static void torture_shuffle_tasks(struct torture_random_state *trp) { - DEFINE_TORTURE_RANDOM(rand); struct shuffle_task *stp; cpumask_setall(shuffle_tmp_mask); @@ -543,7 +543,7 @@ static void torture_shuffle_tasks(void) mutex_lock(&shuffle_task_mutex); list_for_each_entry(stp, &shuffle_task_list, st_l) { - if (!random_shuffle || torture_random(&rand) & 0x1) + if (!random_shuffle || torture_random(trp) & 0x1) set_cpus_allowed_ptr(stp->st_t, shuffle_tmp_mask); } mutex_unlock(&shuffle_task_mutex); @@ -562,7 +562,7 @@ static int torture_shuffle(void *arg) VERBOSE_TOROUT_STRING("torture_shuffle task started"); do { torture_hrtimeout_jiffies(shuffle_interval, &rand); - torture_shuffle_tasks(); + torture_shuffle_tasks(&rand); torture_shutdown_absorb("torture_shuffle"); } while (!torture_must_stop()); torture_kthread_stopping("torture_shuffle"); @@ -673,7 +673,7 @@ int torture_shutdown_init(int ssecs, void (*cleanup)(void)) if (ssecs > 0) { shutdown_time = ktime_add(ktime_get(), ktime_set(ssecs, 0)); return torture_create_kthread(torture_shutdown, NULL, - shutdown_task); + shutdown_task); } return 0; } @@ -720,7 +720,7 @@ static void torture_shutdown_cleanup(void) * suddenly applied to or removed from the system. */ static struct task_struct *stutter_task; -static int stutter_pause_test; +static ktime_t stutter_till_abs_time; static int stutter; static int stutter_gap; @@ -730,30 +730,16 @@ static int stutter_gap; */ bool stutter_wait(const char *title) { - unsigned int i = 0; bool ret = false; - int spt; + ktime_t till_ns; cond_resched_tasks_rcu_qs(); - spt = READ_ONCE(stutter_pause_test); - for (; spt; spt = READ_ONCE(stutter_pause_test)) { - if (!ret && !rt_task(current)) { - sched_set_normal(current, MAX_NICE); - ret = true; - } - if (spt == 1) { - torture_hrtimeout_jiffies(1, NULL); - } else if (spt == 2) { - while (READ_ONCE(stutter_pause_test)) { - if (!(i++ & 0xffff)) - torture_hrtimeout_us(10, 0, NULL); - cond_resched(); - } - } else { - torture_hrtimeout_jiffies(round_jiffies_relative(HZ), NULL); - } - torture_shutdown_absorb(title); + till_ns = READ_ONCE(stutter_till_abs_time); + if (till_ns && ktime_before(ktime_get(), till_ns)) { + torture_hrtimeout_ns(till_ns, 0, HRTIMER_MODE_ABS, NULL); + ret = true; } + torture_shutdown_absorb(title); return ret; } EXPORT_SYMBOL_GPL(stutter_wait); @@ -764,23 +750,16 @@ EXPORT_SYMBOL_GPL(stutter_wait); */ static int torture_stutter(void *arg) { - DEFINE_TORTURE_RANDOM(rand); - int wtime; + ktime_t till_ns; VERBOSE_TOROUT_STRING("torture_stutter task started"); do { if (!torture_must_stop() && stutter > 1) { - wtime = stutter; - if (stutter > 2) { - WRITE_ONCE(stutter_pause_test, 1); - wtime = stutter - 3; - torture_hrtimeout_jiffies(wtime, &rand); - wtime = 2; - } - WRITE_ONCE(stutter_pause_test, 2); - torture_hrtimeout_jiffies(wtime, NULL); + till_ns = ktime_add_ns(ktime_get(), + jiffies_to_nsecs(stutter)); + WRITE_ONCE(stutter_till_abs_time, till_ns); + torture_hrtimeout_jiffies(stutter - 1, NULL); } - WRITE_ONCE(stutter_pause_test, 0); if (!torture_must_stop()) torture_hrtimeout_jiffies(stutter_gap, NULL); torture_shutdown_absorb("torture_stutter"); @@ -812,6 +791,13 @@ static void torture_stutter_cleanup(void) stutter_task = NULL; } +static void +torture_print_module_parms(void) +{ + pr_alert("torture module --- %s: disable_onoff_at_boot=%d ftrace_dump_at_shutdown=%d verbose_sleep_frequency=%d verbose_sleep_duration=%d random_shuffle=%d\n", + torture_type, disable_onoff_at_boot, ftrace_dump_at_shutdown, verbose_sleep_frequency, verbose_sleep_duration, random_shuffle); +} + /* * Initialize torture module. Please note that this is -not- invoked via * the usual module_init() mechanism, but rather by an explicit call from @@ -834,6 +820,7 @@ bool torture_init_begin(char *ttype, int v) torture_type = ttype; verbose = v; fullstop = FULLSTOP_DONTSTOP; + torture_print_module_parms(); return true; } EXPORT_SYMBOL_GPL(torture_init_begin); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a7264b2c17ad..7ac6c52b25eb 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -24,6 +24,7 @@ #include <linux/key.h> #include <linux/verification.h> #include <linux/namei.h> +#include <linux/fileattr.h> #include <net/bpf_sk_storage.h> @@ -41,6 +42,9 @@ #define bpf_event_rcu_dereference(p) \ rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex)) +#define MAX_UPROBE_MULTI_CNT (1U << 20) +#define MAX_KPROBE_MULTI_CNT (1U << 20) + #ifdef CONFIG_MODULES struct bpf_trace_module { struct module *module; @@ -117,6 +121,9 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) * and don't send kprobe event into ring-buffer, * so return zero here */ + rcu_read_lock(); + bpf_prog_inc_misses_counters(rcu_dereference(call->prog_array)); + rcu_read_unlock(); ret = 0; goto out; } @@ -1249,9 +1256,7 @@ static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = { }; #ifdef CONFIG_KEYS -__diag_push(); -__diag_ignore_all("-Wmissing-prototypes", - "kfuncs which will be used in BPF programs"); +__bpf_kfunc_start_defs(); /** * bpf_lookup_user_key - lookup a key by its serial @@ -1375,6 +1380,8 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr, struct bpf_dynptr_kern *sig_ptr, struct bpf_key *trusted_keyring) { + const void *data, *sig; + u32 data_len, sig_len; int ret; if (trusted_keyring->has_ref) { @@ -1391,17 +1398,19 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr, return ret; } - return verify_pkcs7_signature(data_ptr->data, - __bpf_dynptr_size(data_ptr), - sig_ptr->data, - __bpf_dynptr_size(sig_ptr), + data_len = __bpf_dynptr_size(data_ptr); + data = __bpf_dynptr_data(data_ptr, data_len); + sig_len = __bpf_dynptr_size(sig_ptr); + sig = __bpf_dynptr_data(sig_ptr, sig_len); + + return verify_pkcs7_signature(data, data_len, sig, sig_len, trusted_keyring->key, VERIFYING_UNSPECIFIED_SIGNATURE, NULL, NULL); } #endif /* CONFIG_SYSTEM_DATA_VERIFICATION */ -__diag_pop(); +__bpf_kfunc_end_defs(); BTF_SET8_START(key_sig_kfunc_set) BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) @@ -1426,6 +1435,72 @@ static int __init bpf_key_sig_kfuncs_init(void) late_initcall(bpf_key_sig_kfuncs_init); #endif /* CONFIG_KEYS */ +/* filesystem kfuncs */ +__bpf_kfunc_start_defs(); + +/** + * bpf_get_file_xattr - get xattr of a file + * @file: file to get xattr from + * @name__str: name of the xattr + * @value_ptr: output buffer of the xattr value + * + * Get xattr *name__str* of *file* and store the output in *value_ptr*. + * + * For security reasons, only *name__str* with prefix "user." is allowed. + * + * Return: 0 on success, a negative value on error. + */ +__bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str, + struct bpf_dynptr_kern *value_ptr) +{ + struct dentry *dentry; + u32 value_len; + void *value; + int ret; + + if (strncmp(name__str, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) + return -EPERM; + + value_len = __bpf_dynptr_size(value_ptr); + value = __bpf_dynptr_data_rw(value_ptr, value_len); + if (!value) + return -EINVAL; + + dentry = file_dentry(file); + ret = inode_permission(&nop_mnt_idmap, dentry->d_inode, MAY_READ); + if (ret) + return ret; + return __vfs_getxattr(dentry, dentry->d_inode, name__str, value, value_len); +} + +__bpf_kfunc_end_defs(); + +BTF_SET8_START(fs_kfunc_set_ids) +BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) +BTF_SET8_END(fs_kfunc_set_ids) + +static int bpf_get_file_xattr_filter(const struct bpf_prog *prog, u32 kfunc_id) +{ + if (!btf_id_set8_contains(&fs_kfunc_set_ids, kfunc_id)) + return 0; + + /* Only allow to attach from LSM hooks, to avoid recursion */ + return prog->type != BPF_PROG_TYPE_LSM ? -EACCES : 0; +} + +static const struct btf_kfunc_id_set bpf_fs_kfunc_set = { + .owner = THIS_MODULE, + .set = &fs_kfunc_set_ids, + .filter = bpf_get_file_xattr_filter, +}; + +static int __init bpf_fs_kfuncs_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_LSM, &bpf_fs_kfunc_set); +} + +late_initcall(bpf_fs_kfuncs_init); + static const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -2384,7 +2459,8 @@ int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog) int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, u32 *fd_type, const char **buf, - u64 *probe_offset, u64 *probe_addr) + u64 *probe_offset, u64 *probe_addr, + unsigned long *missed) { bool is_tracepoint, is_syscall_tp; struct bpf_prog *prog; @@ -2419,7 +2495,7 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, #ifdef CONFIG_KPROBE_EVENTS if (flags & TRACE_EVENT_FL_KPROBE) err = bpf_get_kprobe_info(event, fd_type, buf, - probe_offset, probe_addr, + probe_offset, probe_addr, missed, event->attr.type == PERF_TYPE_TRACEPOINT); #endif #ifdef CONFIG_UPROBE_EVENTS @@ -2614,6 +2690,7 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link, kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); info->kprobe_multi.count = kmulti_link->cnt; info->kprobe_multi.flags = kmulti_link->flags; + info->kprobe_multi.missed = kmulti_link->fp.nmissed; if (!uaddrs) return 0; @@ -2710,6 +2787,7 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, int err; if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { + bpf_prog_inc_misses_counter(link->link.prog); err = 0; goto out; } @@ -2853,6 +2931,17 @@ static int get_modules_for_addrs(struct module ***mods, unsigned long *addrs, u3 return arr.mods_cnt; } +static int addrs_check_error_injection_list(unsigned long *addrs, u32 cnt) +{ + u32 i; + + for (i = 0; i < cnt; i++) { + if (!within_error_injection_list(addrs[i])) + return -EINVAL; + } + return 0; +} + int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_kprobe_multi_link *link = NULL; @@ -2884,6 +2973,8 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr cnt = attr->link_create.kprobe_multi.cnt; if (!cnt) return -EINVAL; + if (cnt > MAX_KPROBE_MULTI_CNT) + return -E2BIG; size = cnt * sizeof(*addrs); addrs = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL); @@ -2930,6 +3021,11 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr goto error; } + if (prog->kprobe_override && addrs_check_error_injection_list(addrs, cnt)) { + err = -EINVAL; + goto error; + } + link = kzalloc(sizeof(*link), GFP_KERNEL); if (!link) { err = -ENOMEM; @@ -3009,6 +3105,7 @@ struct bpf_uprobe_multi_link; struct bpf_uprobe { struct bpf_uprobe_multi_link *link; loff_t offset; + unsigned long ref_ctr_offset; u64 cookie; struct uprobe_consumer consumer; }; @@ -3017,6 +3114,7 @@ struct bpf_uprobe_multi_link { struct path path; struct bpf_link link; u32 cnt; + u32 flags; struct bpf_uprobe *uprobes; struct task_struct *task; }; @@ -3058,9 +3156,79 @@ static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link) kfree(umulti_link); } +static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + u64 __user *uref_ctr_offsets = u64_to_user_ptr(info->uprobe_multi.ref_ctr_offsets); + u64 __user *ucookies = u64_to_user_ptr(info->uprobe_multi.cookies); + u64 __user *uoffsets = u64_to_user_ptr(info->uprobe_multi.offsets); + u64 __user *upath = u64_to_user_ptr(info->uprobe_multi.path); + u32 upath_size = info->uprobe_multi.path_size; + struct bpf_uprobe_multi_link *umulti_link; + u32 ucount = info->uprobe_multi.count; + int err = 0, i; + long left; + + if (!upath ^ !upath_size) + return -EINVAL; + + if ((uoffsets || uref_ctr_offsets || ucookies) && !ucount) + return -EINVAL; + + umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); + info->uprobe_multi.count = umulti_link->cnt; + info->uprobe_multi.flags = umulti_link->flags; + info->uprobe_multi.pid = umulti_link->task ? + task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0; + + if (upath) { + char *p, *buf; + + upath_size = min_t(u32, upath_size, PATH_MAX); + + buf = kmalloc(upath_size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + p = d_path(&umulti_link->path, buf, upath_size); + if (IS_ERR(p)) { + kfree(buf); + return PTR_ERR(p); + } + upath_size = buf + upath_size - p; + left = copy_to_user(upath, p, upath_size); + kfree(buf); + if (left) + return -EFAULT; + info->uprobe_multi.path_size = upath_size; + } + + if (!uoffsets && !ucookies && !uref_ctr_offsets) + return 0; + + if (ucount < umulti_link->cnt) + err = -ENOSPC; + else + ucount = umulti_link->cnt; + + for (i = 0; i < ucount; i++) { + if (uoffsets && + put_user(umulti_link->uprobes[i].offset, uoffsets + i)) + return -EFAULT; + if (uref_ctr_offsets && + put_user(umulti_link->uprobes[i].ref_ctr_offset, uref_ctr_offsets + i)) + return -EFAULT; + if (ucookies && + put_user(umulti_link->uprobes[i].cookie, ucookies + i)) + return -EFAULT; + } + + return err; +} + static const struct bpf_link_ops bpf_uprobe_multi_link_lops = { .release = bpf_uprobe_multi_link_release, .dealloc = bpf_uprobe_multi_link_dealloc, + .fill_link_info = bpf_uprobe_multi_link_fill_link_info, }; static int uprobe_prog_run(struct bpf_uprobe *uprobe, @@ -3148,7 +3316,6 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr { struct bpf_uprobe_multi_link *link = NULL; unsigned long __user *uref_ctr_offsets; - unsigned long *ref_ctr_offsets = NULL; struct bpf_link_primer link_primer; struct bpf_uprobe *uprobes = NULL; struct task_struct *task = NULL; @@ -3182,6 +3349,8 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (!upath || !uoffsets || !cnt) return -EINVAL; + if (cnt > MAX_UPROBE_MULTI_CNT) + return -E2BIG; uref_ctr_offsets = u64_to_user_ptr(attr->link_create.uprobe_multi.ref_ctr_offsets); ucookies = u64_to_user_ptr(attr->link_create.uprobe_multi.cookies); @@ -3207,8 +3376,10 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr rcu_read_lock(); task = get_pid_task(find_vpid(pid), PIDTYPE_PID); rcu_read_unlock(); - if (!task) + if (!task) { + err = -ESRCH; goto error_path_put; + } } err = -ENOMEM; @@ -3219,22 +3390,20 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (!uprobes || !link) goto error_free; - if (uref_ctr_offsets) { - ref_ctr_offsets = kvcalloc(cnt, sizeof(*ref_ctr_offsets), GFP_KERNEL); - if (!ref_ctr_offsets) - goto error_free; - } - for (i = 0; i < cnt; i++) { - if (ucookies && __get_user(uprobes[i].cookie, ucookies + i)) { + if (__get_user(uprobes[i].offset, uoffsets + i)) { err = -EFAULT; goto error_free; } - if (uref_ctr_offsets && __get_user(ref_ctr_offsets[i], uref_ctr_offsets + i)) { + if (uprobes[i].offset < 0) { + err = -EINVAL; + goto error_free; + } + if (uref_ctr_offsets && __get_user(uprobes[i].ref_ctr_offset, uref_ctr_offsets + i)) { err = -EFAULT; goto error_free; } - if (__get_user(uprobes[i].offset, uoffsets + i)) { + if (ucookies && __get_user(uprobes[i].cookie, ucookies + i)) { err = -EFAULT; goto error_free; } @@ -3254,6 +3423,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr link->uprobes = uprobes; link->path = path; link->task = task; + link->flags = flags; bpf_link_init(&link->link, BPF_LINK_TYPE_UPROBE_MULTI, &bpf_uprobe_multi_link_lops, prog); @@ -3261,7 +3431,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr for (i = 0; i < cnt; i++) { err = uprobe_register_refctr(d_real_inode(link->path.dentry), uprobes[i].offset, - ref_ctr_offsets ? ref_ctr_offsets[i] : 0, + uprobes[i].ref_ctr_offset, &uprobes[i].consumer); if (err) { bpf_uprobe_unregister(&path, uprobes, i); @@ -3273,11 +3443,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (err) goto error_free; - kvfree(ref_ctr_offsets); return bpf_link_settle(&link_primer); error_free: - kvfree(ref_ctr_offsets); kvfree(uprobes); kfree(link); if (task) diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 3b21f4063258..6cd2a4e3afb8 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -187,9 +187,9 @@ static void fprobe_init(struct fprobe *fp) static int fprobe_init_rethook(struct fprobe *fp, int num) { - int i, size; + int size; - if (num < 0) + if (num <= 0) return -EINVAL; if (!fp->exit_handler) { @@ -202,29 +202,21 @@ static int fprobe_init_rethook(struct fprobe *fp, int num) size = fp->nr_maxactive; else size = num * num_possible_cpus() * 2; - if (size < 0) - return -E2BIG; - - fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler); - if (!fp->rethook) - return -ENOMEM; - for (i = 0; i < size; i++) { - struct fprobe_rethook_node *node; - - node = kzalloc(sizeof(*node) + fp->entry_data_size, GFP_KERNEL); - if (!node) { - rethook_free(fp->rethook); - fp->rethook = NULL; - return -ENOMEM; - } - rethook_add_node(fp->rethook, &node->node); - } + if (size <= 0) + return -EINVAL; + + /* Initialize rethook */ + fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler, + sizeof(struct fprobe_rethook_node), size); + if (IS_ERR(fp->rethook)) + return PTR_ERR(fp->rethook); + return 0; } static void fprobe_fail_cleanup(struct fprobe *fp) { - if (fp->rethook) { + if (!IS_ERR_OR_NULL(fp->rethook)) { /* Don't need to cleanup rethook->handler because this is not used. */ rethook_free(fp->rethook); fp->rethook = NULL; @@ -379,14 +371,14 @@ int unregister_fprobe(struct fprobe *fp) if (!fprobe_is_registered(fp)) return -EINVAL; - if (fp->rethook) + if (!IS_ERR_OR_NULL(fp->rethook)) rethook_stop(fp->rethook); ret = unregister_ftrace_function(&fp->ops); if (ret < 0) return ret; - if (fp->rethook) + if (!IS_ERR_OR_NULL(fp->rethook)) rethook_free(fp->rethook); ftrace_free_filter(&fp->ops); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8de8bec5f366..c060d5b47910 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1183,18 +1183,19 @@ static void __add_hash_entry(struct ftrace_hash *hash, hash->count++; } -static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip) +static struct ftrace_func_entry * +add_hash_entry(struct ftrace_hash *hash, unsigned long ip) { struct ftrace_func_entry *entry; entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) - return -ENOMEM; + return NULL; entry->ip = ip; __add_hash_entry(hash, entry); - return 0; + return entry; } static void @@ -1349,7 +1350,6 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) struct ftrace_func_entry *entry; struct ftrace_hash *new_hash; int size; - int ret; int i; new_hash = alloc_ftrace_hash(size_bits); @@ -1366,8 +1366,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) size = 1 << hash->size_bits; for (i = 0; i < size; i++) { hlist_for_each_entry(entry, &hash->buckets[i], hlist) { - ret = add_hash_entry(new_hash, entry->ip); - if (ret < 0) + if (add_hash_entry(new_hash, entry->ip) == NULL) goto free_hash; } } @@ -2536,7 +2535,7 @@ ftrace_find_unique_ops(struct dyn_ftrace *rec) #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS /* Protected by rcu_tasks for reading, and direct_mutex for writing */ -static struct ftrace_hash *direct_functions = EMPTY_HASH; +static struct ftrace_hash __rcu *direct_functions = EMPTY_HASH; static DEFINE_MUTEX(direct_mutex); int ftrace_direct_func_count; @@ -2555,39 +2554,6 @@ unsigned long ftrace_find_rec_direct(unsigned long ip) return entry->direct; } -static struct ftrace_func_entry* -ftrace_add_rec_direct(unsigned long ip, unsigned long addr, - struct ftrace_hash **free_hash) -{ - struct ftrace_func_entry *entry; - - if (ftrace_hash_empty(direct_functions) || - direct_functions->count > 2 * (1 << direct_functions->size_bits)) { - struct ftrace_hash *new_hash; - int size = ftrace_hash_empty(direct_functions) ? 0 : - direct_functions->count + 1; - - if (size < 32) - size = 32; - - new_hash = dup_hash(direct_functions, size); - if (!new_hash) - return NULL; - - *free_hash = direct_functions; - direct_functions = new_hash; - } - - entry = kmalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) - return NULL; - - entry->ip = ip; - entry->direct = addr; - __add_hash_entry(direct_functions, entry); - return entry; -} - static void call_direct_funcs(unsigned long ip, unsigned long pip, struct ftrace_ops *ops, struct ftrace_regs *fregs) { @@ -4223,8 +4189,8 @@ enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int clear_filter) /* Do nothing if it exists */ if (entry) return 0; - - ret = add_hash_entry(hash, rec->ip); + if (add_hash_entry(hash, rec->ip) == NULL) + ret = -ENOMEM; } return ret; } @@ -5266,7 +5232,8 @@ __ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) return 0; } - return add_hash_entry(hash, ip); + entry = add_hash_entry(hash, ip); + return entry ? 0 : -ENOMEM; } static int @@ -5358,7 +5325,17 @@ static LIST_HEAD(ftrace_direct_funcs); static int register_ftrace_function_nolock(struct ftrace_ops *ops); +/* + * If there are multiple ftrace_ops, use SAVE_REGS by default, so that direct + * call will be jumped from ftrace_regs_caller. Only if the architecture does + * not support ftrace_regs_caller but direct_call, use SAVE_ARGS so that it + * jumps from ftrace_caller for multiple ftrace_ops. + */ +#ifndef HAVE_DYNAMIC_FTRACE_WITH_REGS #define MULTI_FLAGS (FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_ARGS) +#else +#define MULTI_FLAGS (FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS) +#endif static int check_direct_multi(struct ftrace_ops *ops) { @@ -5410,7 +5387,7 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long */ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) { - struct ftrace_hash *hash, *free_hash = NULL; + struct ftrace_hash *hash, *new_hash = NULL, *free_hash = NULL; struct ftrace_func_entry *entry, *new; int err = -EBUSY, size, i; @@ -5436,17 +5413,44 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) } } - /* ... and insert them to direct_functions hash. */ err = -ENOMEM; + + /* Make a copy hash to place the new and the old entries in */ + size = hash->count + direct_functions->count; + if (size > 32) + size = 32; + new_hash = alloc_ftrace_hash(fls(size)); + if (!new_hash) + goto out_unlock; + + /* Now copy over the existing direct entries */ + size = 1 << direct_functions->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &direct_functions->buckets[i], hlist) { + new = add_hash_entry(new_hash, entry->ip); + if (!new) + goto out_unlock; + new->direct = entry->direct; + } + } + + /* ... and add the new entries */ + size = 1 << hash->size_bits; for (i = 0; i < size; i++) { hlist_for_each_entry(entry, &hash->buckets[i], hlist) { - new = ftrace_add_rec_direct(entry->ip, addr, &free_hash); + new = add_hash_entry(new_hash, entry->ip); if (!new) - goto out_remove; + goto out_unlock; + /* Update both the copy and the hash entry */ + new->direct = addr; entry->direct = addr; } } + free_hash = direct_functions; + rcu_assign_pointer(direct_functions, new_hash); + new_hash = NULL; + ops->func = call_direct_funcs; ops->flags = MULTI_FLAGS; ops->trampoline = FTRACE_REGS_ADDR; @@ -5454,17 +5458,17 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) err = register_ftrace_function_nolock(ops); - out_remove: - if (err) - remove_direct_functions_hash(hash, addr); - out_unlock: mutex_unlock(&direct_mutex); - if (free_hash) { + if (free_hash && free_hash != EMPTY_HASH) { synchronize_rcu_tasks(); free_ftrace_hash(free_hash); } + + if (new_hash) + free_ftrace_hash(new_hash); + return err; } EXPORT_SYMBOL_GPL(register_ftrace_direct); @@ -6309,7 +6313,7 @@ ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer) if (entry) continue; - if (add_hash_entry(hash, rec->ip) < 0) + if (add_hash_entry(hash, rec->ip) == NULL) goto out; } else { if (entry) { diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c index 5eb9b598f4e9..fa03094e9e69 100644 --- a/kernel/trace/rethook.c +++ b/kernel/trace/rethook.c @@ -8,7 +8,6 @@ #include <linux/preempt.h> #include <linux/rethook.h> #include <linux/slab.h> -#include <linux/sort.h> /* Return hook list (shadow stack by list) */ @@ -36,21 +35,7 @@ void rethook_flush_task(struct task_struct *tk) static void rethook_free_rcu(struct rcu_head *head) { struct rethook *rh = container_of(head, struct rethook, rcu); - struct rethook_node *rhn; - struct freelist_node *node; - int count = 1; - - node = rh->pool.head; - while (node) { - rhn = container_of(node, struct rethook_node, freelist); - node = node->next; - kfree(rhn); - count++; - } - - /* The rh->ref is the number of pooled node + 1 */ - if (refcount_sub_and_test(count, &rh->ref)) - kfree(rh); + objpool_fini(&rh->pool); } /** @@ -63,7 +48,7 @@ static void rethook_free_rcu(struct rcu_head *head) */ void rethook_stop(struct rethook *rh) { - WRITE_ONCE(rh->handler, NULL); + rcu_assign_pointer(rh->handler, NULL); } /** @@ -78,59 +63,73 @@ void rethook_stop(struct rethook *rh) */ void rethook_free(struct rethook *rh) { - WRITE_ONCE(rh->handler, NULL); + rethook_stop(rh); call_rcu(&rh->rcu, rethook_free_rcu); } +static int rethook_init_node(void *nod, void *context) +{ + struct rethook_node *node = nod; + + node->rethook = context; + return 0; +} + +static int rethook_fini_pool(struct objpool_head *head, void *context) +{ + kfree(context); + return 0; +} + +static inline rethook_handler_t rethook_get_handler(struct rethook *rh) +{ + return (rethook_handler_t)rcu_dereference_check(rh->handler, + rcu_read_lock_any_held()); +} + /** * rethook_alloc() - Allocate struct rethook. * @data: a data to pass the @handler when hooking the return. - * @handler: the return hook callback function. + * @handler: the return hook callback function, must NOT be NULL + * @size: node size: rethook node and additional data + * @num: number of rethook nodes to be preallocated * * Allocate and initialize a new rethook with @data and @handler. - * Return NULL if memory allocation fails or @handler is NULL. + * Return pointer of new rethook, or error codes for failures. + * * Note that @handler == NULL means this rethook is going to be freed. */ -struct rethook *rethook_alloc(void *data, rethook_handler_t handler) +struct rethook *rethook_alloc(void *data, rethook_handler_t handler, + int size, int num) { - struct rethook *rh = kzalloc(sizeof(struct rethook), GFP_KERNEL); + struct rethook *rh; - if (!rh || !handler) { - kfree(rh); - return NULL; - } + if (!handler || num <= 0 || size < sizeof(struct rethook_node)) + return ERR_PTR(-EINVAL); + + rh = kzalloc(sizeof(struct rethook), GFP_KERNEL); + if (!rh) + return ERR_PTR(-ENOMEM); rh->data = data; - rh->handler = handler; - rh->pool.head = NULL; - refcount_set(&rh->ref, 1); + rcu_assign_pointer(rh->handler, handler); + /* initialize the objpool for rethook nodes */ + if (objpool_init(&rh->pool, num, size, GFP_KERNEL, rh, + rethook_init_node, rethook_fini_pool)) { + kfree(rh); + return ERR_PTR(-ENOMEM); + } return rh; } -/** - * rethook_add_node() - Add a new node to the rethook. - * @rh: the struct rethook. - * @node: the struct rethook_node to be added. - * - * Add @node to @rh. User must allocate @node (as a part of user's - * data structure.) The @node fields are initialized in this function. - */ -void rethook_add_node(struct rethook *rh, struct rethook_node *node) -{ - node->rethook = rh; - freelist_add(&node->freelist, &rh->pool); - refcount_inc(&rh->ref); -} - static void free_rethook_node_rcu(struct rcu_head *head) { struct rethook_node *node = container_of(head, struct rethook_node, rcu); + struct rethook *rh = node->rethook; - if (refcount_dec_and_test(&node->rethook->ref)) - kfree(node->rethook); - kfree(node); + objpool_drop(node, &rh->pool); } /** @@ -142,10 +141,11 @@ static void free_rethook_node_rcu(struct rcu_head *head) */ void rethook_recycle(struct rethook_node *node) { - lockdep_assert_preemption_disabled(); + rethook_handler_t handler; - if (likely(READ_ONCE(node->rethook->handler))) - freelist_add(&node->freelist, &node->rethook->pool); + handler = rethook_get_handler(node->rethook); + if (likely(handler)) + objpool_push(node, &node->rethook->pool); else call_rcu(&node->rcu, free_rethook_node_rcu); } @@ -160,10 +160,7 @@ NOKPROBE_SYMBOL(rethook_recycle); */ struct rethook_node *rethook_try_get(struct rethook *rh) { - rethook_handler_t handler = READ_ONCE(rh->handler); - struct freelist_node *fn; - - lockdep_assert_preemption_disabled(); + rethook_handler_t handler = rethook_get_handler(rh); /* Check whether @rh is going to be freed. */ if (unlikely(!handler)) @@ -178,11 +175,7 @@ struct rethook_node *rethook_try_get(struct rethook *rh) if (unlikely(!rcu_is_watching())) return NULL; - fn = freelist_try_get(&rh->pool); - if (!fn) - return NULL; - - return container_of(fn, struct rethook_node, freelist); + return (struct rethook_node *)objpool_pop(&rh->pool); } NOKPROBE_SYMBOL(rethook_try_get); @@ -312,7 +305,7 @@ unsigned long rethook_trampoline_handler(struct pt_regs *regs, rhn = container_of(first, struct rethook_node, llist); if (WARN_ON_ONCE(rhn->frame != frame)) break; - handler = READ_ONCE(rhn->rethook->handler); + handler = rethook_get_handler(rhn->rethook); if (handler) handler(rhn, rhn->rethook->data, correct_ret_addr, regs); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 78502d4c7214..fd4bfe3ecf01 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -27,6 +27,7 @@ #include <linux/cpu.h> #include <linux/oom.h> +#include <asm/local64.h> #include <asm/local.h> /* @@ -317,6 +318,11 @@ struct buffer_data_page { unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */ }; +struct buffer_data_read_page { + unsigned order; /* order of the page */ + struct buffer_data_page *data; /* actual data, stored in this page */ +}; + /* * Note, the buffer_page list must be first. The buffer pages * are allocated in cache lines, which means that each buffer @@ -331,6 +337,7 @@ struct buffer_page { unsigned read; /* index for next read */ local_t entries; /* entries on this page */ unsigned long real_end; /* real end of data */ + unsigned order; /* order of the page */ struct buffer_data_page *page; /* Actual data page */ }; @@ -354,9 +361,14 @@ static void rb_init_page(struct buffer_data_page *bpage) local_set(&bpage->commit, 0); } +static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage) +{ + return local_read(&bpage->page->commit); +} + static void free_buffer_page(struct buffer_page *bpage) { - free_page((unsigned long)bpage->page); + free_pages((unsigned long)bpage->page, bpage->order); kfree(bpage); } @@ -368,41 +380,6 @@ static inline bool test_time_stamp(u64 delta) return !!(delta & TS_DELTA_TEST); } -#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE) - -/* Max payload is BUF_PAGE_SIZE - header (8bytes) */ -#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) - -int ring_buffer_print_page_header(struct trace_seq *s) -{ - struct buffer_data_page field; - - trace_seq_printf(s, "\tfield: u64 timestamp;\t" - "offset:0;\tsize:%u;\tsigned:%u;\n", - (unsigned int)sizeof(field.time_stamp), - (unsigned int)is_signed_type(u64)); - - trace_seq_printf(s, "\tfield: local_t commit;\t" - "offset:%u;\tsize:%u;\tsigned:%u;\n", - (unsigned int)offsetof(typeof(field), commit), - (unsigned int)sizeof(field.commit), - (unsigned int)is_signed_type(long)); - - trace_seq_printf(s, "\tfield: int overwrite;\t" - "offset:%u;\tsize:%u;\tsigned:%u;\n", - (unsigned int)offsetof(typeof(field), commit), - 1, - (unsigned int)is_signed_type(long)); - - trace_seq_printf(s, "\tfield: char data;\t" - "offset:%u;\tsize:%u;\tsigned:%u;\n", - (unsigned int)offsetof(typeof(field), data), - (unsigned int)BUF_PAGE_SIZE, - (unsigned int)is_signed_type(char)); - - return !trace_seq_has_overflowed(s); -} - struct rb_irq_work { struct irq_work work; wait_queue_head_t waiters; @@ -458,27 +435,9 @@ enum { RB_CTX_MAX }; -#if BITS_PER_LONG == 32 -#define RB_TIME_32 -#endif - -/* To test on 64 bit machines */ -//#define RB_TIME_32 - -#ifdef RB_TIME_32 - -struct rb_time_struct { - local_t cnt; - local_t top; - local_t bottom; - local_t msb; -}; -#else -#include <asm/local64.h> struct rb_time_struct { local64_t time; }; -#endif typedef struct rb_time_struct rb_time_t; #define MAX_NEST 5 @@ -552,6 +511,10 @@ struct trace_buffer { struct rb_irq_work irq_work; bool time_stamp_abs; + + unsigned int subbuf_size; + unsigned int subbuf_order; + unsigned int max_data_size; }; struct ring_buffer_iter { @@ -565,194 +528,49 @@ struct ring_buffer_iter { u64 read_stamp; u64 page_stamp; struct ring_buffer_event *event; + size_t event_size; int missed_events; }; -#ifdef RB_TIME_32 - -/* - * On 32 bit machines, local64_t is very expensive. As the ring - * buffer doesn't need all the features of a true 64 bit atomic, - * on 32 bit, it uses these functions (64 still uses local64_t). - * - * For the ring buffer, 64 bit required operations for the time is - * the following: - * - * - Reads may fail if it interrupted a modification of the time stamp. - * It will succeed if it did not interrupt another write even if - * the read itself is interrupted by a write. - * It returns whether it was successful or not. - * - * - Writes always succeed and will overwrite other writes and writes - * that were done by events interrupting the current write. - * - * - A write followed by a read of the same time stamp will always succeed, - * but may not contain the same value. - * - * - A cmpxchg will fail if it interrupted another write or cmpxchg. - * Other than that, it acts like a normal cmpxchg. - * - * The 60 bit time stamp is broken up by 30 bits in a top and bottom half - * (bottom being the least significant 30 bits of the 60 bit time stamp). - * - * The two most significant bits of each half holds a 2 bit counter (0-3). - * Each update will increment this counter by one. - * When reading the top and bottom, if the two counter bits match then the - * top and bottom together make a valid 60 bit number. - */ -#define RB_TIME_SHIFT 30 -#define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1) -#define RB_TIME_MSB_SHIFT 60 - -static inline int rb_time_cnt(unsigned long val) -{ - return (val >> RB_TIME_SHIFT) & 3; -} - -static inline u64 rb_time_val(unsigned long top, unsigned long bottom) -{ - u64 val; - - val = top & RB_TIME_VAL_MASK; - val <<= RB_TIME_SHIFT; - val |= bottom & RB_TIME_VAL_MASK; - - return val; -} - -static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt) -{ - unsigned long top, bottom, msb; - unsigned long c; - - /* - * If the read is interrupted by a write, then the cnt will - * be different. Loop until both top and bottom have been read - * without interruption. - */ - do { - c = local_read(&t->cnt); - top = local_read(&t->top); - bottom = local_read(&t->bottom); - msb = local_read(&t->msb); - } while (c != local_read(&t->cnt)); - - *cnt = rb_time_cnt(top); - - /* If top and bottom counts don't match, this interrupted a write */ - if (*cnt != rb_time_cnt(bottom)) - return false; - - /* The shift to msb will lose its cnt bits */ - *ret = rb_time_val(top, bottom) | ((u64)msb << RB_TIME_MSB_SHIFT); - return true; -} - -static bool rb_time_read(rb_time_t *t, u64 *ret) -{ - unsigned long cnt; - - return __rb_time_read(t, ret, &cnt); -} - -static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt) -{ - return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT); -} - -static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom, - unsigned long *msb) -{ - *top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK); - *bottom = (unsigned long)(val & RB_TIME_VAL_MASK); - *msb = (unsigned long)(val >> RB_TIME_MSB_SHIFT); -} - -static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt) -{ - val = rb_time_val_cnt(val, cnt); - local_set(t, val); -} - -static void rb_time_set(rb_time_t *t, u64 val) +int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s) { - unsigned long cnt, top, bottom, msb; - - rb_time_split(val, &top, &bottom, &msb); - - /* Writes always succeed with a valid number even if it gets interrupted. */ - do { - cnt = local_inc_return(&t->cnt); - rb_time_val_set(&t->top, top, cnt); - rb_time_val_set(&t->bottom, bottom, cnt); - rb_time_val_set(&t->msb, val >> RB_TIME_MSB_SHIFT, cnt); - } while (cnt != local_read(&t->cnt)); -} - -static inline bool -rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set) -{ - return local_try_cmpxchg(l, &expect, set); -} - -static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) -{ - unsigned long cnt, top, bottom, msb; - unsigned long cnt2, top2, bottom2, msb2; - u64 val; - - /* The cmpxchg always fails if it interrupted an update */ - if (!__rb_time_read(t, &val, &cnt2)) - return false; - - if (val != expect) - return false; + struct buffer_data_page field; - cnt = local_read(&t->cnt); - if ((cnt & 3) != cnt2) - return false; + trace_seq_printf(s, "\tfield: u64 timestamp;\t" + "offset:0;\tsize:%u;\tsigned:%u;\n", + (unsigned int)sizeof(field.time_stamp), + (unsigned int)is_signed_type(u64)); - cnt2 = cnt + 1; + trace_seq_printf(s, "\tfield: local_t commit;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), commit), + (unsigned int)sizeof(field.commit), + (unsigned int)is_signed_type(long)); - rb_time_split(val, &top, &bottom, &msb); - top = rb_time_val_cnt(top, cnt); - bottom = rb_time_val_cnt(bottom, cnt); + trace_seq_printf(s, "\tfield: int overwrite;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), commit), + 1, + (unsigned int)is_signed_type(long)); - rb_time_split(set, &top2, &bottom2, &msb2); - top2 = rb_time_val_cnt(top2, cnt2); - bottom2 = rb_time_val_cnt(bottom2, cnt2); + trace_seq_printf(s, "\tfield: char data;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), data), + (unsigned int)buffer->subbuf_size, + (unsigned int)is_signed_type(char)); - if (!rb_time_read_cmpxchg(&t->cnt, cnt, cnt2)) - return false; - if (!rb_time_read_cmpxchg(&t->msb, msb, msb2)) - return false; - if (!rb_time_read_cmpxchg(&t->top, top, top2)) - return false; - if (!rb_time_read_cmpxchg(&t->bottom, bottom, bottom2)) - return false; - return true; + return !trace_seq_has_overflowed(s); } -#else /* 64 bits */ - -/* local64_t always succeeds */ - -static inline bool rb_time_read(rb_time_t *t, u64 *ret) +static inline void rb_time_read(rb_time_t *t, u64 *ret) { *ret = local64_read(&t->time); - return true; } static void rb_time_set(rb_time_t *t, u64 val) { local64_set(&t->time, val); } -static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) -{ - return local64_try_cmpxchg(&t->time, &expect, set); -} -#endif - /* * Enable this to make sure that the event passed to * ring_buffer_event_time_stamp() is not committed and also @@ -858,10 +676,7 @@ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer, WARN_ONCE(1, "nest (%d) greater than max", nest); fail: - /* Can only fail on 32 bit */ - if (!rb_time_read(&cpu_buffer->write_stamp, &ts)) - /* Screw it, just read the current time */ - ts = rb_time_stamp(cpu_buffer->buffer); + rb_time_read(&cpu_buffer->write_stamp, &ts); return ts; } @@ -919,9 +734,14 @@ static __always_inline bool full_hit(struct trace_buffer *buffer, int cpu, int f if (!nr_pages || !full) return true; - dirty = ring_buffer_nr_dirty_pages(buffer, cpu); + /* + * Add one as dirty will never equal nr_pages, as the sub-buffer + * that the writer is on is not counted as dirty. + * This is needed if "buffer_percent" is set to 100. + */ + dirty = ring_buffer_nr_dirty_pages(buffer, cpu) + 1; - return (dirty * 100) > (full * nr_pages); + return (dirty * 100) >= (full * nr_pages); } /* @@ -982,7 +802,8 @@ void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu) /* make sure the waiters see the new index */ smp_wmb(); - rb_wake_up_waiters(&rbwork->work); + /* This can be called in any context */ + irq_work_queue(&rbwork->work); } /** @@ -1123,7 +944,7 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, full = 0; } else { if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return -EINVAL; + return EPOLLERR; cpu_buffer = buffer->buffers[cpu]; work = &cpu_buffer->irq_work; @@ -1132,6 +953,9 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, if (full) { poll_wait(filp, &work->full_waiters, poll_table); work->full_waiters_pending = true; + if (!cpu_buffer->shortest_full || + cpu_buffer->shortest_full > full) + cpu_buffer->shortest_full = full; } else { poll_wait(filp, &work->waiters, poll_table); work->waiters_pending = true; @@ -1648,10 +1472,12 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, list_add(&bpage->list, pages); - page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, 0); + page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, + cpu_buffer->buffer->subbuf_order); if (!page) goto free_pages; bpage->page = page_address(page); + bpage->order = cpu_buffer->buffer->subbuf_order; rb_init_page(bpage->page); if (user_thread && fatal_signal_pending(current)) @@ -1730,7 +1556,8 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) rb_check_bpage(cpu_buffer, bpage); cpu_buffer->reader_page = bpage; - page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0); + + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, cpu_buffer->buffer->subbuf_order); if (!page) goto fail_free_reader; bpage->page = page_address(page); @@ -1779,6 +1606,8 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) free_buffer_page(bpage); } + free_page((unsigned long)cpu_buffer->free_page); + kfree(cpu_buffer); } @@ -1811,7 +1640,14 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) goto fail_free_buffer; - nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + /* Default buffer page size - one system page */ + buffer->subbuf_order = 0; + buffer->subbuf_size = PAGE_SIZE - BUF_PAGE_HDR_SIZE; + + /* Max payload is buffer page size - header (8bytes) */ + buffer->max_data_size = buffer->subbuf_size - (sizeof(u32) * 2); + + nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size); buffer->flags = flags; buffer->clock = trace_clock_local; buffer->reader_lock_key = key; @@ -2003,7 +1839,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) * Increment overrun to account for the lost events. */ local_add(page_entries, &cpu_buffer->overrun); - local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); + local_sub(rb_page_commit(to_remove_page), &cpu_buffer->entries_bytes); local_inc(&cpu_buffer->pages_lost); } @@ -2048,7 +1884,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) retries = 10; success = false; while (retries--) { - struct list_head *head_page, *prev_page, *r; + struct list_head *head_page, *prev_page; struct list_head *last_page, *first_page; struct list_head *head_page_with_bit; struct buffer_page *hpage = rb_set_head_page(cpu_buffer); @@ -2067,9 +1903,9 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) last_page->next = head_page_with_bit; first_page->prev = prev_page; - r = cmpxchg(&prev_page->next, head_page_with_bit, first_page); - - if (r == head_page_with_bit) { + /* caution: head_page_with_bit gets updated on cmpxchg failure */ + if (try_cmpxchg(&prev_page->next, + &head_page_with_bit, first_page)) { /* * yay, we replaced the page pointer to our new list, * now, we just have to update to head page's prev @@ -2130,7 +1966,7 @@ static void update_pages_handler(struct work_struct *work) * @size: the new size. * @cpu_id: the cpu buffer to resize * - * Minimum size is 2 * BUF_PAGE_SIZE. + * Minimum size is 2 * buffer->subbuf_size. * * Returns 0 on success and < 0 on failure. */ @@ -2152,7 +1988,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, !cpumask_test_cpu(cpu_id, buffer->cpumask)) return 0; - nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size); /* we need a minimum of two pages */ if (nr_pages < 2) @@ -2198,6 +2034,8 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, err = -ENOMEM; goto out_err; } + + cond_resched(); } cpus_read_lock(); @@ -2365,11 +2203,6 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page->read); } -static __always_inline unsigned rb_page_commit(struct buffer_page *bpage) -{ - return local_read(&bpage->page->commit); -} - static struct ring_buffer_event * rb_iter_head_event(struct ring_buffer_iter *iter) { @@ -2388,6 +2221,11 @@ rb_iter_head_event(struct ring_buffer_iter *iter) */ commit = rb_page_commit(iter_head_page); smp_rmb(); + + /* An event needs to be at least 8 bytes in size */ + if (iter->head > commit - 8) + goto reset; + event = __rb_page_index(iter_head_page, iter->head); length = rb_event_length(event); @@ -2397,7 +2235,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter) */ barrier(); - if ((iter->head + length) > commit || length > BUF_MAX_DATA_SIZE) + if ((iter->head + length) > commit || length > iter->event_size) /* Writer corrupted the read? */ goto reset; @@ -2437,11 +2275,13 @@ rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer) } static __always_inline unsigned -rb_event_index(struct ring_buffer_event *event) +rb_event_index(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { unsigned long addr = (unsigned long)event; - return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE; + addr &= (PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1; + + return addr - BUF_PAGE_HDR_SIZE; } static void rb_inc_iter(struct ring_buffer_iter *iter) @@ -2510,7 +2350,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, * the counters. */ local_add(entries, &cpu_buffer->overrun); - local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); + local_sub(rb_page_commit(next_page), &cpu_buffer->entries_bytes); local_inc(&cpu_buffer->pages_lost); /* @@ -2630,6 +2470,7 @@ static inline void rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, unsigned long tail, struct rb_event_info *info) { + unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size); struct buffer_page *tail_page = info->tail_page; struct ring_buffer_event *event; unsigned long length = info->length; @@ -2638,13 +2479,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, * Only the event that crossed the page boundary * must fill the old tail_page with padding. */ - if (tail >= BUF_PAGE_SIZE) { + if (tail >= bsize) { /* * If the page was filled, then we still need * to update the real_end. Reset it to zero * and the reader will ignore it. */ - if (tail == BUF_PAGE_SIZE) + if (tail == bsize) tail_page->real_end = 0; local_sub(length, &tail_page->write); @@ -2653,9 +2494,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, event = __rb_page_index(tail_page, tail); - /* account for padding bytes */ - local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes); - /* * Save the original length to the meta data. * This will be used by the reader to add lost event @@ -2669,12 +2507,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, * write counter enough to allow another writer to slip * in on this page. * We put in a discarded commit instead, to make sure - * that this space is not used again. + * that this space is not used again, and this space will + * not be accounted into 'entries_bytes'. * * If we are less than the minimum size, we don't need to * worry about it. */ - if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) { + if (tail > (bsize - RB_EVNT_MIN_SIZE)) { /* No room for any events */ /* Mark the rest of the page with padding */ @@ -2689,16 +2528,19 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, } /* Put in a discarded event */ - event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE; + event->array[0] = (bsize - tail) - RB_EVNT_HDR_SIZE; event->type_len = RINGBUF_TYPE_PADDING; /* time delta must be non zero */ event->time_delta = 1; + /* account for padding bytes */ + local_add(bsize - tail, &cpu_buffer->entries_bytes); + /* Make sure the padding is visible before the tail_page->write update */ smp_wmb(); /* Set write to end of buffer */ - length = (tail + length) - BUF_PAGE_SIZE; + length = (tail + length) - bsize; local_sub(length, &tail_page->write); } @@ -2812,7 +2654,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, /* Slow path */ static struct ring_buffer_event * -rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs) +rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event, u64 delta, bool abs) { if (abs) event->type_len = RINGBUF_TYPE_TIME_STAMP; @@ -2820,7 +2663,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs) event->type_len = RINGBUF_TYPE_TIME_EXTEND; /* Not the first event on the page, or not delta? */ - if (abs || rb_event_index(event)) { + if (abs || rb_event_index(cpu_buffer, event)) { event->time_delta = delta & TS_MASK; event->array[0] = delta >> TS_SHIFT; } else { @@ -2850,7 +2693,7 @@ rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer, (unsigned long long)info->ts, (unsigned long long)info->before, (unsigned long long)info->after, - (unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0), + (unsigned long long)({rb_time_read(&cpu_buffer->write_stamp, &write_stamp); write_stamp;}), sched_clock_stable() ? "" : "If you just came from a suspend/resume,\n" "please switch to the trace global clock:\n" @@ -2894,7 +2737,7 @@ static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer, if (!abs) info->delta = 0; } - *event = rb_add_time_stamp(*event, info->delta, abs); + *event = rb_add_time_stamp(cpu_buffer, *event, info->delta, abs); *length -= RB_LEN_TIME_EXTEND; *delta = 0; } @@ -2970,25 +2813,6 @@ static unsigned rb_calculate_event_length(unsigned length) return length; } -static u64 rb_time_delta(struct ring_buffer_event *event) -{ - switch (event->type_len) { - case RINGBUF_TYPE_PADDING: - return 0; - - case RINGBUF_TYPE_TIME_EXTEND: - return rb_event_time_stamp(event); - - case RINGBUF_TYPE_TIME_STAMP: - return 0; - - case RINGBUF_TYPE_DATA: - return event->time_delta; - default: - return 0; - } -} - static inline bool rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) @@ -2996,51 +2820,42 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, unsigned long new_index, old_index; struct buffer_page *bpage; unsigned long addr; - u64 write_stamp; - u64 delta; - new_index = rb_event_index(event); + new_index = rb_event_index(cpu_buffer, event); old_index = new_index + rb_event_ts_length(event); addr = (unsigned long)event; - addr &= PAGE_MASK; + addr &= ~((PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1); bpage = READ_ONCE(cpu_buffer->tail_page); - delta = rb_time_delta(event); - - if (!rb_time_read(&cpu_buffer->write_stamp, &write_stamp)) - return false; - - /* Make sure the write stamp is read before testing the location */ - barrier(); - + /* + * Make sure the tail_page is still the same and + * the next write location is the end of this event + */ if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { unsigned long write_mask = local_read(&bpage->write) & ~RB_WRITE_MASK; unsigned long event_length = rb_event_length(event); - /* Something came in, can't discard */ - if (!rb_time_cmpxchg(&cpu_buffer->write_stamp, - write_stamp, write_stamp - delta)) - return false; - /* - * It's possible that the event time delta is zero - * (has the same time stamp as the previous event) - * in which case write_stamp and before_stamp could - * be the same. In such a case, force before_stamp - * to be different than write_stamp. It doesn't - * matter what it is, as long as its different. + * For the before_stamp to be different than the write_stamp + * to make sure that the next event adds an absolute + * value and does not rely on the saved write stamp, which + * is now going to be bogus. + * + * By setting the before_stamp to zero, the next event + * is not going to use the write_stamp and will instead + * create an absolute timestamp. This means there's no + * reason to update the wirte_stamp! */ - if (!delta) - rb_time_set(&cpu_buffer->before_stamp, 0); + rb_time_set(&cpu_buffer->before_stamp, 0); /* * If an event were to come in now, it would see that the * write_stamp and the before_stamp are different, and assume * that this event just added itself before updating * the write stamp. The interrupting event will fix the - * write stamp for us, and use the before stamp as its delta. + * write stamp for us, and use an absolute timestamp. */ /* @@ -3396,6 +3211,76 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); #define CHECK_FULL_PAGE 1L #ifdef CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS + +static const char *show_irq_str(int bits) +{ + const char *type[] = { + ".", // 0 + "s", // 1 + "h", // 2 + "Hs", // 3 + "n", // 4 + "Ns", // 5 + "Nh", // 6 + "NHs", // 7 + }; + + return type[bits]; +} + +/* Assume this is an trace event */ +static const char *show_flags(struct ring_buffer_event *event) +{ + struct trace_entry *entry; + int bits = 0; + + if (rb_event_data_length(event) - RB_EVNT_HDR_SIZE < sizeof(*entry)) + return "X"; + + entry = ring_buffer_event_data(event); + + if (entry->flags & TRACE_FLAG_SOFTIRQ) + bits |= 1; + + if (entry->flags & TRACE_FLAG_HARDIRQ) + bits |= 2; + + if (entry->flags & TRACE_FLAG_NMI) + bits |= 4; + + return show_irq_str(bits); +} + +static const char *show_irq(struct ring_buffer_event *event) +{ + struct trace_entry *entry; + + if (rb_event_data_length(event) - RB_EVNT_HDR_SIZE < sizeof(*entry)) + return ""; + + entry = ring_buffer_event_data(event); + if (entry->flags & TRACE_FLAG_IRQS_OFF) + return "d"; + return ""; +} + +static const char *show_interrupt_level(void) +{ + unsigned long pc = preempt_count(); + unsigned char level = 0; + + if (pc & SOFTIRQ_OFFSET) + level |= 1; + + if (pc & HARDIRQ_MASK) + level |= 2; + + if (pc & NMI_MASK) + level |= 4; + + return show_irq_str(level); +} + static void dump_buffer_page(struct buffer_data_page *bpage, struct rb_event_info *info, unsigned long tail) @@ -3416,34 +3301,57 @@ static void dump_buffer_page(struct buffer_data_page *bpage, case RINGBUF_TYPE_TIME_EXTEND: delta = rb_event_time_stamp(event); ts += delta; - pr_warn(" [%lld] delta:%lld TIME EXTEND\n", ts, delta); + pr_warn(" 0x%x: [%lld] delta:%lld TIME EXTEND\n", + e, ts, delta); break; case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); ts = rb_fix_abs_ts(delta, ts); - pr_warn(" [%lld] absolute:%lld TIME STAMP\n", ts, delta); + pr_warn(" 0x%x: [%lld] absolute:%lld TIME STAMP\n", + e, ts, delta); break; case RINGBUF_TYPE_PADDING: ts += event->time_delta; - pr_warn(" [%lld] delta:%d PADDING\n", ts, event->time_delta); + pr_warn(" 0x%x: [%lld] delta:%d PADDING\n", + e, ts, event->time_delta); break; case RINGBUF_TYPE_DATA: ts += event->time_delta; - pr_warn(" [%lld] delta:%d\n", ts, event->time_delta); + pr_warn(" 0x%x: [%lld] delta:%d %s%s\n", + e, ts, event->time_delta, + show_flags(event), show_irq(event)); break; default: break; } } + pr_warn("expected end:0x%lx last event actually ended at:0x%x\n", tail, e); } static DEFINE_PER_CPU(atomic_t, checking); static atomic_t ts_dump; +#define buffer_warn_return(fmt, ...) \ + do { \ + /* If another report is happening, ignore this one */ \ + if (atomic_inc_return(&ts_dump) != 1) { \ + atomic_dec(&ts_dump); \ + goto out; \ + } \ + atomic_inc(&cpu_buffer->record_disabled); \ + pr_warn(fmt, ##__VA_ARGS__); \ + dump_buffer_page(bpage, info, tail); \ + atomic_dec(&ts_dump); \ + /* There's some cases in boot up that this can happen */ \ + if (WARN_ON_ONCE(system_state != SYSTEM_BOOTING)) \ + /* Do not re-enable checking */ \ + return; \ + } while (0) + /* * Check if the current event time stamp matches the deltas on * the buffer page. @@ -3477,7 +3385,7 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, return; /* - * If this interrupted another event, + * If this interrupted another event, */ if (atomic_inc_return(this_cpu_ptr(&checking)) != 1) goto out; @@ -3497,7 +3405,12 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); - ts = rb_fix_abs_ts(delta, ts); + delta = rb_fix_abs_ts(delta, ts); + if (delta < ts) { + buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n", + cpu_buffer->cpu, ts, delta); + } + ts = delta; break; case RINGBUF_TYPE_PADDING: @@ -3514,23 +3427,11 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, } if ((full && ts > info->ts) || (!full && ts + info->delta != info->ts)) { - /* If another report is happening, ignore this one */ - if (atomic_inc_return(&ts_dump) != 1) { - atomic_dec(&ts_dump); - goto out; - } - atomic_inc(&cpu_buffer->record_disabled); - /* There's some cases in boot up that this can happen */ - WARN_ON_ONCE(system_state != SYSTEM_BOOTING); - pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s\n", - cpu_buffer->cpu, - ts + info->delta, info->ts, info->delta, - info->before, info->after, - full ? " (full)" : ""); - dump_buffer_page(bpage, info, tail); - atomic_dec(&ts_dump); - /* Do not re-enable checking */ - return; + buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\n", + cpu_buffer->cpu, + ts + info->delta, info->ts, info->delta, + info->before, info->after, + full ? " (full)" : "", show_interrupt_level()); } out: atomic_dec(this_cpu_ptr(&checking)); @@ -3550,16 +3451,14 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event; struct buffer_page *tail_page; unsigned long tail, write, w; - bool a_ok; - bool b_ok; /* Don't let the compiler play games with cpu_buffer->tail_page */ tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page); /*A*/ w = local_read(&tail_page->write) & RB_WRITE_MASK; barrier(); - b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); - a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); + rb_time_read(&cpu_buffer->before_stamp, &info->before); + rb_time_read(&cpu_buffer->write_stamp, &info->after); barrier(); info->ts = rb_time_stamp(cpu_buffer->buffer); @@ -3571,7 +3470,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * absolute timestamp. * Don't bother if this is the start of a new page (w == 0). */ - if (unlikely(!a_ok || !b_ok || (info->before != info->after && w))) { + if (!w) { + /* Use the sub-buffer timestamp */ + info->delta = 0; + } else if (unlikely(info->before != info->after)) { info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND; info->length += RB_LEN_TIME_EXTEND; } else { @@ -3593,27 +3495,20 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, tail = write - info->length; /* See if we shot pass the end of this buffer page */ - if (unlikely(write > BUF_PAGE_SIZE)) { - /* before and after may now different, fix it up*/ - b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); - a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); - if (a_ok && b_ok && info->before != info->after) - (void)rb_time_cmpxchg(&cpu_buffer->before_stamp, - info->before, info->after); - if (a_ok && b_ok) - check_buffer(cpu_buffer, info, CHECK_FULL_PAGE); + if (unlikely(write > cpu_buffer->buffer->subbuf_size)) { + check_buffer(cpu_buffer, info, CHECK_FULL_PAGE); return rb_move_tail(cpu_buffer, tail, info); } if (likely(tail == w)) { - u64 save_before; - bool s_ok; - /* Nothing interrupted us between A and C */ /*D*/ rb_time_set(&cpu_buffer->write_stamp, info->ts); - barrier(); - /*E*/ s_ok = rb_time_read(&cpu_buffer->before_stamp, &save_before); - RB_WARN_ON(cpu_buffer, !s_ok); + /* + * If something came in between C and D, the write stamp + * may now not be in sync. But that's fine as the before_stamp + * will be different and then next event will just be forced + * to use an absolute timestamp. + */ if (likely(!(info->add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)))) /* This did not interrupt any time update */ @@ -3621,41 +3516,37 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, else /* Just use full timestamp for interrupting event */ info->delta = info->ts; - barrier(); check_buffer(cpu_buffer, info, tail); - if (unlikely(info->ts != save_before)) { - /* SLOW PATH - Interrupted between C and E */ - - a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); - RB_WARN_ON(cpu_buffer, !a_ok); - - /* Write stamp must only go forward */ - if (save_before > info->after) { - /* - * We do not care about the result, only that - * it gets updated atomically. - */ - (void)rb_time_cmpxchg(&cpu_buffer->write_stamp, - info->after, save_before); - } - } } else { u64 ts; /* SLOW PATH - Interrupted between A and C */ - a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); - /* Was interrupted before here, write_stamp must be valid */ - RB_WARN_ON(cpu_buffer, !a_ok); + + /* Save the old before_stamp */ + rb_time_read(&cpu_buffer->before_stamp, &info->before); + + /* + * Read a new timestamp and update the before_stamp to make + * the next event after this one force using an absolute + * timestamp. This is in case an interrupt were to come in + * between E and F. + */ ts = rb_time_stamp(cpu_buffer->buffer); + rb_time_set(&cpu_buffer->before_stamp, ts); + barrier(); - /*E*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) && - info->after < ts && - rb_time_cmpxchg(&cpu_buffer->write_stamp, - info->after, ts)) { - /* Nothing came after this event between C and E */ + /*E*/ rb_time_read(&cpu_buffer->write_stamp, &info->after); + barrier(); + /*F*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) && + info->after == info->before && info->after < ts) { + /* + * Nothing came after this event between C and F, it is + * safe to use info->after for the delta as it + * matched info->before and is still valid. + */ info->delta = ts - info->after; } else { /* - * Interrupted between C and E: + * Interrupted between C and F: * Lost the previous events time stamp. Just set the * delta to zero, and this will be the same time as * the event this event interrupted. And the events that @@ -3706,6 +3597,12 @@ rb_reserve_next_event(struct trace_buffer *buffer, int nr_loops = 0; int add_ts_default; + /* ring buffer does cmpxchg, make sure it is safe in NMI context */ + if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) && + (unlikely(in_nmi()))) { + return NULL; + } + rb_start_commit(cpu_buffer); /* The commit page can not change after this */ @@ -3729,6 +3626,8 @@ rb_reserve_next_event(struct trace_buffer *buffer, if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) { add_ts_default = RB_ADD_STAMP_ABSOLUTE; info.length += RB_LEN_TIME_EXTEND; + if (info.length > cpu_buffer->buffer->max_data_size) + goto out_fail; } else { add_ts_default = RB_ADD_STAMP_NONE; } @@ -3802,7 +3701,7 @@ ring_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long length) if (unlikely(atomic_read(&cpu_buffer->record_disabled))) goto out; - if (unlikely(length > BUF_MAX_DATA_SIZE)) + if (unlikely(length > buffer->max_data_size)) goto out; if (unlikely(trace_recursive_lock(cpu_buffer))) @@ -3836,7 +3735,7 @@ rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *bpage = cpu_buffer->commit_page; struct buffer_page *start; - addr &= PAGE_MASK; + addr &= ~((PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1); /* Do the likely case first */ if (likely(bpage->page == (void *)addr)) { @@ -3952,7 +3851,7 @@ int ring_buffer_write(struct trace_buffer *buffer, if (atomic_read(&cpu_buffer->record_disabled)) goto out; - if (length > BUF_MAX_DATA_SIZE) + if (length > buffer->max_data_size) goto out; if (unlikely(trace_recursive_lock(cpu_buffer))) @@ -4208,7 +4107,7 @@ u64 ring_buffer_oldest_event_ts(struct trace_buffer *buffer, int cpu) EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts); /** - * ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer + * ring_buffer_bytes_cpu - get the number of bytes unconsumed in a cpu buffer * @buffer: The ring buffer * @cpu: The per CPU buffer to read from. */ @@ -4532,6 +4431,7 @@ static struct buffer_page * rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader = NULL; + unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size); unsigned long overwrite; unsigned long flags; int nr_loops = 0; @@ -4667,7 +4567,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) #define USECS_WAIT 1000000 for (nr_loops = 0; nr_loops < USECS_WAIT; nr_loops++) { /* If the write is past the end of page, a writer is still updating it */ - if (likely(!reader || rb_page_write(reader) <= BUF_PAGE_SIZE)) + if (likely(!reader || rb_page_write(reader) <= bsize)) break; udelay(1); @@ -4716,6 +4616,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) length = rb_event_length(event); cpu_buffer->reader_page->read += length; + cpu_buffer->read_bytes += length; } static void rb_advance_iter(struct ring_buffer_iter *iter) @@ -5109,7 +5010,9 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags) if (!iter) return NULL; - iter->event = kmalloc(BUF_MAX_DATA_SIZE, flags); + /* Holds the entire event: data and meta data */ + iter->event_size = buffer->subbuf_size; + iter->event = kmalloc(iter->event_size, flags); if (!iter->event) { kfree(iter); return NULL; @@ -5225,19 +5128,28 @@ EXPORT_SYMBOL_GPL(ring_buffer_iter_advance); */ unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu) { - /* - * Earlier, this method returned - * BUF_PAGE_SIZE * buffer->nr_pages - * Since the nr_pages field is now removed, we have converted this to - * return the per cpu buffer value. - */ if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; - return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages; + return buffer->subbuf_size * buffer->buffers[cpu]->nr_pages; } EXPORT_SYMBOL_GPL(ring_buffer_size); +/** + * ring_buffer_max_event_size - return the max data size of an event + * @buffer: The ring buffer. + * + * Returns the maximum size an event can be. + */ +unsigned long ring_buffer_max_event_size(struct trace_buffer *buffer) +{ + /* If abs timestamp is requested, events have a timestamp too */ + if (ring_buffer_time_stamp_abs(buffer)) + return buffer->max_data_size - RB_LEN_TIME_EXTEND; + return buffer->max_data_size; +} +EXPORT_SYMBOL_GPL(ring_buffer_max_event_size); + static void rb_clear_buffer_page(struct buffer_page *page) { local_set(&page->write, 0); @@ -5508,6 +5420,9 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) goto out; + if (buffer_a->subbuf_order != buffer_b->subbuf_order) + goto out; + ret = -EAGAIN; if (atomic_read(&buffer_a->record_disabled)) @@ -5579,40 +5494,48 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); * Returns: * The page allocated, or ERR_PTR */ -void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu) +struct buffer_data_read_page * +ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - struct buffer_data_page *bpage = NULL; + struct buffer_data_read_page *bpage = NULL; unsigned long flags; struct page *page; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return ERR_PTR(-ENODEV); + bpage = kzalloc(sizeof(*bpage), GFP_KERNEL); + if (!bpage) + return ERR_PTR(-ENOMEM); + + bpage->order = buffer->subbuf_order; cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); arch_spin_lock(&cpu_buffer->lock); if (cpu_buffer->free_page) { - bpage = cpu_buffer->free_page; + bpage->data = cpu_buffer->free_page; cpu_buffer->free_page = NULL; } arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); - if (bpage) + if (bpage->data) goto out; - page = alloc_pages_node(cpu_to_node(cpu), - GFP_KERNEL | __GFP_NORETRY, 0); - if (!page) + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY, + cpu_buffer->buffer->subbuf_order); + if (!page) { + kfree(bpage); return ERR_PTR(-ENOMEM); + } - bpage = page_address(page); + bpage->data = page_address(page); out: - rb_init_page(bpage); + rb_init_page(bpage->data); return bpage; } @@ -5622,14 +5545,15 @@ EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page); * ring_buffer_free_read_page - free an allocated read page * @buffer: the buffer the page was allocate for * @cpu: the cpu buffer the page came from - * @data: the page to free + * @data_page: the page to free * * Free a page allocated from ring_buffer_alloc_read_page. */ -void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data) +void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, + struct buffer_data_read_page *data_page) { struct ring_buffer_per_cpu *cpu_buffer; - struct buffer_data_page *bpage = data; + struct buffer_data_page *bpage = data_page->data; struct page *page = virt_to_page(bpage); unsigned long flags; @@ -5638,8 +5562,12 @@ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data cpu_buffer = buffer->buffers[cpu]; - /* If the page is still in use someplace else, we can't reuse it */ - if (page_ref_count(page) > 1) + /* + * If the page is still in use someplace else, or order of the page + * is different from the subbuffer order of the buffer - + * we can't reuse it + */ + if (page_ref_count(page) > 1 || data_page->order != buffer->subbuf_order) goto out; local_irq_save(flags); @@ -5654,7 +5582,8 @@ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data local_irq_restore(flags); out: - free_page((unsigned long)bpage); + free_pages((unsigned long)bpage, data_page->order); + kfree(data_page); } EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); @@ -5675,9 +5604,10 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); * rpage = ring_buffer_alloc_read_page(buffer, cpu); * if (IS_ERR(rpage)) * return PTR_ERR(rpage); - * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0); + * ret = ring_buffer_read_page(buffer, rpage, len, cpu, 0); * if (ret >= 0) - * process_page(rpage, ret); + * process_page(ring_buffer_read_page_data(rpage), ret); + * ring_buffer_free_read_page(buffer, cpu, rpage); * * When @full is set, the function will not return true unless * the writer is off the reader page. @@ -5692,7 +5622,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); * <0 if no data has been transferred. */ int ring_buffer_read_page(struct trace_buffer *buffer, - void **data_page, size_t len, int cpu, int full) + struct buffer_data_read_page *data_page, + size_t len, int cpu, int full) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct ring_buffer_event *event; @@ -5717,10 +5648,12 @@ int ring_buffer_read_page(struct trace_buffer *buffer, len -= BUF_PAGE_HDR_SIZE; - if (!data_page) + if (!data_page || !data_page->data) + goto out; + if (data_page->order != buffer->subbuf_order) goto out; - bpage = *data_page; + bpage = data_page->data; if (!bpage) goto out; @@ -5809,16 +5742,16 @@ int ring_buffer_read_page(struct trace_buffer *buffer, } else { /* update the entry counter */ cpu_buffer->read += rb_page_entries(reader); - cpu_buffer->read_bytes += BUF_PAGE_SIZE; + cpu_buffer->read_bytes += rb_page_commit(reader); /* swap the pages */ rb_init_page(bpage); bpage = reader->page; - reader->page = *data_page; + reader->page = data_page->data; local_set(&reader->write, 0); local_set(&reader->entries, 0); reader->read = 0; - *data_page = bpage; + data_page->data = bpage; /* * Use the real_end for the data size, @@ -5840,7 +5773,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, /* If there is room at the end of the page to save the * missed events, then record it there. */ - if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) { + if (buffer->subbuf_size - commit >= sizeof(missed_events)) { memcpy(&bpage->data[commit], &missed_events, sizeof(missed_events)); local_add(RB_MISSED_STORED, &bpage->commit); @@ -5852,8 +5785,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer, /* * This page may be off to user land. Zero it out here. */ - if (commit < BUF_PAGE_SIZE) - memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit); + if (commit < buffer->subbuf_size) + memset(&bpage->data[commit], 0, buffer->subbuf_size - commit); out_unlock: raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); @@ -5863,6 +5796,209 @@ int ring_buffer_read_page(struct trace_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_read_page); +/** + * ring_buffer_read_page_data - get pointer to the data in the page. + * @page: the page to get the data from + * + * Returns pointer to the actual data in this page. + */ +void *ring_buffer_read_page_data(struct buffer_data_read_page *page) +{ + return page->data; +} +EXPORT_SYMBOL_GPL(ring_buffer_read_page_data); + +/** + * ring_buffer_subbuf_size_get - get size of the sub buffer. + * @buffer: the buffer to get the sub buffer size from + * + * Returns size of the sub buffer, in bytes. + */ +int ring_buffer_subbuf_size_get(struct trace_buffer *buffer) +{ + return buffer->subbuf_size + BUF_PAGE_HDR_SIZE; +} +EXPORT_SYMBOL_GPL(ring_buffer_subbuf_size_get); + +/** + * ring_buffer_subbuf_order_get - get order of system sub pages in one buffer page. + * @buffer: The ring_buffer to get the system sub page order from + * + * By default, one ring buffer sub page equals to one system page. This parameter + * is configurable, per ring buffer. The size of the ring buffer sub page can be + * extended, but must be an order of system page size. + * + * Returns the order of buffer sub page size, in system pages: + * 0 means the sub buffer size is 1 system page and so forth. + * In case of an error < 0 is returned. + */ +int ring_buffer_subbuf_order_get(struct trace_buffer *buffer) +{ + if (!buffer) + return -EINVAL; + + return buffer->subbuf_order; +} +EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_get); + +/** + * ring_buffer_subbuf_order_set - set the size of ring buffer sub page. + * @buffer: The ring_buffer to set the new page size. + * @order: Order of the system pages in one sub buffer page + * + * By default, one ring buffer pages equals to one system page. This API can be + * used to set new size of the ring buffer page. The size must be order of + * system page size, that's why the input parameter @order is the order of + * system pages that are allocated for one ring buffer page: + * 0 - 1 system page + * 1 - 2 system pages + * 3 - 4 system pages + * ... + * + * Returns 0 on success or < 0 in case of an error. + */ +int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct buffer_page *bpage, *tmp; + int old_order, old_size; + int nr_pages; + int psize; + int err; + int cpu; + + if (!buffer || order < 0) + return -EINVAL; + + if (buffer->subbuf_order == order) + return 0; + + psize = (1 << order) * PAGE_SIZE; + if (psize <= BUF_PAGE_HDR_SIZE) + return -EINVAL; + + old_order = buffer->subbuf_order; + old_size = buffer->subbuf_size; + + /* prevent another thread from changing buffer sizes */ + mutex_lock(&buffer->mutex); + atomic_inc(&buffer->record_disabled); + + /* Make sure all commits have finished */ + synchronize_rcu(); + + buffer->subbuf_order = order; + buffer->subbuf_size = psize - BUF_PAGE_HDR_SIZE; + + /* Make sure all new buffers are allocated, before deleting the old ones */ + for_each_buffer_cpu(buffer, cpu) { + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + continue; + + cpu_buffer = buffer->buffers[cpu]; + + /* Update the number of pages to match the new size */ + nr_pages = old_size * buffer->buffers[cpu]->nr_pages; + nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size); + + /* we need a minimum of two pages */ + if (nr_pages < 2) + nr_pages = 2; + + cpu_buffer->nr_pages_to_update = nr_pages; + + /* Include the reader page */ + nr_pages++; + + /* Allocate the new size buffer */ + INIT_LIST_HEAD(&cpu_buffer->new_pages); + if (__rb_allocate_pages(cpu_buffer, nr_pages, + &cpu_buffer->new_pages)) { + /* not enough memory for new pages */ + err = -ENOMEM; + goto error; + } + } + + for_each_buffer_cpu(buffer, cpu) { + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + continue; + + cpu_buffer = buffer->buffers[cpu]; + + /* Clear the head bit to make the link list normal to read */ + rb_head_page_deactivate(cpu_buffer); + + /* Now walk the list and free all the old sub buffers */ + list_for_each_entry_safe(bpage, tmp, cpu_buffer->pages, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + /* The above loop stopped an the last page needing to be freed */ + bpage = list_entry(cpu_buffer->pages, struct buffer_page, list); + free_buffer_page(bpage); + + /* Free the current reader page */ + free_buffer_page(cpu_buffer->reader_page); + + /* One page was allocated for the reader page */ + cpu_buffer->reader_page = list_entry(cpu_buffer->new_pages.next, + struct buffer_page, list); + list_del_init(&cpu_buffer->reader_page->list); + + /* The cpu_buffer pages are a link list with no head */ + cpu_buffer->pages = cpu_buffer->new_pages.next; + cpu_buffer->new_pages.next->prev = cpu_buffer->new_pages.prev; + cpu_buffer->new_pages.prev->next = cpu_buffer->new_pages.next; + + /* Clear the new_pages list */ + INIT_LIST_HEAD(&cpu_buffer->new_pages); + + cpu_buffer->head_page + = list_entry(cpu_buffer->pages, struct buffer_page, list); + cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; + + cpu_buffer->nr_pages = cpu_buffer->nr_pages_to_update; + cpu_buffer->nr_pages_to_update = 0; + + free_pages((unsigned long)cpu_buffer->free_page, old_order); + cpu_buffer->free_page = NULL; + + rb_head_page_activate(cpu_buffer); + + rb_check_pages(cpu_buffer); + } + + atomic_dec(&buffer->record_disabled); + mutex_unlock(&buffer->mutex); + + return 0; + +error: + buffer->subbuf_order = old_order; + buffer->subbuf_size = old_size; + + atomic_dec(&buffer->record_disabled); + mutex_unlock(&buffer->mutex); + + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + + if (!cpu_buffer->nr_pages_to_update) + continue; + + list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + } + + return err; +} +EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set); + /* * We only allocate new buffers, never free them if the CPU goes down. * If we were to free the buffer, then the user would lose any trace that was in diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index aef34673d79d..008187ebd7fe 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -104,10 +104,11 @@ static enum event_status read_event(int cpu) static enum event_status read_page(int cpu) { + struct buffer_data_read_page *bpage; struct ring_buffer_event *event; struct rb_page *rpage; unsigned long commit; - void *bpage; + int page_size; int *entry; int ret; int inc; @@ -117,14 +118,15 @@ static enum event_status read_page(int cpu) if (IS_ERR(bpage)) return EVENT_DROPPED; - ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1); + page_size = ring_buffer_subbuf_size_get(buffer); + ret = ring_buffer_read_page(buffer, bpage, page_size, cpu, 1); if (ret >= 0) { - rpage = bpage; + rpage = ring_buffer_read_page_data(bpage); /* The commit may have missed event flags set, clear them */ commit = local_read(&rpage->commit) & 0xfffff; for (i = 0; i < commit && !test_error ; i += inc) { - if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) { + if (i >= (page_size - offsetof(struct rb_page, data))) { TEST_ERROR(); break; } diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c index 8dfe85499d4a..354c2117be43 100644 --- a/kernel/trace/synth_event_gen_test.c +++ b/kernel/trace/synth_event_gen_test.c @@ -477,6 +477,17 @@ static int __init synth_event_gen_test_init(void) ret = test_trace_synth_event(); WARN_ON(ret); + + /* Disable when done */ + trace_array_set_clr_event(gen_synth_test->tr, + "synthetic", + "gen_synth_test", false); + trace_array_set_clr_event(empty_synth_test->tr, + "synthetic", + "empty_synth_test", false); + trace_array_set_clr_event(create_synth_test->tr, + "synthetic", + "create_synth_test", false); out: return ret; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2b4ded753367..9ff8a439d674 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -54,12 +54,6 @@ #include "trace.h" #include "trace_output.h" -/* - * On boot up, the ring buffer is set to the minimum size, so that - * we do not waste memory on systems that are not using tracing. - */ -bool ring_buffer_expanded; - #ifdef CONFIG_FTRACE_STARTUP_TEST /* * We need to change this state when a selftest is running. @@ -202,7 +196,7 @@ static int __init set_cmdline_ftrace(char *str) strscpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); default_bootup_tracer = bootup_tracer_buf; /* We are using ftrace early, expand it */ - ring_buffer_expanded = true; + trace_set_ring_buffer_expanded(NULL); return 1; } __setup("ftrace=", set_cmdline_ftrace); @@ -247,7 +241,7 @@ static int __init boot_alloc_snapshot(char *str) } else { allocate_snapshot = true; /* We also need the main ring buffer expanded */ - ring_buffer_expanded = true; + trace_set_ring_buffer_expanded(NULL); } return 1; } @@ -490,6 +484,13 @@ static struct trace_array global_trace = { .trace_flags = TRACE_DEFAULT_FLAGS, }; +void trace_set_ring_buffer_expanded(struct trace_array *tr) +{ + if (!tr) + tr = &global_trace; + tr->ring_buffer_expanded = true; +} + LIST_HEAD(ftrace_trace_arrays); int trace_array_get(struct trace_array *this_tr) @@ -1262,10 +1263,17 @@ static void set_buffer_entries(struct array_buffer *buf, unsigned long val); int tracing_alloc_snapshot_instance(struct trace_array *tr) { + int order; int ret; if (!tr->allocated_snapshot) { + /* Make the snapshot buffer have the same order as main buffer */ + order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer); + ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order); + if (ret < 0) + return ret; + /* allocate spare buffer */ ret = resize_buffer_duplicate_size(&tr->max_buffer, &tr->array_buffer, RING_BUFFER_ALL_CPUS); @@ -1285,6 +1293,7 @@ static void free_snapshot(struct trace_array *tr) * The max_tr ring buffer has some state (e.g. ring->clock) and * we want preserve it. */ + ring_buffer_subbuf_order_set(tr->max_buffer.buffer, 0); ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); set_buffer_entries(&tr->max_buffer, 1); tracing_reset_online_cpus(&tr->max_buffer); @@ -1730,15 +1739,15 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) { int len; - if (trace_seq_used(s) <= s->seq.readpos) + if (trace_seq_used(s) <= s->readpos) return -EBUSY; - len = trace_seq_used(s) - s->seq.readpos; + len = trace_seq_used(s) - s->readpos; if (cnt > len) cnt = len; - memcpy(buf, s->buffer + s->seq.readpos, cnt); + memcpy(buf, s->buffer + s->readpos, cnt); - s->seq.readpos += cnt; + s->readpos += cnt; return cnt; } @@ -1772,7 +1781,7 @@ static void trace_create_maxlat_file(struct trace_array *tr, init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq); tr->d_max_latency = trace_create_file("tracing_max_latency", TRACE_MODE_WRITE, - d_tracer, &tr->max_latency, + d_tracer, tr, &tracing_max_lat_fops); } @@ -1805,7 +1814,7 @@ void latency_fsnotify(struct trace_array *tr) #define trace_create_maxlat_file(tr, d_tracer) \ trace_create_file("tracing_max_latency", TRACE_MODE_WRITE, \ - d_tracer, &tr->max_latency, &tracing_max_lat_fops) + d_tracer, tr, &tracing_max_lat_fops) #endif @@ -1893,6 +1902,9 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, __update_max_tr(tr, tsk, cpu); arch_spin_unlock(&tr->max_lock); + + /* Any waiters on the old snapshot buffer need to wake up */ + ring_buffer_wake_waiters(tr->array_buffer.buffer, RING_BUFFER_ALL_CPUS); } /** @@ -1944,12 +1956,23 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) static int wait_on_pipe(struct trace_iterator *iter, int full) { + int ret; + /* Iterators are static, they should be filled or empty */ if (trace_buffer_iter(iter, iter->cpu_file)) return 0; - return ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, - full); + ret = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full); + +#ifdef CONFIG_TRACER_MAX_TRACE + /* + * Make sure this is still the snapshot buffer, as if a snapshot were + * to happen, this would now be the main buffer. + */ + if (iter->snapshot) + iter->array_buffer = &iter->tr->max_buffer; +#endif + return ret; } #ifdef CONFIG_FTRACE_STARTUP_TEST @@ -2012,7 +2035,7 @@ static int run_tracer_selftest(struct tracer *type) #ifdef CONFIG_TRACER_MAX_TRACE if (type->use_max_tr) { /* If we expanded the buffers, make sure the max is expanded too */ - if (ring_buffer_expanded) + if (tr->ring_buffer_expanded) ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size, RING_BUFFER_ALL_CPUS); tr->allocated_snapshot = true; @@ -2038,7 +2061,7 @@ static int run_tracer_selftest(struct tracer *type) tr->allocated_snapshot = false; /* Shrink the max buffer again */ - if (ring_buffer_expanded) + if (tr->ring_buffer_expanded) ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); } @@ -2297,7 +2320,7 @@ struct saved_cmdlines_buffer { unsigned *map_cmdline_to_pid; unsigned cmdline_num; int cmdline_idx; - char *saved_cmdlines; + char saved_cmdlines[]; }; static struct saved_cmdlines_buffer *savedcmd; @@ -2311,47 +2334,58 @@ static inline void set_cmdline(int idx, const char *cmdline) strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); } -static int allocate_cmdlines_buffer(unsigned int val, - struct saved_cmdlines_buffer *s) +static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) { + int order = get_order(sizeof(*s) + s->cmdline_num * TASK_COMM_LEN); + + kfree(s->map_cmdline_to_pid); + free_pages((unsigned long)s, order); +} + +static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val) +{ + struct saved_cmdlines_buffer *s; + struct page *page; + int orig_size, size; + int order; + + /* Figure out how much is needed to hold the given number of cmdlines */ + orig_size = sizeof(*s) + val * TASK_COMM_LEN; + order = get_order(orig_size); + size = 1 << (order + PAGE_SHIFT); + page = alloc_pages(GFP_KERNEL, order); + if (!page) + return NULL; + + s = page_address(page); + memset(s, 0, sizeof(*s)); + + /* Round up to actual allocation */ + val = (size - sizeof(*s)) / TASK_COMM_LEN; + s->cmdline_num = val; + s->map_cmdline_to_pid = kmalloc_array(val, sizeof(*s->map_cmdline_to_pid), GFP_KERNEL); - if (!s->map_cmdline_to_pid) - return -ENOMEM; - - s->saved_cmdlines = kmalloc_array(TASK_COMM_LEN, val, GFP_KERNEL); - if (!s->saved_cmdlines) { - kfree(s->map_cmdline_to_pid); - return -ENOMEM; + if (!s->map_cmdline_to_pid) { + free_saved_cmdlines_buffer(s); + return NULL; } s->cmdline_idx = 0; - s->cmdline_num = val; memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(s->map_pid_to_cmdline)); memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, val * sizeof(*s->map_cmdline_to_pid)); - return 0; + return s; } static int trace_create_savedcmd(void) { - int ret; - - savedcmd = kmalloc(sizeof(*savedcmd), GFP_KERNEL); - if (!savedcmd) - return -ENOMEM; + savedcmd = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT); - ret = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT, savedcmd); - if (ret < 0) { - kfree(savedcmd); - savedcmd = NULL; - return -ENOMEM; - } - - return 0; + return savedcmd ? 0 : -ENOMEM; } int is_tracing_stopped(void) @@ -2359,13 +2393,7 @@ int is_tracing_stopped(void) return global_trace.stop_count; } -/** - * tracing_start - quick start of the tracer - * - * If tracing is enabled but was stopped by tracing_stop, - * this will start the tracer back up. - */ -void tracing_start(void) +static void tracing_start_tr(struct trace_array *tr) { struct trace_buffer *buffer; unsigned long flags; @@ -2373,119 +2401,83 @@ void tracing_start(void) if (tracing_disabled) return; - raw_spin_lock_irqsave(&global_trace.start_lock, flags); - if (--global_trace.stop_count) { - if (global_trace.stop_count < 0) { + raw_spin_lock_irqsave(&tr->start_lock, flags); + if (--tr->stop_count) { + if (WARN_ON_ONCE(tr->stop_count < 0)) { /* Someone screwed up their debugging */ - WARN_ON_ONCE(1); - global_trace.stop_count = 0; + tr->stop_count = 0; } goto out; } /* Prevent the buffers from switching */ - arch_spin_lock(&global_trace.max_lock); + arch_spin_lock(&tr->max_lock); - buffer = global_trace.array_buffer.buffer; + buffer = tr->array_buffer.buffer; if (buffer) ring_buffer_record_enable(buffer); #ifdef CONFIG_TRACER_MAX_TRACE - buffer = global_trace.max_buffer.buffer; + buffer = tr->max_buffer.buffer; if (buffer) ring_buffer_record_enable(buffer); #endif - arch_spin_unlock(&global_trace.max_lock); - - out: - raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); -} - -static void tracing_start_tr(struct trace_array *tr) -{ - struct trace_buffer *buffer; - unsigned long flags; - - if (tracing_disabled) - return; - - /* If global, we need to also start the max tracer */ - if (tr->flags & TRACE_ARRAY_FL_GLOBAL) - return tracing_start(); - - raw_spin_lock_irqsave(&tr->start_lock, flags); - - if (--tr->stop_count) { - if (tr->stop_count < 0) { - /* Someone screwed up their debugging */ - WARN_ON_ONCE(1); - tr->stop_count = 0; - } - goto out; - } - - buffer = tr->array_buffer.buffer; - if (buffer) - ring_buffer_record_enable(buffer); + arch_spin_unlock(&tr->max_lock); out: raw_spin_unlock_irqrestore(&tr->start_lock, flags); } /** - * tracing_stop - quick stop of the tracer + * tracing_start - quick start of the tracer * - * Light weight way to stop tracing. Use in conjunction with - * tracing_start. + * If tracing is enabled but was stopped by tracing_stop, + * this will start the tracer back up. */ -void tracing_stop(void) +void tracing_start(void) + +{ + return tracing_start_tr(&global_trace); +} + +static void tracing_stop_tr(struct trace_array *tr) { struct trace_buffer *buffer; unsigned long flags; - raw_spin_lock_irqsave(&global_trace.start_lock, flags); - if (global_trace.stop_count++) + raw_spin_lock_irqsave(&tr->start_lock, flags); + if (tr->stop_count++) goto out; /* Prevent the buffers from switching */ - arch_spin_lock(&global_trace.max_lock); + arch_spin_lock(&tr->max_lock); - buffer = global_trace.array_buffer.buffer; + buffer = tr->array_buffer.buffer; if (buffer) ring_buffer_record_disable(buffer); #ifdef CONFIG_TRACER_MAX_TRACE - buffer = global_trace.max_buffer.buffer; + buffer = tr->max_buffer.buffer; if (buffer) ring_buffer_record_disable(buffer); #endif - arch_spin_unlock(&global_trace.max_lock); + arch_spin_unlock(&tr->max_lock); out: - raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); + raw_spin_unlock_irqrestore(&tr->start_lock, flags); } -static void tracing_stop_tr(struct trace_array *tr) +/** + * tracing_stop - quick stop of the tracer + * + * Light weight way to stop tracing. Use in conjunction with + * tracing_start. + */ +void tracing_stop(void) { - struct trace_buffer *buffer; - unsigned long flags; - - /* If global, we need to also stop the max tracer */ - if (tr->flags & TRACE_ARRAY_FL_GLOBAL) - return tracing_stop(); - - raw_spin_lock_irqsave(&tr->start_lock, flags); - if (tr->stop_count++) - goto out; - - buffer = tr->array_buffer.buffer; - if (buffer) - ring_buffer_record_disable(buffer); - - out: - raw_spin_unlock_irqrestore(&tr->start_lock, flags); + return tracing_stop_tr(&global_trace); } static int trace_save_cmdline(struct task_struct *tsk) @@ -2769,8 +2761,11 @@ void trace_buffered_event_enable(void) for_each_tracing_cpu(cpu) { page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY, 0); - if (!page) - goto failed; + /* This is just an optimization and can handle failures */ + if (!page) { + pr_err("Failed to allocate event buffer\n"); + break; + } event = page_address(page); memset(event, 0, sizeof(*event)); @@ -2784,10 +2779,6 @@ void trace_buffered_event_enable(void) WARN_ON_ONCE(1); preempt_enable(); } - - return; - failed: - trace_buffered_event_disable(); } static void enable_trace_buffered_event(void *data) @@ -2822,11 +2813,9 @@ void trace_buffered_event_disable(void) if (--trace_buffered_event_ref) return; - preempt_disable(); /* For each CPU, set the buffer as used. */ - smp_call_function_many(tracing_buffer_mask, - disable_trace_buffered_event, NULL, 1); - preempt_enable(); + on_each_cpu_mask(tracing_buffer_mask, disable_trace_buffered_event, + NULL, true); /* Wait for all current users to finish */ synchronize_rcu(); @@ -2835,17 +2824,19 @@ void trace_buffered_event_disable(void) free_page((unsigned long)per_cpu(trace_buffered_event, cpu)); per_cpu(trace_buffered_event, cpu) = NULL; } + /* - * Make sure trace_buffered_event is NULL before clearing - * trace_buffered_event_cnt. + * Wait for all CPUs that potentially started checking if they can use + * their event buffer only after the previous synchronize_rcu() call and + * they still read a valid pointer from trace_buffered_event. It must be + * ensured they don't see cleared trace_buffered_event_cnt else they + * could wrongly decide to use the pointed-to buffer which is now freed. */ - smp_wmb(); + synchronize_rcu(); - preempt_disable(); - /* Do the work on each cpu */ - smp_call_function_many(tracing_buffer_mask, - enable_trace_buffered_event, NULL, 1); - preempt_enable(); + /* For each CPU, relinquish the buffer */ + on_each_cpu_mask(tracing_buffer_mask, enable_trace_buffered_event, NULL, + true); } static struct trace_buffer *temp_buffer; @@ -3403,7 +3394,7 @@ void trace_printk_init_buffers(void) pr_warn("**********************************************************\n"); /* Expand the buffers to set size */ - tracing_update_buffers(); + tracing_update_buffers(&global_trace); buffers_allocated = 1; @@ -3795,7 +3786,7 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str, /* OK if part of the temp seq buffer */ if ((addr >= (unsigned long)iter->tmp_seq.buffer) && - (addr < (unsigned long)iter->tmp_seq.buffer + PAGE_SIZE)) + (addr < (unsigned long)iter->tmp_seq.buffer + TRACE_SEQ_BUFFER_SIZE)) return true; /* Core rodata can not be freed */ @@ -3827,15 +3818,6 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str, return false; } -static const char *show_buffer(struct trace_seq *s) -{ - struct seq_buf *seq = &s->seq; - - seq_buf_terminate(seq); - - return seq->buffer; -} - static DEFINE_STATIC_KEY_FALSE(trace_no_verify); static int test_can_verify_check(const char *fmt, ...) @@ -3975,7 +3957,7 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, */ if (WARN_ONCE(!trace_safe_str(iter, str, star, len), "fmt: '%s' current_buffer: '%s'", - fmt, show_buffer(&iter->seq))) { + fmt, seq_buf_str(&iter->seq.seq))) { int ret; /* Try to safely read the string */ @@ -4773,7 +4755,11 @@ static int s_show(struct seq_file *m, void *v) iter->leftover = ret; } else { - print_trace_line(iter); + ret = print_trace_line(iter); + if (ret == TRACE_TYPE_PARTIAL_LINE) { + iter->seq.full = 0; + trace_seq_puts(&iter->seq, "[LINE TOO BIG]\n"); + } ret = trace_print_seq(m, &iter->seq); /* * If we overflow the seq_file buffer, then it will @@ -4973,6 +4959,54 @@ int tracing_open_generic_tr(struct inode *inode, struct file *filp) return 0; } +/* + * The private pointer of the inode is the trace_event_file. + * Update the tr ref count associated to it. + */ +int tracing_open_file_tr(struct inode *inode, struct file *filp) +{ + struct trace_event_file *file = inode->i_private; + int ret; + + ret = tracing_check_open_get_tr(file->tr); + if (ret) + return ret; + + mutex_lock(&event_mutex); + + /* Fail if the file is marked for removal */ + if (file->flags & EVENT_FILE_FL_FREED) { + trace_array_put(file->tr); + ret = -ENODEV; + } else { + event_file_get(file); + } + + mutex_unlock(&event_mutex); + if (ret) + return ret; + + filp->private_data = inode->i_private; + + return 0; +} + +int tracing_release_file_tr(struct inode *inode, struct file *filp) +{ + struct trace_event_file *file = inode->i_private; + + trace_array_put(file->tr); + event_file_put(file); + + return 0; +} + +int tracing_single_release_file_tr(struct inode *inode, struct file *filp) +{ + tracing_release_file_tr(inode, filp); + return single_release(inode, filp); +} + static int tracing_mark_open(struct inode *inode, struct file *filp) { stream_open(inode, filp); @@ -5017,7 +5051,7 @@ static int tracing_release(struct inode *inode, struct file *file) return 0; } -static int tracing_release_generic_tr(struct inode *inode, struct file *file) +int tracing_release_generic_tr(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; @@ -6033,26 +6067,14 @@ tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } -static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) -{ - kfree(s->saved_cmdlines); - kfree(s->map_cmdline_to_pid); - kfree(s); -} - static int tracing_resize_saved_cmdlines(unsigned int val) { struct saved_cmdlines_buffer *s, *savedcmd_temp; - s = kmalloc(sizeof(*s), GFP_KERNEL); + s = allocate_cmdlines_buffer(val); if (!s) return -ENOMEM; - if (allocate_cmdlines_buffer(val, s) < 0) { - kfree(s); - return -ENOMEM; - } - preempt_disable(); arch_spin_lock(&trace_cmdline_lock); savedcmd_temp = savedcmd; @@ -6347,19 +6369,21 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, * we use the size that was given, and we can forget about * expanding it later. */ - ring_buffer_expanded = true; + trace_set_ring_buffer_expanded(tr); /* May be called before buffers are initialized */ if (!tr->array_buffer.buffer) return 0; + /* Do not allow tracing while resizing ring buffer */ + tracing_stop_tr(tr); + ret = ring_buffer_resize(tr->array_buffer.buffer, size, cpu); if (ret < 0) - return ret; + goto out_start; #ifdef CONFIG_TRACER_MAX_TRACE - if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL) || - !tr->current_trace->use_max_tr) + if (!tr->allocated_snapshot) goto out; ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu); @@ -6384,7 +6408,7 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, WARN_ON(1); tracing_disabled = 1; } - return ret; + goto out_start; } update_buffer_entries(&tr->max_buffer, cpu); @@ -6393,7 +6417,8 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, #endif /* CONFIG_TRACER_MAX_TRACE */ update_buffer_entries(&tr->array_buffer, cpu); - + out_start: + tracing_start_tr(tr); return ret; } @@ -6425,6 +6450,7 @@ out: /** * tracing_update_buffers - used by tracing facility to expand ring buffers + * @tr: The tracing instance * * To save on memory when the tracing is never used on a system with it * configured in. The ring buffers are set to a minimum size. But once @@ -6433,13 +6459,13 @@ out: * * This function is to be called when a tracer is about to be used. */ -int tracing_update_buffers(void) +int tracing_update_buffers(struct trace_array *tr) { int ret = 0; mutex_lock(&trace_types_lock); - if (!ring_buffer_expanded) - ret = __tracing_resize_ring_buffer(&global_trace, trace_buf_size, + if (!tr->ring_buffer_expanded) + ret = __tracing_resize_ring_buffer(tr, trace_buf_size, RING_BUFFER_ALL_CPUS); mutex_unlock(&trace_types_lock); @@ -6493,7 +6519,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) mutex_lock(&trace_types_lock); - if (!ring_buffer_expanded) { + if (!tr->ring_buffer_expanded) { ret = __tracing_resize_ring_buffer(tr, trace_buf_size, RING_BUFFER_ALL_CPUS); if (ret < 0) @@ -6691,14 +6717,18 @@ static ssize_t tracing_max_lat_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - return tracing_nsecs_read(filp->private_data, ubuf, cnt, ppos); + struct trace_array *tr = filp->private_data; + + return tracing_nsecs_read(&tr->max_latency, ubuf, cnt, ppos); } static ssize_t tracing_max_lat_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - return tracing_nsecs_write(filp->private_data, ubuf, cnt, ppos); + struct trace_array *tr = filp->private_data; + + return tracing_nsecs_write(&tr->max_latency, ubuf, cnt, ppos); } #endif @@ -6923,8 +6953,8 @@ waitagain: goto out; } - if (cnt >= PAGE_SIZE) - cnt = PAGE_SIZE - 1; + if (cnt >= TRACE_SEQ_BUFFER_SIZE) + cnt = TRACE_SEQ_BUFFER_SIZE - 1; /* reset all but tr, trace, and overruns */ trace_iterator_reset(iter); @@ -6975,7 +7005,7 @@ waitagain: /* Now copy what we have to the user */ sret = trace_seq_to_user(&iter->seq, ubuf, cnt); - if (iter->seq.seq.readpos >= trace_seq_used(&iter->seq)) + if (iter->seq.readpos >= trace_seq_used(&iter->seq)) trace_seq_init(&iter->seq); /* @@ -7161,7 +7191,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf, } if (buf_size_same) { - if (!ring_buffer_expanded) + if (!tr->ring_buffer_expanded) r = sprintf(buf, "%lu (expanded: %lu)\n", size >> 10, trace_buf_size >> 10); @@ -7218,10 +7248,10 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf, mutex_lock(&trace_types_lock); for_each_tracing_cpu(cpu) { size += per_cpu_ptr(tr->array_buffer.data, cpu)->entries >> 10; - if (!ring_buffer_expanded) + if (!tr->ring_buffer_expanded) expanded_size += trace_buf_size >> 10; } - if (ring_buffer_expanded) + if (tr->ring_buffer_expanded) r = sprintf(buf, "%lu\n", size); else r = sprintf(buf, "%lu (expanded: %lu)\n", size, expanded_size); @@ -7269,8 +7299,9 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, enum event_trigger_type tt = ETT_NONE; struct trace_buffer *buffer; struct print_entry *entry; + int meta_size; ssize_t written; - int size; + size_t size; int len; /* Used in tracing_mark_raw_write() as well */ @@ -7283,23 +7314,44 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (!(tr->trace_flags & TRACE_ITER_MARKERS)) return -EINVAL; - if (cnt > TRACE_BUF_SIZE) - cnt = TRACE_BUF_SIZE; - - BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); + if ((ssize_t)cnt < 0) + return -EINVAL; - size = sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */ + meta_size = sizeof(*entry) + 2; /* add '\0' and possible '\n' */ + again: + size = cnt + meta_size; /* If less than "<faulted>", then make sure we can still add that */ if (cnt < FAULTED_SIZE) size += FAULTED_SIZE - cnt; + if (size > TRACE_SEQ_BUFFER_SIZE) { + cnt -= size - TRACE_SEQ_BUFFER_SIZE; + goto again; + } + buffer = tr->array_buffer.buffer; event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, tracing_gen_ctx()); - if (unlikely(!event)) + if (unlikely(!event)) { + /* + * If the size was greater than what was allowed, then + * make it smaller and try again. + */ + if (size > ring_buffer_max_event_size(buffer)) { + /* cnt < FAULTED size should never be bigger than max */ + if (WARN_ON_ONCE(cnt < FAULTED_SIZE)) + return -EBADF; + cnt = ring_buffer_max_event_size(buffer) - meta_size; + /* The above should only happen once */ + if (WARN_ON_ONCE(cnt + meta_size == size)) + return -EBADF; + goto again; + } + /* Ring buffer disabled, return as if not open for write */ return -EBADF; + } entry = ring_buffer_event_data(event); entry->ip = _THIS_IP_; @@ -7334,9 +7386,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, return written; } -/* Limit it for now to 3K (including tag) */ -#define RAW_DATA_MAX_SIZE (1024*3) - static ssize_t tracing_mark_raw_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) @@ -7358,19 +7407,18 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, return -EINVAL; /* The marker must at least have a tag id */ - if (cnt < sizeof(unsigned int) || cnt > RAW_DATA_MAX_SIZE) + if (cnt < sizeof(unsigned int)) return -EINVAL; - if (cnt > TRACE_BUF_SIZE) - cnt = TRACE_BUF_SIZE; - - BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); - size = sizeof(*entry) + cnt; if (cnt < FAULT_SIZE_ID) size += FAULT_SIZE_ID - cnt; buffer = tr->array_buffer.buffer; + + if (size > ring_buffer_max_event_size(buffer)) + return -EINVAL; + event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size, tracing_gen_ctx()); if (!event) @@ -7555,6 +7603,7 @@ struct ftrace_buffer_info { struct trace_iterator iter; void *spare; unsigned int spare_cpu; + unsigned int spare_size; unsigned int read; }; @@ -7615,7 +7664,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, unsigned long val; int ret; - ret = tracing_update_buffers(); + ret = tracing_update_buffers(tr); if (ret < 0) return ret; @@ -7752,18 +7801,20 @@ static const struct file_operations tracing_thresh_fops = { #ifdef CONFIG_TRACER_MAX_TRACE static const struct file_operations tracing_max_lat_fops = { - .open = tracing_open_generic, + .open = tracing_open_generic_tr, .read = tracing_max_lat_read, .write = tracing_max_lat_write, .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, }; #endif static const struct file_operations set_tracer_fops = { - .open = tracing_open_generic, + .open = tracing_open_generic_tr, .read = tracing_set_trace_read, .write = tracing_set_trace_write, .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, }; static const struct file_operations tracing_pipe_fops = { @@ -8257,6 +8308,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, { struct ftrace_buffer_info *info = filp->private_data; struct trace_iterator *iter = &info->iter; + void *trace_data; + int page_size; ssize_t ret = 0; ssize_t size; @@ -8268,6 +8321,17 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, return -EBUSY; #endif + page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer); + + /* Make sure the spare matches the current sub buffer size */ + if (info->spare) { + if (page_size != info->spare_size) { + ring_buffer_free_read_page(iter->array_buffer->buffer, + info->spare_cpu, info->spare); + info->spare = NULL; + } + } + if (!info->spare) { info->spare = ring_buffer_alloc_read_page(iter->array_buffer->buffer, iter->cpu_file); @@ -8276,19 +8340,20 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, info->spare = NULL; } else { info->spare_cpu = iter->cpu_file; + info->spare_size = page_size; } } if (!info->spare) return ret; /* Do we have previous read data to read? */ - if (info->read < PAGE_SIZE) + if (info->read < page_size) goto read; again: trace_access_lock(iter->cpu_file); ret = ring_buffer_read_page(iter->array_buffer->buffer, - &info->spare, + info->spare, count, iter->cpu_file, 0); trace_access_unlock(iter->cpu_file); @@ -8309,11 +8374,11 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, info->read = 0; read: - size = PAGE_SIZE - info->read; + size = page_size - info->read; if (size > count) size = count; - - ret = copy_to_user(ubuf, info->spare + info->read, size); + trace_data = ring_buffer_read_page_data(info->spare); + ret = copy_to_user(ubuf, trace_data + info->read, size); if (ret == size) return -EFAULT; @@ -8424,6 +8489,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, .spd_release = buffer_spd_release, }; struct buffer_ref *ref; + int page_size; int entries, i; ssize_t ret = 0; @@ -8432,13 +8498,14 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, return -EBUSY; #endif - if (*ppos & (PAGE_SIZE - 1)) + page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer); + if (*ppos & (page_size - 1)) return -EINVAL; - if (len & (PAGE_SIZE - 1)) { - if (len < PAGE_SIZE) + if (len & (page_size - 1)) { + if (len < page_size) return -EINVAL; - len &= PAGE_MASK; + len &= (~(page_size - 1)); } if (splice_grow_spd(pipe, &spd)) @@ -8448,7 +8515,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, trace_access_lock(iter->cpu_file); entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file); - for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= PAGE_SIZE) { + for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= page_size) { struct page *page; int r; @@ -8469,7 +8536,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, } ref->cpu = iter->cpu_file; - r = ring_buffer_read_page(ref->buffer, &ref->page, + r = ring_buffer_read_page(ref->buffer, ref->page, len, iter->cpu_file, 1); if (r < 0) { ring_buffer_free_read_page(ref->buffer, ref->cpu, @@ -8478,14 +8545,14 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, break; } - page = virt_to_page(ref->page); + page = virt_to_page(ring_buffer_read_page_data(ref->page)); spd.pages[i] = page; - spd.partial[i].len = PAGE_SIZE; + spd.partial[i].len = page_size; spd.partial[i].offset = 0; spd.partial[i].private = (unsigned long)ref; spd.nr_pages++; - *ppos += PAGE_SIZE; + *ppos += page_size; entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file); } @@ -8506,7 +8573,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, wait_index = READ_ONCE(iter->wait_index); - ret = wait_on_pipe(iter, iter->tr->buffer_percent); + ret = wait_on_pipe(iter, iter->snapshot ? 0 : iter->tr->buffer_percent); if (ret) goto out; @@ -8956,12 +9023,33 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } +static int tracing_open_options(struct inode *inode, struct file *filp) +{ + struct trace_option_dentry *topt = inode->i_private; + int ret; + + ret = tracing_check_open_get_tr(topt->tr); + if (ret) + return ret; + + filp->private_data = inode->i_private; + return 0; +} + +static int tracing_release_options(struct inode *inode, struct file *file) +{ + struct trace_option_dentry *topt = file->private_data; + + trace_array_put(topt->tr); + return 0; +} static const struct file_operations trace_options_fops = { - .open = tracing_open_generic, + .open = tracing_open_options, .read = trace_options_read, .write = trace_options_write, .llseek = generic_file_llseek, + .release = tracing_release_options, }; /* @@ -9308,6 +9396,103 @@ static const struct file_operations buffer_percent_fops = { .llseek = default_llseek, }; +static ssize_t +buffer_subbuf_size_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + size_t size; + char buf[64]; + int order; + int r; + + order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer); + size = (PAGE_SIZE << order) / 1024; + + r = sprintf(buf, "%zd\n", size); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +buffer_subbuf_size_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + unsigned long val; + int old_order; + int order; + int pages; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + val *= 1024; /* value passed in is in KB */ + + pages = DIV_ROUND_UP(val, PAGE_SIZE); + order = fls(pages - 1); + + /* limit between 1 and 128 system pages */ + if (order < 0 || order > 7) + return -EINVAL; + + /* Do not allow tracing while changing the order of the ring buffer */ + tracing_stop_tr(tr); + + old_order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer); + if (old_order == order) + goto out; + + ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, order); + if (ret) + goto out; + +#ifdef CONFIG_TRACER_MAX_TRACE + + if (!tr->allocated_snapshot) + goto out_max; + + ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order); + if (ret) { + /* Put back the old order */ + cnt = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, old_order); + if (WARN_ON_ONCE(cnt)) { + /* + * AARGH! We are left with different orders! + * The max buffer is our "snapshot" buffer. + * When a tracer needs a snapshot (one of the + * latency tracers), it swaps the max buffer + * with the saved snap shot. We succeeded to + * update the order of the main buffer, but failed to + * update the order of the max buffer. But when we tried + * to reset the main buffer to the original size, we + * failed there too. This is very unlikely to + * happen, but if it does, warn and kill all + * tracing. + */ + tracing_disabled = 1; + } + goto out; + } + out_max: +#endif + (*ppos)++; + out: + if (ret) + cnt = ret; + tracing_start_tr(tr); + return cnt; +} + +static const struct file_operations buffer_subbuf_size_fops = { + .open = tracing_open_generic_tr, + .read = buffer_subbuf_size_read, + .write = buffer_subbuf_size_write, + .release = tracing_release_generic_tr, + .llseek = default_llseek, +}; + static struct dentry *trace_instance_dir; static void @@ -9458,7 +9643,8 @@ static int trace_array_create_dir(struct trace_array *tr) return ret; } -static struct trace_array *trace_array_create(const char *name) +static struct trace_array * +trace_array_create_systems(const char *name, const char *systems) { struct trace_array *tr; int ret; @@ -9478,6 +9664,12 @@ static struct trace_array *trace_array_create(const char *name) if (!zalloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL)) goto out_free_tr; + if (systems) { + tr->system_names = kstrdup_const(systems, GFP_KERNEL); + if (!tr->system_names) + goto out_free_tr; + } + tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS; cpumask_copy(tr->tracing_cpumask, cpu_all_mask); @@ -9496,6 +9688,9 @@ static struct trace_array *trace_array_create(const char *name) if (allocate_trace_buffers(tr, trace_buf_size) < 0) goto out_free_tr; + /* The ring buffer is defaultly expanded */ + trace_set_ring_buffer_expanded(tr); + if (ftrace_allocate_ftrace_ops(tr) < 0) goto out_free_tr; @@ -9521,12 +9716,18 @@ static struct trace_array *trace_array_create(const char *name) free_trace_buffers(tr); free_cpumask_var(tr->pipe_cpumask); free_cpumask_var(tr->tracing_cpumask); + kfree_const(tr->system_names); kfree(tr->name); kfree(tr); return ERR_PTR(ret); } +static struct trace_array *trace_array_create(const char *name) +{ + return trace_array_create_systems(name, NULL); +} + static int instance_mkdir(const char *name) { struct trace_array *tr; @@ -9552,6 +9753,7 @@ out_unlock: /** * trace_array_get_by_name - Create/Lookup a trace array, given its name. * @name: The name of the trace array to be looked up/created. + * @systems: A list of systems to create event directories for (NULL for all) * * Returns pointer to trace array with given name. * NULL, if it cannot be created. @@ -9565,7 +9767,7 @@ out_unlock: * trace_array_put() is called, user space can not delete it. * */ -struct trace_array *trace_array_get_by_name(const char *name) +struct trace_array *trace_array_get_by_name(const char *name, const char *systems) { struct trace_array *tr; @@ -9577,7 +9779,7 @@ struct trace_array *trace_array_get_by_name(const char *name) goto out_unlock; } - tr = trace_array_create(name); + tr = trace_array_create_systems(name, systems); if (IS_ERR(tr)) tr = NULL; @@ -9624,6 +9826,7 @@ static int __remove_instance(struct trace_array *tr) free_cpumask_var(tr->pipe_cpumask); free_cpumask_var(tr->tracing_cpumask); + kfree_const(tr->system_names); kfree(tr->name); kfree(tr); @@ -9705,7 +9908,6 @@ static __init void create_trace_instances(struct dentry *d_tracer) static void init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) { - struct trace_event_file *file; int cpu; trace_create_file("available_tracers", TRACE_MODE_READ, d_tracer, @@ -9738,11 +9940,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("trace_marker", 0220, d_tracer, tr, &tracing_mark_fops); - file = __find_event_file(tr, "ftrace", "print"); - if (file && file->dir) - trace_create_file("trigger", TRACE_MODE_WRITE, file->dir, - file, &event_trigger_fops); - tr->trace_marker_file = file; + tr->trace_marker_file = __find_event_file(tr, "ftrace", "print"); trace_create_file("trace_marker_raw", 0220, d_tracer, tr, &tracing_mark_raw_fops); @@ -9761,6 +9959,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("buffer_percent", TRACE_MODE_WRITE, d_tracer, tr, &buffer_percent_fops); + trace_create_file("buffer_subbuf_size_kb", TRACE_MODE_WRITE, d_tracer, + tr, &buffer_subbuf_size_fops); + create_trace_options_dir(tr); #ifdef CONFIG_TRACER_MAX_TRACE @@ -10347,7 +10548,7 @@ __init static void enable_instances(void) if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE)) do_allocate_snapshot(tok); - tr = trace_array_get_by_name(tok); + tr = trace_array_get_by_name(tok, NULL); if (!tr) { pr_warn("Failed to create instance buffer %s\n", curr_str); continue; @@ -10390,7 +10591,7 @@ __init static int tracer_alloc_buffers(void) trace_printk_init_buffers(); /* To save memory, keep the ring buffer size to its minimum */ - if (ring_buffer_expanded) + if (global_trace.ring_buffer_expanded) ring_buf_size = trace_buf_size; else ring_buf_size = 1; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5669dd1f90d9..00f873910c5d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -377,11 +377,12 @@ struct trace_array { unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE]; unsigned int flags; raw_spinlock_t start_lock; + const char *system_names; struct list_head err_log; struct dentry *dir; struct dentry *options; struct dentry *percpu_dir; - struct dentry *event_dir; + struct eventfs_inode *event_dir; struct trace_options *topts; struct list_head systems; struct list_head events; @@ -410,6 +411,11 @@ struct trace_array { struct cond_snapshot *cond_snapshot; #endif struct trace_func_repeats __percpu *last_func_repeats; + /* + * On boot up, the ring buffer is set to the minimum size, so that + * we do not waste memory on systems that are not using tracing. + */ + bool ring_buffer_expanded; }; enum { @@ -610,6 +616,10 @@ void tracing_reset_all_online_cpus(void); void tracing_reset_all_online_cpus_unlocked(void); int tracing_open_generic(struct inode *inode, struct file *filp); int tracing_open_generic_tr(struct inode *inode, struct file *filp); +int tracing_release_generic_tr(struct inode *inode, struct file *file); +int tracing_open_file_tr(struct inode *inode, struct file *filp); +int tracing_release_file_tr(struct inode *inode, struct file *filp); +int tracing_single_release_file_tr(struct inode *inode, struct file *filp); bool tracing_is_disabled(void); bool tracer_tracing_is_on(struct trace_array *tr); void tracer_tracing_on(struct trace_array *tr); @@ -759,7 +769,7 @@ extern int DYN_FTRACE_TEST_NAME(void); #define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 extern int DYN_FTRACE_TEST_NAME2(void); -extern bool ring_buffer_expanded; +extern void trace_set_ring_buffer_expanded(struct trace_array *tr); extern bool tracing_selftest_disabled; #ifdef CONFIG_FTRACE_STARTUP_TEST @@ -1303,7 +1313,7 @@ static inline void trace_branch_disable(void) #endif /* CONFIG_BRANCH_TRACER */ /* set ring buffers to default size if not already done so */ -int tracing_update_buffers(void); +int tracing_update_buffers(struct trace_array *tr); union trace_synth_field { u8 as_u8; @@ -1342,7 +1352,7 @@ struct trace_subsystem_dir { struct list_head list; struct event_subsystem *subsystem; struct trace_array *tr; - struct eventfs_file *ef; + struct eventfs_inode *ei; int ref_count; int nr_events; }; @@ -1662,6 +1672,9 @@ extern void event_trigger_unregister(struct event_command *cmd_ops, char *glob, struct event_trigger_data *trigger_data); +extern void event_file_get(struct trace_event_file *file); +extern void event_file_put(struct trace_event_file *file); + /** * struct event_trigger_ops - callbacks for trace event triggers * diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 7ccc7a8e155b..dbe29b4c6a7a 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -633,7 +633,7 @@ trace_boot_init_instances(struct xbc_node *node) if (!p || *p == '\0') continue; - tr = trace_array_get_by_name(p); + tr = trace_array_get_by_name(p, NULL); if (!tr) { pr_err("Failed to get trace instance %s\n", p); continue; diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 72714cbf475c..03c851f57969 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -788,12 +788,9 @@ find_and_get_event(const char *system, const char *event_name) name = trace_event_name(tp_event); if (!name || strcmp(event_name, name)) continue; - if (!trace_event_try_get_ref(tp_event)) { + if (!trace_event_try_get_ref(tp_event)) return NULL; - break; - } return tp_event; - break; } return NULL; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index ed367d713be0..7c364b87352e 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -984,32 +984,41 @@ static void remove_subsystem(struct trace_subsystem_dir *dir) return; if (!--dir->nr_events) { - eventfs_remove(dir->ef); + eventfs_remove_dir(dir->ei); list_del(&dir->list); __put_system_dir(dir); } } -static void remove_event_file_dir(struct trace_event_file *file) +void event_file_get(struct trace_event_file *file) { - struct dentry *dir = file->dir; - struct dentry *child; + atomic_inc(&file->ref); +} - if (dir) { - spin_lock(&dir->d_lock); /* probably unneeded */ - list_for_each_entry(child, &dir->d_subdirs, d_child) { - if (d_really_is_positive(child)) /* probably unneeded */ - d_inode(child)->i_private = NULL; - } - spin_unlock(&dir->d_lock); +void event_file_put(struct trace_event_file *file) +{ + if (WARN_ON_ONCE(!atomic_read(&file->ref))) { + if (file->flags & EVENT_FILE_FL_FREED) + kmem_cache_free(file_cachep, file); + return; + } - tracefs_remove(dir); + if (atomic_dec_and_test(&file->ref)) { + /* Count should only go to zero when it is freed */ + if (WARN_ON_ONCE(!(file->flags & EVENT_FILE_FL_FREED))) + return; + kmem_cache_free(file_cachep, file); } - eventfs_remove(file->ef); +} + +static void remove_event_file_dir(struct trace_event_file *file) +{ + eventfs_remove_dir(file->ei); list_del(&file->list); remove_subsystem(file->system); free_event_filter(file->filter); - kmem_cache_free(file_cachep, file); + file->flags |= EVENT_FILE_FL_FREED; + event_file_put(file); } /* @@ -1179,7 +1188,7 @@ ftrace_event_write(struct file *file, const char __user *ubuf, if (!cnt) return 0; - ret = tracing_update_buffers(); + ret = tracing_update_buffers(tr); if (ret < 0) return ret; @@ -1382,7 +1391,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, flags = file->flags; mutex_unlock(&event_mutex); - if (!file) + if (!file || flags & EVENT_FILE_FL_FREED) return -ENODEV; if (flags & EVENT_FILE_FL_ENABLED && @@ -1410,18 +1419,20 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, if (ret) return ret; - ret = tracing_update_buffers(); - if (ret < 0) - return ret; - switch (val) { case 0: case 1: ret = -ENODEV; mutex_lock(&event_mutex); file = event_file_data(filp); - if (likely(file)) + if (likely(file && !(file->flags & EVENT_FILE_FL_FREED))) { + ret = tracing_update_buffers(file->tr); + if (ret < 0) { + mutex_unlock(&event_mutex); + return ret; + } ret = ftrace_event_enable_disable(file, val); + } mutex_unlock(&event_mutex); break; @@ -1495,7 +1506,7 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, if (ret) return ret; - ret = tracing_update_buffers(); + ret = tracing_update_buffers(dir->tr); if (ret < 0) return ret; @@ -1694,7 +1705,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, mutex_lock(&event_mutex); file = event_file_data(filp); - if (file) + if (file && !(file->flags & EVENT_FILE_FL_FREED)) print_event_filter(file, s); mutex_unlock(&event_mutex); @@ -1882,9 +1893,33 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, } static ssize_t -show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +show_header_page_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + struct trace_seq *s; + int r; + + if (*ppos) + return 0; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + ring_buffer_print_page_header(tr->array_buffer.buffer, s); + r = simple_read_from_buffer(ubuf, cnt, ppos, + s->buffer, trace_seq_used(s)); + + kfree(s); + + return r; +} + +static ssize_t +show_header_event_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - int (*func)(struct trace_seq *s) = filp->private_data; struct trace_seq *s; int r; @@ -1897,7 +1932,7 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) trace_seq_init(s); - func(s); + ring_buffer_print_entry_header(s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, trace_seq_used(s)); @@ -1969,7 +2004,7 @@ event_pid_write(struct file *filp, const char __user *ubuf, if (!cnt) return 0; - ret = tracing_update_buffers(); + ret = tracing_update_buffers(tr); if (ret < 0) return ret; @@ -2103,9 +2138,10 @@ static const struct file_operations ftrace_set_event_notrace_pid_fops = { }; static const struct file_operations ftrace_enable_fops = { - .open = tracing_open_generic, + .open = tracing_open_file_tr, .read = event_enable_read, .write = event_enable_write, + .release = tracing_release_file_tr, .llseek = default_llseek, }; @@ -2122,9 +2158,10 @@ static const struct file_operations ftrace_event_id_fops = { }; static const struct file_operations ftrace_event_filter_fops = { - .open = tracing_open_generic, + .open = tracing_open_file_tr, .read = event_filter_read, .write = event_filter_write, + .release = tracing_release_file_tr, .llseek = default_llseek, }; @@ -2152,10 +2189,18 @@ static const struct file_operations ftrace_tr_enable_fops = { .release = subsystem_release, }; -static const struct file_operations ftrace_show_header_fops = { - .open = tracing_open_generic, - .read = show_header, +static const struct file_operations ftrace_show_header_page_fops = { + .open = tracing_open_generic_tr, + .read = show_header_page_file, .llseek = default_llseek, + .release = tracing_release_generic_tr, +}; + +static const struct file_operations ftrace_show_header_event_fops = { + .open = tracing_open_generic_tr, + .read = show_header_event_file, + .llseek = default_llseek, + .release = tracing_release_generic_tr, }; static int @@ -2291,13 +2336,40 @@ create_new_subsystem(const char *name) return NULL; } -static struct eventfs_file * +static int system_callback(const char *name, umode_t *mode, void **data, + const struct file_operations **fops) +{ + if (strcmp(name, "filter") == 0) + *fops = &ftrace_subsystem_filter_fops; + + else if (strcmp(name, "enable") == 0) + *fops = &ftrace_system_enable_fops; + + else + return 0; + + *mode = TRACE_MODE_WRITE; + return 1; +} + +static struct eventfs_inode * event_subsystem_dir(struct trace_array *tr, const char *name, - struct trace_event_file *file, struct dentry *parent) + struct trace_event_file *file, struct eventfs_inode *parent) { struct event_subsystem *system, *iter; struct trace_subsystem_dir *dir; - int res; + struct eventfs_inode *ei; + int nr_entries; + static struct eventfs_entry system_entries[] = { + { + .name = "filter", + .callback = system_callback, + }, + { + .name = "enable", + .callback = system_callback, + } + }; /* First see if we did not already create this dir */ list_for_each_entry(dir, &tr->systems, list) { @@ -2305,7 +2377,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name, if (strcmp(system->name, name) == 0) { dir->nr_events++; file->system = dir; - return dir->ef; + return dir->ei; } } @@ -2329,38 +2401,29 @@ event_subsystem_dir(struct trace_array *tr, const char *name, } else __get_system(system); - dir->ef = eventfs_add_subsystem_dir(name, parent); - if (IS_ERR(dir->ef)) { + /* ftrace only has directories no files */ + if (strcmp(name, "ftrace") == 0) + nr_entries = 0; + else + nr_entries = ARRAY_SIZE(system_entries); + + ei = eventfs_create_dir(name, parent, system_entries, nr_entries, dir); + if (IS_ERR(ei)) { pr_warn("Failed to create system directory %s\n", name); __put_system(system); goto out_free; } + dir->ei = ei; dir->tr = tr; dir->ref_count = 1; dir->nr_events = 1; dir->subsystem = system; file->system = dir; - /* the ftrace system is special, do not create enable or filter files */ - if (strcmp(name, "ftrace") != 0) { - - res = eventfs_add_file("filter", TRACE_MODE_WRITE, - dir->ef, dir, - &ftrace_subsystem_filter_fops); - if (res) { - kfree(system->filter); - system->filter = NULL; - pr_warn("Could not create tracefs '%s/filter' entry\n", name); - } - - eventfs_add_file("enable", TRACE_MODE_WRITE, dir->ef, dir, - &ftrace_system_enable_fops); - } - list_add(&dir->list, &tr->systems); - return dir->ef; + return dir->ei; out_free: kfree(dir); @@ -2409,14 +2472,134 @@ event_define_fields(struct trace_event_call *call) return ret; } +static int event_callback(const char *name, umode_t *mode, void **data, + const struct file_operations **fops) +{ + struct trace_event_file *file = *data; + struct trace_event_call *call = file->event_call; + + if (strcmp(name, "format") == 0) { + *mode = TRACE_MODE_READ; + *fops = &ftrace_event_format_fops; + *data = call; + return 1; + } + + /* + * Only event directories that can be enabled should have + * triggers or filters, with the exception of the "print" + * event that can have a "trigger" file. + */ + if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) { + if (call->class->reg && strcmp(name, "enable") == 0) { + *mode = TRACE_MODE_WRITE; + *fops = &ftrace_enable_fops; + return 1; + } + + if (strcmp(name, "filter") == 0) { + *mode = TRACE_MODE_WRITE; + *fops = &ftrace_event_filter_fops; + return 1; + } + } + + if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) || + strcmp(trace_event_name(call), "print") == 0) { + if (strcmp(name, "trigger") == 0) { + *mode = TRACE_MODE_WRITE; + *fops = &event_trigger_fops; + return 1; + } + } + +#ifdef CONFIG_PERF_EVENTS + if (call->event.type && call->class->reg && + strcmp(name, "id") == 0) { + *mode = TRACE_MODE_READ; + *data = (void *)(long)call->event.type; + *fops = &ftrace_event_id_fops; + return 1; + } +#endif + +#ifdef CONFIG_HIST_TRIGGERS + if (strcmp(name, "hist") == 0) { + *mode = TRACE_MODE_READ; + *fops = &event_hist_fops; + return 1; + } +#endif +#ifdef CONFIG_HIST_TRIGGERS_DEBUG + if (strcmp(name, "hist_debug") == 0) { + *mode = TRACE_MODE_READ; + *fops = &event_hist_debug_fops; + return 1; + } +#endif +#ifdef CONFIG_TRACE_EVENT_INJECT + if (call->event.type && call->class->reg && + strcmp(name, "inject") == 0) { + *mode = 0200; + *fops = &event_inject_fops; + return 1; + } +#endif + return 0; +} + static int -event_create_dir(struct dentry *parent, struct trace_event_file *file) +event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file) { struct trace_event_call *call = file->event_call; - struct eventfs_file *ef_subsystem = NULL; struct trace_array *tr = file->tr; + struct eventfs_inode *e_events; + struct eventfs_inode *ei; const char *name; + int nr_entries; int ret; + static struct eventfs_entry event_entries[] = { + { + .name = "enable", + .callback = event_callback, + }, + { + .name = "filter", + .callback = event_callback, + }, + { + .name = "trigger", + .callback = event_callback, + }, + { + .name = "format", + .callback = event_callback, + }, +#ifdef CONFIG_PERF_EVENTS + { + .name = "id", + .callback = event_callback, + }, +#endif +#ifdef CONFIG_HIST_TRIGGERS + { + .name = "hist", + .callback = event_callback, + }, +#endif +#ifdef CONFIG_HIST_TRIGGERS_DEBUG + { + .name = "hist_debug", + .callback = event_callback, + }, +#endif +#ifdef CONFIG_TRACE_EVENT_INJECT + { + .name = "inject", + .callback = event_callback, + }, +#endif + }; /* * If the trace point header did not define TRACE_SYSTEM @@ -2426,27 +2609,20 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) if (WARN_ON_ONCE(strcmp(call->class->system, TRACE_SYSTEM) == 0)) return -ENODEV; - ef_subsystem = event_subsystem_dir(tr, call->class->system, file, parent); - if (!ef_subsystem) + e_events = event_subsystem_dir(tr, call->class->system, file, parent); + if (!e_events) return -ENOMEM; + nr_entries = ARRAY_SIZE(event_entries); + name = trace_event_name(call); - file->ef = eventfs_add_dir(name, ef_subsystem); - if (IS_ERR(file->ef)) { + ei = eventfs_create_dir(name, e_events, event_entries, nr_entries, file); + if (IS_ERR(ei)) { pr_warn("Could not create tracefs '%s' directory\n", name); return -1; } - if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) - eventfs_add_file("enable", TRACE_MODE_WRITE, file->ef, file, - &ftrace_enable_fops); - -#ifdef CONFIG_PERF_EVENTS - if (call->event.type && call->class->reg) - eventfs_add_file("id", TRACE_MODE_READ, file->ef, - (void *)(long)call->event.type, - &ftrace_event_id_fops); -#endif + file->ei = ei; ret = event_define_fields(call); if (ret < 0) { @@ -2454,35 +2630,6 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) return ret; } - /* - * Only event directories that can be enabled should have - * triggers or filters. - */ - if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) { - eventfs_add_file("filter", TRACE_MODE_WRITE, file->ef, - file, &ftrace_event_filter_fops); - - eventfs_add_file("trigger", TRACE_MODE_WRITE, file->ef, - file, &event_trigger_fops); - } - -#ifdef CONFIG_HIST_TRIGGERS - eventfs_add_file("hist", TRACE_MODE_READ, file->ef, file, - &event_hist_fops); -#endif -#ifdef CONFIG_HIST_TRIGGERS_DEBUG - eventfs_add_file("hist_debug", TRACE_MODE_READ, file->ef, file, - &event_hist_debug_fops); -#endif - eventfs_add_file("format", TRACE_MODE_READ, file->ef, call, - &ftrace_event_format_fops); - -#ifdef CONFIG_TRACE_EVENT_INJECT - if (call->event.type && call->class->reg) - eventfs_add_file("inject", 0200, file->ef, file, - &event_inject_fops); -#endif - return 0; } @@ -2776,10 +2923,32 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) update_event_fields(call, map[i]); } } + cond_resched(); } up_write(&trace_event_sem); } +static bool event_in_systems(struct trace_event_call *call, + const char *systems) +{ + const char *system; + const char *p; + + if (!systems) + return true; + + system = call->class->system; + p = strstr(systems, system); + if (!p) + return false; + + if (p != systems && !isspace(*(p - 1)) && *(p - 1) != ',') + return false; + + p += strlen(system); + return !*p || isspace(*p) || *p == ','; +} + static struct trace_event_file * trace_create_new_event(struct trace_event_call *call, struct trace_array *tr) @@ -2789,9 +2958,12 @@ trace_create_new_event(struct trace_event_call *call, struct trace_event_file *file; unsigned int first; + if (!event_in_systems(call, tr->system_names)) + return NULL; + file = kmem_cache_alloc(file_cachep, GFP_TRACE); if (!file) - return NULL; + return ERR_PTR(-ENOMEM); pid_list = rcu_dereference_protected(tr->filtered_pids, lockdep_is_held(&event_mutex)); @@ -2808,6 +2980,7 @@ trace_create_new_event(struct trace_event_call *call, atomic_set(&file->tm_ref, 0); INIT_LIST_HEAD(&file->triggers); list_add(&file->list, &tr->events); + event_file_get(file); return file; } @@ -2829,7 +3002,7 @@ static __init int setup_trace_triggers(char *str) int i; strscpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE); - ring_buffer_expanded = true; + trace_set_ring_buffer_expanded(NULL); disable_tracing_selftest("running event triggers"); buf = bootup_trigger_buf; @@ -2855,8 +3028,17 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr) struct trace_event_file *file; file = trace_create_new_event(call, tr); + /* + * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed + * allocation, or NULL if the event is not part of the tr->system_names. + * When the event is not part of the tr->system_names, return zero, not + * an error. + */ if (!file) - return -ENOMEM; + return 0; + + if (IS_ERR(file)) + return PTR_ERR(file); if (eventdir_initialized) return event_create_dir(tr->event_dir, file); @@ -2895,8 +3077,17 @@ __trace_early_add_new_event(struct trace_event_call *call, int ret; file = trace_create_new_event(call, tr); + /* + * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed + * allocation, or NULL if the event is not part of the tr->system_names. + * When the event is not part of the tr->system_names, return zero, not + * an error. + */ if (!file) - return -ENOMEM; + return 0; + + if (IS_ERR(file)) + return PTR_ERR(file); ret = event_define_fields(call); if (ret) @@ -3619,37 +3810,71 @@ static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; static __init int setup_trace_event(char *str) { strscpy(bootup_event_buf, str, COMMAND_LINE_SIZE); - ring_buffer_expanded = true; + trace_set_ring_buffer_expanded(NULL); disable_tracing_selftest("running event tracing"); return 1; } __setup("trace_event=", setup_trace_event); +static int events_callback(const char *name, umode_t *mode, void **data, + const struct file_operations **fops) +{ + if (strcmp(name, "enable") == 0) { + *mode = TRACE_MODE_WRITE; + *fops = &ftrace_tr_enable_fops; + return 1; + } + + if (strcmp(name, "header_page") == 0) { + *mode = TRACE_MODE_READ; + *fops = &ftrace_show_header_page_fops; + + } else if (strcmp(name, "header_event") == 0) { + *mode = TRACE_MODE_READ; + *fops = &ftrace_show_header_event_fops; + } else + return 0; + + return 1; +} + /* Expects to have event_mutex held when called */ static int create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) { - struct dentry *d_events; + struct eventfs_inode *e_events; struct dentry *entry; - int error = 0; + int nr_entries; + static struct eventfs_entry events_entries[] = { + { + .name = "enable", + .callback = events_callback, + }, + { + .name = "header_page", + .callback = events_callback, + }, + { + .name = "header_event", + .callback = events_callback, + }, + }; entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent, tr, &ftrace_set_event_fops); if (!entry) return -ENOMEM; - d_events = eventfs_create_events_dir("events", parent); - if (IS_ERR(d_events)) { + nr_entries = ARRAY_SIZE(events_entries); + + e_events = eventfs_create_events_dir("events", parent, events_entries, + nr_entries, tr); + if (IS_ERR(e_events)) { pr_warn("Could not create tracefs 'events' directory\n"); return -ENOMEM; } - error = eventfs_add_events_file("enable", TRACE_MODE_WRITE, d_events, - tr, &ftrace_tr_enable_fops); - if (error) - return -ENOMEM; - /* There are not as crucial, just warn if they are not created */ trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent, @@ -3659,16 +3884,7 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) TRACE_MODE_WRITE, parent, tr, &ftrace_set_event_notrace_pid_fops); - /* ring buffer internal formats */ - eventfs_add_events_file("header_page", TRACE_MODE_READ, d_events, - ring_buffer_print_page_header, - &ftrace_show_header_fops); - - eventfs_add_events_file("header_event", TRACE_MODE_READ, d_events, - ring_buffer_print_entry_header, - &ftrace_show_header_fops); - - tr->event_dir = d_events; + tr->event_dir = e_events; return 0; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 33264e510d16..0c611b281a5b 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -2349,6 +2349,9 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string) struct event_filter *filter = NULL; int err; + if (file->flags & EVENT_FILE_FL_FREED) + return -ENODEV; + if (!strcmp(strstrip(filter_string), "0")) { filter_disable(file); filter = event_filter(file); diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index d06938ae0717..6ece1308d36a 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -774,23 +774,16 @@ static void last_cmd_set(struct trace_event_file *file, char *str) { const char *system = NULL, *name = NULL; struct trace_event_call *call; - int len; if (!str) return; - /* sizeof() contains the nul byte */ - len = sizeof(HIST_PREFIX) + strlen(str); kfree(last_cmd); - last_cmd = kzalloc(len, GFP_KERNEL); + + last_cmd = kasprintf(GFP_KERNEL, HIST_PREFIX "%s", str); if (!last_cmd) return; - strcpy(last_cmd, HIST_PREFIX); - /* Again, sizeof() contains the nul byte */ - len -= sizeof(HIST_PREFIX); - strncat(last_cmd, str, len); - if (file) { call = file->event_call; system = call->class->system; @@ -4812,36 +4805,35 @@ static int parse_actions(struct hist_trigger_data *hist_data) int len; for (i = 0; i < hist_data->attrs->n_actions; i++) { + enum handler_id hid = 0; + char *action_str; + str = hist_data->attrs->action_str[i]; - if ((len = str_has_prefix(str, "onmatch("))) { - char *action_str = str + len; + if ((len = str_has_prefix(str, "onmatch("))) + hid = HANDLER_ONMATCH; + else if ((len = str_has_prefix(str, "onmax("))) + hid = HANDLER_ONMAX; + else if ((len = str_has_prefix(str, "onchange("))) + hid = HANDLER_ONCHANGE; - data = onmatch_parse(tr, action_str); - if (IS_ERR(data)) { - ret = PTR_ERR(data); - break; - } - } else if ((len = str_has_prefix(str, "onmax("))) { - char *action_str = str + len; + action_str = str + len; - data = track_data_parse(hist_data, action_str, - HANDLER_ONMAX); - if (IS_ERR(data)) { - ret = PTR_ERR(data); - break; - } - } else if ((len = str_has_prefix(str, "onchange("))) { - char *action_str = str + len; + switch (hid) { + case HANDLER_ONMATCH: + data = onmatch_parse(tr, action_str); + break; + case HANDLER_ONMAX: + case HANDLER_ONCHANGE: + data = track_data_parse(hist_data, action_str, hid); + break; + default: + data = ERR_PTR(-EINVAL); + break; + } - data = track_data_parse(hist_data, action_str, - HANDLER_ONCHANGE); - if (IS_ERR(data)) { - ret = PTR_ERR(data); - break; - } - } else { - ret = -EINVAL; + if (IS_ERR(data)) { + ret = PTR_ERR(data); break; } @@ -5630,10 +5622,12 @@ static int event_hist_open(struct inode *inode, struct file *file) { int ret; - ret = security_locked_down(LOCKDOWN_TRACEFS); + ret = tracing_open_file_tr(inode, file); if (ret) return ret; + /* Clear private_data to avoid warning in single_open() */ + file->private_data = NULL; return single_open(file, hist_show, file); } @@ -5641,7 +5635,7 @@ const struct file_operations event_hist_fops = { .open = event_hist_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = tracing_single_release_file_tr, }; #ifdef CONFIG_HIST_TRIGGERS_DEBUG @@ -5907,10 +5901,12 @@ static int event_hist_debug_open(struct inode *inode, struct file *file) { int ret; - ret = security_locked_down(LOCKDOWN_TRACEFS); + ret = tracing_open_file_tr(inode, file); if (ret) return ret; + /* Clear private_data to avoid warning in single_open() */ + file->private_data = NULL; return single_open(file, hist_debug_show, file); } @@ -5918,7 +5914,7 @@ const struct file_operations event_hist_debug_fops = { .open = event_hist_debug_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = tracing_single_release_file_tr, }; #endif diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c index abe805d471eb..8650562bdaa9 100644 --- a/kernel/trace/trace_events_inject.c +++ b/kernel/trace/trace_events_inject.c @@ -328,7 +328,8 @@ event_inject_read(struct file *file, char __user *buf, size_t size, } const struct file_operations event_inject_fops = { - .open = tracing_open_generic, + .open = tracing_open_file_tr, .read = event_inject_read, .write = event_inject_write, + .release = tracing_release_file_tr, }; diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 9897d0bfcab7..e7af286af4f1 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -337,7 +337,7 @@ static void print_synth_event_num_val(struct trace_seq *s, break; default: - trace_seq_printf(s, print_fmt, name, val, space); + trace_seq_printf(s, print_fmt, name, val->as_u64, space); break; } } @@ -452,7 +452,7 @@ static unsigned int trace_string(struct synth_trace_event *entry, #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE if ((unsigned long)str_val < TASK_SIZE) - ret = strncpy_from_user_nofault(str_field, str_val, STR_VAR_LEN_MAX); + ret = strncpy_from_user_nofault(str_field, (const void __user *)str_val, STR_VAR_LEN_MAX); else #endif ret = strncpy_from_kernel_nofault(str_field, str_val, STR_VAR_LEN_MAX); @@ -1137,7 +1137,7 @@ EXPORT_SYMBOL_GPL(synth_event_add_fields); * @cmd: A pointer to the dynevent_cmd struct representing the new event * @name: The name of the synthetic event * @mod: The module creating the event, NULL if not created from a module - * @args: Variable number of arg (pairs), one pair for each field + * @...: Variable number of arg (pairs), one pair for each field * * NOTE: Users normally won't want to call this function directly, but * rather use the synth_event_gen_cmd_start() wrapper, which @@ -1695,7 +1695,7 @@ __synth_event_trace_end(struct synth_event_trace_state *trace_state) * synth_event_trace - Trace a synthetic event * @file: The trace_event_file representing the synthetic event * @n_vals: The number of values in vals - * @args: Variable number of args containing the event values + * @...: Variable number of args containing the event values * * Trace a synthetic event using the values passed in the variable * argument list. diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 46439e3bcec4..b33c3861fbbb 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1470,8 +1470,10 @@ register_snapshot_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file) { - if (tracing_alloc_snapshot_instance(file->tr) != 0) - return 0; + int ret = tracing_alloc_snapshot_instance(file->tr); + + if (ret < 0) + return ret; return register_trigger(glob, data, file); } diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 6f046650e527..e76f5e1efdf2 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -50,18 +50,6 @@ #define EVENT_STATUS_OTHER BIT(7) /* - * User register flags are not allowed yet, keep them here until we are - * ready to expose them out to the user ABI. - */ -enum user_reg_flag { - /* Event will not delete upon last reference closing */ - USER_EVENT_REG_PERSIST = 1U << 0, - - /* This value or above is currently non-ABI */ - USER_EVENT_REG_MAX = 1U << 1, -}; - -/* * Stores the system name, tables, and locks for a group of events. This * allows isolation for events by various means. */ @@ -127,8 +115,13 @@ struct user_event_enabler { /* Bit 7 is for freeing status of enablement */ #define ENABLE_VAL_FREEING_BIT 7 -/* Only duplicate the bit value */ -#define ENABLE_VAL_DUP_MASK ENABLE_VAL_BIT_MASK +/* Bit 8 is for marking 32-bit on 64-bit */ +#define ENABLE_VAL_32_ON_64_BIT 8 + +#define ENABLE_VAL_COMPAT_MASK (1 << ENABLE_VAL_32_ON_64_BIT) + +/* Only duplicate the bit and compat values */ +#define ENABLE_VAL_DUP_MASK (ENABLE_VAL_BIT_MASK | ENABLE_VAL_COMPAT_MASK) #define ENABLE_BITOPS(e) (&(e)->values) @@ -174,6 +167,30 @@ struct user_event_validator { int flags; }; +static inline void align_addr_bit(unsigned long *addr, int *bit, + unsigned long *flags) +{ + if (IS_ALIGNED(*addr, sizeof(long))) { +#ifdef __BIG_ENDIAN + /* 32 bit on BE 64 bit requires a 32 bit offset when aligned. */ + if (test_bit(ENABLE_VAL_32_ON_64_BIT, flags)) + *bit += 32; +#endif + return; + } + + *addr = ALIGN_DOWN(*addr, sizeof(long)); + + /* + * We only support 32 and 64 bit values. The only time we need + * to align is a 32 bit value on a 64 bit kernel, which on LE + * is always 32 bits, and on BE requires no change when unaligned. + */ +#ifdef __LITTLE_ENDIAN + *bit += 32; +#endif +} + typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i, void *tpdata, bool *faulted); @@ -191,6 +208,17 @@ static u32 user_event_key(char *name) return jhash(name, strlen(name), 0); } +static bool user_event_capable(u16 reg_flags) +{ + /* Persistent events require CAP_PERFMON / CAP_SYS_ADMIN */ + if (reg_flags & USER_EVENT_REG_PERSIST) { + if (!perfmon_capable()) + return false; + } + + return true; +} + static struct user_event *user_event_get(struct user_event *user) { refcount_inc(&user->refcnt); @@ -482,6 +510,7 @@ static int user_event_enabler_write(struct user_event_mm *mm, unsigned long *ptr; struct page *page; void *kaddr; + int bit = ENABLE_BIT(enabler); int ret; lockdep_assert_held(&event_mutex); @@ -497,6 +526,8 @@ static int user_event_enabler_write(struct user_event_mm *mm, test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler)))) return -EBUSY; + align_addr_bit(&uaddr, &bit, ENABLE_BITOPS(enabler)); + ret = pin_user_pages_remote(mm->mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT, &page, NULL); @@ -515,9 +546,9 @@ static int user_event_enabler_write(struct user_event_mm *mm, /* Update bit atomically, user tracers must be atomic as well */ if (enabler->event && enabler->event->status) - set_bit(ENABLE_BIT(enabler), ptr); + set_bit(bit, ptr); else - clear_bit(ENABLE_BIT(enabler), ptr); + clear_bit(bit, ptr); kunmap_local(kaddr); unpin_user_pages_dirty_lock(&page, 1, true); @@ -849,6 +880,12 @@ static struct user_event_enabler enabler->event = user; enabler->addr = uaddr; enabler->values = reg->enable_bit; + +#if BITS_PER_LONG >= 64 + if (reg->enable_size == 4) + set_bit(ENABLE_VAL_32_ON_64_BIT, ENABLE_BITOPS(enabler)); +#endif + retry: /* Prevents state changes from racing with new enablers */ mutex_lock(&event_mutex); @@ -1773,6 +1810,9 @@ static int user_event_free(struct dyn_event *ev) if (!user_event_last_ref(user)) return -EBUSY; + if (!user_event_capable(user->reg_flags)) + return -EPERM; + return destroy_user_event(user); } @@ -1888,10 +1928,13 @@ static int user_event_parse(struct user_event_group *group, char *name, int argc = 0; char **argv; - /* User register flags are not ready yet */ - if (reg_flags != 0 || flags != NULL) + /* Currently don't support any text based flags */ + if (flags != NULL) return -EINVAL; + if (!user_event_capable(reg_flags)) + return -EPERM; + /* Prevent dyn_event from racing */ mutex_lock(&event_mutex); user = find_user_event(group, name, &key); @@ -2024,6 +2067,9 @@ static int delete_user_event(struct user_event_group *group, char *name) if (!user_event_last_ref(user)) return -EBUSY; + if (!user_event_capable(user->reg_flags)) + return -EPERM; + return destroy_user_event(user); } @@ -2131,14 +2177,12 @@ static int user_events_open(struct inode *node, struct file *file) static ssize_t user_events_write(struct file *file, const char __user *ubuf, size_t count, loff_t *ppos) { - struct iovec iov; struct iov_iter i; if (unlikely(*ppos != 0)) return -EFAULT; - if (unlikely(import_single_range(ITER_SOURCE, (char __user *)ubuf, - count, &iov, &i))) + if (unlikely(import_ubuf(ITER_SOURCE, (char __user *)ubuf, count, &i))) return -EFAULT; return user_events_write_core(file, &i); @@ -2377,7 +2421,8 @@ static long user_unreg_get(struct user_unreg __user *ureg, } static int user_event_mm_clear_bit(struct user_event_mm *user_mm, - unsigned long uaddr, unsigned char bit) + unsigned long uaddr, unsigned char bit, + unsigned long flags) { struct user_event_enabler enabler; int result; @@ -2385,7 +2430,7 @@ static int user_event_mm_clear_bit(struct user_event_mm *user_mm, memset(&enabler, 0, sizeof(enabler)); enabler.addr = uaddr; - enabler.values = bit; + enabler.values = bit | flags; retry: /* Prevents state changes from racing with new enablers */ mutex_lock(&event_mutex); @@ -2415,6 +2460,7 @@ static long user_events_ioctl_unreg(unsigned long uarg) struct user_event_mm *mm = current->user_event_mm; struct user_event_enabler *enabler, *next; struct user_unreg reg; + unsigned long flags; long ret; ret = user_unreg_get(ureg, ®); @@ -2425,6 +2471,7 @@ static long user_events_ioctl_unreg(unsigned long uarg) if (!mm) return -ENOENT; + flags = 0; ret = -ENOENT; /* @@ -2441,6 +2488,9 @@ static long user_events_ioctl_unreg(unsigned long uarg) ENABLE_BIT(enabler) == reg.disable_bit) { set_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler)); + /* We must keep compat flags for the clear */ + flags |= enabler->values & ENABLE_VAL_COMPAT_MASK; + if (!test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler))) user_event_enabler_destroy(enabler, true); @@ -2454,7 +2504,7 @@ static long user_events_ioctl_unreg(unsigned long uarg) /* Ensure bit is now cleared for user, regardless of event status */ if (!ret) ret = user_event_mm_clear_bit(mm, reg.disable_addr, - reg.disable_bit); + reg.disable_bit, flags); return ret; } diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index 8bfe23af9c73..7d2ddbcfa377 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -927,11 +927,12 @@ static int parse_symbol_and_return(int argc, const char *argv[], for (i = 2; i < argc; i++) { tmp = strstr(argv[i], "$retval"); if (tmp && !isalnum(tmp[7]) && tmp[7] != '_') { + if (is_tracepoint) { + trace_probe_log_set_index(i); + trace_probe_log_err(tmp - argv[i], RETVAL_ON_PROBE); + return -EINVAL; + } *is_return = true; - /* - * NOTE: Don't check is_tracepoint here, because it will - * be checked when the argument is parsed. - */ break; } } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 3d7a180a8427..c4c6e0e0068b 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -487,8 +487,8 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) return -EINVAL; if (within_notrace_func(tk)) { - pr_warn("Could not probe notrace function %s\n", - trace_kprobe_symbol(tk)); + pr_warn("Could not probe notrace function %ps\n", + (void *)trace_kprobe_address(tk)); return -EINVAL; } @@ -705,6 +705,41 @@ static struct notifier_block trace_kprobe_module_nb = { .priority = 1 /* Invoked after kprobe module callback */ }; +static int count_symbols(void *data, unsigned long unused) +{ + unsigned int *count = data; + + (*count)++; + + return 0; +} + +struct sym_count_ctx { + unsigned int count; + const char *name; +}; + +static int count_mod_symbols(void *data, const char *name, unsigned long unused) +{ + struct sym_count_ctx *ctx = data; + + if (strcmp(name, ctx->name) == 0) + ctx->count++; + + return 0; +} + +static unsigned int number_of_same_symbols(char *func_name) +{ + struct sym_count_ctx ctx = { .count = 0, .name = func_name }; + + kallsyms_on_each_match_symbol(count_symbols, func_name, &ctx.count); + + module_kallsyms_on_each_symbol(NULL, count_mod_symbols, &ctx); + + return ctx.count; +} + static int __trace_kprobe_create(int argc, const char *argv[]) { /* @@ -836,6 +871,31 @@ static int __trace_kprobe_create(int argc, const char *argv[]) } } + if (symbol && !strchr(symbol, ':')) { + unsigned int count; + + count = number_of_same_symbols(symbol); + if (count > 1) { + /* + * Users should use ADDR to remove the ambiguity of + * using KSYM only. + */ + trace_probe_log_err(0, NON_UNIQ_SYMBOL); + ret = -EADDRNOTAVAIL; + + goto error; + } else if (count == 0) { + /* + * We can return ENOENT earlier than when register the + * kprobe. + */ + trace_probe_log_err(0, BAD_PROBE_ADDR); + ret = -ENOENT; + + goto error; + } + } + trace_probe_log_set_index(0); if (event) { ret = traceprobe_parse_event_name(&event, &group, gbuf, @@ -960,10 +1020,10 @@ EXPORT_SYMBOL_GPL(kprobe_event_cmd_init); /** * __kprobe_event_gen_cmd_start - Generate a kprobe event command from arg list * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @kretprobe: Is this a return probe? * @name: The name of the kprobe event * @loc: The location of the kprobe event - * @kretprobe: Is this a return probe? - * @args: Variable number of arg (pairs), one pair for each field + * @...: Variable number of arg (pairs), one pair for each field * * NOTE: Users normally won't want to call this function directly, but * rather use the kprobe_event_gen_cmd_start() wrapper, which automatically @@ -1036,7 +1096,7 @@ EXPORT_SYMBOL_GPL(__kprobe_event_gen_cmd_start); /** * __kprobe_event_add_fields - Add probe fields to a kprobe command from arg list * @cmd: A pointer to the dynevent_cmd struct representing the new event - * @args: Variable number of arg (pairs), one pair for each field + * @...: Variable number of arg (pairs), one pair for each field * * NOTE: Users normally won't want to call this function directly, but * rather use the kprobe_event_add_fields() wrapper, which @@ -1189,6 +1249,12 @@ static const struct file_operations kprobe_events_ops = { .write = probes_write, }; +static unsigned long trace_kprobe_missed(struct trace_kprobe *tk) +{ + return trace_kprobe_is_return(tk) ? + tk->rp.kp.nmissed + tk->rp.nmissed : tk->rp.kp.nmissed; +} + /* Probes profiling interfaces */ static int probes_profile_seq_show(struct seq_file *m, void *v) { @@ -1200,8 +1266,7 @@ static int probes_profile_seq_show(struct seq_file *m, void *v) return 0; tk = to_trace_kprobe(ev); - nmissed = trace_kprobe_is_return(tk) ? - tk->rp.kp.nmissed + tk->rp.nmissed : tk->rp.kp.nmissed; + nmissed = trace_kprobe_missed(tk); seq_printf(m, " %-44s %15lu %15lu\n", trace_probe_name(&tk->tp), trace_kprobe_nhit(tk), @@ -1547,7 +1612,8 @@ NOKPROBE_SYMBOL(kretprobe_perf_func); int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type, const char **symbol, u64 *probe_offset, - u64 *probe_addr, bool perf_type_tracepoint) + u64 *probe_addr, unsigned long *missed, + bool perf_type_tracepoint) { const char *pevent = trace_event_name(event->tp_event); const char *group = event->tp_event->class->system; @@ -1566,6 +1632,8 @@ int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type, *probe_addr = kallsyms_show_value(current_cred()) ? (unsigned long)tk->rp.kp.addr : 0; *symbol = tk->symbol; + if (missed) + *missed = trace_kprobe_missed(tk); return 0; } #endif /* CONFIG_PERF_EVENTS */ @@ -1695,6 +1763,7 @@ static int unregister_kprobe_event(struct trace_kprobe *tk) } #ifdef CONFIG_PERF_EVENTS + /* create a trace_kprobe, but don't add it to global lists */ struct trace_event_call * create_local_trace_kprobe(char *func, void *addr, unsigned long offs, @@ -1705,6 +1774,24 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs, int ret; char *event; + if (func) { + unsigned int count; + + count = number_of_same_symbols(func); + if (count > 1) + /* + * Users should use addr to remove the ambiguity of + * using func only. + */ + return ERR_PTR(-EADDRNOTAVAIL); + else if (count == 0) + /* + * We can return ENOENT earlier than when register the + * kprobe. + */ + return ERR_PTR(-ENOENT); + } + /* * local trace_kprobes are not added to dyn_event, so they are never * searched in find_trace_kprobe(). Therefore, there is no concern of diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index bd0d01d00fb9..a8e28f9b9271 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -2444,6 +2444,9 @@ static int timerlat_fd_open(struct inode *inode, struct file *file) tlat = this_cpu_tmr_var(); tlat->count = 0; + hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); + tlat->timer.function = timerlat_irq; + migrate_enable(); return 0; }; @@ -2526,9 +2529,6 @@ timerlat_fd_read(struct file *file, char __user *ubuf, size_t count, tlat->tracing_thread = false; tlat->kthread = current; - hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); - tlat->timer.function = timerlat_irq; - /* Annotate now to drift new period */ tlat->abs_period = hrtimer_cb_get_time(&tlat->timer); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index db575094c498..3e7fa44dc2b2 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -404,7 +404,7 @@ static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, vmstart = vma->vm_start; } if (file) { - ret = trace_seq_path(s, &file->f_path); + ret = trace_seq_path(s, file_user_path(file)); if (ret) trace_seq_printf(s, "[+0x%lx]", ip - vmstart); @@ -1587,11 +1587,12 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter, { struct print_entry *field; struct trace_seq *s = &iter->seq; + int max = iter->ent_size - offsetof(struct print_entry, buf); trace_assign_type(field, iter->ent); seq_print_ip_sym(s, field->ip, flags); - trace_seq_printf(s, ": %s", field->buf); + trace_seq_printf(s, ": %.*s", max, field->buf); return trace_handle_return(s); } @@ -1600,10 +1601,11 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags, struct trace_event *event) { struct print_entry *field; + int max = iter->ent_size - offsetof(struct print_entry, buf); trace_assign_type(field, iter->ent); - trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf); + trace_seq_printf(&iter->seq, "# %lx %.*s", field->ip, max, field->buf); return trace_handle_return(&iter->seq); } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 4dc74d73fc1d..34289f9c6707 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1159,9 +1159,12 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, if (!(ctx->flags & TPARG_FL_TEVENT) && (strcmp(arg, "$comm") == 0 || strcmp(arg, "$COMM") == 0 || strncmp(arg, "\\\"", 2) == 0)) { - /* The type of $comm must be "string", and not an array. */ - if (parg->count || (t && strcmp(t, "string"))) + /* The type of $comm must be "string", and not an array type. */ + if (parg->count || (t && strcmp(t, "string"))) { + trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), + NEED_STRING_TYPE); goto out; + } parg->type = find_fetch_type("string", ctx->flags); } else parg->type = find_fetch_type(t, ctx->flags); @@ -1169,18 +1172,6 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), BAD_TYPE); goto out; } - parg->offset = *size; - *size += parg->type->size * (parg->count ?: 1); - - ret = -ENOMEM; - if (parg->count) { - len = strlen(parg->type->fmttype) + 6; - parg->fmt = kmalloc(len, GFP_KERNEL); - if (!parg->fmt) - goto out; - snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype, - parg->count); - } code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL); if (!code) @@ -1204,6 +1195,19 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, goto fail; } } + parg->offset = *size; + *size += parg->type->size * (parg->count ?: 1); + + if (parg->count) { + len = strlen(parg->type->fmttype) + 6; + parg->fmt = kmalloc(len, GFP_KERNEL); + if (!parg->fmt) { + ret = -ENOMEM; + goto out; + } + snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype, + parg->count); + } ret = -EINVAL; /* Store operation */ diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 02b432ae7513..c1877d018269 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -450,6 +450,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(BAD_MAXACT, "Invalid maxactive number"), \ C(MAXACT_TOO_BIG, "Maxactive is too big"), \ C(BAD_PROBE_ADDR, "Invalid probed address or symbol"), \ + C(NON_UNIQ_SYMBOL, "The symbol is not unique"), \ C(BAD_RETPROBE, "Retprobe address must be an function entry"), \ C(NO_TRACEPOINT, "Tracepoint is not found"), \ C(BAD_ADDR_SUFFIX, "Invalid probed address suffix"), \ @@ -514,7 +515,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(BAD_HYPHEN, "Failed to parse single hyphen. Forgot '>'?"), \ C(NO_BTF_FIELD, "This field is not found."), \ C(BAD_BTF_TID, "Failed to get BTF type info."),\ - C(BAD_TYPE4STR, "This type does not fit for string."), + C(BAD_TYPE4STR, "This type does not fit for string."),\ + C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"), #undef C #define C(a, b) TP_ERR_##a diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index bac06ee3b98b..c158d65a8a88 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -13,9 +13,6 @@ * trace_seq_init() more than once to reset the trace_seq to start * from scratch. * - * The buffer size is currently PAGE_SIZE, although it may become dynamic - * in the future. - * * A write to the buffer will either succeed or fail. That is, unlike * sprintf() there will not be a partial write (well it may write into * the buffer but it wont update the pointers). This allows users to @@ -370,8 +367,12 @@ EXPORT_SYMBOL_GPL(trace_seq_path); */ int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt) { + int ret; __trace_seq_init(s); - return seq_buf_to_user(&s->seq, ubuf, cnt); + ret = seq_buf_to_user(&s->seq, ubuf, s->readpos, cnt); + if (ret > 0) + s->readpos += ret; + return ret; } EXPORT_SYMBOL_GPL(trace_seq_to_user); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index de753403cdaf..9c581d6da843 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -556,7 +556,7 @@ static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *re { struct syscall_tp_t { struct trace_entry ent; - unsigned long syscall_nr; + int syscall_nr; unsigned long args[SYSCALL_DEFINE_MAXARGS]; } __aligned(8) param; int i; @@ -661,7 +661,7 @@ static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *reg { struct syscall_tp_t { struct trace_entry ent; - unsigned long syscall_nr; + int syscall_nr; unsigned long ret; } __aligned(8) param; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 99c051de412a..a84b85d8aac1 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -151,7 +151,7 @@ fetch_store_string(unsigned long addr, void *dest, void *base) return -ENOMEM; if (addr == FETCH_TOKEN_COMM) - ret = strlcpy(dst, current->comm, maxlen); + ret = strscpy(dst, current->comm, maxlen); else ret = strncpy_from_user(dst, src, maxlen); if (ret >= 0) { diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index c774e560f2f9..a4dcf0f24352 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -574,7 +574,12 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only) } memcpy(elt->key, key, map->key_size); - entry->val = elt; + /* + * Ensure the initialization is visible and + * publish the elt. + */ + smp_wmb(); + WRITE_ONCE(entry->val, elt); atomic64_inc(&map->hits); return entry->val; diff --git a/kernel/up.c b/kernel/up.c index a38b8b095251..df50828cc2f0 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -25,7 +25,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, } EXPORT_SYMBOL(smp_call_function_single); -int smp_call_function_single_async(int cpu, struct __call_single_data *csd) +int smp_call_function_single_async(int cpu, call_single_data_t *csd) { unsigned long flags; diff --git a/kernel/user.c b/kernel/user.c index d667debeafd6..03cedc366dc9 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -18,8 +18,18 @@ #include <linux/interrupt.h> #include <linux/export.h> #include <linux/user_namespace.h> +#include <linux/binfmts.h> #include <linux/proc_ns.h> +#if IS_ENABLED(CONFIG_BINFMT_MISC) +struct binfmt_misc init_binfmt_misc = { + .entries = LIST_HEAD_INIT(init_binfmt_misc.entries), + .enabled = true, + .entries_lock = __RW_LOCK_UNLOCKED(init_binfmt_misc.entries_lock), +}; +EXPORT_SYMBOL_GPL(init_binfmt_misc); +#endif + /* * userns count is 1 for root user, 1 for init_uts_ns, * and 1 for... ? @@ -67,6 +77,9 @@ struct user_namespace init_user_ns = { .keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list), .keyring_sem = __RWSEM_INITIALIZER(init_user_ns.keyring_sem), #endif +#if IS_ENABLED(CONFIG_BINFMT_MISC) + .binfmt_misc = &init_binfmt_misc, +#endif }; EXPORT_SYMBOL_GPL(init_user_ns); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 1d8e47bed3f1..ce4d99df5f0e 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -22,7 +22,7 @@ #include <linux/bsearch.h> #include <linux/sort.h> -static struct kmem_cache *user_ns_cachep __read_mostly; +static struct kmem_cache *user_ns_cachep __ro_after_init; static DEFINE_MUTEX(userns_state_mutex); static bool new_idmap_permitted(const struct file *file, @@ -213,6 +213,9 @@ static void free_user_ns(struct work_struct *work) kfree(ns->projid_map.forward); kfree(ns->projid_map.reverse); } +#if IS_ENABLED(CONFIG_BINFMT_MISC) + kfree(ns->binfmt_misc); +#endif retire_userns_sysctls(ns); key_free_user_ns(ns); ns_free_inum(&ns->ns); @@ -228,7 +231,7 @@ void __put_user_ns(struct user_namespace *ns) } EXPORT_SYMBOL(__put_user_ns); -/** +/* * struct idmap_key - holds the information necessary to find an idmapping in a * sorted idmap array. It is passed to cmp_map_id() as first argument. */ @@ -238,7 +241,7 @@ struct idmap_key { u32 count; /* == 0 unless used with map_id_range_down() */ }; -/** +/* * cmp_map_id - Function to be passed to bsearch() to find the requested * idmapping. Expects struct idmap_key to be passed via @k. */ @@ -268,7 +271,7 @@ static int cmp_map_id(const void *k, const void *e) return 1; } -/** +/* * map_id_range_down_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ @@ -285,7 +288,7 @@ map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 cou sizeof(struct uid_gid_extent), cmp_map_id); } -/** +/* * map_id_range_down_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. @@ -329,12 +332,12 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) return id; } -static u32 map_id_down(struct uid_gid_map *map, u32 id) +u32 map_id_down(struct uid_gid_map *map, u32 id) { return map_id_range_down(map, id, 1); } -/** +/* * map_id_up_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. @@ -355,7 +358,7 @@ map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id) return NULL; } -/** +/* * map_id_up_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ @@ -372,7 +375,7 @@ map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id) sizeof(struct uid_gid_extent), cmp_map_id); } -static u32 map_id_up(struct uid_gid_map *map, u32 id) +u32 map_id_up(struct uid_gid_map *map, u32 id) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; @@ -767,7 +770,7 @@ static bool mappings_overlap(struct uid_gid_map *new_map, return false; } -/** +/* * insert_extent - Safely insert a new idmap extent into struct uid_gid_map. * Takes care to allocate a 4K block of memory if the number of mappings exceeds * UID_GID_MAP_MAX_BASE_EXTENTS. @@ -836,7 +839,7 @@ static int cmp_extents_reverse(const void *a, const void *b) return 0; } -/** +/* * sort_idmaps - Sorts an array of idmap entries. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index d0b6b390ee42..03b90d7d2175 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -270,7 +270,7 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) goto error; ret = -ENOMEM; - pages = kcalloc(sizeof(struct page *), nr_pages, GFP_KERNEL); + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) goto error; @@ -331,7 +331,7 @@ long watch_queue_set_filter(struct pipe_inode_info *pipe, filter.__reserved != 0) return -EINVAL; - tf = memdup_user(_filter->filters, filter.nr_filters * sizeof(*tf)); + tf = memdup_array_user(_filter->filters, filter.nr_filters, sizeof(*tf)); if (IS_ERR(tf)) return PTR_ERR(tf); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index d145305d95fe..81a8862295d6 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -91,7 +91,7 @@ static DEFINE_PER_CPU(atomic_t, hrtimer_interrupts); static DEFINE_PER_CPU(int, hrtimer_interrupts_saved); static DEFINE_PER_CPU(bool, watchdog_hardlockup_warned); static DEFINE_PER_CPU(bool, watchdog_hardlockup_touched); -static unsigned long watchdog_hardlockup_all_cpu_dumped; +static unsigned long hard_lockup_nmi_warn; notrace void arch_touch_nmi_watchdog(void) { @@ -151,12 +151,32 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) */ if (is_hardlockup(cpu)) { unsigned int this_cpu = smp_processor_id(); + unsigned long flags; /* Only print hardlockups once. */ if (per_cpu(watchdog_hardlockup_warned, cpu)) return; + /* + * Prevent multiple hard-lockup reports if one cpu is already + * engaged in dumping all cpu back traces. + */ + if (sysctl_hardlockup_all_cpu_backtrace) { + if (test_and_set_bit_lock(0, &hard_lockup_nmi_warn)) + return; + } + + /* + * NOTE: we call printk_cpu_sync_get_irqsave() after printing + * the lockup message. While it would be nice to serialize + * that printout, we really want to make sure that if some + * other CPU somehow locked up while holding the lock associated + * with printk_cpu_sync_get_irqsave() that we can still at least + * get the message about the lockup out. + */ pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", cpu); + printk_cpu_sync_get_irqsave(flags); + print_modules(); print_irqtrace_events(current); if (cpu == this_cpu) { @@ -164,17 +184,17 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) show_regs(regs); else dump_stack(); + printk_cpu_sync_put_irqrestore(flags); } else { + printk_cpu_sync_put_irqrestore(flags); trigger_single_cpu_backtrace(cpu); } - /* - * Perform multi-CPU dump only once to avoid multiple - * hardlockups generating interleaving traces - */ - if (sysctl_hardlockup_all_cpu_backtrace && - !test_and_set_bit(0, &watchdog_hardlockup_all_cpu_dumped)) + if (sysctl_hardlockup_all_cpu_backtrace) { trigger_allbutcpu_cpu_backtrace(cpu); + if (!hardlockup_panic) + clear_bit_unlock(0, &hard_lockup_nmi_warn); + } if (hardlockup_panic) nmi_panic(regs, "Hard LOCKUP"); @@ -283,6 +303,13 @@ static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); static DEFINE_PER_CPU(bool, softlockup_touch_sync); static unsigned long soft_lockup_nmi_warn; +static int __init softlockup_panic_setup(char *str) +{ + softlockup_panic = simple_strtoul(str, NULL, 0); + return 1; +} +__setup("softlockup_panic=", softlockup_panic_setup); + static int __init nowatchdog_setup(char *str) { watchdog_user_enabled = 0; @@ -441,6 +468,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) struct pt_regs *regs = get_irq_regs(); int duration; int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; + unsigned long flags; if (!watchdog_enabled) return HRTIMER_NORESTART; @@ -507,6 +535,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) /* Start period for the next softlockup warning. */ update_report_ts(); + printk_cpu_sync_get_irqsave(flags); pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); @@ -516,10 +545,12 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) show_regs(regs); else dump_stack(); + printk_cpu_sync_put_irqrestore(flags); if (softlockup_all_cpu_backtrace) { trigger_allbutcpu_cpu_backtrace(smp_processor_id()); - clear_bit_unlock(0, &soft_lockup_nmi_warn); + if (!softlockup_panic) + clear_bit_unlock(0, &soft_lockup_nmi_warn); } add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c85825e17df8..76e60faed892 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -381,6 +381,12 @@ static bool workqueue_freezing; /* PL: have wqs started freezing? */ /* PL&A: allowable cpus for unbound wqs and work items */ static cpumask_var_t wq_unbound_cpumask; +/* PL: user requested unbound cpumask via sysfs */ +static cpumask_var_t wq_requested_unbound_cpumask; + +/* PL: isolated cpumask to be excluded from unbound cpumask */ +static cpumask_var_t wq_isolated_cpumask; + /* for further constrain wq_unbound_cpumask by cmdline parameter*/ static struct cpumask wq_cmdline_cpumask __initdata; @@ -418,21 +424,21 @@ static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS]; * process context while holding a pool lock. Bounce to a dedicated kthread * worker to avoid A-A deadlocks. */ -static struct kthread_worker *pwq_release_worker; +static struct kthread_worker *pwq_release_worker __ro_after_init; -struct workqueue_struct *system_wq __read_mostly; +struct workqueue_struct *system_wq __ro_after_init; EXPORT_SYMBOL(system_wq); -struct workqueue_struct *system_highpri_wq __read_mostly; +struct workqueue_struct *system_highpri_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_highpri_wq); -struct workqueue_struct *system_long_wq __read_mostly; +struct workqueue_struct *system_long_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_long_wq); -struct workqueue_struct *system_unbound_wq __read_mostly; +struct workqueue_struct *system_unbound_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_unbound_wq); -struct workqueue_struct *system_freezable_wq __read_mostly; +struct workqueue_struct *system_freezable_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_freezable_wq); -struct workqueue_struct *system_power_efficient_wq __read_mostly; +struct workqueue_struct *system_power_efficient_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_power_efficient_wq); -struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly; +struct workqueue_struct *system_freezable_power_efficient_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); static int worker_thread(void *__worker); @@ -1684,9 +1690,6 @@ static int wq_select_unbound_cpu(int cpu) pr_warn_once("workqueue: round-robin CPU selection forced, expect performance impact\n"); } - if (cpumask_empty(wq_unbound_cpumask)) - return cpu; - new_cpu = __this_cpu_read(wq_rr_cpu_last); new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask); if (unlikely(new_cpu >= nr_cpu_ids)) { @@ -2166,7 +2169,7 @@ static struct worker *create_worker(struct worker_pool *pool) { struct worker *worker; int id; - char id_buf[16]; + char id_buf[23]; /* ID is needed to determine kthread name */ id = ida_alloc(&pool->worker_ida, GFP_KERNEL); @@ -4411,19 +4414,6 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) mutex_unlock(&ctx->wq->mutex); } -static void apply_wqattrs_lock(void) -{ - /* CPUs should stay stable across pwq creations and installations */ - cpus_read_lock(); - mutex_lock(&wq_pool_mutex); -} - -static void apply_wqattrs_unlock(void) -{ - mutex_unlock(&wq_pool_mutex); - cpus_read_unlock(); -} - static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { @@ -4600,12 +4590,22 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) } cpus_read_unlock(); + /* for unbound pwq, flush the pwq_release_worker ensures that the + * pwq_release_workfn() completes before calling kfree(wq). + */ + if (ret) + kthread_flush_worker(pwq_release_worker); + return ret; enomem: if (wq->cpu_pwq) { - for_each_possible_cpu(cpu) - kfree(*per_cpu_ptr(wq->cpu_pwq, cpu)); + for_each_possible_cpu(cpu) { + struct pool_workqueue *pwq = *per_cpu_ptr(wq->cpu_pwq, cpu); + + if (pwq) + kmem_cache_free(pwq_cache, pwq); + } free_percpu(wq->cpu_pwq); wq->cpu_pwq = NULL; } @@ -5612,50 +5612,54 @@ static void work_for_cpu_fn(struct work_struct *work) } /** - * work_on_cpu - run a function in thread context on a particular cpu + * work_on_cpu_key - run a function in thread context on a particular cpu * @cpu: the cpu to run on * @fn: the function to run * @arg: the function arg + * @key: The lock class key for lock debugging purposes * * It is up to the caller to ensure that the cpu doesn't go offline. * The caller must not hold any locks which would prevent @fn from completing. * * Return: The value @fn returns. */ -long work_on_cpu(int cpu, long (*fn)(void *), void *arg) +long work_on_cpu_key(int cpu, long (*fn)(void *), + void *arg, struct lock_class_key *key) { struct work_for_cpu wfc = { .fn = fn, .arg = arg }; - INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); + INIT_WORK_ONSTACK_KEY(&wfc.work, work_for_cpu_fn, key); schedule_work_on(cpu, &wfc.work); flush_work(&wfc.work); destroy_work_on_stack(&wfc.work); return wfc.ret; } -EXPORT_SYMBOL_GPL(work_on_cpu); +EXPORT_SYMBOL_GPL(work_on_cpu_key); /** - * work_on_cpu_safe - run a function in thread context on a particular cpu + * work_on_cpu_safe_key - run a function in thread context on a particular cpu * @cpu: the cpu to run on * @fn: the function to run * @arg: the function argument + * @key: The lock class key for lock debugging purposes * * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold * any locks which would prevent @fn from completing. * * Return: The value @fn returns. */ -long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg) +long work_on_cpu_safe_key(int cpu, long (*fn)(void *), + void *arg, struct lock_class_key *key) { long ret = -ENODEV; cpus_read_lock(); if (cpu_online(cpu)) - ret = work_on_cpu(cpu, fn, arg); + ret = work_on_cpu_key(cpu, fn, arg, key); cpus_read_unlock(); return ret; } -EXPORT_SYMBOL_GPL(work_on_cpu_safe); +EXPORT_SYMBOL_GPL(work_on_cpu_safe_key); #endif /* CONFIG_SMP */ #ifdef CONFIG_FREEZER @@ -5782,9 +5786,13 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask) list_for_each_entry(wq, &workqueues, list) { if (!(wq->flags & WQ_UNBOUND)) continue; + /* creating multiple pwqs breaks ordering guarantee */ - if (wq->flags & __WQ_ORDERED) - continue; + if (!list_empty(&wq->pwqs)) { + if (wq->flags & __WQ_ORDERED_EXPLICIT) + continue; + wq->flags &= ~__WQ_ORDERED; + } ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask); if (IS_ERR(ctx)) { @@ -5810,39 +5818,40 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask) } /** - * workqueue_set_unbound_cpumask - Set the low-level unbound cpumask - * @cpumask: the cpumask to set + * workqueue_unbound_exclude_cpumask - Exclude given CPUs from unbound cpumask + * @exclude_cpumask: the cpumask to be excluded from wq_unbound_cpumask * - * The low-level workqueues cpumask is a global cpumask that limits - * the affinity of all unbound workqueues. This function check the @cpumask - * and apply it to all unbound workqueues and updates all pwqs of them. - * - * Return: 0 - Success - * -EINVAL - Invalid @cpumask - * -ENOMEM - Failed to allocate memory for attrs or pwqs. + * This function can be called from cpuset code to provide a set of isolated + * CPUs that should be excluded from wq_unbound_cpumask. The caller must hold + * either cpus_read_lock or cpus_write_lock. */ -int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) +int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask) { - int ret = -EINVAL; + cpumask_var_t cpumask; + int ret = 0; + + if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL)) + return -ENOMEM; + + lockdep_assert_cpus_held(); + mutex_lock(&wq_pool_mutex); + + /* Save the current isolated cpumask & export it via sysfs */ + cpumask_copy(wq_isolated_cpumask, exclude_cpumask); /* - * Not excluding isolated cpus on purpose. - * If the user wishes to include them, we allow that. + * If the operation fails, it will fall back to + * wq_requested_unbound_cpumask which is initially set to + * (HK_TYPE_WQ ∩ HK_TYPE_DOMAIN) house keeping mask and rewritten + * by any subsequent write to workqueue/cpumask sysfs file. */ - cpumask_and(cpumask, cpumask, cpu_possible_mask); - if (!cpumask_empty(cpumask)) { - apply_wqattrs_lock(); - if (cpumask_equal(cpumask, wq_unbound_cpumask)) { - ret = 0; - goto out_unlock; - } - + if (!cpumask_andnot(cpumask, wq_requested_unbound_cpumask, exclude_cpumask)) + cpumask_copy(cpumask, wq_requested_unbound_cpumask); + if (!cpumask_equal(cpumask, wq_unbound_cpumask)) ret = workqueue_apply_unbound_cpumask(cpumask); -out_unlock: - apply_wqattrs_unlock(); - } - + mutex_unlock(&wq_pool_mutex); + free_cpumask_var(cpumask); return ret; } @@ -5964,6 +5973,19 @@ static struct attribute *wq_sysfs_attrs[] = { }; ATTRIBUTE_GROUPS(wq_sysfs); +static void apply_wqattrs_lock(void) +{ + /* CPUs should stay stable across pwq creations and installations */ + cpus_read_lock(); + mutex_lock(&wq_pool_mutex); +} + +static void apply_wqattrs_unlock(void) +{ + mutex_unlock(&wq_pool_mutex); + cpus_read_unlock(); +} + static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -6140,19 +6162,74 @@ static struct bus_type wq_subsys = { .dev_groups = wq_sysfs_groups, }; -static ssize_t wq_unbound_cpumask_show(struct device *dev, - struct device_attribute *attr, char *buf) +/** + * workqueue_set_unbound_cpumask - Set the low-level unbound cpumask + * @cpumask: the cpumask to set + * + * The low-level workqueues cpumask is a global cpumask that limits + * the affinity of all unbound workqueues. This function check the @cpumask + * and apply it to all unbound workqueues and updates all pwqs of them. + * + * Return: 0 - Success + * -EINVAL - Invalid @cpumask + * -ENOMEM - Failed to allocate memory for attrs or pwqs. + */ +static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) +{ + int ret = -EINVAL; + + /* + * Not excluding isolated cpus on purpose. + * If the user wishes to include them, we allow that. + */ + cpumask_and(cpumask, cpumask, cpu_possible_mask); + if (!cpumask_empty(cpumask)) { + apply_wqattrs_lock(); + cpumask_copy(wq_requested_unbound_cpumask, cpumask); + if (cpumask_equal(cpumask, wq_unbound_cpumask)) { + ret = 0; + goto out_unlock; + } + + ret = workqueue_apply_unbound_cpumask(cpumask); + +out_unlock: + apply_wqattrs_unlock(); + } + + return ret; +} + +static ssize_t __wq_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf, cpumask_var_t mask) { int written; mutex_lock(&wq_pool_mutex); - written = scnprintf(buf, PAGE_SIZE, "%*pb\n", - cpumask_pr_args(wq_unbound_cpumask)); + written = scnprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask)); mutex_unlock(&wq_pool_mutex); return written; } +static ssize_t wq_unbound_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return __wq_cpumask_show(dev, attr, buf, wq_unbound_cpumask); +} + +static ssize_t wq_requested_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return __wq_cpumask_show(dev, attr, buf, wq_requested_unbound_cpumask); +} + +static ssize_t wq_isolated_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return __wq_cpumask_show(dev, attr, buf, wq_isolated_cpumask); +} + static ssize_t wq_unbound_cpumask_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { @@ -6170,9 +6247,13 @@ static ssize_t wq_unbound_cpumask_store(struct device *dev, return ret ? ret : count; } -static struct device_attribute wq_sysfs_cpumask_attr = +static struct device_attribute wq_sysfs_cpumask_attrs[] = { __ATTR(cpumask, 0644, wq_unbound_cpumask_show, - wq_unbound_cpumask_store); + wq_unbound_cpumask_store), + __ATTR(cpumask_requested, 0444, wq_requested_cpumask_show, NULL), + __ATTR(cpumask_isolated, 0444, wq_isolated_cpumask_show, NULL), + __ATTR_NULL, +}; static int __init wq_sysfs_init(void) { @@ -6185,7 +6266,13 @@ static int __init wq_sysfs_init(void) dev_root = bus_get_dev_root(&wq_subsys); if (dev_root) { - err = device_create_file(dev_root, &wq_sysfs_cpumask_attr); + struct device_attribute *attr; + + for (attr = wq_sysfs_cpumask_attrs; attr->attr.name; attr++) { + err = device_create_file(dev_root, attr); + if (err) + break; + } put_device(dev_root); } return err; @@ -6497,6 +6584,17 @@ static inline void wq_watchdog_init(void) { } #endif /* CONFIG_WQ_WATCHDOG */ +static void __init restrict_unbound_cpumask(const char *name, const struct cpumask *mask) +{ + if (!cpumask_intersects(wq_unbound_cpumask, mask)) { + pr_warn("workqueue: Restricting unbound_cpumask (%*pb) with %s (%*pb) leaves no CPU, ignoring\n", + cpumask_pr_args(wq_unbound_cpumask), name, cpumask_pr_args(mask)); + return; + } + + cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, mask); +} + /** * workqueue_init_early - early init for workqueue subsystem * @@ -6516,11 +6614,16 @@ void __init workqueue_init_early(void) BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); - cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ)); - cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN)); + BUG_ON(!alloc_cpumask_var(&wq_requested_unbound_cpumask, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&wq_isolated_cpumask, GFP_KERNEL)); + cpumask_copy(wq_unbound_cpumask, cpu_possible_mask); + restrict_unbound_cpumask("HK_TYPE_WQ", housekeeping_cpumask(HK_TYPE_WQ)); + restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN)); if (!cpumask_empty(&wq_cmdline_cpumask)) - cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, &wq_cmdline_cpumask); + restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask); + + cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); @@ -6535,9 +6638,6 @@ void __init workqueue_init_early(void) BUG_ON(!zalloc_cpumask_var_node(&pt->pod_cpus[0], GFP_KERNEL, NUMA_NO_NODE)); - wq_update_pod_attrs_buf = alloc_workqueue_attrs(); - BUG_ON(!wq_update_pod_attrs_buf); - pt->nr_pods = 1; cpumask_copy(pt->pod_cpus[0], cpu_possible_mask); pt->pod_node[0] = NUMA_NO_NODE; @@ -6605,13 +6705,13 @@ static void __init wq_cpu_intensive_thresh_init(void) unsigned long thresh; unsigned long bogo; + pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release"); + BUG_ON(IS_ERR(pwq_release_worker)); + /* if the user set it to a specific value, keep it */ if (wq_cpu_intensive_thresh_us != ULONG_MAX) return; - pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release"); - BUG_ON(IS_ERR(pwq_release_worker)); - /* * The default of 10ms is derived from the fact that most modern (as of * 2023) processors can do a lot in 10ms and that it's just below what |
