diff options
Diffstat (limited to 'kernel')
192 files changed, 8916 insertions, 5569 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index da326800c1c9..88c594c6d7fc 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -16,11 +16,13 @@ config ARCH_HAS_PREEMPT_LAZY choice prompt "Preemption Model" + default PREEMPT_LAZY if ARCH_HAS_PREEMPT_LAZY default PREEMPT_NONE config PREEMPT_NONE bool "No Forced Preemption (Server)" depends on !PREEMPT_RT + depends on ARCH_NO_PREEMPT select PREEMPT_NONE_BUILD if !PREEMPT_DYNAMIC help This is the traditional Linux preemption model, geared towards @@ -35,6 +37,7 @@ config PREEMPT_NONE config PREEMPT_VOLUNTARY bool "Voluntary Kernel Preemption (Desktop)" + depends on !ARCH_HAS_PREEMPT_LAZY depends on !ARCH_NO_PREEMPT depends on !PREEMPT_RT select PREEMPT_VOLUNTARY_BUILD if !PREEMPT_DYNAMIC diff --git a/kernel/Makefile b/kernel/Makefile index e83669841b8c..6785982013dc 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -43,6 +43,8 @@ KASAN_SANITIZE_kcov.o := n KCSAN_SANITIZE_kcov.o := n UBSAN_SANITIZE_kcov.o := n KMSAN_SANITIZE_kcov.o := n + +CONTEXT_ANALYSIS_kcov.o := y CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector obj-y += sched/ diff --git a/kernel/acct.c b/kernel/acct.c index 2a2b3c874acd..812808e5b1b8 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -218,7 +218,6 @@ static int acct_on(const char __user *name) /* Difference from BSD - they don't do O_APPEND */ const int open_flags = O_WRONLY|O_APPEND|O_LARGEFILE; struct pid_namespace *ns = task_active_pid_ns(current); - struct filename *pathname __free(putname) = getname(name); struct file *original_file __free(fput) = NULL; // in that order struct path internal __free(path_put) = {}; // in that order struct file *file __free(fput_sync) = NULL; // in that order @@ -226,8 +225,7 @@ static int acct_on(const char __user *name) struct vfsmount *mnt; struct fs_pin *old; - if (IS_ERR(pathname)) - return PTR_ERR(pathname); + CLASS(filename, pathname)(name); original_file = file_open_name(pathname, open_flags, 0); if (IS_ERR(original_file)) return PTR_ERR(original_file); diff --git a/kernel/audit.c b/kernel/audit.c index 26a332ffb1b8..592d927e70f9 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -32,6 +32,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/file.h> +#include <linux/hex.h> #include <linux/init.h> #include <linux/types.h> #include <linux/atomic.h> @@ -58,6 +59,9 @@ #include <linux/freezer.h> #include <linux/pid_namespace.h> #include <net/netns/generic.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <linux/sctp.h> #include "audit.h" @@ -2488,6 +2492,162 @@ void audit_log_path_denied(int type, const char *operation) audit_log_end(ab); } +int audit_log_nf_skb(struct audit_buffer *ab, + const struct sk_buff *skb, u8 nfproto) +{ + /* find the IP protocol in the case of NFPROTO_BRIDGE */ + if (nfproto == NFPROTO_BRIDGE) { + switch (eth_hdr(skb)->h_proto) { + case htons(ETH_P_IP): + nfproto = NFPROTO_IPV4; + break; + case htons(ETH_P_IPV6): + nfproto = NFPROTO_IPV6; + break; + default: + goto unknown_proto; + } + } + + switch (nfproto) { + case NFPROTO_IPV4: { + struct iphdr iph; + const struct iphdr *ih; + + ih = skb_header_pointer(skb, skb_network_offset(skb), + sizeof(iph), &iph); + if (!ih) + return -ENOMEM; + + switch (ih->protocol) { + case IPPROTO_TCP: { + struct tcphdr _tcph; + const struct tcphdr *th; + + th = skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_tcph), &_tcph); + if (!th) + return -ENOMEM; + + audit_log_format(ab, " saddr=%pI4 daddr=%pI4 proto=%hhu sport=%hu dport=%hu", + &ih->saddr, &ih->daddr, ih->protocol, + ntohs(th->source), ntohs(th->dest)); + break; + } + case IPPROTO_UDP: + case IPPROTO_UDPLITE: { + struct udphdr _udph; + const struct udphdr *uh; + + uh = skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_udph), &_udph); + if (!uh) + return -ENOMEM; + + audit_log_format(ab, " saddr=%pI4 daddr=%pI4 proto=%hhu sport=%hu dport=%hu", + &ih->saddr, &ih->daddr, ih->protocol, + ntohs(uh->source), ntohs(uh->dest)); + break; + } + case IPPROTO_SCTP: { + struct sctphdr _sctph; + const struct sctphdr *sh; + + sh = skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_sctph), &_sctph); + if (!sh) + return -ENOMEM; + + audit_log_format(ab, " saddr=%pI4 daddr=%pI4 proto=%hhu sport=%hu dport=%hu", + &ih->saddr, &ih->daddr, ih->protocol, + ntohs(sh->source), ntohs(sh->dest)); + break; + } + default: + audit_log_format(ab, " saddr=%pI4 daddr=%pI4 proto=%hhu", + &ih->saddr, &ih->daddr, ih->protocol); + } + + break; + } + case NFPROTO_IPV6: { + struct ipv6hdr iph; + const struct ipv6hdr *ih; + u8 nexthdr; + __be16 frag_off; + + ih = skb_header_pointer(skb, skb_network_offset(skb), + sizeof(iph), &iph); + if (!ih) + return -ENOMEM; + + nexthdr = ih->nexthdr; + ipv6_skip_exthdr(skb, skb_network_offset(skb) + sizeof(iph), + &nexthdr, &frag_off); + + switch (nexthdr) { + case IPPROTO_TCP: { + struct tcphdr _tcph; + const struct tcphdr *th; + + th = skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_tcph), &_tcph); + if (!th) + return -ENOMEM; + + audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu sport=%hu dport=%hu", + &ih->saddr, &ih->daddr, nexthdr, + ntohs(th->source), ntohs(th->dest)); + break; + } + case IPPROTO_UDP: + case IPPROTO_UDPLITE: { + struct udphdr _udph; + const struct udphdr *uh; + + uh = skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_udph), &_udph); + if (!uh) + return -ENOMEM; + + audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu sport=%hu dport=%hu", + &ih->saddr, &ih->daddr, nexthdr, + ntohs(uh->source), ntohs(uh->dest)); + break; + } + case IPPROTO_SCTP: { + struct sctphdr _sctph; + const struct sctphdr *sh; + + sh = skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_sctph), &_sctph); + if (!sh) + return -ENOMEM; + + audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu sport=%hu dport=%hu", + &ih->saddr, &ih->daddr, nexthdr, + ntohs(sh->source), ntohs(sh->dest)); + break; + } + default: + audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu", + &ih->saddr, &ih->daddr, nexthdr); + } + + break; + } + default: + goto unknown_proto; + } + + return 0; + +unknown_proto: + audit_log_format(ab, " saddr=? daddr=? proto=?"); + return -EPFNOSUPPORT; +} +EXPORT_SYMBOL(audit_log_nf_skb); + /* global counter which is incremented every time something logs in */ static atomic_t session_id = ATOMIC_INIT(0); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index dd0563a8e0be..86a44b162a87 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2170,29 +2170,6 @@ static struct audit_names *audit_alloc_name(struct audit_context *context, } /** - * __audit_reusename - fill out filename with info from existing entry - * @uptr: userland ptr to pathname - * - * Search the audit_names list for the current audit context. If there is an - * existing entry with a matching "uptr" then return the filename - * associated with that audit_name. If not, return NULL. - */ -struct filename * -__audit_reusename(const __user char *uptr) -{ - struct audit_context *context = audit_context(); - struct audit_names *n; - - list_for_each_entry(n, &context->names_list, list) { - if (!n->name) - continue; - if (n->name->uptr == uptr) - return refname(n->name); - } - return NULL; -} - -/** * __audit_getname - add a name to the list * @name: name to add * @@ -2214,7 +2191,7 @@ void __audit_getname(struct filename *name) n->name = name; n->name_len = AUDIT_NAME_FULL; name->aname = n; - refname(name); + name->refcnt++; } static inline int audit_copy_fcaps(struct audit_names *name, @@ -2346,7 +2323,7 @@ out_alloc: return; if (name) { n->name = name; - refname(name); + name->refcnt++; } out: @@ -2468,7 +2445,7 @@ void __audit_inode_child(struct inode *parent, if (found_parent) { found_child->name = found_parent->name; found_child->name_len = AUDIT_NAME_FULL; - refname(found_child->name); + found_child->name->refcnt++; } } diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 232cbc97434d..79cf22860a99 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -42,7 +42,17 @@ endif ifeq ($(CONFIG_BPF_JIT),y) obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o obj-$(CONFIG_BPF_SYSCALL) += cpumask.o -obj-${CONFIG_BPF_LSM} += bpf_lsm.o +# bpf_lsm_proto.o must precede bpf_lsm.o. The current pahole logic +# deduplicates function prototypes within +# btf_encoder__add_saved_func() by keeping the first instance seen. We +# need the function prototype(s) in bpf_lsm_proto.o to take precedence +# over those within bpf_lsm.o. Having bpf_lsm_proto.o precede +# bpf_lsm.o ensures its DWARF CU is processed early, forcing the +# generated BTF to contain the overrides. +# +# Notably, this is a temporary workaround whilst the deduplication +# semantics within pahole are revisited accordingly. +obj-${CONFIG_BPF_LSM} += bpf_lsm_proto.o bpf_lsm.o endif ifneq ($(CONFIG_CRYPTO),) obj-$(CONFIG_BPF_SYSCALL) += crypto.o diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 872dc0e41c65..42fae0a9f314 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -2,11 +2,15 @@ /* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ #include <linux/bpf.h> #include <linux/btf.h> +#include <linux/cacheflush.h> #include <linux/err.h> +#include <linux/irq_work.h> #include "linux/filter.h" +#include <linux/llist.h> #include <linux/btf_ids.h> #include <linux/vmalloc.h> #include <linux/pagemap.h> +#include <asm/tlbflush.h> #include "range_tree.h" /* @@ -42,14 +46,31 @@ #define GUARD_SZ round_up(1ull << sizeof_field(struct bpf_insn, off) * 8, PAGE_SIZE << 1) #define KERN_VM_SZ (SZ_4G + GUARD_SZ) +static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable); + struct bpf_arena { struct bpf_map map; u64 user_vm_start; u64 user_vm_end; struct vm_struct *kern_vm; struct range_tree rt; + /* protects rt */ + rqspinlock_t spinlock; struct list_head vma_list; + /* protects vma_list */ struct mutex lock; + struct irq_work free_irq; + struct work_struct free_work; + struct llist_head free_spans; +}; + +static void arena_free_worker(struct work_struct *work); +static void arena_free_irq(struct irq_work *iw); + +struct arena_free_span { + struct llist_node node; + unsigned long uaddr; + u32 page_cnt; }; u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena) @@ -92,6 +113,66 @@ static long compute_pgoff(struct bpf_arena *arena, long uaddr) return (u32)(uaddr - (u32)arena->user_vm_start) >> PAGE_SHIFT; } +struct apply_range_data { + struct page **pages; + int i; +}; + +static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data) +{ + struct apply_range_data *d = data; + struct page *page; + + if (!data) + return 0; + /* sanity check */ + if (unlikely(!pte_none(ptep_get(pte)))) + return -EBUSY; + + page = d->pages[d->i]; + /* paranoia, similar to vmap_pages_pte_range() */ + if (WARN_ON_ONCE(!pfn_valid(page_to_pfn(page)))) + return -EINVAL; + + set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL)); + d->i++; + return 0; +} + +static void flush_vmap_cache(unsigned long start, unsigned long size) +{ + flush_cache_vmap(start, start + size); +} + +static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *free_pages) +{ + pte_t old_pte; + struct page *page; + + /* sanity check */ + old_pte = ptep_get(pte); + if (pte_none(old_pte) || !pte_present(old_pte)) + return 0; /* nothing to do */ + + page = pte_page(old_pte); + if (WARN_ON_ONCE(!page)) + return -EINVAL; + + pte_clear(&init_mm, addr, pte); + + /* Add page to the list so it is freed later */ + if (free_pages) + __llist_add(&page->pcp_llist, free_pages); + + return 0; +} + +static int populate_pgtable_except_pte(struct bpf_arena *arena) +{ + return apply_to_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena), + KERN_VM_SZ - GUARD_SZ, apply_range_set_cb, NULL); +} + static struct bpf_map *arena_map_alloc(union bpf_attr *attr) { struct vm_struct *kern_vm; @@ -136,6 +217,9 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr) arena->user_vm_end = arena->user_vm_start + vm_range; INIT_LIST_HEAD(&arena->vma_list); + init_llist_head(&arena->free_spans); + init_irq_work(&arena->free_irq, arena_free_irq); + INIT_WORK(&arena->free_work, arena_free_worker); bpf_map_init_from_attr(&arena->map, attr); range_tree_init(&arena->rt); err = range_tree_set(&arena->rt, 0, attr->max_entries); @@ -144,6 +228,13 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr) goto err; } mutex_init(&arena->lock); + raw_res_spin_lock_init(&arena->spinlock); + err = populate_pgtable_except_pte(arena); + if (err) { + range_tree_destroy(&arena->rt); + bpf_map_area_free(arena); + goto err; + } return &arena->map; err: @@ -184,6 +275,10 @@ static void arena_map_free(struct bpf_map *map) if (WARN_ON_ONCE(!list_empty(&arena->vma_list))) return; + /* Ensure no pending deferred frees */ + irq_work_sync(&arena->free_irq); + flush_work(&arena->free_work); + /* * free_vm_area() calls remove_vm_area() that calls free_unmap_vmap_area(). * It unmaps everything from vmalloc area and clears pgtables. @@ -265,44 +360,59 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) { struct bpf_map *map = vmf->vma->vm_file->private_data; struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + struct mem_cgroup *new_memcg, *old_memcg; struct page *page; long kbase, kaddr; + unsigned long flags; int ret; kbase = bpf_arena_get_kern_vm_start(arena); kaddr = kbase + (u32)(vmf->address); - guard(mutex)(&arena->lock); + if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) + /* Make a reasonable effort to address impossible case */ + return VM_FAULT_RETRY; + page = vmalloc_to_page((void *)kaddr); if (page) /* already have a page vmap-ed */ goto out; + bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); + if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT) /* User space requested to segfault when page is not allocated by bpf prog */ - return VM_FAULT_SIGSEGV; + goto out_unlock_sigsegv; ret = range_tree_clear(&arena->rt, vmf->pgoff, 1); if (ret) - return VM_FAULT_SIGSEGV; + goto out_unlock_sigsegv; + struct apply_range_data data = { .pages = &page, .i = 0 }; /* Account into memcg of the process that created bpf_arena */ ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page); if (ret) { range_tree_set(&arena->rt, vmf->pgoff, 1); - return VM_FAULT_SIGSEGV; + goto out_unlock_sigsegv; } - ret = vm_area_map_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE, &page); + ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data); if (ret) { range_tree_set(&arena->rt, vmf->pgoff, 1); - __free_page(page); - return VM_FAULT_SIGSEGV; + free_pages_nolock(page, 0); + goto out_unlock_sigsegv; } + flush_vmap_cache(kaddr, PAGE_SIZE); + bpf_map_memcg_exit(old_memcg, new_memcg); out: page_ref_add(page, 1); + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); vmf->page = page; return 0; +out_unlock_sigsegv: + bpf_map_memcg_exit(old_memcg, new_memcg); + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); + return VM_FAULT_SIGSEGV; } static const struct vm_operations_struct arena_vm_ops = { @@ -423,12 +533,18 @@ static u64 clear_lo32(u64 val) * Allocate pages and vmap them into kernel vmalloc area. * Later the pages will be mmaped into user space vma. */ -static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id) +static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id, + bool sleepable) { /* user_vm_end/start are fixed before bpf prog runs */ long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT; u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena); - struct page **pages; + struct mem_cgroup *new_memcg, *old_memcg; + struct apply_range_data data; + struct page **pages = NULL; + long remaining, mapped = 0; + long alloc_pages; + unsigned long flags; long pgoff = 0; u32 uaddr32; int ret, i; @@ -445,17 +561,23 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt return 0; } - /* zeroing is needed, since alloc_pages_bulk() only fills in non-zero entries */ - pages = kvcalloc(page_cnt, sizeof(struct page *), GFP_KERNEL); - if (!pages) + bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); + /* Cap allocation size to KMALLOC_MAX_CACHE_SIZE so kmalloc_nolock() can succeed. */ + alloc_pages = min(page_cnt, KMALLOC_MAX_CACHE_SIZE / sizeof(struct page *)); + pages = kmalloc_nolock(alloc_pages * sizeof(struct page *), __GFP_ACCOUNT, NUMA_NO_NODE); + if (!pages) { + bpf_map_memcg_exit(old_memcg, new_memcg); return 0; + } + data.pages = pages; - guard(mutex)(&arena->lock); + if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) + goto out_free_pages; if (uaddr) { ret = is_range_tree_set(&arena->rt, pgoff, page_cnt); if (ret) - goto out_free_pages; + goto out_unlock_free_pages; ret = range_tree_clear(&arena->rt, pgoff, page_cnt); } else { ret = pgoff = range_tree_find(&arena->rt, page_cnt); @@ -463,33 +585,62 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt ret = range_tree_clear(&arena->rt, pgoff, page_cnt); } if (ret) - goto out_free_pages; - - ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages); - if (ret) - goto out; + goto out_unlock_free_pages; + remaining = page_cnt; uaddr32 = (u32)(arena->user_vm_start + pgoff * PAGE_SIZE); - /* Earlier checks made sure that uaddr32 + page_cnt * PAGE_SIZE - 1 - * will not overflow 32-bit. Lower 32-bit need to represent - * contiguous user address range. - * Map these pages at kern_vm_start base. - * kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow - * lower 32-bit and it's ok. - */ - ret = vm_area_map_pages(arena->kern_vm, kern_vm_start + uaddr32, - kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE, pages); - if (ret) { - for (i = 0; i < page_cnt; i++) - __free_page(pages[i]); - goto out; + + while (remaining) { + long this_batch = min(remaining, alloc_pages); + + /* zeroing is needed, since alloc_pages_bulk() only fills in non-zero entries */ + memset(pages, 0, this_batch * sizeof(struct page *)); + + ret = bpf_map_alloc_pages(&arena->map, node_id, this_batch, pages); + if (ret) + goto out; + + /* + * Earlier checks made sure that uaddr32 + page_cnt * PAGE_SIZE - 1 + * will not overflow 32-bit. Lower 32-bit need to represent + * contiguous user address range. + * Map these pages at kern_vm_start base. + * kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow + * lower 32-bit and it's ok. + */ + data.i = 0; + ret = apply_to_page_range(&init_mm, + kern_vm_start + uaddr32 + (mapped << PAGE_SHIFT), + this_batch << PAGE_SHIFT, apply_range_set_cb, &data); + if (ret) { + /* data.i pages were mapped, account them and free the remaining */ + mapped += data.i; + for (i = data.i; i < this_batch; i++) + free_pages_nolock(pages[i], 0); + goto out; + } + + mapped += this_batch; + remaining -= this_batch; } - kvfree(pages); + flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT); + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); + kfree_nolock(pages); + bpf_map_memcg_exit(old_memcg, new_memcg); return clear_lo32(arena->user_vm_start) + uaddr32; out: - range_tree_set(&arena->rt, pgoff, page_cnt); + range_tree_set(&arena->rt, pgoff + mapped, page_cnt - mapped); + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); + if (mapped) { + flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT); + arena_free_pages(arena, uaddr32, mapped, sleepable); + } + goto out_free_pages; +out_unlock_free_pages: + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); out_free_pages: - kvfree(pages); + kfree_nolock(pages); + bpf_map_memcg_exit(old_memcg, new_memcg); return 0; } @@ -502,42 +653,66 @@ static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt) { struct vma_list *vml; + guard(mutex)(&arena->lock); + /* iterate link list under lock */ list_for_each_entry(vml, &arena->vma_list, head) zap_page_range_single(vml->vma, uaddr, PAGE_SIZE * page_cnt, NULL); } -static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt) +static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable) { + struct mem_cgroup *new_memcg, *old_memcg; u64 full_uaddr, uaddr_end; - long kaddr, pgoff, i; + long kaddr, pgoff; struct page *page; + struct llist_head free_pages; + struct llist_node *pos, *t; + struct arena_free_span *s; + unsigned long flags; + int ret = 0; /* only aligned lower 32-bit are relevant */ uaddr = (u32)uaddr; uaddr &= PAGE_MASK; + kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr; full_uaddr = clear_lo32(arena->user_vm_start) + uaddr; uaddr_end = min(arena->user_vm_end, full_uaddr + (page_cnt << PAGE_SHIFT)); if (full_uaddr >= uaddr_end) return; page_cnt = (uaddr_end - full_uaddr) >> PAGE_SHIFT; + pgoff = compute_pgoff(arena, uaddr); + bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); - guard(mutex)(&arena->lock); + if (!sleepable) + goto defer; + + ret = raw_res_spin_lock_irqsave(&arena->spinlock, flags); + + /* Can't proceed without holding the spinlock so defer the free */ + if (ret) + goto defer; - pgoff = compute_pgoff(arena, uaddr); - /* clear range */ range_tree_set(&arena->rt, pgoff, page_cnt); + init_llist_head(&free_pages); + /* clear ptes and collect struct pages */ + apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT, + apply_range_clear_cb, &free_pages); + + /* drop the lock to do the tlb flush and zap pages */ + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); + + /* ensure no stale TLB entries */ + flush_tlb_kernel_range(kaddr, kaddr + (page_cnt * PAGE_SIZE)); + if (page_cnt > 1) /* bulk zap if multiple pages being freed */ zap_pages(arena, full_uaddr, page_cnt); - kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr; - for (i = 0; i < page_cnt; i++, kaddr += PAGE_SIZE, full_uaddr += PAGE_SIZE) { - page = vmalloc_to_page((void *)kaddr); - if (!page) - continue; + llist_for_each_safe(pos, t, __llist_del_all(&free_pages)) { + page = llist_entry(pos, struct page, pcp_llist); if (page_cnt == 1 && page_mapped(page)) /* mapped by some user process */ /* Optimization for the common case of page_cnt==1: * If page wasn't mapped into some user vma there @@ -545,9 +720,27 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt) * page_cnt is big it's faster to do the batched zap. */ zap_pages(arena, full_uaddr, 1); - vm_area_unmap_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE); __free_page(page); } + bpf_map_memcg_exit(old_memcg, new_memcg); + + return; + +defer: + s = kmalloc_nolock(sizeof(struct arena_free_span), __GFP_ACCOUNT, -1); + bpf_map_memcg_exit(old_memcg, new_memcg); + if (!s) + /* + * If allocation fails in non-sleepable context, pages are intentionally left + * inaccessible (leaked) until the arena is destroyed. Cleanup or retries are not + * possible here, so we intentionally omit them for safety. + */ + return; + + s->page_cnt = page_cnt; + s->uaddr = uaddr; + llist_add(&s->node, &arena->free_spans); + irq_work_queue(&arena->free_irq); } /* @@ -557,6 +750,8 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt) static int arena_reserve_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt) { long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT; + struct mem_cgroup *new_memcg, *old_memcg; + unsigned long flags; long pgoff; int ret; @@ -567,15 +762,94 @@ static int arena_reserve_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt if (pgoff + page_cnt > page_cnt_max) return -EINVAL; - guard(mutex)(&arena->lock); + if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) + return -EBUSY; /* Cannot guard already allocated pages. */ ret = is_range_tree_set(&arena->rt, pgoff, page_cnt); - if (ret) - return -EBUSY; + if (ret) { + ret = -EBUSY; + goto out; + } /* "Allocate" the region to prevent it from being allocated. */ - return range_tree_clear(&arena->rt, pgoff, page_cnt); + bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); + ret = range_tree_clear(&arena->rt, pgoff, page_cnt); + bpf_map_memcg_exit(old_memcg, new_memcg); +out: + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); + return ret; +} + +static void arena_free_worker(struct work_struct *work) +{ + struct bpf_arena *arena = container_of(work, struct bpf_arena, free_work); + struct mem_cgroup *new_memcg, *old_memcg; + struct llist_node *list, *pos, *t; + struct arena_free_span *s; + u64 arena_vm_start, user_vm_start; + struct llist_head free_pages; + struct page *page; + unsigned long full_uaddr; + long kaddr, page_cnt, pgoff; + unsigned long flags; + + if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) { + schedule_work(work); + return; + } + + bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); + + init_llist_head(&free_pages); + arena_vm_start = bpf_arena_get_kern_vm_start(arena); + user_vm_start = bpf_arena_get_user_vm_start(arena); + + list = llist_del_all(&arena->free_spans); + llist_for_each(pos, list) { + s = llist_entry(pos, struct arena_free_span, node); + page_cnt = s->page_cnt; + kaddr = arena_vm_start + s->uaddr; + pgoff = compute_pgoff(arena, s->uaddr); + + /* clear ptes and collect pages in free_pages llist */ + apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT, + apply_range_clear_cb, &free_pages); + + range_tree_set(&arena->rt, pgoff, page_cnt); + } + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); + + /* Iterate the list again without holding spinlock to do the tlb flush and zap_pages */ + llist_for_each_safe(pos, t, list) { + s = llist_entry(pos, struct arena_free_span, node); + page_cnt = s->page_cnt; + full_uaddr = clear_lo32(user_vm_start) + s->uaddr; + kaddr = arena_vm_start + s->uaddr; + + /* ensure no stale TLB entries */ + flush_tlb_kernel_range(kaddr, kaddr + (page_cnt * PAGE_SIZE)); + + /* remove pages from user vmas */ + zap_pages(arena, full_uaddr, page_cnt); + + kfree_nolock(s); + } + + /* free all pages collected by apply_to_existing_page_range() in the first loop */ + llist_for_each_safe(pos, t, __llist_del_all(&free_pages)) { + page = llist_entry(pos, struct page, pcp_llist); + __free_page(page); + } + + bpf_map_memcg_exit(old_memcg, new_memcg); +} + +static void arena_free_irq(struct irq_work *iw) +{ + struct bpf_arena *arena = container_of(iw, struct bpf_arena, free_irq); + + schedule_work(&arena->free_work); } __bpf_kfunc_start_defs(); @@ -589,9 +863,20 @@ __bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_ if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt) return NULL; - return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id); + return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true); } +void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt, + int node_id, u64 flags) +{ + struct bpf_map *map = p__map; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt) + return NULL; + + return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, false); +} __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt) { struct bpf_map *map = p__map; @@ -599,7 +884,17 @@ __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign) return; - arena_free_pages(arena, (long)ptr__ign, page_cnt); + arena_free_pages(arena, (long)ptr__ign, page_cnt, true); +} + +void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt) +{ + struct bpf_map *map = p__map; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign) + return; + arena_free_pages(arena, (long)ptr__ign, page_cnt, false); } __bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_cnt) @@ -618,9 +913,9 @@ __bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_c __bpf_kfunc_end_defs(); BTF_KFUNCS_START(arena_kfuncs) -BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_RET | KF_ARENA_ARG2) -BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2) -BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2) +BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_ARENA_RET | KF_ARENA_ARG2) +BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_ARENA_ARG2) +BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_ARENA_ARG2) BTF_KFUNCS_END(arena_kfuncs) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 1eeb31c5b317..67e9e811de3a 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -307,7 +307,7 @@ static void *percpu_array_map_lookup_percpu_elem(struct bpf_map *map, void *key, return per_cpu_ptr(array->pptrs[index & array->index_mask], cpu); } -int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) +int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; @@ -325,11 +325,18 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) size = array->elem_size; rcu_read_lock(); pptr = array->pptrs[index & array->index_mask]; + if (map_flags & BPF_F_CPU) { + cpu = map_flags >> 32; + copy_map_value(map, value, per_cpu_ptr(pptr, cpu)); + check_and_init_map_value(map, value); + goto unlock; + } for_each_possible_cpu(cpu) { copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu)); check_and_init_map_value(map, value + off); off += size; } +unlock: rcu_read_unlock(); return 0; } @@ -398,10 +405,11 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; void __percpu *pptr; - int cpu, off = 0; + void *ptr, *val; u32 size; + int cpu; - if (unlikely(map_flags > BPF_EXIST)) + if (unlikely((map_flags & BPF_F_LOCK) || (u32)map_flags > BPF_F_ALL_CPUS)) /* unknown flags */ return -EINVAL; @@ -422,11 +430,20 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, size = array->elem_size; rcu_read_lock(); pptr = array->pptrs[index & array->index_mask]; + if (map_flags & BPF_F_CPU) { + cpu = map_flags >> 32; + ptr = per_cpu_ptr(pptr, cpu); + copy_map_value(map, ptr, value); + bpf_obj_free_fields(array->map.record, ptr); + goto unlock; + } for_each_possible_cpu(cpu) { - copy_map_value_long(map, per_cpu_ptr(pptr, cpu), value + off); - bpf_obj_free_fields(array->map.record, per_cpu_ptr(pptr, cpu)); - off += size; + ptr = per_cpu_ptr(pptr, cpu); + val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu; + copy_map_value(map, ptr, val); + bpf_obj_free_fields(array->map.record, ptr); } +unlock: rcu_read_unlock(); return 0; } diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 0687a760974a..c2a2ead1f466 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -11,29 +11,6 @@ DEFINE_BPF_STORAGE_CACHE(cgroup_cache); -static DEFINE_PER_CPU(int, bpf_cgrp_storage_busy); - -static void bpf_cgrp_storage_lock(void) -{ - cant_migrate(); - this_cpu_inc(bpf_cgrp_storage_busy); -} - -static void bpf_cgrp_storage_unlock(void) -{ - this_cpu_dec(bpf_cgrp_storage_busy); -} - -static bool bpf_cgrp_storage_trylock(void) -{ - cant_migrate(); - if (unlikely(this_cpu_inc_return(bpf_cgrp_storage_busy) != 1)) { - this_cpu_dec(bpf_cgrp_storage_busy); - return false; - } - return true; -} - static struct bpf_local_storage __rcu **cgroup_storage_ptr(void *owner) { struct cgroup *cg = owner; @@ -45,16 +22,14 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup) { struct bpf_local_storage *local_storage; - rcu_read_lock_dont_migrate(); + rcu_read_lock(); local_storage = rcu_dereference(cgroup->bpf_cgrp_storage); if (!local_storage) goto out; - bpf_cgrp_storage_lock(); bpf_local_storage_destroy(local_storage); - bpf_cgrp_storage_unlock(); out: - rcu_read_unlock_migrate(); + rcu_read_unlock(); } static struct bpf_local_storage_data * @@ -83,9 +58,7 @@ static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key) if (IS_ERR(cgroup)) return ERR_CAST(cgroup); - bpf_cgrp_storage_lock(); sdata = cgroup_storage_lookup(cgroup, map, true); - bpf_cgrp_storage_unlock(); cgroup_put(cgroup); return sdata ? sdata->data : NULL; } @@ -102,10 +75,8 @@ static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key, if (IS_ERR(cgroup)) return PTR_ERR(cgroup); - bpf_cgrp_storage_lock(); sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, value, map_flags, false, GFP_ATOMIC); - bpf_cgrp_storage_unlock(); cgroup_put(cgroup); return PTR_ERR_OR_ZERO(sdata); } @@ -118,8 +89,7 @@ static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map) if (!sdata) return -ENOENT; - bpf_selem_unlink(SELEM(sdata), false); - return 0; + return bpf_selem_unlink(SELEM(sdata)); } static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key) @@ -132,9 +102,7 @@ static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key) if (IS_ERR(cgroup)) return PTR_ERR(cgroup); - bpf_cgrp_storage_lock(); err = cgroup_storage_delete(cgroup, map); - bpf_cgrp_storage_unlock(); cgroup_put(cgroup); return err; } @@ -151,7 +119,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) static void cgroup_storage_map_free(struct bpf_map *map) { - bpf_local_storage_map_free(map, &cgroup_cache, &bpf_cgrp_storage_busy); + bpf_local_storage_map_free(map, &cgroup_cache); } /* *gfp_flags* is a hidden argument provided by the verifier */ @@ -159,7 +127,6 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup, void *, value, u64, flags, gfp_t, gfp_flags) { struct bpf_local_storage_data *sdata; - bool nobusy; WARN_ON_ONCE(!bpf_rcu_lock_held()); if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) @@ -168,38 +135,27 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup, if (!cgroup) return (unsigned long)NULL; - nobusy = bpf_cgrp_storage_trylock(); - - sdata = cgroup_storage_lookup(cgroup, map, nobusy); + sdata = cgroup_storage_lookup(cgroup, map, true); if (sdata) - goto unlock; + goto out; /* only allocate new storage, when the cgroup is refcounted */ if (!percpu_ref_is_dying(&cgroup->self.refcnt) && - (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) + (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, value, BPF_NOEXIST, false, gfp_flags); -unlock: - if (nobusy) - bpf_cgrp_storage_unlock(); +out: return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; } BPF_CALL_2(bpf_cgrp_storage_delete, struct bpf_map *, map, struct cgroup *, cgroup) { - int ret; - WARN_ON_ONCE(!bpf_rcu_lock_held()); if (!cgroup) return -EINVAL; - if (!bpf_cgrp_storage_trylock()) - return -EBUSY; - - ret = cgroup_storage_delete(cgroup, map); - bpf_cgrp_storage_unlock(); - return ret; + return cgroup_storage_delete(cgroup, map); } const struct bpf_map_ops cgrp_storage_map_ops = { diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index e54cce2b9175..e86734609f3d 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -110,9 +110,7 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map) if (!sdata) return -ENOENT; - bpf_selem_unlink(SELEM(sdata), false); - - return 0; + return bpf_selem_unlink(SELEM(sdata)); } static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key) @@ -186,7 +184,7 @@ static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr) static void inode_storage_map_free(struct bpf_map *map) { - bpf_local_storage_map_free(map, &inode_cache, NULL); + bpf_local_storage_map_free(map, &inode_cache); } const struct bpf_map_ops inode_storage_map_ops = { diff --git a/kernel/bpf/bpf_insn_array.c b/kernel/bpf/bpf_insn_array.c index c96630cb75bf..c0286f25ca3c 100644 --- a/kernel/bpf/bpf_insn_array.c +++ b/kernel/bpf/bpf_insn_array.c @@ -123,10 +123,10 @@ static int insn_array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, if ((off % sizeof(long)) != 0 || (off / sizeof(long)) >= map->max_entries) - return -EINVAL; + return -EACCES; /* from BPF's point of view, this map is a jump table */ - *imm = (unsigned long)insn_array->ips + off; + *imm = (unsigned long)insn_array->ips; return 0; } diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index eec60b57bd3d..4b58d56ecab1 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -86,7 +86,7 @@ static bool bpf_iter_support_resched(struct seq_file *seq) /* bpf_seq_read, a customized and simpler version for bpf iterator. * The following are differences from seq_read(): - * . fixed buffer size (PAGE_SIZE) + * . fixed buffer size (PAGE_SIZE << 3) * . assuming NULL ->llseek() * . stop() may call bpf program, handling potential overflow there */ diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index e2fe6c32822b..b28f07d3a0db 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -19,9 +19,9 @@ static struct bpf_local_storage_map_bucket * select_bucket(struct bpf_local_storage_map *smap, - struct bpf_local_storage_elem *selem) + struct bpf_local_storage *local_storage) { - return &smap->buckets[hash_ptr(selem, smap->bucket_log)]; + return &smap->buckets[hash_ptr(local_storage, smap->bucket_log)]; } static int mem_charge(struct bpf_local_storage_map *smap, void *owner, u32 size) @@ -61,11 +61,6 @@ static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem) return !hlist_unhashed(&selem->snode); } -static bool selem_linked_to_map_lockless(const struct bpf_local_storage_elem *selem) -{ - return !hlist_unhashed_lockless(&selem->map_node); -} - static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem) { return !hlist_unhashed(&selem->map_node); @@ -90,6 +85,8 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, if (selem) { RCU_INIT_POINTER(SDATA(selem)->smap, smap); + atomic_set(&selem->state, 0); + selem->use_kmalloc_nolock = smap->use_kmalloc_nolock; if (value) { /* No need to call check_and_init_map_value as memory is zero init */ @@ -198,9 +195,11 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu) /* The bpf_local_storage_map_free will wait for rcu_barrier */ smap = rcu_dereference_check(SDATA(selem)->smap, 1); - migrate_disable(); - bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); - migrate_enable(); + if (smap) { + migrate_disable(); + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); + migrate_enable(); + } kfree_nolock(selem); } @@ -219,13 +218,14 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); - if (!smap->use_kmalloc_nolock) { + if (!selem->use_kmalloc_nolock) { /* * No uptr will be unpin even when reuse_now == false since uptr * is only supported in task local storage, where * smap->use_kmalloc_nolock == true. */ - bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); + if (smap) + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); __bpf_selem_free(selem, reuse_now); return; } @@ -256,6 +256,36 @@ static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now) bpf_selem_free(selem, reuse_now); } +static void bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem *selem, + struct bpf_local_storage_map *smap, + struct bpf_local_storage *local_storage, + bool free_local_storage, bool pin_owner) +{ + void *owner = local_storage->owner; + u32 uncharge = smap->elem_size; + + if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) == + SDATA(selem)) + RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); + + if (pin_owner && !refcount_inc_not_zero(&local_storage->owner_refcnt)) + return; + + uncharge += free_local_storage ? sizeof(*local_storage) : 0; + mem_uncharge(smap, local_storage->owner, uncharge); + local_storage->mem_charge -= uncharge; + + if (free_local_storage) { + local_storage->owner = NULL; + + /* After this RCU_INIT, owner may be freed and cannot be used */ + RCU_INIT_POINTER(*owner_storage(smap, owner), NULL); + } + + if (pin_owner) + refcount_dec(&local_storage->owner_refcnt); +} + /* local_storage->lock must be held and selem->local_storage == local_storage. * The caller must ensure selem->smap is still valid to be * dereferenced for its smap->elem_size and smap->cache_idx. @@ -266,124 +296,219 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor { struct bpf_local_storage_map *smap; bool free_local_storage; - void *owner; smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); - owner = local_storage->owner; - - /* All uncharging on the owner must be done first. - * The owner may be freed once the last selem is unlinked - * from local_storage. - */ - mem_uncharge(smap, owner, smap->elem_size); free_local_storage = hlist_is_singular_node(&selem->snode, &local_storage->list); - if (free_local_storage) { - mem_uncharge(smap, owner, sizeof(struct bpf_local_storage)); - local_storage->owner = NULL; - /* After this RCU_INIT, owner may be freed and cannot be used */ - RCU_INIT_POINTER(*owner_storage(smap, owner), NULL); + bpf_selem_unlink_storage_nolock_misc(selem, smap, local_storage, + free_local_storage, false); - /* local_storage is not freed now. local_storage->lock is - * still held and raw_spin_unlock_bh(&local_storage->lock) - * will be done by the caller. - * - * Although the unlock will be done under - * rcu_read_lock(), it is more intuitive to - * read if the freeing of the storage is done - * after the raw_spin_unlock_bh(&local_storage->lock). - * - * Hence, a "bool free_local_storage" is returned - * to the caller which then calls then frees the storage after - * all the RCU grace periods have expired. - */ - } hlist_del_init_rcu(&selem->snode); - if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) == - SDATA(selem)) - RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); hlist_add_head(&selem->free_node, free_selem_list); - if (rcu_access_pointer(local_storage->smap) == smap) - RCU_INIT_POINTER(local_storage->smap, NULL); - return free_local_storage; } -static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, - bool reuse_now) +void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem) +{ + struct bpf_local_storage_map *smap; + + smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); + local_storage->mem_charge += smap->elem_size; + + RCU_INIT_POINTER(selem->local_storage, local_storage); + hlist_add_head_rcu(&selem->snode, &local_storage->list); +} + +static int bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) +{ + struct bpf_local_storage *local_storage; + struct bpf_local_storage_map *smap; + struct bpf_local_storage_map_bucket *b; + unsigned long flags; + int err; + + local_storage = rcu_dereference_check(selem->local_storage, + bpf_rcu_lock_held()); + smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); + b = select_bucket(smap, local_storage); + err = raw_res_spin_lock_irqsave(&b->lock, flags); + if (err) + return err; + + hlist_del_init_rcu(&selem->map_node); + raw_res_spin_unlock_irqrestore(&b->lock, flags); + + return 0; +} + +static void bpf_selem_unlink_map_nolock(struct bpf_local_storage_elem *selem) +{ + hlist_del_init_rcu(&selem->map_node); +} + +int bpf_selem_link_map(struct bpf_local_storage_map *smap, + struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem) +{ + struct bpf_local_storage_map_bucket *b; + unsigned long flags; + int err; + + b = select_bucket(smap, local_storage); + + err = raw_res_spin_lock_irqsave(&b->lock, flags); + if (err) + return err; + + hlist_add_head_rcu(&selem->map_node, &b->list); + raw_res_spin_unlock_irqrestore(&b->lock, flags); + + return 0; +} + +static void bpf_selem_link_map_nolock(struct bpf_local_storage_map_bucket *b, + struct bpf_local_storage_elem *selem) +{ + hlist_add_head_rcu(&selem->map_node, &b->list); +} + +/* + * Unlink an selem from map and local storage with lock held. + * This is the common path used by local storages to delete an selem. + */ +int bpf_selem_unlink(struct bpf_local_storage_elem *selem) { struct bpf_local_storage *local_storage; bool free_local_storage = false; HLIST_HEAD(selem_free_list); unsigned long flags; + int err; if (unlikely(!selem_linked_to_storage_lockless(selem))) /* selem has already been unlinked from sk */ - return; + return 0; local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held()); - raw_spin_lock_irqsave(&local_storage->lock, flags); - if (likely(selem_linked_to_storage(selem))) + err = raw_res_spin_lock_irqsave(&local_storage->lock, flags); + if (err) + return err; + + if (likely(selem_linked_to_storage(selem))) { + /* Always unlink from map before unlinking from local_storage + * because selem will be freed after successfully unlinked from + * the local_storage. + */ + err = bpf_selem_unlink_map(selem); + if (err) + goto out; + free_local_storage = bpf_selem_unlink_storage_nolock( local_storage, selem, &selem_free_list); - raw_spin_unlock_irqrestore(&local_storage->lock, flags); + } +out: + raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); - bpf_selem_free_list(&selem_free_list, reuse_now); + bpf_selem_free_list(&selem_free_list, false); if (free_local_storage) - bpf_local_storage_free(local_storage, reuse_now); -} + bpf_local_storage_free(local_storage, false); -void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, - struct bpf_local_storage_elem *selem) -{ - RCU_INIT_POINTER(selem->local_storage, local_storage); - hlist_add_head_rcu(&selem->snode, &local_storage->list); + return err; } -static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) +/* + * Unlink an selem from map and local storage with lockless fallback if callers + * are racing or rqspinlock returns error. It should only be called by + * bpf_local_storage_destroy() or bpf_local_storage_map_free(). + */ +static void bpf_selem_unlink_nofail(struct bpf_local_storage_elem *selem, + struct bpf_local_storage_map_bucket *b) { + bool in_map_free = !!b, free_storage = false; + struct bpf_local_storage *local_storage; struct bpf_local_storage_map *smap; - struct bpf_local_storage_map_bucket *b; unsigned long flags; + int err, unlink = 0; - if (unlikely(!selem_linked_to_map_lockless(selem))) - /* selem has already be unlinked from smap */ - return; - + local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held()); smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); - b = select_bucket(smap, selem); - raw_spin_lock_irqsave(&b->lock, flags); - if (likely(selem_linked_to_map(selem))) - hlist_del_init_rcu(&selem->map_node); - raw_spin_unlock_irqrestore(&b->lock, flags); -} -void bpf_selem_link_map(struct bpf_local_storage_map *smap, - struct bpf_local_storage_elem *selem) -{ - struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem); - unsigned long flags; + if (smap) { + b = b ? : select_bucket(smap, local_storage); + err = raw_res_spin_lock_irqsave(&b->lock, flags); + if (!err) { + /* + * Call bpf_obj_free_fields() under b->lock to make sure it is done + * exactly once for an selem. Safe to free special fields immediately + * as no BPF program should be referencing the selem. + */ + if (likely(selem_linked_to_map(selem))) { + hlist_del_init_rcu(&selem->map_node); + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); + unlink++; + } + raw_res_spin_unlock_irqrestore(&b->lock, flags); + } + /* + * Highly unlikely scenario: resource leak + * + * When map_free(selem1), destroy(selem1) and destroy(selem2) are racing + * and both selem belong to the same bucket, if destroy(selem2) acquired + * b->lock and block for too long, neither map_free(selem1) and + * destroy(selem1) will be able to free the special field associated + * with selem1 as raw_res_spin_lock_irqsave() returns -ETIMEDOUT. + */ + WARN_ON_ONCE(err && in_map_free); + if (!err || in_map_free) + RCU_INIT_POINTER(SDATA(selem)->smap, NULL); + } - raw_spin_lock_irqsave(&b->lock, flags); - hlist_add_head_rcu(&selem->map_node, &b->list); - raw_spin_unlock_irqrestore(&b->lock, flags); -} + if (local_storage) { + err = raw_res_spin_lock_irqsave(&local_storage->lock, flags); + if (!err) { + if (likely(selem_linked_to_storage(selem))) { + free_storage = hlist_is_singular_node(&selem->snode, + &local_storage->list); + /* + * Okay to skip clearing owner_storage and storage->owner in + * destroy() since the owner is going away. No user or bpf + * programs should be able to reference it. + */ + if (smap && in_map_free) + bpf_selem_unlink_storage_nolock_misc( + selem, smap, local_storage, + free_storage, true); + hlist_del_init_rcu(&selem->snode); + unlink++; + } + raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); + } + if (!err || !in_map_free) + RCU_INIT_POINTER(selem->local_storage, NULL); + } -void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) -{ - /* Always unlink from map before unlinking from local_storage - * because selem will be freed after successfully unlinked from - * the local_storage. + if (unlink != 2) + atomic_or(in_map_free ? SELEM_MAP_UNLINKED : SELEM_STORAGE_UNLINKED, &selem->state); + + /* + * Normally, an selem can be unlinked under local_storage->lock and b->lock, and + * then freed after an RCU grace period. However, if destroy() and map_free() are + * racing or rqspinlock returns errors in unlikely situations (unlink != 2), free + * the selem only after both map_free() and destroy() see the selem. */ - bpf_selem_unlink_map(selem); - bpf_selem_unlink_storage(selem, reuse_now); + if (unlink == 2 || + atomic_cmpxchg(&selem->state, SELEM_UNLINKED, SELEM_TOFREE) == SELEM_UNLINKED) + bpf_selem_free(selem, true); + + if (free_storage) + bpf_local_storage_free(local_storage, true); } void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage, @@ -391,16 +516,20 @@ void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem) { unsigned long flags; + int err; /* spinlock is needed to avoid racing with the * parallel delete. Otherwise, publishing an already * deleted sdata to the cache will become a use-after-free * problem in the next bpf_local_storage_lookup(). */ - raw_spin_lock_irqsave(&local_storage->lock, flags); + err = raw_res_spin_lock_irqsave(&local_storage->lock, flags); + if (err) + return; + if (selem_linked_to_storage(selem)) rcu_assign_pointer(local_storage->cache[smap->cache_idx], SDATA(selem)); - raw_spin_unlock_irqrestore(&local_storage->lock, flags); + raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); } static int check_flags(const struct bpf_local_storage_data *old_sdata, @@ -424,6 +553,8 @@ int bpf_local_storage_alloc(void *owner, { struct bpf_local_storage *prev_storage, *storage; struct bpf_local_storage **owner_storage_ptr; + struct bpf_local_storage_map_bucket *b; + unsigned long flags; int err; err = mem_charge(smap, owner, sizeof(*storage)); @@ -441,14 +572,21 @@ int bpf_local_storage_alloc(void *owner, goto uncharge; } - RCU_INIT_POINTER(storage->smap, smap); INIT_HLIST_HEAD(&storage->list); - raw_spin_lock_init(&storage->lock); + raw_res_spin_lock_init(&storage->lock); storage->owner = owner; + storage->mem_charge = sizeof(*storage); storage->use_kmalloc_nolock = smap->use_kmalloc_nolock; + refcount_set(&storage->owner_refcnt, 1); bpf_selem_link_storage_nolock(storage, first_selem); - bpf_selem_link_map(smap, first_selem); + + b = select_bucket(smap, storage); + err = raw_res_spin_lock_irqsave(&b->lock, flags); + if (err) + goto uncharge; + + bpf_selem_link_map_nolock(b, first_selem); owner_storage_ptr = (struct bpf_local_storage **)owner_storage(smap, owner); @@ -464,10 +602,12 @@ int bpf_local_storage_alloc(void *owner, */ prev_storage = cmpxchg(owner_storage_ptr, NULL, storage); if (unlikely(prev_storage)) { - bpf_selem_unlink_map(first_selem); + bpf_selem_unlink_map_nolock(first_selem); + raw_res_spin_unlock_irqrestore(&b->lock, flags); err = -EAGAIN; goto uncharge; } + raw_res_spin_unlock_irqrestore(&b->lock, flags); return 0; @@ -489,8 +629,9 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, struct bpf_local_storage_data *old_sdata = NULL; struct bpf_local_storage_elem *alloc_selem, *selem = NULL; struct bpf_local_storage *local_storage; + struct bpf_local_storage_map_bucket *b; HLIST_HEAD(old_selem_free_list); - unsigned long flags; + unsigned long flags, b_flags; int err; /* BPF_EXIST and BPF_NOEXIST cannot be both set */ @@ -549,7 +690,9 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (!alloc_selem) return ERR_PTR(-ENOMEM); - raw_spin_lock_irqsave(&local_storage->lock, flags); + err = raw_res_spin_lock_irqsave(&local_storage->lock, flags); + if (err) + goto free_selem; /* Recheck local_storage->list under local_storage->lock */ if (unlikely(hlist_empty(&local_storage->list))) { @@ -574,22 +717,30 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, goto unlock; } + b = select_bucket(smap, local_storage); + + err = raw_res_spin_lock_irqsave(&b->lock, b_flags); + if (err) + goto unlock; + alloc_selem = NULL; /* First, link the new selem to the map */ - bpf_selem_link_map(smap, selem); + bpf_selem_link_map_nolock(b, selem); /* Second, link (and publish) the new selem to local_storage */ bpf_selem_link_storage_nolock(local_storage, selem); /* Third, remove old selem, SELEM(old_sdata) */ if (old_sdata) { - bpf_selem_unlink_map(SELEM(old_sdata)); + bpf_selem_unlink_map_nolock(SELEM(old_sdata)); bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata), &old_selem_free_list); } + raw_res_spin_unlock_irqrestore(&b->lock, b_flags); unlock: - raw_spin_unlock_irqrestore(&local_storage->lock, flags); + raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); +free_selem: bpf_selem_free_list(&old_selem_free_list, false); if (alloc_selem) { mem_uncharge(smap, owner, smap->elem_size); @@ -657,13 +808,13 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, return 0; } -void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) +/* + * Destroy local storage when the owner is going away. Caller must uncharge memory + * if memory charging is used. + */ +u32 bpf_local_storage_destroy(struct bpf_local_storage *local_storage) { struct bpf_local_storage_elem *selem; - bool free_storage = false; - HLIST_HEAD(free_selem_list); - struct hlist_node *n; - unsigned long flags; /* Neither the bpf_prog nor the bpf_map's syscall * could be modifying the local_storage->list now. @@ -674,27 +825,20 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) * when unlinking elem from the local_storage->list and * the map's bucket->list. */ - raw_spin_lock_irqsave(&local_storage->lock, flags); - hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { - /* Always unlink from map before unlinking from - * local_storage. - */ - bpf_selem_unlink_map(selem); - /* If local_storage list has only one element, the - * bpf_selem_unlink_storage_nolock() will return true. - * Otherwise, it will return false. The current loop iteration - * intends to remove all local storage. So the last iteration - * of the loop will set the free_cgroup_storage to true. + hlist_for_each_entry_rcu(selem, &local_storage->list, snode) + bpf_selem_unlink_nofail(selem, NULL); + + if (!refcount_dec_and_test(&local_storage->owner_refcnt)) { + while (refcount_read(&local_storage->owner_refcnt)) + cpu_relax(); + /* + * Paired with refcount_dec() in bpf_selem_unlink_nofail() + * to make sure destroy() sees the correct local_storage->mem_charge. */ - free_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, &free_selem_list); + smp_mb(); } - raw_spin_unlock_irqrestore(&local_storage->lock, flags); - - bpf_selem_free_list(&free_selem_list, true); - if (free_storage) - bpf_local_storage_free(local_storage, true); + return local_storage->mem_charge; } u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) @@ -736,7 +880,7 @@ bpf_local_storage_map_alloc(union bpf_attr *attr, for (i = 0; i < nbuckets; i++) { INIT_HLIST_HEAD(&smap->buckets[i].list); - raw_spin_lock_init(&smap->buckets[i].lock); + raw_res_spin_lock_init(&smap->buckets[i].lock); } smap->elem_size = offsetof(struct bpf_local_storage_elem, @@ -758,8 +902,7 @@ free_smap: } void bpf_local_storage_map_free(struct bpf_map *map, - struct bpf_local_storage_cache *cache, - int __percpu *busy_counter) + struct bpf_local_storage_cache *cache) { struct bpf_local_storage_map_bucket *b; struct bpf_local_storage_elem *selem; @@ -789,15 +932,14 @@ void bpf_local_storage_map_free(struct bpf_map *map, rcu_read_lock(); /* No one is adding to b->list now */ - while ((selem = hlist_entry_safe( - rcu_dereference_raw(hlist_first_rcu(&b->list)), - struct bpf_local_storage_elem, map_node))) { - if (busy_counter) - this_cpu_inc(*busy_counter); - bpf_selem_unlink(selem, true); - if (busy_counter) - this_cpu_dec(*busy_counter); - cond_resched_rcu(); +restart: + hlist_for_each_entry_rcu(selem, &b->list, map_node) { + bpf_selem_unlink_nofail(selem, b); + + if (need_resched()) { + cond_resched_rcu(); + goto restart; + } } rcu_read_unlock(); } diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 7cb6e8d4282c..0c4a0c8e6f70 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -18,10 +18,11 @@ #include <linux/bpf-cgroup.h> /* For every LSM hook that allows attachment of BPF programs, declare a nop - * function where a BPF program can be attached. + * function where a BPF program can be attached. Notably, we qualify each with + * weak linkage such that strong overrides can be implemented if need be. */ #define LSM_HOOK(RET, DEFAULT, NAME, ...) \ -noinline RET bpf_lsm_##NAME(__VA_ARGS__) \ +__weak noinline RET bpf_lsm_##NAME(__VA_ARGS__) \ { \ return DEFAULT; \ } diff --git a/kernel/bpf/bpf_lsm_proto.c b/kernel/bpf/bpf_lsm_proto.c new file mode 100644 index 000000000000..44a54fd8045e --- /dev/null +++ b/kernel/bpf/bpf_lsm_proto.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2025 Google LLC. + */ + +#include <linux/fs.h> +#include <linux/bpf_lsm.h> + +/* + * Strong definition of the mmap_file() BPF LSM hook. The __nullable suffix on + * the struct file pointer parameter name marks it as PTR_MAYBE_NULL. This + * explicitly enforces that BPF LSM programs check for NULL before attempting to + * dereference it. + */ +int bpf_lsm_mmap_file(struct file *file__nullable, unsigned long reqprot, + unsigned long prot, unsigned long flags) +{ + return 0; +} diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 278490683d28..c43346cb3d76 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -533,6 +533,17 @@ static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map) } } +static void bpf_struct_ops_map_dissoc_progs(struct bpf_struct_ops_map *st_map) +{ + u32 i; + + for (i = 0; i < st_map->funcs_cnt; i++) { + if (!st_map->links[i]) + break; + bpf_prog_disassoc_struct_ops(st_map->links[i]->prog); + } +} + static void bpf_struct_ops_map_free_image(struct bpf_struct_ops_map *st_map) { int i; @@ -801,6 +812,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, goto reset_unlock; } + /* Poison pointer on error instead of return for backward compatibility */ + bpf_prog_assoc_struct_ops(prog, &st_map->map); + link = kzalloc(sizeof(*link), GFP_USER); if (!link) { bpf_prog_put(prog); @@ -980,6 +994,8 @@ static void bpf_struct_ops_map_free(struct bpf_map *map) if (btf_is_module(st_map->btf)) module_put(st_map->st_ops_desc->st_ops->owner); + bpf_struct_ops_map_dissoc_progs(st_map); + bpf_struct_ops_map_del_ksyms(st_map); /* The struct_ops's function may switch to another struct_ops. @@ -1396,6 +1412,78 @@ err_out: return err; } +int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map) +{ + struct bpf_map *st_ops_assoc; + + guard(mutex)(&prog->aux->st_ops_assoc_mutex); + + st_ops_assoc = rcu_dereference_protected(prog->aux->st_ops_assoc, + lockdep_is_held(&prog->aux->st_ops_assoc_mutex)); + if (st_ops_assoc && st_ops_assoc == map) + return 0; + + if (st_ops_assoc) { + if (prog->type != BPF_PROG_TYPE_STRUCT_OPS) + return -EBUSY; + + rcu_assign_pointer(prog->aux->st_ops_assoc, BPF_PTR_POISON); + } else { + /* + * struct_ops map does not track associated non-struct_ops programs. + * Bump the refcount to make sure st_ops_assoc is always valid. + */ + if (prog->type != BPF_PROG_TYPE_STRUCT_OPS) + bpf_map_inc(map); + + rcu_assign_pointer(prog->aux->st_ops_assoc, map); + } + + return 0; +} + +void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog) +{ + struct bpf_map *st_ops_assoc; + + guard(mutex)(&prog->aux->st_ops_assoc_mutex); + + st_ops_assoc = rcu_dereference_protected(prog->aux->st_ops_assoc, + lockdep_is_held(&prog->aux->st_ops_assoc_mutex)); + if (!st_ops_assoc || st_ops_assoc == BPF_PTR_POISON) + return; + + if (prog->type != BPF_PROG_TYPE_STRUCT_OPS) + bpf_map_put(st_ops_assoc); + + RCU_INIT_POINTER(prog->aux->st_ops_assoc, NULL); +} + +/* + * Get a reference to the struct_ops struct (i.e., kdata) associated with a + * program. Should only be called in BPF program context (e.g., in a kfunc). + * + * If the returned pointer is not NULL, it must points to a valid struct_ops. + * The struct_ops map is not guaranteed to be initialized nor attached. + * Kernel struct_ops implementers are responsible for tracking and checking + * the state of the struct_ops if the use case requires an initialized or + * attached struct_ops. + */ +void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux) +{ + struct bpf_struct_ops_map *st_map; + struct bpf_map *st_ops_assoc; + + st_ops_assoc = rcu_dereference_check(aux->st_ops_assoc, bpf_rcu_lock_held()); + if (!st_ops_assoc || st_ops_assoc == BPF_PTR_POISON) + return NULL; + + st_map = (struct bpf_struct_ops_map *)st_ops_assoc; + + return &st_map->kvalue.data; +} +EXPORT_SYMBOL_GPL(bpf_prog_get_assoc_struct_ops); + void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index a1dc1bf0848a..605506792b5b 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -20,29 +20,6 @@ DEFINE_BPF_STORAGE_CACHE(task_cache); -static DEFINE_PER_CPU(int, bpf_task_storage_busy); - -static void bpf_task_storage_lock(void) -{ - cant_migrate(); - this_cpu_inc(bpf_task_storage_busy); -} - -static void bpf_task_storage_unlock(void) -{ - this_cpu_dec(bpf_task_storage_busy); -} - -static bool bpf_task_storage_trylock(void) -{ - cant_migrate(); - if (unlikely(this_cpu_inc_return(bpf_task_storage_busy) != 1)) { - this_cpu_dec(bpf_task_storage_busy); - return false; - } - return true; -} - static struct bpf_local_storage __rcu **task_storage_ptr(void *owner) { struct task_struct *task = owner; @@ -70,17 +47,15 @@ void bpf_task_storage_free(struct task_struct *task) { struct bpf_local_storage *local_storage; - rcu_read_lock_dont_migrate(); + rcu_read_lock(); local_storage = rcu_dereference(task->bpf_storage); if (!local_storage) goto out; - bpf_task_storage_lock(); bpf_local_storage_destroy(local_storage); - bpf_task_storage_unlock(); out: - rcu_read_unlock_migrate(); + rcu_read_unlock(); } static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key) @@ -106,9 +81,7 @@ static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key) goto out; } - bpf_task_storage_lock(); sdata = task_storage_lookup(task, map, true); - bpf_task_storage_unlock(); put_pid(pid); return sdata ? sdata->data : NULL; out: @@ -143,11 +116,9 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, goto out; } - bpf_task_storage_lock(); sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, map_flags, true, GFP_ATOMIC); - bpf_task_storage_unlock(); err = PTR_ERR_OR_ZERO(sdata); out: @@ -155,8 +126,7 @@ out: return err; } -static int task_storage_delete(struct task_struct *task, struct bpf_map *map, - bool nobusy) +static int task_storage_delete(struct task_struct *task, struct bpf_map *map) { struct bpf_local_storage_data *sdata; @@ -164,12 +134,7 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map, if (!sdata) return -ENOENT; - if (!nobusy) - return -EBUSY; - - bpf_selem_unlink(SELEM(sdata), false); - - return 0; + return bpf_selem_unlink(SELEM(sdata)); } static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) @@ -194,111 +159,50 @@ static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) goto out; } - bpf_task_storage_lock(); - err = task_storage_delete(task, map, true); - bpf_task_storage_unlock(); + err = task_storage_delete(task, map); out: put_pid(pid); return err; } -/* Called by bpf_task_storage_get*() helpers */ -static void *__bpf_task_storage_get(struct bpf_map *map, - struct task_struct *task, void *value, - u64 flags, gfp_t gfp_flags, bool nobusy) +/* *gfp_flags* is a hidden argument provided by the verifier */ +BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, + task, void *, value, u64, flags, gfp_t, gfp_flags) { struct bpf_local_storage_data *sdata; - sdata = task_storage_lookup(task, map, nobusy); + WARN_ON_ONCE(!bpf_rcu_lock_held()); + if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task) + return (unsigned long)NULL; + + sdata = task_storage_lookup(task, map, true); if (sdata) - return sdata->data; + return (unsigned long)sdata->data; /* only allocate new storage, when the task is refcounted */ if (refcount_read(&task->usage) && - (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) { + (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) { sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, BPF_NOEXIST, false, gfp_flags); - return IS_ERR(sdata) ? NULL : sdata->data; + return IS_ERR(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; } - return NULL; -} - -/* *gfp_flags* is a hidden argument provided by the verifier */ -BPF_CALL_5(bpf_task_storage_get_recur, struct bpf_map *, map, struct task_struct *, - task, void *, value, u64, flags, gfp_t, gfp_flags) -{ - bool nobusy; - void *data; - - WARN_ON_ONCE(!bpf_rcu_lock_held()); - if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task) - return (unsigned long)NULL; - - nobusy = bpf_task_storage_trylock(); - data = __bpf_task_storage_get(map, task, value, flags, - gfp_flags, nobusy); - if (nobusy) - bpf_task_storage_unlock(); - return (unsigned long)data; -} - -/* *gfp_flags* is a hidden argument provided by the verifier */ -BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, - task, void *, value, u64, flags, gfp_t, gfp_flags) -{ - void *data; - - WARN_ON_ONCE(!bpf_rcu_lock_held()); - if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task) - return (unsigned long)NULL; - - bpf_task_storage_lock(); - data = __bpf_task_storage_get(map, task, value, flags, - gfp_flags, true); - bpf_task_storage_unlock(); - return (unsigned long)data; -} - -BPF_CALL_2(bpf_task_storage_delete_recur, struct bpf_map *, map, struct task_struct *, - task) -{ - bool nobusy; - int ret; - - WARN_ON_ONCE(!bpf_rcu_lock_held()); - if (!task) - return -EINVAL; - - nobusy = bpf_task_storage_trylock(); - /* This helper must only be called from places where the lifetime of the task - * is guaranteed. Either by being refcounted or by being protected - * by an RCU read-side critical section. - */ - ret = task_storage_delete(task, map, nobusy); - if (nobusy) - bpf_task_storage_unlock(); - return ret; + return (unsigned long)NULL; } BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *, task) { - int ret; - WARN_ON_ONCE(!bpf_rcu_lock_held()); if (!task) return -EINVAL; - bpf_task_storage_lock(); /* This helper must only be called from places where the lifetime of the task * is guaranteed. Either by being refcounted or by being protected * by an RCU read-side critical section. */ - ret = task_storage_delete(task, map, true); - bpf_task_storage_unlock(); - return ret; + return task_storage_delete(task, map); } static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) @@ -313,7 +217,7 @@ static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr) static void task_storage_map_free(struct bpf_map *map) { - bpf_local_storage_map_free(map, &task_cache, &bpf_task_storage_busy); + bpf_local_storage_map_free(map, &task_cache); } BTF_ID_LIST_GLOBAL_SINGLE(bpf_local_storage_map_btf_id, struct, bpf_local_storage_map) @@ -332,17 +236,6 @@ const struct bpf_map_ops task_storage_map_ops = { .map_owner_storage_ptr = task_storage_ptr, }; -const struct bpf_func_proto bpf_task_storage_get_recur_proto = { - .func = bpf_task_storage_get_recur, - .gpl_only = false, - .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, - .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, - .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], - .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, - .arg4_type = ARG_ANYTHING, -}; - const struct bpf_func_proto bpf_task_storage_get_proto = { .func = bpf_task_storage_get, .gpl_only = false, @@ -354,15 +247,6 @@ const struct bpf_func_proto bpf_task_storage_get_proto = { .arg4_type = ARG_ANYTHING, }; -const struct bpf_func_proto bpf_task_storage_delete_recur_proto = { - .func = bpf_task_storage_delete_recur, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, - .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], -}; - const struct bpf_func_proto bpf_task_storage_delete_proto = { .func = bpf_task_storage_delete, .gpl_only = false, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 0de8fc8a0e0b..7708958e3fb8 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -25,6 +25,7 @@ #include <linux/perf_event.h> #include <linux/bsearch.h> #include <linux/kobject.h> +#include <linux/string.h> #include <linux/sysfs.h> #include <linux/overflow.h> @@ -259,6 +260,7 @@ struct btf { void *nohdr_data; struct btf_header hdr; u32 nr_types; /* includes VOID for base BTF */ + u32 named_start_id; u32 types_size; u32 data_size; refcount_t refcnt; @@ -494,6 +496,11 @@ static bool btf_type_is_modifier(const struct btf_type *t) return false; } +static int btf_start_id(const struct btf *btf) +{ + return btf->start_id + (btf->base_btf ? 0 : 1); +} + bool btf_type_is_void(const struct btf_type *t) { return t == &btf_void; @@ -544,21 +551,125 @@ u32 btf_nr_types(const struct btf *btf) return total; } +/* + * Note that vmlinux and kernel module BTFs are always sorted + * during the building phase. + */ +static void btf_check_sorted(struct btf *btf) +{ + u32 i, n, named_start_id = 0; + + n = btf_nr_types(btf); + if (btf_is_vmlinux(btf)) { + for (i = btf_start_id(btf); i < n; i++) { + const struct btf_type *t = btf_type_by_id(btf, i); + const char *n = btf_name_by_offset(btf, t->name_off); + + if (n[0] != '\0') { + btf->named_start_id = i; + return; + } + } + return; + } + + for (i = btf_start_id(btf) + 1; i < n; i++) { + const struct btf_type *ta = btf_type_by_id(btf, i - 1); + const struct btf_type *tb = btf_type_by_id(btf, i); + const char *na = btf_name_by_offset(btf, ta->name_off); + const char *nb = btf_name_by_offset(btf, tb->name_off); + + if (strcmp(na, nb) > 0) + return; + + if (named_start_id == 0 && na[0] != '\0') + named_start_id = i - 1; + if (named_start_id == 0 && nb[0] != '\0') + named_start_id = i; + } + + if (named_start_id) + btf->named_start_id = named_start_id; +} + +/* + * btf_named_start_id - Get the named starting ID for the BTF + * @btf: Pointer to the target BTF object + * @own: Flag indicating whether to query only the current BTF (true = current BTF only, + * false = recursively traverse the base BTF chain) + * + * Return value rules: + * 1. For a sorted btf, return its named_start_id + * 2. Else for a split BTF, return its start_id + * 3. Else for a base BTF, return 1 + */ +u32 btf_named_start_id(const struct btf *btf, bool own) +{ + const struct btf *base_btf = btf; + + while (!own && base_btf->base_btf) + base_btf = base_btf->base_btf; + + return base_btf->named_start_id ?: (base_btf->start_id ?: 1); +} + +static s32 btf_find_by_name_kind_bsearch(const struct btf *btf, const char *name) +{ + const struct btf_type *t; + const char *tname; + s32 l, r, m; + + l = btf_named_start_id(btf, true); + r = btf_nr_types(btf) - 1; + while (l <= r) { + m = l + (r - l) / 2; + t = btf_type_by_id(btf, m); + tname = btf_name_by_offset(btf, t->name_off); + if (strcmp(tname, name) >= 0) { + if (l == r) + return r; + r = m; + } else { + l = m + 1; + } + } + + return btf_nr_types(btf); +} + s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind) { + const struct btf *base_btf = btf_base_btf(btf); const struct btf_type *t; const char *tname; - u32 i, total; + s32 id, total; - total = btf_nr_types(btf); - for (i = 1; i < total; i++) { - t = btf_type_by_id(btf, i); - if (BTF_INFO_KIND(t->info) != kind) - continue; + if (base_btf) { + id = btf_find_by_name_kind(base_btf, name, kind); + if (id > 0) + return id; + } - tname = btf_name_by_offset(btf, t->name_off); - if (!strcmp(tname, name)) - return i; + total = btf_nr_types(btf); + if (btf->named_start_id > 0 && name[0]) { + id = btf_find_by_name_kind_bsearch(btf, name); + for (; id < total; id++) { + t = btf_type_by_id(btf, id); + tname = btf_name_by_offset(btf, t->name_off); + if (strcmp(tname, name) != 0) + return -ENOENT; + if (BTF_INFO_KIND(t->info) == kind) + return id; + } + } else { + for (id = btf_start_id(btf); id < total; id++) { + t = btf_type_by_id(btf, id); + if (BTF_INFO_KIND(t->info) != kind) + continue; + tname = btf_name_by_offset(btf, t->name_off); + if (strcmp(tname, name) == 0) + return id; + } } return -ENOENT; @@ -3424,7 +3535,8 @@ const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type const struct btf_type *t; int len, id; - id = btf_find_next_decl_tag(btf, pt, comp_idx, tag_key, 0); + id = btf_find_next_decl_tag(btf, pt, comp_idx, tag_key, + btf_named_start_id(btf, false) - 1); if (id < 0) return ERR_PTR(id); @@ -5791,6 +5903,7 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat goto errout; } env->btf = btf; + btf->named_start_id = 0; data = kvmalloc(attr->btf_size, GFP_KERNEL | __GFP_NOWARN); if (!data) { @@ -6107,6 +6220,7 @@ static int btf_validate_prog_ctx_type(struct bpf_verifier_log *log, const struct case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_MODIFY_RETURN: + case BPF_TRACE_FSESSION: /* allow u64* as ctx */ if (btf_is_int(t) && t->size == 8) return 0; @@ -6210,7 +6324,8 @@ static struct btf *btf_parse_base(struct btf_verifier_env *env, const char *name btf->data = data; btf->data_size = data_size; btf->kernel_btf = true; - snprintf(btf->name, sizeof(btf->name), "%s", name); + btf->named_start_id = 0; + strscpy(btf->name, name); err = btf_parse_hdr(env); if (err) @@ -6230,6 +6345,7 @@ static struct btf *btf_parse_base(struct btf_verifier_env *env, const char *name if (err) goto errout; + btf_check_sorted(btf); refcount_set(&btf->refcnt, 1); return btf; @@ -6327,7 +6443,8 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, btf->start_id = base_btf->nr_types; btf->start_str_off = base_btf->hdr.str_len; btf->kernel_btf = true; - snprintf(btf->name, sizeof(btf->name), "%s", module_name); + btf->named_start_id = 0; + strscpy(btf->name, module_name); btf->data = kvmemdup(data, data_size, GFP_KERNEL | __GFP_NOWARN); if (!btf->data) { @@ -6363,6 +6480,7 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, } btf_verifier_env_free(env); + btf_check_sorted(btf); refcount_set(&btf->refcnt, 1); return btf; @@ -6704,6 +6822,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, fallthrough; case BPF_LSM_CGROUP: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: /* When LSM programs are attached to void LSM hooks * they use FEXIT trampolines and when attached to * int LSM hooks, they use MODIFY_RETURN trampolines. @@ -7729,12 +7848,13 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) tname); return -EINVAL; } + /* Convert BTF function arguments into verifier types. * Only PTR_TO_CTX and SCALAR are supported atm. */ for (i = 0; i < nargs; i++) { u32 tags = 0; - int id = 0; + int id = btf_named_start_id(btf, false) - 1; /* 'arg:<tag>' decl_tag takes precedence over derivation of * register type from BTF type itself @@ -8640,24 +8760,17 @@ end: return ret; } -static u32 *__btf_kfunc_id_set_contains(const struct btf *btf, - enum btf_kfunc_hook hook, - u32 kfunc_btf_id, - const struct bpf_prog *prog) +static u32 *btf_kfunc_id_set_contains(const struct btf *btf, + enum btf_kfunc_hook hook, + u32 kfunc_btf_id) { - struct btf_kfunc_hook_filter *hook_filter; struct btf_id_set8 *set; - u32 *id, i; + u32 *id; if (hook >= BTF_KFUNC_HOOK_MAX) return NULL; if (!btf->kfunc_set_tab) return NULL; - hook_filter = &btf->kfunc_set_tab->hook_filters[hook]; - for (i = 0; i < hook_filter->nr_filters; i++) { - if (hook_filter->filters[i](prog, kfunc_btf_id)) - return NULL; - } set = btf->kfunc_set_tab->sets[hook]; if (!set) return NULL; @@ -8668,6 +8781,28 @@ static u32 *__btf_kfunc_id_set_contains(const struct btf *btf, return id + 1; } +static bool __btf_kfunc_is_allowed(const struct btf *btf, + enum btf_kfunc_hook hook, + u32 kfunc_btf_id, + const struct bpf_prog *prog) +{ + struct btf_kfunc_hook_filter *hook_filter; + int i; + + if (hook >= BTF_KFUNC_HOOK_MAX) + return false; + if (!btf->kfunc_set_tab) + return false; + + hook_filter = &btf->kfunc_set_tab->hook_filters[hook]; + for (i = 0; i < hook_filter->nr_filters; i++) { + if (hook_filter->filters[i](prog, kfunc_btf_id)) + return false; + } + + return true; +} + static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) { switch (prog_type) { @@ -8681,6 +8816,7 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) return BTF_KFUNC_HOOK_STRUCT_OPS; case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_TRACEPOINT: + case BPF_PROG_TYPE_RAW_TRACEPOINT: case BPF_PROG_TYPE_PERF_EVENT: case BPF_PROG_TYPE_LSM: return BTF_KFUNC_HOOK_TRACING; @@ -8714,6 +8850,26 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) } } +bool btf_kfunc_is_allowed(const struct btf *btf, + u32 kfunc_btf_id, + const struct bpf_prog *prog) +{ + enum bpf_prog_type prog_type = resolve_prog_type(prog); + enum btf_kfunc_hook hook; + u32 *kfunc_flags; + + kfunc_flags = btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id); + if (kfunc_flags && __btf_kfunc_is_allowed(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id, prog)) + return true; + + hook = bpf_prog_type_to_kfunc_hook(prog_type); + kfunc_flags = btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id); + if (kfunc_flags && __btf_kfunc_is_allowed(btf, hook, kfunc_btf_id, prog)) + return true; + + return false; +} + /* Caution: * Reference to the module (obtained using btf_try_get_module) corresponding to * the struct btf *MUST* be held when calling this function from verifier @@ -8721,26 +8877,27 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) * keeping the reference for the duration of the call provides the necessary * protection for looking up a well-formed btf->kfunc_set_tab. */ -u32 *btf_kfunc_id_set_contains(const struct btf *btf, - u32 kfunc_btf_id, - const struct bpf_prog *prog) +u32 *btf_kfunc_flags(const struct btf *btf, u32 kfunc_btf_id, const struct bpf_prog *prog) { enum bpf_prog_type prog_type = resolve_prog_type(prog); enum btf_kfunc_hook hook; u32 *kfunc_flags; - kfunc_flags = __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id, prog); + kfunc_flags = btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id); if (kfunc_flags) return kfunc_flags; hook = bpf_prog_type_to_kfunc_hook(prog_type); - return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id, prog); + return btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id); } u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id, const struct bpf_prog *prog) { - return __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id, prog); + if (!__btf_kfunc_is_allowed(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id, prog)) + return NULL; + + return btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id); } static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook, @@ -8845,6 +9002,13 @@ static int btf_check_dtor_kfuncs(struct btf *btf, const struct btf_id_dtor_kfunc */ if (!t || !btf_type_is_ptr(t)) return -EINVAL; + + if (IS_ENABLED(CONFIG_CFI_CLANG)) { + /* Ensure the destructor kfunc type matches btf_dtor_kfunc_t */ + t = btf_type_by_id(btf, t->type); + if (!btf_type_is_void(t)) + return -EINVAL; + } } return 0; } @@ -9215,7 +9379,7 @@ bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id) } /* Attempt to find target candidates in vmlinux BTF first */ - cands = bpf_core_add_cands(cands, main_btf, 1); + cands = bpf_core_add_cands(cands, main_btf, btf_named_start_id(main_btf, true)); if (IS_ERR(cands)) return ERR_CAST(cands); @@ -9247,7 +9411,7 @@ check_modules: */ btf_get(mod_btf); spin_unlock_bh(&btf_idr_lock); - cands = bpf_core_add_cands(cands, mod_btf, btf_nr_types(main_btf)); + cands = bpf_core_add_cands(cands, mod_btf, btf_named_start_id(mod_btf, true)); btf_put(mod_btf); if (IS_ERR(cands)) return ERR_CAST(cands); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 69988af44b37..b029f0369ecf 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1680,11 +1680,7 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct cgroup *cgrp; int ret; - /* Check socket family since not all sockets represent network - * endpoint (e.g. AF_UNIX). - */ - if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6 && - sk->sk_family != AF_UNIX) + if (!sk_is_inet(sk) && !sk_is_unix(sk)) return 0; if (!ctx.uaddr) { diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c index f04a468cf6a7..fd51fe3d92cc 100644 --- a/kernel/bpf/cgroup_iter.c +++ b/kernel/bpf/cgroup_iter.c @@ -8,12 +8,13 @@ #include "../cgroup/cgroup-internal.h" /* cgroup_mutex and cgroup_is_dead */ -/* cgroup_iter provides four modes of traversal to the cgroup hierarchy. +/* cgroup_iter provides five modes of traversal to the cgroup hierarchy. * * 1. Walk the descendants of a cgroup in pre-order. * 2. Walk the descendants of a cgroup in post-order. * 3. Walk the ancestors of a cgroup. * 4. Show the given cgroup only. + * 5. Walk the children of a given parent cgroup. * * For walking descendants, cgroup_iter can walk in either pre-order or * post-order. For walking ancestors, the iter walks up from a cgroup to @@ -78,6 +79,8 @@ static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos) return css_next_descendant_pre(NULL, p->start_css); else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST) return css_next_descendant_post(NULL, p->start_css); + else if (p->order == BPF_CGROUP_ITER_CHILDREN) + return css_next_child(NULL, p->start_css); else /* BPF_CGROUP_ITER_SELF_ONLY and BPF_CGROUP_ITER_ANCESTORS_UP */ return p->start_css; } @@ -113,6 +116,8 @@ static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos) return css_next_descendant_post(curr, p->start_css); else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP) return curr->parent; + else if (p->order == BPF_CGROUP_ITER_CHILDREN) + return css_next_child(curr, p->start_css); else /* BPF_CGROUP_ITER_SELF_ONLY */ return NULL; } @@ -200,11 +205,16 @@ static int bpf_iter_attach_cgroup(struct bpf_prog *prog, int order = linfo->cgroup.order; struct cgroup *cgrp; - if (order != BPF_CGROUP_ITER_DESCENDANTS_PRE && - order != BPF_CGROUP_ITER_DESCENDANTS_POST && - order != BPF_CGROUP_ITER_ANCESTORS_UP && - order != BPF_CGROUP_ITER_SELF_ONLY) + switch (order) { + case BPF_CGROUP_ITER_DESCENDANTS_PRE: + case BPF_CGROUP_ITER_DESCENDANTS_POST: + case BPF_CGROUP_ITER_ANCESTORS_UP: + case BPF_CGROUP_ITER_SELF_ONLY: + case BPF_CGROUP_ITER_CHILDREN: + break; + default: return -EINVAL; + } if (fd && id) return -EINVAL; @@ -257,6 +267,8 @@ show_order: seq_puts(seq, "order: descendants_post\n"); else if (aux->cgroup.order == BPF_CGROUP_ITER_ANCESTORS_UP) seq_puts(seq, "order: ancestors_up\n"); + else if (aux->cgroup.order == BPF_CGROUP_ITER_CHILDREN) + seq_puts(seq, "order: children\n"); else /* BPF_CGROUP_ITER_SELF_ONLY */ seq_puts(seq, "order: self_only\n"); } @@ -320,6 +332,7 @@ __bpf_kfunc int bpf_iter_css_new(struct bpf_iter_css *it, case BPF_CGROUP_ITER_DESCENDANTS_PRE: case BPF_CGROUP_ITER_DESCENDANTS_POST: case BPF_CGROUP_ITER_ANCESTORS_UP: + case BPF_CGROUP_ITER_CHILDREN: break; default: return -EINVAL; @@ -345,6 +358,9 @@ __bpf_kfunc struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *i case BPF_CGROUP_ITER_DESCENDANTS_POST: kit->pos = css_next_descendant_post(kit->pos, kit->start); break; + case BPF_CGROUP_ITER_CHILDREN: + kit->pos = css_next_child(kit->pos, kit->start); + break; case BPF_CGROUP_ITER_ANCESTORS_UP: kit->pos = kit->pos ? kit->pos->parent : kit->start; } diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 1b9b18e5b03c..5ab6bace7d0d 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -25,6 +25,7 @@ #include <linux/prandom.h> #include <linux/bpf.h> #include <linux/btf.h> +#include <linux/hex.h> #include <linux/objtool.h> #include <linux/overflow.h> #include <linux/rbtree_latch.h> @@ -112,7 +113,8 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag vfree(fp); return NULL; } - fp->active = alloc_percpu_gfp(int, bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags)); + fp->active = __alloc_percpu_gfp(sizeof(u8[BPF_NR_CONTEXTS]), 4, + bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags)); if (!fp->active) { vfree(fp); kfree(aux); @@ -136,6 +138,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag mutex_init(&fp->aux->used_maps_mutex); mutex_init(&fp->aux->ext_mutex); mutex_init(&fp->aux->dst_mutex); + mutex_init(&fp->aux->st_ops_assoc_mutex); #ifdef CONFIG_BPF_SYSCALL bpf_prog_stream_init(fp); @@ -286,6 +289,7 @@ void __bpf_prog_free(struct bpf_prog *fp) if (fp->aux) { mutex_destroy(&fp->aux->used_maps_mutex); mutex_destroy(&fp->aux->dst_mutex); + mutex_destroy(&fp->aux->st_ops_assoc_mutex); kfree(fp->aux->poke_tab); kfree(fp->aux); } @@ -713,8 +717,8 @@ static struct bpf_ksym *bpf_ksym_find(unsigned long addr) return n ? container_of(n, struct bpf_ksym, tnode) : NULL; } -int __bpf_address_lookup(unsigned long addr, unsigned long *size, - unsigned long *off, char *sym) +int bpf_address_lookup(unsigned long addr, unsigned long *size, + unsigned long *off, char *sym) { struct bpf_ksym *ksym; int ret = 0; @@ -2398,6 +2402,7 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map, map->owner->type = prog_type; map->owner->jited = fp->jited; map->owner->xdp_has_frags = aux->xdp_has_frags; + map->owner->sleepable = fp->sleepable; map->owner->expected_attach_type = fp->expected_attach_type; map->owner->attach_func_proto = aux->attach_func_proto; for_each_cgroup_storage_type(i) { @@ -2409,7 +2414,8 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map, } else { ret = map->owner->type == prog_type && map->owner->jited == fp->jited && - map->owner->xdp_has_frags == aux->xdp_has_frags; + map->owner->xdp_has_frags == aux->xdp_has_frags && + map->owner->sleepable == fp->sleepable; if (ret && map->map_type == BPF_MAP_TYPE_PROG_ARRAY && map->owner->expected_attach_type != fp->expected_attach_type) @@ -2912,6 +2918,7 @@ static void bpf_prog_free_deferred(struct work_struct *work) #endif bpf_free_used_maps(aux); bpf_free_used_btfs(aux); + bpf_prog_disassoc_struct_ops(aux->prog); if (bpf_prog_is_dev_bound(aux)) bpf_prog_dev_bound_destroy(aux->prog); #ifdef CONFIG_PERF_EVENTS @@ -3138,6 +3145,11 @@ bool __weak bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) return false; } +bool __weak bpf_jit_supports_fsession(void) +{ + return false; +} + u64 __weak bpf_arch_uaddress_limit(void) { #if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 703e5df1f4ef..04171fbc39cb 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -430,7 +430,7 @@ static struct bpf_cpu_map_entry * __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, u32 cpu) { - int numa, err, i, fd = value->bpf_prog.fd; + int numa, err = -ENOMEM, i, fd = value->bpf_prog.fd; gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct bpf_cpu_map_entry *rcpu; struct xdp_bulk_queue *bq; @@ -440,7 +440,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, rcpu = bpf_map_kmalloc_node(map, sizeof(*rcpu), gfp | __GFP_ZERO, numa); if (!rcpu) - return NULL; + return ERR_PTR(err); /* Alloc percpu bulkq */ rcpu->bulkq = bpf_map_alloc_percpu(map, sizeof(*rcpu->bulkq), @@ -468,16 +468,21 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, rcpu->value.qsize = value->qsize; gro_init(&rcpu->gro); - if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd)) - goto free_ptr_ring; + if (fd > 0) { + err = __cpu_map_load_bpf_program(rcpu, map, fd); + if (err) + goto free_ptr_ring; + } /* Setup kthread */ init_completion(&rcpu->kthread_running); rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, "cpumap/%d/map:%d", cpu, map->id); - if (IS_ERR(rcpu->kthread)) + if (IS_ERR(rcpu->kthread)) { + err = PTR_ERR(rcpu->kthread); goto free_prog; + } /* Make sure kthread runs on a single CPU */ kthread_bind(rcpu->kthread, cpu); @@ -503,7 +508,7 @@ free_bulkq: free_percpu(rcpu->bulkq); free_rcu: kfree(rcpu); - return NULL; + return ERR_PTR(err); } static void __cpu_map_entry_free(struct work_struct *work) @@ -596,8 +601,8 @@ static long cpu_map_update_elem(struct bpf_map *map, void *key, void *value, } else { /* Updating qsize cause re-allocation of bpf_cpu_map_entry */ rcpu = __cpu_map_entry_alloc(map, &cpumap_value, key_cpu); - if (!rcpu) - return -ENOMEM; + if (IS_ERR(rcpu)) + return PTR_ERR(rcpu); } rcu_read_lock(); __cpu_map_entry_replace(cmap, key_cpu, rcpu); diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index 9876c5fe6c2a..b8c805b4b06a 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -477,7 +477,7 @@ __bpf_kfunc_end_defs(); BTF_KFUNCS_START(cpumask_kfunc_btf_ids) BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE) BTF_ID_FLAGS(func, bpf_cpumask_first, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_first_and, KF_RCU) diff --git a/kernel/bpf/crypto.c b/kernel/bpf/crypto.c index 83c4d9943084..7e75a1936256 100644 --- a/kernel/bpf/crypto.c +++ b/kernel/bpf/crypto.c @@ -60,7 +60,7 @@ struct bpf_crypto_ctx { int bpf_crypto_register_type(const struct bpf_crypto_type *type) { struct bpf_crypto_type_list *node; - int err = -EEXIST; + int err = -EBUSY; down_write(&bpf_crypto_types_sem); list_for_each_entry(node, &bpf_crypto_types, list) { @@ -261,6 +261,12 @@ __bpf_kfunc void bpf_crypto_ctx_release(struct bpf_crypto_ctx *ctx) call_rcu(&ctx->rcu, crypto_free_cb); } +__bpf_kfunc void bpf_crypto_ctx_release_dtor(void *ctx) +{ + bpf_crypto_ctx_release(ctx); +} +CFI_NOSEAL(bpf_crypto_ctx_release_dtor); + static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx, const struct bpf_dynptr_kern *src, const struct bpf_dynptr_kern *dst, @@ -368,7 +374,7 @@ static const struct btf_kfunc_id_set crypt_kfunc_set = { BTF_ID_LIST(bpf_crypto_dtor_ids) BTF_ID(struct, bpf_crypto_ctx) -BTF_ID(func, bpf_crypto_ctx_release) +BTF_ID(func, bpf_crypto_ctx_release_dtor) static int __init crypto_kfunc_init(void) { diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index c8a9b27f8663..3b9d297a53be 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -82,9 +82,6 @@ struct bucket { rqspinlock_t raw_lock; }; -#define HASHTAB_MAP_LOCK_COUNT 8 -#define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1) - struct bpf_htab { struct bpf_map map; struct bpf_mem_alloc ma; @@ -932,7 +929,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) } static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, - void *value, bool onallcpus) + void *value, bool onallcpus, u64 map_flags) { void *ptr; @@ -943,19 +940,28 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, bpf_obj_free_fields(htab->map.record, ptr); } else { u32 size = round_up(htab->map.value_size, 8); - int off = 0, cpu; + void *val; + int cpu; + + if (map_flags & BPF_F_CPU) { + cpu = map_flags >> 32; + ptr = per_cpu_ptr(pptr, cpu); + copy_map_value(&htab->map, ptr, value); + bpf_obj_free_fields(htab->map.record, ptr); + return; + } for_each_possible_cpu(cpu) { ptr = per_cpu_ptr(pptr, cpu); - copy_map_value_long(&htab->map, ptr, value + off); + val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu; + copy_map_value(&htab->map, ptr, val); bpf_obj_free_fields(htab->map.record, ptr); - off += size; } } } static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, - void *value, bool onallcpus) + void *value, bool onallcpus, u64 map_flags) { /* When not setting the initial value on all cpus, zero-fill element * values for other cpus. Otherwise, bpf program has no way to ensure @@ -973,7 +979,7 @@ static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, zero_map_value(&htab->map, per_cpu_ptr(pptr, cpu)); } } else { - pcpu_copy_value(htab, pptr, value, onallcpus); + pcpu_copy_value(htab, pptr, value, onallcpus, map_flags); } } @@ -985,7 +991,7 @@ static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash, bool percpu, bool onallcpus, - struct htab_elem *old_elem) + struct htab_elem *old_elem, u64 map_flags) { u32 size = htab->map.value_size; bool prealloc = htab_is_prealloc(htab); @@ -1043,7 +1049,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, pptr = *(void __percpu **)ptr; } - pcpu_init_value(htab, pptr, value, onallcpus); + pcpu_init_value(htab, pptr, value, onallcpus, map_flags); if (!prealloc) htab_elem_set_ptr(l_new, key_size, pptr); @@ -1147,7 +1153,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, } l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, - l_old); + l_old, map_flags); if (IS_ERR(l_new)) { /* all pre-allocated elements are in use or memory exhausted */ ret = PTR_ERR(l_new); @@ -1249,6 +1255,15 @@ err_lock_bucket: return ret; } +static int htab_map_check_update_flags(bool onallcpus, u64 map_flags) +{ + if (unlikely(!onallcpus && map_flags > BPF_EXIST)) + return -EINVAL; + if (unlikely(onallcpus && ((map_flags & BPF_F_LOCK) || (u32)map_flags > BPF_F_ALL_CPUS))) + return -EINVAL; + return 0; +} + static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, void *value, u64 map_flags, bool percpu, bool onallcpus) @@ -1262,9 +1277,9 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, u32 key_size, hash; int ret; - if (unlikely(map_flags > BPF_EXIST)) - /* unknown flags */ - return -EINVAL; + ret = htab_map_check_update_flags(onallcpus, map_flags); + if (unlikely(ret)) + return ret; WARN_ON_ONCE(!bpf_rcu_lock_held()); @@ -1289,7 +1304,7 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, /* Update value in-place */ if (percpu) { pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), - value, onallcpus); + value, onallcpus, map_flags); } else { void **inner_map_pptr = htab_elem_value(l_old, key_size); @@ -1298,7 +1313,7 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, } } else { l_new = alloc_htab_elem(htab, key, value, key_size, - hash, percpu, onallcpus, NULL); + hash, percpu, onallcpus, NULL, map_flags); if (IS_ERR(l_new)) { ret = PTR_ERR(l_new); goto err; @@ -1324,9 +1339,9 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, u32 key_size, hash; int ret; - if (unlikely(map_flags > BPF_EXIST)) - /* unknown flags */ - return -EINVAL; + ret = htab_map_check_update_flags(onallcpus, map_flags); + if (unlikely(ret)) + return ret; WARN_ON_ONCE(!bpf_rcu_lock_held()); @@ -1363,10 +1378,10 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, /* per-cpu hash map can update value in-place */ pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), - value, onallcpus); + value, onallcpus, map_flags); } else { pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size), - value, onallcpus); + value, onallcpus, map_flags); hlist_nulls_add_head_rcu(&l_new->hash_node, head); l_new = NULL; } @@ -1678,9 +1693,9 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, void __user *ukeys = u64_to_user_ptr(attr->batch.keys); void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); u32 batch, max_count, size, bucket_size, map_id; + u64 elem_map_flags, map_flags, allowed_flags; u32 bucket_cnt, total, key_size, value_size; struct htab_elem *node_to_free = NULL; - u64 elem_map_flags, map_flags; struct hlist_nulls_head *head; struct hlist_nulls_node *n; unsigned long flags = 0; @@ -1690,9 +1705,12 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, int ret = 0; elem_map_flags = attr->batch.elem_flags; - if ((elem_map_flags & ~BPF_F_LOCK) || - ((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK))) - return -EINVAL; + allowed_flags = BPF_F_LOCK; + if (!do_delete && is_percpu) + allowed_flags |= BPF_F_CPU; + ret = bpf_map_check_op_flags(map, elem_map_flags, allowed_flags); + if (ret) + return ret; map_flags = attr->batch.flags; if (map_flags) @@ -1715,7 +1733,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, key_size = htab->map.key_size; value_size = htab->map.value_size; size = round_up(value_size, 8); - if (is_percpu) + if (is_percpu && !(elem_map_flags & BPF_F_CPU)) value_size = size * num_possible_cpus(); total = 0; /* while experimenting with hash tables with sizes ranging from 10 to @@ -1798,10 +1816,17 @@ again_nocopy: void __percpu *pptr; pptr = htab_elem_get_ptr(l, map->key_size); - for_each_possible_cpu(cpu) { - copy_map_value_long(&htab->map, dst_val + off, per_cpu_ptr(pptr, cpu)); - check_and_init_map_value(&htab->map, dst_val + off); - off += size; + if (elem_map_flags & BPF_F_CPU) { + cpu = elem_map_flags >> 32; + copy_map_value(&htab->map, dst_val, per_cpu_ptr(pptr, cpu)); + check_and_init_map_value(&htab->map, dst_val); + } else { + for_each_possible_cpu(cpu) { + copy_map_value_long(&htab->map, dst_val + off, + per_cpu_ptr(pptr, cpu)); + check_and_init_map_value(&htab->map, dst_val + off); + off += size; + } } } else { value = htab_elem_value(l, key_size); @@ -2209,11 +2234,11 @@ static u64 htab_map_mem_usage(const struct bpf_map *map) bool prealloc = htab_is_prealloc(htab); bool percpu = htab_is_percpu(htab); bool lru = htab_is_lru(htab); - u64 num_entries; - u64 usage = sizeof(struct bpf_htab); + u64 num_entries, usage; + + usage = sizeof(struct bpf_htab) + + sizeof(struct bucket) * htab->n_buckets; - usage += sizeof(struct bucket) * htab->n_buckets; - usage += sizeof(int) * num_possible_cpus() * HASHTAB_MAP_LOCK_COUNT; if (prealloc) { num_entries = map->max_entries; if (htab_has_extra_elems(htab)) @@ -2357,7 +2382,7 @@ static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *k return NULL; } -int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) +int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct htab_elem *l; void __percpu *pptr; @@ -2374,16 +2399,22 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) l = __htab_map_lookup_elem(map, key); if (!l) goto out; + ret = 0; /* We do not mark LRU map element here in order to not mess up * eviction heuristics when user space does a map walk. */ pptr = htab_elem_get_ptr(l, map->key_size); + if (map_flags & BPF_F_CPU) { + cpu = map_flags >> 32; + copy_map_value(map, value, per_cpu_ptr(pptr, cpu)); + check_and_init_map_value(map, value); + goto out; + } for_each_possible_cpu(cpu) { copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu)); check_and_init_map_value(map, value + off); off += size; } - ret = 0; out: rcu_read_unlock(); return ret; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index db72b96f9c8c..7ac32798eb04 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1077,7 +1077,7 @@ const struct bpf_func_proto bpf_snprintf_proto = { .func = bpf_snprintf, .gpl_only = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM_OR_NULL, + .arg1_type = ARG_PTR_TO_MEM_OR_NULL | MEM_WRITE, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_PTR_TO_CONST_STR, .arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, @@ -1095,16 +1095,34 @@ static void *map_key_from_value(struct bpf_map *map, void *value, u32 *arr_idx) return (void *)value - round_up(map->key_size, 8); } +enum bpf_async_type { + BPF_ASYNC_TYPE_TIMER = 0, + BPF_ASYNC_TYPE_WQ, +}; + +enum bpf_async_op { + BPF_ASYNC_START, + BPF_ASYNC_CANCEL +}; + +struct bpf_async_cmd { + struct llist_node node; + u64 nsec; + u32 mode; + enum bpf_async_op op; +}; + struct bpf_async_cb { struct bpf_map *map; struct bpf_prog *prog; void __rcu *callback_fn; void *value; - union { - struct rcu_head rcu; - struct work_struct delete_work; - }; + struct rcu_head rcu; u64 flags; + struct irq_work worker; + refcount_t refcnt; + enum bpf_async_type type; + struct llist_head async_cmds; }; /* BPF map elements can contain 'struct bpf_timer'. @@ -1132,7 +1150,6 @@ struct bpf_hrtimer { struct bpf_work { struct bpf_async_cb cb; struct work_struct work; - struct work_struct delete_work; }; /* the actual struct hidden inside uapi struct bpf_timer and bpf_wq */ @@ -1142,20 +1159,12 @@ struct bpf_async_kern { struct bpf_hrtimer *timer; struct bpf_work *work; }; - /* bpf_spin_lock is used here instead of spinlock_t to make - * sure that it always fits into space reserved by struct bpf_timer - * regardless of LOCKDEP and spinlock debug flags. - */ - struct bpf_spin_lock lock; } __attribute__((aligned(8))); -enum bpf_async_type { - BPF_ASYNC_TYPE_TIMER = 0, - BPF_ASYNC_TYPE_WQ, -}; - static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running); +static void bpf_async_refcount_put(struct bpf_async_cb *cb); + static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) { struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer); @@ -1219,45 +1228,85 @@ static void bpf_async_cb_rcu_free(struct rcu_head *rcu) { struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu); + /* + * Drop the last reference to prog only after RCU GP, as set_callback() + * may race with cancel_and_free() + */ + if (cb->prog) + bpf_prog_put(cb->prog); + kfree_nolock(cb); } -static void bpf_wq_delete_work(struct work_struct *work) +/* Callback from call_rcu_tasks_trace, chains to call_rcu for final free */ +static void bpf_async_cb_rcu_tasks_trace_free(struct rcu_head *rcu) { - struct bpf_work *w = container_of(work, struct bpf_work, delete_work); + struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu); + struct bpf_hrtimer *t = container_of(cb, struct bpf_hrtimer, cb); + struct bpf_work *w = container_of(cb, struct bpf_work, cb); + bool retry = false; - cancel_work_sync(&w->work); + /* + * bpf_async_cancel_and_free() tried to cancel timer/wq, but it + * could have raced with timer/wq_start. Now refcnt is zero and + * srcu/rcu GP completed. Cancel timer/wq again. + */ + switch (cb->type) { + case BPF_ASYNC_TYPE_TIMER: + if (hrtimer_try_to_cancel(&t->timer) < 0) + retry = true; + break; + case BPF_ASYNC_TYPE_WQ: + if (!cancel_work(&w->work) && work_busy(&w->work)) + retry = true; + break; + } + if (retry) { + /* + * hrtimer or wq callback may still be running. It must be + * in rcu_tasks_trace or rcu CS, so wait for GP again. + * It won't retry forever, since refcnt zero prevents all + * operations on timer/wq. + */ + call_rcu_tasks_trace(&cb->rcu, bpf_async_cb_rcu_tasks_trace_free); + return; + } - call_rcu(&w->cb.rcu, bpf_async_cb_rcu_free); + /* rcu_trace_implies_rcu_gp() is true and will remain so */ + bpf_async_cb_rcu_free(rcu); } -static void bpf_timer_delete_work(struct work_struct *work) +static void worker_for_call_rcu(struct irq_work *work) { - struct bpf_hrtimer *t = container_of(work, struct bpf_hrtimer, cb.delete_work); + struct bpf_async_cb *cb = container_of(work, struct bpf_async_cb, worker); - /* Cancel the timer and wait for callback to complete if it was running. - * If hrtimer_cancel() can be safely called it's safe to call - * call_rcu() right after for both preallocated and non-preallocated - * maps. The async->cb = NULL was already done and no code path can see - * address 't' anymore. Timer if armed for existing bpf_hrtimer before - * bpf_timer_cancel_and_free will have been cancelled. - */ - hrtimer_cancel(&t->timer); - call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free); + call_rcu_tasks_trace(&cb->rcu, bpf_async_cb_rcu_tasks_trace_free); } +static void bpf_async_refcount_put(struct bpf_async_cb *cb) +{ + if (!refcount_dec_and_test(&cb->refcnt)) + return; + + if (irqs_disabled()) { + cb->worker = IRQ_WORK_INIT(worker_for_call_rcu); + irq_work_queue(&cb->worker); + } else { + call_rcu_tasks_trace(&cb->rcu, bpf_async_cb_rcu_tasks_trace_free); + } +} + +static void bpf_async_cancel_and_free(struct bpf_async_kern *async); +static void bpf_async_irq_worker(struct irq_work *work); + static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags, enum bpf_async_type type) { - struct bpf_async_cb *cb; + struct bpf_async_cb *cb, *old_cb; struct bpf_hrtimer *t; struct bpf_work *w; clockid_t clockid; size_t size; - int ret = 0; - - if (in_nmi()) - return -EOPNOTSUPP; switch (type) { case BPF_ASYNC_TYPE_TIMER: @@ -1270,18 +1319,13 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u return -EINVAL; } - __bpf_spin_lock_irqsave(&async->lock); - t = async->timer; - if (t) { - ret = -EBUSY; - goto out; - } + old_cb = READ_ONCE(async->cb); + if (old_cb) + return -EBUSY; cb = bpf_map_kmalloc_nolock(map, size, 0, map->numa_node); - if (!cb) { - ret = -ENOMEM; - goto out; - } + if (!cb) + return -ENOMEM; switch (type) { case BPF_ASYNC_TYPE_TIMER: @@ -1289,7 +1333,6 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u t = (struct bpf_hrtimer *)cb; atomic_set(&t->cancelling, 0); - INIT_WORK(&t->cb.delete_work, bpf_timer_delete_work); hrtimer_setup(&t->timer, bpf_timer_cb, clockid, HRTIMER_MODE_REL_SOFT); cb->value = (void *)async - map->record->timer_off; break; @@ -1297,16 +1340,24 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u w = (struct bpf_work *)cb; INIT_WORK(&w->work, bpf_wq_work); - INIT_WORK(&w->delete_work, bpf_wq_delete_work); cb->value = (void *)async - map->record->wq_off; break; } cb->map = map; cb->prog = NULL; cb->flags = flags; + cb->worker = IRQ_WORK_INIT(bpf_async_irq_worker); + init_llist_head(&cb->async_cmds); + refcount_set(&cb->refcnt, 1); /* map's reference */ + cb->type = type; rcu_assign_pointer(cb->callback_fn, NULL); - WRITE_ONCE(async->cb, cb); + old_cb = cmpxchg(&async->cb, NULL, cb); + if (old_cb) { + /* Lost the race to initialize this bpf_async_kern, drop the allocated object */ + kfree_nolock(cb); + return -EBUSY; + } /* Guarantee the order between async->cb and map->usercnt. So * when there are concurrent uref release and bpf timer init, either * bpf_timer_cancel_and_free() called by uref release reads a no-NULL @@ -1317,13 +1368,11 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u /* maps with timers must be either held by user space * or pinned in bpffs. */ - WRITE_ONCE(async->cb, NULL); - kfree_nolock(cb); - ret = -EPERM; + bpf_async_cancel_and_free(async); + return -EPERM; } -out: - __bpf_spin_unlock_irqrestore(&async->lock); - return ret; + + return 0; } BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map, @@ -1354,56 +1403,90 @@ static const struct bpf_func_proto bpf_timer_init_proto = { .arg3_type = ARG_ANYTHING, }; -static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn, - struct bpf_prog_aux *aux, unsigned int flags, - enum bpf_async_type type) +static int bpf_async_update_prog_callback(struct bpf_async_cb *cb, + struct bpf_prog *prog, + void *callback_fn) { - struct bpf_prog *prev, *prog = aux->prog; - struct bpf_async_cb *cb; - int ret = 0; + struct bpf_prog *prev; - if (in_nmi()) - return -EOPNOTSUPP; - __bpf_spin_lock_irqsave(&async->lock); - cb = async->cb; - if (!cb) { - ret = -EINVAL; - goto out; - } - if (!atomic64_read(&cb->map->usercnt)) { - /* maps with timers must be either held by user space - * or pinned in bpffs. Otherwise timer might still be - * running even when bpf prog is detached and user space - * is gone, since map_release_uref won't ever be called. - */ - ret = -EPERM; - goto out; + /* Acquire a guard reference on prog to prevent it from being freed during the loop */ + if (prog) { + prog = bpf_prog_inc_not_zero(prog); + if (IS_ERR(prog)) + return PTR_ERR(prog); } - prev = cb->prog; - if (prev != prog) { - /* Bump prog refcnt once. Every bpf_timer_set_callback() - * can pick different callback_fn-s within the same prog. + + do { + if (prog) + prog = bpf_prog_inc_not_zero(prog); + prev = xchg(&cb->prog, prog); + rcu_assign_pointer(cb->callback_fn, callback_fn); + + /* + * Release previous prog, make sure that if other CPU is contending, + * to set bpf_prog, references are not leaked as each iteration acquires and + * releases one reference. */ - prog = bpf_prog_inc_not_zero(prog); - if (IS_ERR(prog)) { - ret = PTR_ERR(prog); - goto out; - } if (prev) - /* Drop prev prog refcnt when swapping with new prog */ bpf_prog_put(prev); - cb->prog = prog; + + } while (READ_ONCE(cb->prog) != prog || + (void __force *)READ_ONCE(cb->callback_fn) != callback_fn); + + if (prog) + bpf_prog_put(prog); + + return 0; +} + +static DEFINE_PER_CPU(struct bpf_async_cb *, async_cb_running); + +static int bpf_async_schedule_op(struct bpf_async_cb *cb, enum bpf_async_op op, + u64 nsec, u32 timer_mode) +{ + /* + * Do not schedule another operation on this cpu if it's in irq_work + * callback that is processing async_cmds queue. Otherwise the following + * loop is possible: + * bpf_timer_start() -> bpf_async_schedule_op() -> irq_work_queue(). + * irqrestore -> bpf_async_irq_worker() -> tracepoint -> bpf_timer_start(). + */ + if (this_cpu_read(async_cb_running) == cb) { + bpf_async_refcount_put(cb); + return -EDEADLK; } - rcu_assign_pointer(cb->callback_fn, callback_fn); -out: - __bpf_spin_unlock_irqrestore(&async->lock); - return ret; + + struct bpf_async_cmd *cmd = kmalloc_nolock(sizeof(*cmd), 0, NUMA_NO_NODE); + + if (!cmd) { + bpf_async_refcount_put(cb); + return -ENOMEM; + } + init_llist_node(&cmd->node); + cmd->nsec = nsec; + cmd->mode = timer_mode; + cmd->op = op; + if (llist_add(&cmd->node, &cb->async_cmds)) + irq_work_queue(&cb->worker); + return 0; +} + +static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn, + struct bpf_prog *prog) +{ + struct bpf_async_cb *cb; + + cb = READ_ONCE(async->cb); + if (!cb) + return -EINVAL; + + return bpf_async_update_prog_callback(cb, prog, callback_fn); } BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn, struct bpf_prog_aux *, aux) { - return __bpf_async_set_callback(timer, callback_fn, aux, 0, BPF_ASYNC_TYPE_TIMER); + return __bpf_async_set_callback(timer, callback_fn, aux->prog); } static const struct bpf_func_proto bpf_timer_set_callback_proto = { @@ -1414,22 +1497,22 @@ static const struct bpf_func_proto bpf_timer_set_callback_proto = { .arg2_type = ARG_PTR_TO_FUNC, }; -BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, flags) +static bool defer_timer_wq_op(void) +{ + return in_hardirq() || irqs_disabled(); +} + +BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, async, u64, nsecs, u64, flags) { struct bpf_hrtimer *t; - int ret = 0; - enum hrtimer_mode mode; + u32 mode; - if (in_nmi()) - return -EOPNOTSUPP; if (flags & ~(BPF_F_TIMER_ABS | BPF_F_TIMER_CPU_PIN)) return -EINVAL; - __bpf_spin_lock_irqsave(&timer->lock); - t = timer->timer; - if (!t || !t->cb.prog) { - ret = -EINVAL; - goto out; - } + + t = READ_ONCE(async->timer); + if (!t || !READ_ONCE(t->cb.prog)) + return -EINVAL; if (flags & BPF_F_TIMER_ABS) mode = HRTIMER_MODE_ABS_SOFT; @@ -1439,10 +1522,20 @@ BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, fla if (flags & BPF_F_TIMER_CPU_PIN) mode |= HRTIMER_MODE_PINNED; - hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode); -out: - __bpf_spin_unlock_irqrestore(&timer->lock); - return ret; + /* + * bpf_async_cancel_and_free() could have dropped refcnt to zero. In + * such case BPF progs are not allowed to arm the timer to prevent UAF. + */ + if (!refcount_inc_not_zero(&t->cb.refcnt)) + return -ENOENT; + + if (!defer_timer_wq_op()) { + hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode); + bpf_async_refcount_put(&t->cb); + return 0; + } else { + return bpf_async_schedule_op(&t->cb, BPF_ASYNC_START, nsecs, mode); + } } static const struct bpf_func_proto bpf_timer_start_proto = { @@ -1454,32 +1547,18 @@ static const struct bpf_func_proto bpf_timer_start_proto = { .arg3_type = ARG_ANYTHING, }; -static void drop_prog_refcnt(struct bpf_async_cb *async) -{ - struct bpf_prog *prog = async->prog; - - if (prog) { - bpf_prog_put(prog); - async->prog = NULL; - rcu_assign_pointer(async->callback_fn, NULL); - } -} - -BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) +BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, async) { struct bpf_hrtimer *t, *cur_t; bool inc = false; int ret = 0; - if (in_nmi()) + if (defer_timer_wq_op()) return -EOPNOTSUPP; - rcu_read_lock(); - __bpf_spin_lock_irqsave(&timer->lock); - t = timer->timer; - if (!t) { - ret = -EINVAL; - goto out; - } + + t = READ_ONCE(async->timer); + if (!t) + return -EINVAL; cur_t = this_cpu_read(hrtimer_running); if (cur_t == t) { @@ -1487,8 +1566,7 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) * its own timer the hrtimer_cancel() will deadlock * since it waits for callback_fn to finish. */ - ret = -EDEADLK; - goto out; + return -EDEADLK; } /* Only account in-flight cancellations when invoked from a timer @@ -1511,20 +1589,17 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) * cancelling and waiting for it synchronously, since it might * do the same. Bail! */ - ret = -EDEADLK; - goto out; + atomic_dec(&t->cancelling); + return -EDEADLK; } drop: - drop_prog_refcnt(&t->cb); -out: - __bpf_spin_unlock_irqrestore(&timer->lock); + bpf_async_update_prog_callback(&t->cb, NULL, NULL); /* Cancel the timer and wait for associated callback to finish * if it was running. */ - ret = ret ?: hrtimer_cancel(&t->timer); + ret = hrtimer_cancel(&t->timer); if (inc) atomic_dec(&t->cancelling); - rcu_read_unlock(); return ret; } @@ -1535,107 +1610,107 @@ static const struct bpf_func_proto bpf_timer_cancel_proto = { .arg1_type = ARG_PTR_TO_TIMER, }; -static struct bpf_async_cb *__bpf_async_cancel_and_free(struct bpf_async_kern *async) +static void bpf_async_process_op(struct bpf_async_cb *cb, u32 op, + u64 timer_nsec, u32 timer_mode) { - struct bpf_async_cb *cb; + switch (cb->type) { + case BPF_ASYNC_TYPE_TIMER: { + struct bpf_hrtimer *t = container_of(cb, struct bpf_hrtimer, cb); - /* Performance optimization: read async->cb without lock first. */ - if (!READ_ONCE(async->cb)) - return NULL; + switch (op) { + case BPF_ASYNC_START: + hrtimer_start(&t->timer, ns_to_ktime(timer_nsec), timer_mode); + break; + case BPF_ASYNC_CANCEL: + hrtimer_try_to_cancel(&t->timer); + break; + } + break; + } + case BPF_ASYNC_TYPE_WQ: { + struct bpf_work *w = container_of(cb, struct bpf_work, cb); + + switch (op) { + case BPF_ASYNC_START: + schedule_work(&w->work); + break; + case BPF_ASYNC_CANCEL: + cancel_work(&w->work); + break; + } + break; + } + } + bpf_async_refcount_put(cb); +} - __bpf_spin_lock_irqsave(&async->lock); - /* re-read it under lock */ - cb = async->cb; - if (!cb) - goto out; - drop_prog_refcnt(cb); - /* The subsequent bpf_timer_start/cancel() helpers won't be able to use - * this timer, since it won't be initialized. - */ - WRITE_ONCE(async->cb, NULL); -out: - __bpf_spin_unlock_irqrestore(&async->lock); - return cb; +static void bpf_async_irq_worker(struct irq_work *work) +{ + struct bpf_async_cb *cb = container_of(work, struct bpf_async_cb, worker); + struct llist_node *pos, *n, *list; + + list = llist_del_all(&cb->async_cmds); + if (!list) + return; + + list = llist_reverse_order(list); + this_cpu_write(async_cb_running, cb); + llist_for_each_safe(pos, n, list) { + struct bpf_async_cmd *cmd; + + cmd = container_of(pos, struct bpf_async_cmd, node); + bpf_async_process_op(cb, cmd->op, cmd->nsec, cmd->mode); + kfree_nolock(cmd); + } + this_cpu_write(async_cb_running, NULL); } -/* This function is called by map_delete/update_elem for individual element and - * by ops->map_release_uref when the user space reference to a map reaches zero. - */ -void bpf_timer_cancel_and_free(void *val) +static void bpf_async_cancel_and_free(struct bpf_async_kern *async) { - struct bpf_hrtimer *t; + struct bpf_async_cb *cb; - t = (struct bpf_hrtimer *)__bpf_async_cancel_and_free(val); + if (!READ_ONCE(async->cb)) + return; - if (!t) + cb = xchg(&async->cb, NULL); + if (!cb) return; - /* We check that bpf_map_delete/update_elem() was called from timer - * callback_fn. In such case we don't call hrtimer_cancel() (since it - * will deadlock) and don't call hrtimer_try_to_cancel() (since it will - * just return -1). Though callback_fn is still running on this cpu it's - * safe to do kfree(t) because bpf_timer_cb() read everything it needed - * from 't'. The bpf subprog callback_fn won't be able to access 't', - * since async->cb = NULL was already done. The timer will be - * effectively cancelled because bpf_timer_cb() will return - * HRTIMER_NORESTART. - * - * However, it is possible the timer callback_fn calling us armed the - * timer _before_ calling us, such that failing to cancel it here will - * cause it to possibly use struct hrtimer after freeing bpf_hrtimer. - * Therefore, we _need_ to cancel any outstanding timers before we do - * call_rcu, even though no more timers can be armed. - * - * Moreover, we need to schedule work even if timer does not belong to - * the calling callback_fn, as on two different CPUs, we can end up in a - * situation where both sides run in parallel, try to cancel one - * another, and we end up waiting on both sides in hrtimer_cancel - * without making forward progress, since timer1 depends on time2 - * callback to finish, and vice versa. - * - * CPU 1 (timer1_cb) CPU 2 (timer2_cb) - * bpf_timer_cancel_and_free(timer2) bpf_timer_cancel_and_free(timer1) - * - * To avoid these issues, punt to workqueue context when we are in a - * timer callback. + + bpf_async_update_prog_callback(cb, NULL, NULL); + /* + * No refcount_inc_not_zero(&cb->refcnt) here. Dropping the last + * refcnt. Either synchronously or asynchronously in irq_work. */ - if (this_cpu_read(hrtimer_running)) { - queue_work(system_dfl_wq, &t->cb.delete_work); - return; - } - if (IS_ENABLED(CONFIG_PREEMPT_RT)) { - /* If the timer is running on other CPU, also use a kworker to - * wait for the completion of the timer instead of trying to - * acquire a sleepable lock in hrtimer_cancel() to wait for its - * completion. - */ - if (hrtimer_try_to_cancel(&t->timer) >= 0) - call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free); - else - queue_work(system_dfl_wq, &t->cb.delete_work); + if (!defer_timer_wq_op()) { + bpf_async_process_op(cb, BPF_ASYNC_CANCEL, 0, 0); } else { - bpf_timer_delete_work(&t->cb.delete_work); + (void)bpf_async_schedule_op(cb, BPF_ASYNC_CANCEL, 0, 0); + /* + * bpf_async_schedule_op() either enqueues allocated cmd into llist + * or fails with ENOMEM and drop the last refcnt. + * This is unlikely, but safe, since bpf_async_cb_rcu_tasks_trace_free() + * callback will do additional timer/wq_cancel due to races anyway. + */ } } -/* This function is called by map_delete/update_elem for individual element and +/* + * This function is called by map_delete/update_elem for individual element and * by ops->map_release_uref when the user space reference to a map reaches zero. */ -void bpf_wq_cancel_and_free(void *val) +void bpf_timer_cancel_and_free(void *val) { - struct bpf_work *work; - - BTF_TYPE_EMIT(struct bpf_wq); + bpf_async_cancel_and_free(val); +} - work = (struct bpf_work *)__bpf_async_cancel_and_free(val); - if (!work) - return; - /* Trigger cancel of the sleepable work, but *do not* wait for - * it to finish if it was running as we might not be in a - * sleepable context. - * kfree will be called once the work has finished. - */ - schedule_work(&work->delete_work); +/* + * This function is called by map_delete/update_elem for individual element and + * by ops->map_release_uref when the user space reference to a map reaches zero. + */ +void bpf_wq_cancel_and_free(void *val) +{ + bpf_async_cancel_and_free(val); } BPF_CALL_2(bpf_kptr_xchg, void *, dst, void *, ptr) @@ -2092,12 +2167,8 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_cgroup_classid_curr_proto; #endif case BPF_FUNC_task_storage_get: - if (bpf_prog_check_recur(prog)) - return &bpf_task_storage_get_recur_proto; return &bpf_task_storage_get_proto; case BPF_FUNC_task_storage_delete: - if (bpf_prog_check_recur(prog)) - return &bpf_task_storage_delete_recur_proto; return &bpf_task_storage_delete_proto; default: break; @@ -2709,14 +2780,14 @@ __bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid) * bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data. * @p: The dynptr whose data slice to retrieve * @offset: Offset into the dynptr - * @buffer__opt: User-provided buffer to copy contents into. May be NULL + * @buffer__nullable: User-provided buffer to copy contents into. May be NULL * @buffer__szk: Size (in bytes) of the buffer if present. This is the * length of the requested slice. This must be a constant. * * For non-skb and non-xdp type dynptrs, there is no difference between * bpf_dynptr_slice and bpf_dynptr_data. * - * If buffer__opt is NULL, the call will fail if buffer_opt was needed. + * If buffer__nullable is NULL, the call will fail if buffer_opt was needed. * * If the intention is to write to the data slice, please use * bpf_dynptr_slice_rdwr. @@ -2734,7 +2805,7 @@ __bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid) * direct pointer) */ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset, - void *buffer__opt, u64 buffer__szk) + void *buffer__nullable, u64 buffer__szk) { const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; enum bpf_dynptr_type type; @@ -2755,8 +2826,8 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset, case BPF_DYNPTR_TYPE_RINGBUF: return ptr->data + ptr->offset + offset; case BPF_DYNPTR_TYPE_SKB: - if (buffer__opt) - return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer__opt); + if (buffer__nullable) + return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer__nullable); else return skb_pointer_if_linear(ptr->data, ptr->offset + offset, len); case BPF_DYNPTR_TYPE_XDP: @@ -2765,16 +2836,16 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset, if (!IS_ERR_OR_NULL(xdp_ptr)) return xdp_ptr; - if (!buffer__opt) + if (!buffer__nullable) return NULL; - bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__opt, len, false); - return buffer__opt; + bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__nullable, len, false); + return buffer__nullable; } case BPF_DYNPTR_TYPE_SKB_META: return bpf_skb_meta_pointer(ptr->data, ptr->offset + offset); case BPF_DYNPTR_TYPE_FILE: - err = bpf_file_fetch_bytes(ptr->data, offset, buffer__opt, buffer__szk); - return err ? NULL : buffer__opt; + err = bpf_file_fetch_bytes(ptr->data, offset, buffer__nullable, buffer__szk); + return err ? NULL : buffer__nullable; default: WARN_ONCE(true, "unknown dynptr type %d\n", type); return NULL; @@ -2785,14 +2856,14 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset, * bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data. * @p: The dynptr whose data slice to retrieve * @offset: Offset into the dynptr - * @buffer__opt: User-provided buffer to copy contents into. May be NULL + * @buffer__nullable: User-provided buffer to copy contents into. May be NULL * @buffer__szk: Size (in bytes) of the buffer if present. This is the * length of the requested slice. This must be a constant. * * For non-skb and non-xdp type dynptrs, there is no difference between * bpf_dynptr_slice and bpf_dynptr_data. * - * If buffer__opt is NULL, the call will fail if buffer_opt was needed. + * If buffer__nullable is NULL, the call will fail if buffer_opt was needed. * * The returned pointer is writable and may point to either directly the dynptr * data at the requested offset or to the buffer if unable to obtain a direct @@ -2824,7 +2895,7 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset, * direct pointer) */ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset, - void *buffer__opt, u64 buffer__szk) + void *buffer__nullable, u64 buffer__szk) { const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; @@ -2853,7 +2924,7 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset, * will be copied out into the buffer and the user will need to call * bpf_dynptr_write() to commit changes. */ - return bpf_dynptr_slice(p, offset, buffer__opt, buffer__szk); + return bpf_dynptr_slice(p, offset, buffer__nullable, buffer__szk); } __bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u64 start, u64 end) @@ -3108,30 +3179,36 @@ __bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) struct bpf_async_kern *async = (struct bpf_async_kern *)wq; struct bpf_work *w; - if (in_nmi()) - return -EOPNOTSUPP; if (flags) return -EINVAL; + w = READ_ONCE(async->work); if (!w || !READ_ONCE(w->cb.prog)) return -EINVAL; - schedule_work(&w->work); - return 0; + if (!refcount_inc_not_zero(&w->cb.refcnt)) + return -ENOENT; + + if (!defer_timer_wq_op()) { + schedule_work(&w->work); + bpf_async_refcount_put(&w->cb); + return 0; + } else { + return bpf_async_schedule_op(&w->cb, BPF_ASYNC_START, 0, 0); + } } -__bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq, - int (callback_fn)(void *map, int *key, void *value), - unsigned int flags, - void *aux__prog) +__bpf_kfunc int bpf_wq_set_callback(struct bpf_wq *wq, + int (callback_fn)(void *map, int *key, void *value), + unsigned int flags, + struct bpf_prog_aux *aux) { - struct bpf_prog_aux *aux = (struct bpf_prog_aux *)aux__prog; struct bpf_async_kern *async = (struct bpf_async_kern *)wq; if (flags) return -EINVAL; - return __bpf_async_set_callback(async, callback_fn, aux, flags, BPF_ASYNC_TYPE_WQ); + return __bpf_async_set_callback(async, callback_fn, aux->prog); } __bpf_kfunc void bpf_preempt_disable(void) @@ -3406,7 +3483,7 @@ __bpf_kfunc void __bpf_trap(void) * __get_kernel_nofault instead of plain dereference to make them safe. */ -static int __bpf_strcasecmp(const char *s1, const char *s2, bool ignore_case) +static int __bpf_strncasecmp(const char *s1, const char *s2, bool ignore_case, size_t len) { char c1, c2; int i; @@ -3417,7 +3494,7 @@ static int __bpf_strcasecmp(const char *s1, const char *s2, bool ignore_case) } guard(pagefault)(); - for (i = 0; i < XATTR_SIZE_MAX; i++) { + for (i = 0; i < len && i < XATTR_SIZE_MAX; i++) { __get_kernel_nofault(&c1, s1, char, err_out); __get_kernel_nofault(&c2, s2, char, err_out); if (ignore_case) { @@ -3431,7 +3508,7 @@ static int __bpf_strcasecmp(const char *s1, const char *s2, bool ignore_case) s1++; s2++; } - return -E2BIG; + return i == XATTR_SIZE_MAX ? -E2BIG : 0; err_out: return -EFAULT; } @@ -3451,7 +3528,7 @@ err_out: */ __bpf_kfunc int bpf_strcmp(const char *s1__ign, const char *s2__ign) { - return __bpf_strcasecmp(s1__ign, s2__ign, false); + return __bpf_strncasecmp(s1__ign, s2__ign, false, XATTR_SIZE_MAX); } /** @@ -3469,7 +3546,26 @@ __bpf_kfunc int bpf_strcmp(const char *s1__ign, const char *s2__ign) */ __bpf_kfunc int bpf_strcasecmp(const char *s1__ign, const char *s2__ign) { - return __bpf_strcasecmp(s1__ign, s2__ign, true); + return __bpf_strncasecmp(s1__ign, s2__ign, true, XATTR_SIZE_MAX); +} + +/* + * bpf_strncasecmp - Compare two length-limited strings, ignoring case + * @s1__ign: One string + * @s2__ign: Another string + * @len: The maximum number of characters to compare + * + * Return: + * * %0 - Strings are equal + * * %-1 - @s1__ign is smaller + * * %1 - @s2__ign is smaller + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of strings is too large + * * %-ERANGE - One of strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strncasecmp(const char *s1__ign, const char *s2__ign, size_t len) +{ + return __bpf_strncasecmp(s1__ign, s2__ign, true, len); } /** @@ -4275,41 +4371,39 @@ release_prog: } /** - * bpf_task_work_schedule_signal_impl - Schedule BPF callback using task_work_add with TWA_SIGNAL + * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL * mode * @task: Task struct for which callback should be scheduled * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping * @map__map: bpf_map that embeds struct bpf_task_work in the values * @callback: pointer to BPF subprogram to call - * @aux__prog: user should pass NULL + * @aux: pointer to bpf_prog_aux of the caller BPF program, implicitly set by the verifier * * Return: 0 if task work has been scheduled successfully, negative error code otherwise */ -__bpf_kfunc int bpf_task_work_schedule_signal_impl(struct task_struct *task, - struct bpf_task_work *tw, void *map__map, - bpf_task_work_callback_t callback, - void *aux__prog) +__bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw, + void *map__map, bpf_task_work_callback_t callback, + struct bpf_prog_aux *aux) { - return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_SIGNAL); + return bpf_task_work_schedule(task, tw, map__map, callback, aux, TWA_SIGNAL); } /** - * bpf_task_work_schedule_resume_impl - Schedule BPF callback using task_work_add with TWA_RESUME + * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME * mode * @task: Task struct for which callback should be scheduled * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping * @map__map: bpf_map that embeds struct bpf_task_work in the values * @callback: pointer to BPF subprogram to call - * @aux__prog: user should pass NULL + * @aux: pointer to bpf_prog_aux of the caller BPF program, implicitly set by the verifier * * Return: 0 if task work has been scheduled successfully, negative error code otherwise */ -__bpf_kfunc int bpf_task_work_schedule_resume_impl(struct task_struct *task, - struct bpf_task_work *tw, void *map__map, - bpf_task_work_callback_t callback, - void *aux__prog) +__bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw, + void *map__map, bpf_task_work_callback_t callback, + struct bpf_prog_aux *aux) { - return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME); + return bpf_task_work_schedule(task, tw, map__map, callback, aux, TWA_RESUME); } static int make_file_dynptr(struct file *file, u32 flags, bool may_sleep, @@ -4360,6 +4454,53 @@ __bpf_kfunc int bpf_dynptr_file_discard(struct bpf_dynptr *dynptr) return 0; } +/** + * bpf_timer_cancel_async - try to deactivate a timer + * @timer: bpf_timer to stop + * + * Returns: + * + * * 0 when the timer was not active + * * 1 when the timer was active + * * -1 when the timer is currently executing the callback function and + * cannot be stopped + * * -ECANCELED when the timer will be cancelled asynchronously + * * -ENOMEM when out of memory + * * -EINVAL when the timer was not initialized + * * -ENOENT when this kfunc is racing with timer deletion + */ +__bpf_kfunc int bpf_timer_cancel_async(struct bpf_timer *timer) +{ + struct bpf_async_kern *async = (void *)timer; + struct bpf_async_cb *cb; + int ret; + + cb = READ_ONCE(async->cb); + if (!cb) + return -EINVAL; + + /* + * Unlike hrtimer_start() it's ok to synchronously call + * hrtimer_try_to_cancel() when refcnt reached zero, but deferring to + * irq_work is not, since irq callback may execute after RCU GP and + * cb could be freed at that time. Check for refcnt zero for + * consistency. + */ + if (!refcount_inc_not_zero(&cb->refcnt)) + return -ENOENT; + + if (!defer_timer_wq_op()) { + struct bpf_hrtimer *t = container_of(cb, struct bpf_hrtimer, cb); + + ret = hrtimer_try_to_cancel(&t->timer); + bpf_async_refcount_put(cb); + return ret; + } else { + ret = bpf_async_schedule_op(cb, BPF_ASYNC_CANCEL, 0, 0); + return ret ? ret : -ECANCELED; + } +} + __bpf_kfunc_end_defs(); static void bpf_task_work_cancel_scheduled(struct irq_work *irq_work) @@ -4427,7 +4568,7 @@ BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_from_vpid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_throw) #ifdef CONFIG_BPF_EVENTS -BTF_ID_FLAGS(func, bpf_send_signal_task, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_send_signal_task) #endif #ifdef CONFIG_KEYS BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) @@ -4467,14 +4608,14 @@ BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW | KF_RCU) BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY) #ifdef CONFIG_CGROUPS -BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY) -BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED) +BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_iter_css_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY) #endif -BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED) +BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_dynptr_adjust) @@ -4488,7 +4629,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_memset) BTF_ID_FLAGS(func, bpf_modify_return_test_tp) #endif BTF_ID_FLAGS(func, bpf_wq_init) -BTF_ID_FLAGS(func, bpf_wq_set_callback_impl) +BTF_ID_FLAGS(func, bpf_wq_set_callback, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_wq_start) BTF_ID_FLAGS(func, bpf_preempt_disable) BTF_ID_FLAGS(func, bpf_preempt_enable) @@ -4510,8 +4651,8 @@ BTF_ID_FLAGS(func, bpf_probe_read_user_str_dynptr) BTF_ID_FLAGS(func, bpf_probe_read_kernel_str_dynptr) BTF_ID_FLAGS(func, bpf_copy_from_user_dynptr, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_copy_from_user_str_dynptr, KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE) #endif #ifdef CONFIG_DMA_SHARED_BUFFER BTF_ID_FLAGS(func, bpf_iter_dmabuf_new, KF_ITER_NEW | KF_SLEEPABLE) @@ -4521,6 +4662,7 @@ BTF_ID_FLAGS(func, bpf_iter_dmabuf_destroy, KF_ITER_DESTROY | KF_SLEEPABLE) BTF_ID_FLAGS(func, __bpf_trap) BTF_ID_FLAGS(func, bpf_strcmp); BTF_ID_FLAGS(func, bpf_strcasecmp); +BTF_ID_FLAGS(func, bpf_strncasecmp); BTF_ID_FLAGS(func, bpf_strchr); BTF_ID_FLAGS(func, bpf_strchrnul); BTF_ID_FLAGS(func, bpf_strnchr); @@ -4536,11 +4678,13 @@ BTF_ID_FLAGS(func, bpf_strncasestr); #if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS) BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif -BTF_ID_FLAGS(func, bpf_stream_vprintk_impl, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_dynptr_from_file, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, bpf_stream_print_stack, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, bpf_dynptr_from_file) BTF_ID_FLAGS(func, bpf_dynptr_file_discard) +BTF_ID_FLAGS(func, bpf_timer_cancel_async) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 9f866a010dad..005ea3a2cda7 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -600,10 +600,17 @@ struct bpffs_btf_enums { static int find_bpffs_btf_enums(struct bpffs_btf_enums *info) { + struct { + const struct btf_type **type; + const char *name; + } btf_enums[] = { + {&info->cmd_t, "bpf_cmd"}, + {&info->map_t, "bpf_map_type"}, + {&info->prog_t, "bpf_prog_type"}, + {&info->attach_t, "bpf_attach_type"}, + }; const struct btf *btf; - const struct btf_type *t; - const char *name; - int i, n; + int i, id; memset(info, 0, sizeof(*info)); @@ -615,31 +622,16 @@ static int find_bpffs_btf_enums(struct bpffs_btf_enums *info) info->btf = btf; - for (i = 1, n = btf_nr_types(btf); i < n; i++) { - t = btf_type_by_id(btf, i); - if (!btf_type_is_enum(t)) - continue; - - name = btf_name_by_offset(btf, t->name_off); - if (!name) - continue; - - if (strcmp(name, "bpf_cmd") == 0) - info->cmd_t = t; - else if (strcmp(name, "bpf_map_type") == 0) - info->map_t = t; - else if (strcmp(name, "bpf_prog_type") == 0) - info->prog_t = t; - else if (strcmp(name, "bpf_attach_type") == 0) - info->attach_t = t; - else - continue; + for (i = 0; i < ARRAY_SIZE(btf_enums); i++) { + id = btf_find_by_name_kind(btf, btf_enums[i].name, + BTF_KIND_ENUM); + if (id < 0) + return -ESRCH; - if (info->cmd_t && info->map_t && info->prog_t && info->attach_t) - return 0; + *btf_enums[i].type = btf_type_by_id(btf, id); } - return -ESRCH; + return 0; } static bool find_btf_enum_const(const struct btf *btf, const struct btf_type *enum_t, diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index c93a756e035c..1ccbf28b2ad9 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -180,7 +180,7 @@ static long cgroup_storage_update_elem(struct bpf_map *map, void *key, } int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *key, - void *value) + void *value, u64 map_flags) { struct bpf_cgroup_storage_map *map = map_to_storage(_map); struct bpf_cgroup_storage *storage; @@ -198,12 +198,17 @@ int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *key, * access 'value_size' of them, so copying rounded areas * will not leak any kernel data */ + if (map_flags & BPF_F_CPU) { + cpu = map_flags >> 32; + copy_map_value(_map, value, per_cpu_ptr(storage->percpu_buf, cpu)); + goto unlock; + } size = round_up(_map->value_size, 8); for_each_possible_cpu(cpu) { - bpf_long_memcpy(value + off, - per_cpu_ptr(storage->percpu_buf, cpu), size); + copy_map_value_long(_map, value + off, per_cpu_ptr(storage->percpu_buf, cpu)); off += size; } +unlock: rcu_read_unlock(); return 0; } @@ -213,10 +218,11 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *key, { struct bpf_cgroup_storage_map *map = map_to_storage(_map); struct bpf_cgroup_storage *storage; - int cpu, off = 0; + void *val; u32 size; + int cpu; - if (map_flags != BPF_ANY && map_flags != BPF_EXIST) + if ((u32)map_flags & ~(BPF_ANY | BPF_EXIST | BPF_F_CPU | BPF_F_ALL_CPUS)) return -EINVAL; rcu_read_lock(); @@ -232,12 +238,17 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *key, * returned or zeros which were zero-filled by percpu_alloc, * so no kernel data leaks possible */ + if (map_flags & BPF_F_CPU) { + cpu = map_flags >> 32; + copy_map_value(_map, per_cpu_ptr(storage->percpu_buf, cpu), value); + goto unlock; + } size = round_up(_map->value_size, 8); for_each_possible_cpu(cpu) { - bpf_long_memcpy(per_cpu_ptr(storage->percpu_buf, cpu), - value + off, size); - off += size; + val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu; + copy_map_value(_map, per_cpu_ptr(storage->percpu_buf, cpu), val); } +unlock: rcu_read_unlock(); return 0; } diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 9575314f40a6..261a03ea73d3 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -214,7 +214,7 @@ __bpf_kfunc s64 bpf_map_sum_elem_count(const struct bpf_map *map) __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_map_iter_kfunc_ids) -BTF_ID_FLAGS(func, bpf_map_sum_elem_count, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_map_sum_elem_count) BTF_KFUNCS_END(bpf_map_iter_kfunc_ids) static const struct btf_kfunc_id_set bpf_map_iter_kfunc_set = { diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 42ae8d595c2c..227f9b5f388b 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -1,16 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2017-2018 Netronome Systems, Inc. - * - * This software is licensed under the GNU General License Version 2, - * June 1991 as shown in the file COPYING in the top-level directory of this - * source tree. - * - * THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" - * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, - * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE - * OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME - * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. */ #include <linux/bpf.h> diff --git a/kernel/bpf/range_tree.c b/kernel/bpf/range_tree.c index 99c63d982c5d..2f28886f3ff7 100644 --- a/kernel/bpf/range_tree.c +++ b/kernel/bpf/range_tree.c @@ -149,7 +149,8 @@ int range_tree_clear(struct range_tree *rt, u32 start, u32 len) range_it_insert(rn, rt); /* Add a range */ - new_rn = kmalloc_nolock(sizeof(struct range_node), 0, NUMA_NO_NODE); + new_rn = kmalloc_nolock(sizeof(struct range_node), __GFP_ACCOUNT, + NUMA_NO_NODE); if (!new_rn) return -ENOMEM; new_rn->rn_start = last + 1; @@ -234,7 +235,7 @@ int range_tree_set(struct range_tree *rt, u32 start, u32 len) right->rn_start = start; range_it_insert(right, rt); } else { - left = kmalloc_nolock(sizeof(struct range_node), 0, NUMA_NO_NODE); + left = kmalloc_nolock(sizeof(struct range_node), __GFP_ACCOUNT, NUMA_NO_NODE); if (!left) return -ENOMEM; left->rn_start = start; diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index f6a075ffac63..35ae64ade36b 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/bpf.h> #include <linux/btf.h> #include <linux/err.h> diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c index f7d0c8d4644e..e4e338cdb437 100644 --- a/kernel/bpf/rqspinlock.c +++ b/kernel/bpf/rqspinlock.c @@ -265,10 +265,11 @@ int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock) RES_INIT_TIMEOUT(ts); /* - * The fast path is not invoked for the TAS fallback, so we must grab - * the deadlock detection entry here. + * We are either called directly from res_spin_lock after grabbing the + * deadlock detection entry when queued spinlocks are disabled, or from + * resilient_queued_spin_lock_slowpath after grabbing the deadlock + * detection entry. No need to obtain it here. */ - grab_held_lock_entry(lock); /* * Since the waiting loop's time is dependent on the amount of @@ -694,7 +695,6 @@ __bpf_kfunc int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) int ret; BUILD_BUG_ON(sizeof(rqspinlock_t) != sizeof(struct bpf_res_spin_lock)); - BUILD_BUG_ON(__alignof__(rqspinlock_t) != __alignof__(struct bpf_res_spin_lock)); preempt_disable(); ret = res_spin_lock((rqspinlock_t *)lock); diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c index 0b6bc3f30335..be9ce98e9469 100644 --- a/kernel/bpf/stream.c +++ b/kernel/bpf/stream.c @@ -212,14 +212,13 @@ __bpf_kfunc_start_defs(); * Avoid using enum bpf_stream_id so that kfunc users don't have to pull in the * enum in headers. */ -__bpf_kfunc int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const void *args, - u32 len__sz, void *aux__prog) +__bpf_kfunc int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args, + u32 len__sz, struct bpf_prog_aux *aux) { struct bpf_bprintf_data data = { .get_bin_args = true, .get_buf = true, }; - struct bpf_prog_aux *aux = aux__prog; u32 fmt_size = strlen(fmt__str) + 1; struct bpf_stream *stream; u32 data_len = len__sz; @@ -246,6 +245,25 @@ __bpf_kfunc int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, con return ret; } +/* Directly trigger a stack dump from the program. */ +__bpf_kfunc int bpf_stream_print_stack(int stream_id, struct bpf_prog_aux *aux) +{ + struct bpf_stream_stage ss; + struct bpf_prog *prog; + + /* Make sure the stream ID is valid. */ + if (!bpf_stream_get(stream_id, aux)) + return -ENOENT; + + prog = aux->main_prog_aux->prog; + + bpf_stream_stage(ss, prog, stream_id, ({ + bpf_stream_dump_stack(ss); + })); + + return 0; +} + __bpf_kfunc_end_defs(); /* Added kfunc to common_btf_ids */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4ff82144f885..dd89bf809772 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -9,6 +9,7 @@ #include <linux/bpf_verifier.h> #include <linux/bsearch.h> #include <linux/btf.h> +#include <linux/hex.h> #include <linux/syscalls.h> #include <linux/slab.h> #include <linux/sched/signal.h> @@ -133,12 +134,14 @@ bool bpf_map_write_active(const struct bpf_map *map) return atomic64_read(&map->writecnt) != 0; } -static u32 bpf_map_value_size(const struct bpf_map *map) +static u32 bpf_map_value_size(const struct bpf_map *map, u64 flags) { - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || - map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) + if (flags & (BPF_F_CPU | BPF_F_ALL_CPUS)) + return map->value_size; + else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || + map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) return round_up(map->value_size, 8) * num_possible_cpus(); else if (IS_FD_MAP(map)) return sizeof(u32); @@ -314,11 +317,11 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, bpf_disable_instrumentation(); if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { - err = bpf_percpu_hash_copy(map, key, value); + err = bpf_percpu_hash_copy(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { - err = bpf_percpu_array_copy(map, key, value); + err = bpf_percpu_array_copy(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { - err = bpf_percpu_cgroup_storage_copy(map, key, value); + err = bpf_percpu_cgroup_storage_copy(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { err = bpf_stackmap_extract(map, key, value, false); } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) { @@ -505,17 +508,29 @@ static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map) return root_mem_cgroup; } +void bpf_map_memcg_enter(const struct bpf_map *map, struct mem_cgroup **old_memcg, + struct mem_cgroup **new_memcg) +{ + *new_memcg = bpf_map_get_memcg(map); + *old_memcg = set_active_memcg(*new_memcg); +} + +void bpf_map_memcg_exit(struct mem_cgroup *old_memcg, + struct mem_cgroup *new_memcg) +{ + set_active_memcg(old_memcg); + mem_cgroup_put(new_memcg); +} + void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, int node) { struct mem_cgroup *memcg, *old_memcg; void *ptr; - memcg = bpf_map_get_memcg(map); - old_memcg = set_active_memcg(memcg); + bpf_map_memcg_enter(map, &old_memcg, &memcg); ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node); - set_active_memcg(old_memcg); - mem_cgroup_put(memcg); + bpf_map_memcg_exit(old_memcg, memcg); return ptr; } @@ -526,11 +541,9 @@ void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags struct mem_cgroup *memcg, *old_memcg; void *ptr; - memcg = bpf_map_get_memcg(map); - old_memcg = set_active_memcg(memcg); + bpf_map_memcg_enter(map, &old_memcg, &memcg); ptr = kmalloc_nolock(size, flags | __GFP_ACCOUNT, node); - set_active_memcg(old_memcg); - mem_cgroup_put(memcg); + bpf_map_memcg_exit(old_memcg, memcg); return ptr; } @@ -540,11 +553,9 @@ void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) struct mem_cgroup *memcg, *old_memcg; void *ptr; - memcg = bpf_map_get_memcg(map); - old_memcg = set_active_memcg(memcg); + bpf_map_memcg_enter(map, &old_memcg, &memcg); ptr = kzalloc(size, flags | __GFP_ACCOUNT); - set_active_memcg(old_memcg); - mem_cgroup_put(memcg); + bpf_map_memcg_exit(old_memcg, memcg); return ptr; } @@ -555,11 +566,9 @@ void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size, struct mem_cgroup *memcg, *old_memcg; void *ptr; - memcg = bpf_map_get_memcg(map); - old_memcg = set_active_memcg(memcg); + bpf_map_memcg_enter(map, &old_memcg, &memcg); ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT); - set_active_memcg(old_memcg); - mem_cgroup_put(memcg); + bpf_map_memcg_exit(old_memcg, memcg); return ptr; } @@ -570,11 +579,9 @@ void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, struct mem_cgroup *memcg, *old_memcg; void __percpu *ptr; - memcg = bpf_map_get_memcg(map); - old_memcg = set_active_memcg(memcg); + bpf_map_memcg_enter(map, &old_memcg, &memcg); ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT); - set_active_memcg(old_memcg); - mem_cgroup_put(memcg); + bpf_map_memcg_exit(old_memcg, memcg); return ptr; } @@ -612,12 +619,7 @@ int bpf_map_alloc_pages(const struct bpf_map *map, int nid, unsigned long i, j; struct page *pg; int ret = 0; -#ifdef CONFIG_MEMCG - struct mem_cgroup *memcg, *old_memcg; - memcg = bpf_map_get_memcg(map); - old_memcg = set_active_memcg(memcg); -#endif for (i = 0; i < nr_pages; i++) { pg = __bpf_alloc_page(nid); @@ -631,10 +633,6 @@ int bpf_map_alloc_pages(const struct bpf_map *map, int nid, break; } -#ifdef CONFIG_MEMCG - set_active_memcg(old_memcg); - mem_cgroup_put(memcg); -#endif return ret; } @@ -1366,11 +1364,6 @@ free_map_tab: return ret; } -static bool bpf_net_capable(void) -{ - return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN); -} - #define BPF_MAP_CREATE_LAST_FIELD excl_prog_hash_size /* called via syscall */ static int map_create(union bpf_attr *attr, bpfptr_t uattr) @@ -1734,7 +1727,7 @@ static int map_lookup_elem(union bpf_attr *attr) if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) return -EPERM; - err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK); + err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK | BPF_F_CPU); if (err) return err; @@ -1742,7 +1735,7 @@ static int map_lookup_elem(union bpf_attr *attr) if (IS_ERR(key)) return PTR_ERR(key); - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, attr->flags); err = -ENOMEM; value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); @@ -1809,7 +1802,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) goto err_put; } - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, attr->flags); value = kvmemdup_bpfptr(uvalue, value_size); if (IS_ERR(value)) { err = PTR_ERR(value); @@ -2005,11 +1998,12 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file, void *key, *value; int err = 0; - err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK); + err = bpf_map_check_op_flags(map, attr->batch.elem_flags, + BPF_F_LOCK | BPF_F_CPU | BPF_F_ALL_CPUS); if (err) return err; - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, attr->batch.elem_flags); max_count = attr->batch.count; if (!max_count) @@ -2064,11 +2058,11 @@ int generic_map_lookup_batch(struct bpf_map *map, u32 value_size, cp, max_count; int err; - err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK); + err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK | BPF_F_CPU); if (err) return err; - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, attr->batch.elem_flags); max_count = attr->batch.count; if (!max_count) @@ -2190,7 +2184,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) goto err_put; } - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, 0); err = -ENOMEM; value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); @@ -2820,6 +2814,13 @@ static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr void *sig; int err = 0; + /* + * Don't attempt to use kmalloc_large or vmalloc for signatures. + * Practical signature for BPF program should be below this limit. + */ + if (attr->signature_size > KMALLOC_MAX_CACHE_SIZE) + return -EINVAL; + if (system_keyring_id_check(attr->keyring_id) == 0) key = bpf_lookup_system_key(attr->keyring_id); else @@ -3579,6 +3580,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, case BPF_PROG_TYPE_TRACING: if (prog->expected_attach_type != BPF_TRACE_FENTRY && prog->expected_attach_type != BPF_TRACE_FEXIT && + prog->expected_attach_type != BPF_TRACE_FSESSION && prog->expected_attach_type != BPF_MODIFY_RETURN) { err = -EINVAL; goto out_put_prog; @@ -3628,7 +3630,21 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id); } - link = kzalloc(sizeof(*link), GFP_USER); + if (prog->expected_attach_type == BPF_TRACE_FSESSION) { + struct bpf_fsession_link *fslink; + + fslink = kzalloc(sizeof(*fslink), GFP_USER); + if (fslink) { + bpf_link_init(&fslink->fexit.link, BPF_LINK_TYPE_TRACING, + &bpf_tracing_link_lops, prog, attach_type); + fslink->fexit.cookie = bpf_cookie; + link = &fslink->link; + } else { + link = NULL; + } + } else { + link = kzalloc(sizeof(*link), GFP_USER); + } if (!link) { err = -ENOMEM; goto out_put_prog; @@ -4352,6 +4368,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_TRACE_RAW_TP: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: case BPF_MODIFY_RETURN: return BPF_PROG_TYPE_TRACING; case BPF_LSM_MAC: @@ -4565,6 +4582,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); if (IS_ERR(prog)) return PTR_ERR(prog); + } else if (!bpf_mprog_detach_empty(ptype)) { + return -EPERM; } } else if (is_cgroup_prog_type(ptype, 0, false)) { if (attr->attach_flags || attr->relative_fd) @@ -5310,6 +5329,9 @@ static int bpf_map_get_info_by_fd(struct file *file, if (info.hash_size != SHA256_DIGEST_SIZE) return -EINVAL; + if (!READ_ONCE(map->frozen)) + return -EPERM; + err = map->ops->map_get_hash(map, SHA256_DIGEST_SIZE, map->sha); if (err != 0) return err; @@ -6122,6 +6144,49 @@ static int prog_stream_read(union bpf_attr *attr) return ret; } +#define BPF_PROG_ASSOC_STRUCT_OPS_LAST_FIELD prog_assoc_struct_ops.prog_fd + +static int prog_assoc_struct_ops(union bpf_attr *attr) +{ + struct bpf_prog *prog; + struct bpf_map *map; + int ret; + + if (CHECK_ATTR(BPF_PROG_ASSOC_STRUCT_OPS)) + return -EINVAL; + + if (attr->prog_assoc_struct_ops.flags) + return -EINVAL; + + prog = bpf_prog_get(attr->prog_assoc_struct_ops.prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) { + ret = -EINVAL; + goto put_prog; + } + + map = bpf_map_get(attr->prog_assoc_struct_ops.map_fd); + if (IS_ERR(map)) { + ret = PTR_ERR(map); + goto put_prog; + } + + if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS) { + ret = -EINVAL; + goto put_map; + } + + ret = bpf_prog_assoc_struct_ops(prog, map); + +put_map: + bpf_map_put(map); +put_prog: + bpf_prog_put(prog); + return ret; +} + static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) { union bpf_attr attr; @@ -6261,6 +6326,9 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) case BPF_PROG_STREAM_READ_BY_FD: err = prog_stream_read(&attr); break; + case BPF_PROG_ASSOC_STRUCT_OPS: + err = prog_assoc_struct_ops(&attr); + break; default: err = -EINVAL; break; @@ -6407,7 +6475,7 @@ static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = { .func = bpf_kallsyms_lookup_name, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index f8e70e9c3998..26fbfbb01700 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -8,6 +8,7 @@ */ #include <linux/kernel.h> #include <linux/tnum.h> +#include <linux/swab.h> #define TNUM(_v, _m) (struct tnum){.value = _v, .mask = _m} /* A completely unknown value */ @@ -253,3 +254,18 @@ struct tnum tnum_const_subreg(struct tnum a, u32 value) { return tnum_with_subreg(a, tnum_const(value)); } + +struct tnum tnum_bswap16(struct tnum a) +{ + return TNUM(swab16(a.value & 0xFFFF), swab16(a.mask & 0xFFFF)); +} + +struct tnum tnum_bswap32(struct tnum a) +{ + return TNUM(swab32(a.value & 0xFFFFFFFF), swab32(a.mask & 0xFFFFFFFF)); +} + +struct tnum tnum_bswap64(struct tnum a) +{ + return TNUM(swab64(a.value), swab64(a.mask)); +} diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c index feecd8f4dbf9..7e4aa1e44b50 100644 --- a/kernel/bpf/token.c +++ b/kernel/bpf/token.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/bpf.h> #include <linux/vmalloc.h> #include <linux/file.h> diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 976d89011b15..952cd7932461 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -24,19 +24,49 @@ const struct bpf_prog_ops bpf_extension_prog_ops = { #define TRAMPOLINE_HASH_BITS 10 #define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS) -static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE]; +static struct hlist_head trampoline_key_table[TRAMPOLINE_TABLE_SIZE]; +static struct hlist_head trampoline_ip_table[TRAMPOLINE_TABLE_SIZE]; -/* serializes access to trampoline_table */ +/* serializes access to trampoline tables */ static DEFINE_MUTEX(trampoline_mutex); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex); -static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd cmd) +#ifdef CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS +static struct bpf_trampoline *direct_ops_ip_lookup(struct ftrace_ops *ops, unsigned long ip) { - struct bpf_trampoline *tr = ops->private; + struct hlist_head *head_ip; + struct bpf_trampoline *tr; + + mutex_lock(&trampoline_mutex); + head_ip = &trampoline_ip_table[hash_64(ip, TRAMPOLINE_HASH_BITS)]; + hlist_for_each_entry(tr, head_ip, hlist_ip) { + if (tr->ip == ip) + goto out; + } + tr = NULL; +out: + mutex_unlock(&trampoline_mutex); + return tr; +} +#else +static struct bpf_trampoline *direct_ops_ip_lookup(struct ftrace_ops *ops, unsigned long ip) +{ + return ops->private; +} +#endif /* CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS */ + +static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip, + enum ftrace_ops_cmd cmd) +{ + struct bpf_trampoline *tr; int ret = 0; + tr = direct_ops_ip_lookup(ops, ip); + if (!tr) + return -EINVAL; + if (cmd == FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF) { /* This is called inside register_ftrace_direct_multi(), so * tr->mutex is already locked. @@ -109,10 +139,17 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog) enum bpf_attach_type eatype = prog->expected_attach_type; enum bpf_prog_type ptype = prog->type; - return (ptype == BPF_PROG_TYPE_TRACING && - (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || - eatype == BPF_MODIFY_RETURN)) || - (ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC); + switch (ptype) { + case BPF_PROG_TYPE_TRACING: + if (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || + eatype == BPF_MODIFY_RETURN || eatype == BPF_TRACE_FSESSION) + return true; + return false; + case BPF_PROG_TYPE_LSM: + return eatype == BPF_LSM_MAC; + default: + return false; + } } void bpf_image_ksym_init(void *data, unsigned int size, struct bpf_ksym *ksym) @@ -135,15 +172,171 @@ void bpf_image_ksym_del(struct bpf_ksym *ksym) PAGE_SIZE, true, ksym->name); } -static struct bpf_trampoline *bpf_trampoline_lookup(u64 key) +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +#ifdef CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS +/* + * We have only single direct_ops which contains all the direct call + * sites and is the only global ftrace_ops for all trampolines. + * + * We use 'update_ftrace_direct_*' api for attachment. + */ +struct ftrace_ops direct_ops = { + .ops_func = bpf_tramp_ftrace_ops_func, +}; + +static int direct_ops_alloc(struct bpf_trampoline *tr) +{ + tr->fops = &direct_ops; + return 0; +} + +static void direct_ops_free(struct bpf_trampoline *tr) { } + +static struct ftrace_hash *hash_from_ip(struct bpf_trampoline *tr, void *ptr) +{ + unsigned long ip, addr = (unsigned long) ptr; + struct ftrace_hash *hash; + + ip = ftrace_location(tr->ip); + if (!ip) + return NULL; + hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); + if (!hash) + return NULL; + if (bpf_trampoline_use_jmp(tr->flags)) + addr = ftrace_jmp_set(addr); + if (!add_ftrace_hash_entry_direct(hash, ip, addr)) { + free_ftrace_hash(hash); + return NULL; + } + return hash; +} + +static int direct_ops_add(struct bpf_trampoline *tr, void *addr) +{ + struct ftrace_hash *hash = hash_from_ip(tr, addr); + int err; + + if (!hash) + return -ENOMEM; + err = update_ftrace_direct_add(tr->fops, hash); + free_ftrace_hash(hash); + return err; +} + +static int direct_ops_del(struct bpf_trampoline *tr, void *addr) +{ + struct ftrace_hash *hash = hash_from_ip(tr, addr); + int err; + + if (!hash) + return -ENOMEM; + err = update_ftrace_direct_del(tr->fops, hash); + free_ftrace_hash(hash); + return err; +} + +static int direct_ops_mod(struct bpf_trampoline *tr, void *addr, bool lock_direct_mutex) +{ + struct ftrace_hash *hash = hash_from_ip(tr, addr); + int err; + + if (!hash) + return -ENOMEM; + err = update_ftrace_direct_mod(tr->fops, hash, lock_direct_mutex); + free_ftrace_hash(hash); + return err; +} +#else +/* + * We allocate ftrace_ops object for each trampoline and it contains + * call site specific for that trampoline. + * + * We use *_ftrace_direct api for attachment. + */ +static int direct_ops_alloc(struct bpf_trampoline *tr) +{ + tr->fops = kzalloc(sizeof(struct ftrace_ops), GFP_KERNEL); + if (!tr->fops) + return -ENOMEM; + tr->fops->private = tr; + tr->fops->ops_func = bpf_tramp_ftrace_ops_func; + return 0; +} + +static void direct_ops_free(struct bpf_trampoline *tr) +{ + if (!tr->fops) + return; + ftrace_free_filter(tr->fops); + kfree(tr->fops); +} + +static int direct_ops_add(struct bpf_trampoline *tr, void *ptr) +{ + unsigned long addr = (unsigned long) ptr; + struct ftrace_ops *ops = tr->fops; + int ret; + + if (bpf_trampoline_use_jmp(tr->flags)) + addr = ftrace_jmp_set(addr); + + ret = ftrace_set_filter_ip(ops, tr->ip, 0, 1); + if (ret) + return ret; + return register_ftrace_direct(ops, addr); +} + +static int direct_ops_del(struct bpf_trampoline *tr, void *addr) +{ + return unregister_ftrace_direct(tr->fops, (long)addr, false); +} + +static int direct_ops_mod(struct bpf_trampoline *tr, void *ptr, bool lock_direct_mutex) +{ + unsigned long addr = (unsigned long) ptr; + struct ftrace_ops *ops = tr->fops; + + if (bpf_trampoline_use_jmp(tr->flags)) + addr = ftrace_jmp_set(addr); + if (lock_direct_mutex) + return modify_ftrace_direct(ops, addr); + return modify_ftrace_direct_nolock(ops, addr); +} +#endif /* CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS */ +#else +static void direct_ops_free(struct bpf_trampoline *tr) { } + +static int direct_ops_alloc(struct bpf_trampoline *tr) +{ + return 0; +} + +static int direct_ops_add(struct bpf_trampoline *tr, void *addr) +{ + return -ENODEV; +} + +static int direct_ops_del(struct bpf_trampoline *tr, void *addr) +{ + return -ENODEV; +} + +static int direct_ops_mod(struct bpf_trampoline *tr, void *ptr, bool lock_direct_mutex) +{ + return -ENODEV; +} +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + +static struct bpf_trampoline *bpf_trampoline_lookup(u64 key, unsigned long ip) { struct bpf_trampoline *tr; struct hlist_head *head; int i; mutex_lock(&trampoline_mutex); - head = &trampoline_table[hash_64(key, TRAMPOLINE_HASH_BITS)]; - hlist_for_each_entry(tr, head, hlist) { + head = &trampoline_key_table[hash_64(key, TRAMPOLINE_HASH_BITS)]; + hlist_for_each_entry(tr, head, hlist_key) { if (tr->key == key) { refcount_inc(&tr->refcnt); goto out; @@ -152,20 +345,19 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key) tr = kzalloc(sizeof(*tr), GFP_KERNEL); if (!tr) goto out; -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS - tr->fops = kzalloc(sizeof(struct ftrace_ops), GFP_KERNEL); - if (!tr->fops) { + if (direct_ops_alloc(tr)) { kfree(tr); tr = NULL; goto out; } - tr->fops->private = tr; - tr->fops->ops_func = bpf_tramp_ftrace_ops_func; -#endif tr->key = key; - INIT_HLIST_NODE(&tr->hlist); - hlist_add_head(&tr->hlist, head); + tr->ip = ftrace_location(ip); + INIT_HLIST_NODE(&tr->hlist_key); + INIT_HLIST_NODE(&tr->hlist_ip); + hlist_add_head(&tr->hlist_key, head); + head = &trampoline_ip_table[hash_64(tr->ip, TRAMPOLINE_HASH_BITS)]; + hlist_add_head(&tr->hlist_ip, head); refcount_set(&tr->refcnt, 1); mutex_init(&tr->mutex); for (i = 0; i < BPF_TRAMP_MAX; i++) @@ -200,7 +392,7 @@ static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags, int ret; if (tr->func.ftrace_managed) - ret = unregister_ftrace_direct(tr->fops, (long)old_addr, false); + ret = direct_ops_del(tr, old_addr); else ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, NULL); @@ -214,10 +406,7 @@ static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags, int ret; if (tr->func.ftrace_managed) { - if (lock_direct_mutex) - ret = modify_ftrace_direct(tr->fops, (long)new_addr); - else - ret = modify_ftrace_direct_nolock(tr->fops, (long)new_addr); + ret = direct_ops_mod(tr, new_addr, lock_direct_mutex); } else { ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, new_addr); @@ -240,10 +429,7 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) } if (tr->func.ftrace_managed) { - ret = ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1); - if (ret) - return ret; - ret = register_ftrace_direct(tr->fops, (long)new_addr); + ret = direct_ops_add(tr, new_addr); } else { ret = bpf_trampoline_update_fentry(tr, 0, NULL, new_addr); } @@ -499,13 +685,6 @@ again: if (err) goto out_free; -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP - if (bpf_trampoline_use_jmp(tr->flags)) - tr->fops->flags |= FTRACE_OPS_FL_JMP; - else - tr->fops->flags &= ~FTRACE_OPS_FL_JMP; -#endif - WARN_ON(tr->cur_image && total == 0); if (tr->cur_image) /* progs already running at this address */ @@ -533,15 +712,8 @@ again: tr->cur_image = im; out: /* If any error happens, restore previous flags */ - if (err) { + if (err) tr->flags = orig_flags; -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP - if (bpf_trampoline_use_jmp(tr->flags)) - tr->fops->flags |= FTRACE_OPS_FL_JMP; - else - tr->fops->flags &= ~FTRACE_OPS_FL_JMP; -#endif - } kfree(tlinks); return err; @@ -559,6 +731,8 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) return BPF_TRAMP_MODIFY_RETURN; case BPF_TRACE_FEXIT: return BPF_TRAMP_FEXIT; + case BPF_TRACE_FSESSION: + return BPF_TRAMP_FSESSION; case BPF_LSM_MAC: if (!prog->aux->attach_func_proto->type) /* The function returns void, we cannot modify its @@ -594,8 +768,10 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog) { + struct bpf_fsession_link *fslink = NULL; enum bpf_tramp_prog_type kind; struct bpf_tramp_link *link_exiting; + struct hlist_head *prog_list; int err = 0; int cnt = 0, i; @@ -621,24 +797,43 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, BPF_MOD_JUMP, NULL, link->link.prog->bpf_func); } + if (kind == BPF_TRAMP_FSESSION) { + prog_list = &tr->progs_hlist[BPF_TRAMP_FENTRY]; + cnt++; + } else { + prog_list = &tr->progs_hlist[kind]; + } if (cnt >= BPF_MAX_TRAMP_LINKS) return -E2BIG; if (!hlist_unhashed(&link->tramp_hlist)) /* prog already linked */ return -EBUSY; - hlist_for_each_entry(link_exiting, &tr->progs_hlist[kind], tramp_hlist) { + hlist_for_each_entry(link_exiting, prog_list, tramp_hlist) { if (link_exiting->link.prog != link->link.prog) continue; /* prog already linked */ return -EBUSY; } - hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]); - tr->progs_cnt[kind]++; + hlist_add_head(&link->tramp_hlist, prog_list); + if (kind == BPF_TRAMP_FSESSION) { + tr->progs_cnt[BPF_TRAMP_FENTRY]++; + fslink = container_of(link, struct bpf_fsession_link, link.link); + hlist_add_head(&fslink->fexit.tramp_hlist, &tr->progs_hlist[BPF_TRAMP_FEXIT]); + tr->progs_cnt[BPF_TRAMP_FEXIT]++; + } else { + tr->progs_cnt[kind]++; + } err = bpf_trampoline_update(tr, true /* lock_direct_mutex */); if (err) { hlist_del_init(&link->tramp_hlist); - tr->progs_cnt[kind]--; + if (kind == BPF_TRAMP_FSESSION) { + tr->progs_cnt[BPF_TRAMP_FENTRY]--; + hlist_del_init(&fslink->fexit.tramp_hlist); + tr->progs_cnt[BPF_TRAMP_FEXIT]--; + } else { + tr->progs_cnt[kind]--; + } } return err; } @@ -672,6 +867,13 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, guard(mutex)(&tgt_prog->aux->ext_mutex); tgt_prog->aux->is_extended = false; return err; + } else if (kind == BPF_TRAMP_FSESSION) { + struct bpf_fsession_link *fslink = + container_of(link, struct bpf_fsession_link, link.link); + + hlist_del_init(&fslink->fexit.tramp_hlist); + tr->progs_cnt[BPF_TRAMP_FEXIT]--; + kind = BPF_TRAMP_FENTRY; } hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; @@ -850,7 +1052,7 @@ void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog) prog->aux->attach_btf_id); bpf_lsm_find_cgroup_shim(prog, &bpf_func); - tr = bpf_trampoline_lookup(key); + tr = bpf_trampoline_lookup(key, 0); if (WARN_ON_ONCE(!tr)) return; @@ -870,7 +1072,7 @@ struct bpf_trampoline *bpf_trampoline_get(u64 key, { struct bpf_trampoline *tr; - tr = bpf_trampoline_lookup(key); + tr = bpf_trampoline_lookup(key, tgt_info->tgt_addr); if (!tr) return NULL; @@ -906,11 +1108,9 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) * fexit progs. The fentry-only trampoline will be freed via * multiple rcu callbacks. */ - hlist_del(&tr->hlist); - if (tr->fops) { - ftrace_free_filter(tr->fops); - kfree(tr->fops); - } + hlist_del(&tr->hlist_key); + hlist_del(&tr->hlist_ip); + direct_ops_free(tr); kfree(tr); out: mutex_unlock(&trampoline_mutex); @@ -949,7 +1149,7 @@ static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tram run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); - if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { + if (unlikely(!bpf_prog_get_recursion_context(prog))) { bpf_prog_inc_misses_counter(prog); if (prog->aux->recursion_detected) prog->aux->recursion_detected(prog); @@ -993,7 +1193,7 @@ static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start, bpf_reset_run_ctx(run_ctx->saved_run_ctx); update_prog_stats(prog, start); - this_cpu_dec(*(prog->active)); + bpf_prog_put_recursion_context(prog); rcu_read_unlock_migrate(); } @@ -1029,7 +1229,7 @@ u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); - if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { + if (unlikely(!bpf_prog_get_recursion_context(prog))) { bpf_prog_inc_misses_counter(prog); if (prog->aux->recursion_detected) prog->aux->recursion_detected(prog); @@ -1044,7 +1244,7 @@ void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start, bpf_reset_run_ctx(run_ctx->saved_run_ctx); update_prog_stats(prog, start); - this_cpu_dec(*(prog->active)); + bpf_prog_put_recursion_context(prog); migrate_enable(); rcu_read_unlock_trace(); } @@ -1179,7 +1379,9 @@ static int __init init_trampolines(void) int i; for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++) - INIT_HLIST_HEAD(&trampoline_table[i]); + INIT_HLIST_HEAD(&trampoline_key_table[i]); + for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++) + INIT_HLIST_HEAD(&trampoline_ip_table[i]); return 0; } late_initcall(init_trampolines); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3135643d5695..edf5342b982f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -272,8 +272,13 @@ static bool bpf_pseudo_kfunc_call(const struct bpf_insn *insn) insn->src_reg == BPF_PSEUDO_KFUNC_CALL; } +struct bpf_map_desc { + struct bpf_map *ptr; + int uid; +}; + struct bpf_call_arg_meta { - struct bpf_map *map_ptr; + struct bpf_map_desc map; bool raw_mode; bool pkt_access; u8 release_regno; @@ -283,7 +288,6 @@ struct bpf_call_arg_meta { u64 msize_max_value; int ref_obj_id; int dynptr_id; - int map_uid; int func_id; struct btf *btf; u32 btf_id; @@ -294,6 +298,14 @@ struct bpf_call_arg_meta { s64 const_map_key; }; +struct bpf_kfunc_meta { + struct btf *btf; + const struct btf_type *proto; + const char *name; + const u32 *flags; + s32 id; +}; + struct bpf_kfunc_call_arg_meta { /* In parameters */ struct btf *btf; @@ -343,10 +355,7 @@ struct bpf_kfunc_call_arg_meta { u8 spi; u8 frameno; } iter; - struct { - struct bpf_map *ptr; - int uid; - } map; + struct bpf_map_desc map; u64 mem_size; }; @@ -512,7 +521,7 @@ static bool is_async_callback_calling_kfunc(u32 btf_id); static bool is_callback_calling_kfunc(u32 btf_id); static bool is_bpf_throw_kfunc(struct bpf_insn *insn); -static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id); +static bool is_bpf_wq_set_callback_kfunc(u32 btf_id); static bool is_task_work_add_kfunc(u32 func_id); static bool is_sync_callback_calling_function(enum bpf_func_id func_id) @@ -554,7 +563,7 @@ static bool is_async_cb_sleepable(struct bpf_verifier_env *env, struct bpf_insn /* bpf_wq and bpf_task_work callbacks are always sleepable. */ if (bpf_pseudo_kfunc_call(insn) && insn->off == 0 && - (is_bpf_wq_set_callback_impl_kfunc(insn->imm) || is_task_work_add_kfunc(insn->imm))) + (is_bpf_wq_set_callback_kfunc(insn->imm) || is_task_work_add_kfunc(insn->imm))) return true; verifier_bug(env, "unhandled async callback in is_async_cb_sleepable"); @@ -2341,6 +2350,18 @@ static void __mark_reg32_unbounded(struct bpf_reg_state *reg) reg->u32_max_value = U32_MAX; } +static void reset_reg64_and_tnum(struct bpf_reg_state *reg) +{ + __mark_reg64_unbounded(reg); + reg->var_off = tnum_unknown; +} + +static void reset_reg32_and_tnum(struct bpf_reg_state *reg) +{ + __mark_reg32_unbounded(reg); + reg->var_off = tnum_unknown; +} + static void __update_reg32_bounds(struct bpf_reg_state *reg) { struct tnum var32_off = tnum_subreg(reg->var_off); @@ -3263,16 +3284,105 @@ static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, s16 offset) return btf_vmlinux ?: ERR_PTR(-ENOENT); } -static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) +#define KF_IMPL_SUFFIX "_impl" + +static const struct btf_type *find_kfunc_impl_proto(struct bpf_verifier_env *env, + struct btf *btf, + const char *func_name) +{ + char *buf = env->tmp_str_buf; + const struct btf_type *func; + s32 impl_id; + int len; + + len = snprintf(buf, TMP_STR_BUF_LEN, "%s%s", func_name, KF_IMPL_SUFFIX); + if (len < 0 || len >= TMP_STR_BUF_LEN) { + verbose(env, "function name %s%s is too long\n", func_name, KF_IMPL_SUFFIX); + return NULL; + } + + impl_id = btf_find_by_name_kind(btf, buf, BTF_KIND_FUNC); + if (impl_id <= 0) { + verbose(env, "cannot find function %s in BTF\n", buf); + return NULL; + } + + func = btf_type_by_id(btf, impl_id); + + return btf_type_by_id(btf, func->type); +} + +static int fetch_kfunc_meta(struct bpf_verifier_env *env, + s32 func_id, + s16 offset, + struct bpf_kfunc_meta *kfunc) { const struct btf_type *func, *func_proto; + const char *func_name; + u32 *kfunc_flags; + struct btf *btf; + + if (func_id <= 0) { + verbose(env, "invalid kernel function btf_id %d\n", func_id); + return -EINVAL; + } + + btf = find_kfunc_desc_btf(env, offset); + if (IS_ERR(btf)) { + verbose(env, "failed to find BTF for kernel function\n"); + return PTR_ERR(btf); + } + + /* + * Note that kfunc_flags may be NULL at this point, which + * means that we couldn't find func_id in any relevant + * kfunc_id_set. This most likely indicates an invalid kfunc + * call. However we don't fail with an error here, + * and let the caller decide what to do with NULL kfunc->flags. + */ + kfunc_flags = btf_kfunc_flags(btf, func_id, env->prog); + + func = btf_type_by_id(btf, func_id); + if (!func || !btf_type_is_func(func)) { + verbose(env, "kernel btf_id %d is not a function\n", func_id); + return -EINVAL; + } + + func_name = btf_name_by_offset(btf, func->name_off); + + /* + * An actual prototype of a kfunc with KF_IMPLICIT_ARGS flag + * can be found through the counterpart _impl kfunc. + */ + if (kfunc_flags && (*kfunc_flags & KF_IMPLICIT_ARGS)) + func_proto = find_kfunc_impl_proto(env, btf, func_name); + else + func_proto = btf_type_by_id(btf, func->type); + + if (!func_proto || !btf_type_is_func_proto(func_proto)) { + verbose(env, "kernel function btf_id %d does not have a valid func_proto\n", + func_id); + return -EINVAL; + } + + memset(kfunc, 0, sizeof(*kfunc)); + kfunc->btf = btf; + kfunc->id = func_id; + kfunc->name = func_name; + kfunc->proto = func_proto; + kfunc->flags = kfunc_flags; + + return 0; +} + +static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) +{ struct bpf_kfunc_btf_tab *btf_tab; struct btf_func_model func_model; struct bpf_kfunc_desc_tab *tab; struct bpf_prog_aux *prog_aux; + struct bpf_kfunc_meta kfunc; struct bpf_kfunc_desc *desc; - const char *func_name; - struct btf *desc_btf; unsigned long addr; int err; @@ -3322,12 +3432,6 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) prog_aux->kfunc_btf_tab = btf_tab; } - desc_btf = find_kfunc_desc_btf(env, offset); - if (IS_ERR(desc_btf)) { - verbose(env, "failed to find BTF for kernel function\n"); - return PTR_ERR(desc_btf); - } - if (find_kfunc_desc(env->prog, func_id, offset)) return 0; @@ -3336,24 +3440,13 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) return -E2BIG; } - func = btf_type_by_id(desc_btf, func_id); - if (!func || !btf_type_is_func(func)) { - verbose(env, "kernel btf_id %u is not a function\n", - func_id); - return -EINVAL; - } - func_proto = btf_type_by_id(desc_btf, func->type); - if (!func_proto || !btf_type_is_func_proto(func_proto)) { - verbose(env, "kernel function btf_id %u does not have a valid func_proto\n", - func_id); - return -EINVAL; - } + err = fetch_kfunc_meta(env, func_id, offset, &kfunc); + if (err) + return err; - func_name = btf_name_by_offset(desc_btf, func->name_off); - addr = kallsyms_lookup_name(func_name); + addr = kallsyms_lookup_name(kfunc.name); if (!addr) { - verbose(env, "cannot find address for kernel function %s\n", - func_name); + verbose(env, "cannot find address for kernel function %s\n", kfunc.name); return -EINVAL; } @@ -3363,9 +3456,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) return err; } - err = btf_distill_func_proto(&env->log, desc_btf, - func_proto, func_name, - &func_model); + err = btf_distill_func_proto(&env->log, kfunc.btf, kfunc.proto, kfunc.name, &func_model); if (err) return err; @@ -5427,6 +5518,12 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, */ s32 subreg_def = state->regs[dst_regno].subreg_def; + if (env->bpf_capable && size == 4 && spill_size == 4 && + get_reg_width(reg) <= 32) + /* Ensure stack slot has an ID to build a relation + * with the destination register on fill. + */ + assign_scalar_id_before_mov(env, reg); copy_register_state(&state->regs[dst_regno], reg); state->regs[dst_regno].subreg_def = subreg_def; @@ -5472,6 +5569,11 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, } } else if (dst_regno >= 0) { /* restore register state from stack */ + if (env->bpf_capable) + /* Ensure stack slot has an ID to build a relation + * with the destination register on fill. + */ + assign_scalar_id_before_mov(env, reg); copy_register_state(&state->regs[dst_regno], reg); /* mark reg as written since spilled pointer state likely * has its liveness marks cleared by is_state_visited() @@ -5654,8 +5756,8 @@ static int check_stack_write(struct bpf_verifier_env *env, static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, int off, int size, enum bpf_access_type type) { - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_map *map = regs[regno].map_ptr; + struct bpf_reg_state *reg = reg_state(env, regno); + struct bpf_map *map = reg->map_ptr; u32 cap = bpf_map_flags_to_cap(map); if (type == BPF_WRITE && !(cap & BPF_MAP_CAN_WRITE)) { @@ -6168,8 +6270,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed) { - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); int err; /* We may have added a variable offset to the packet pointer; but any @@ -6256,8 +6357,7 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, int off, int size, enum bpf_access_type t) { - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_insn_access_aux info = {}; bool valid; @@ -7453,8 +7553,7 @@ static int check_stack_access_within_bounds( int regno, int off, int access_size, enum bpf_access_type type) { - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = regs + regno; + struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_func_state *state = func(env, reg); s64 min_off, max_off; int err; @@ -8408,7 +8507,7 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags) { bool is_lock = flags & PROCESS_SPIN_LOCK, is_res_lock = flags & PROCESS_RES_LOCK; const char *lock_str = is_res_lock ? "bpf_res_spin" : "bpf_spin"; - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_verifier_state *cur = env->cur_state; bool is_const = tnum_is_const(reg->var_off); bool is_irq = flags & PROCESS_LOCK_IRQ; @@ -8522,9 +8621,10 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags) /* Check if @regno is a pointer to a specific field in a map value */ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, - enum btf_field_type field_type) + enum btf_field_type field_type, + struct bpf_map_desc *map_desc) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); bool is_const = tnum_is_const(reg->var_off); struct bpf_map *map = reg->map_ptr; u64 val = reg->var_off.value; @@ -8565,78 +8665,41 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, val + reg->off, struct_name, field_off); return -EINVAL; } + if (map_desc->ptr) { + verifier_bug(env, "Two map pointers in a %s helper", struct_name); + return -EFAULT; + } + map_desc->uid = reg->map_uid; + map_desc->ptr = map; return 0; } static int process_timer_func(struct bpf_verifier_env *env, int regno, - struct bpf_call_arg_meta *meta) + struct bpf_map_desc *map) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; - struct bpf_map *map = reg->map_ptr; - int err; - - err = check_map_field_pointer(env, regno, BPF_TIMER); - if (err) - return err; - - if (meta->map_ptr) { - verifier_bug(env, "Two map pointers in a timer helper"); - return -EFAULT; - } if (IS_ENABLED(CONFIG_PREEMPT_RT)) { verbose(env, "bpf_timer cannot be used for PREEMPT_RT.\n"); return -EOPNOTSUPP; } - meta->map_uid = reg->map_uid; - meta->map_ptr = map; - return 0; + return check_map_field_pointer(env, regno, BPF_TIMER, map); } -static int process_wq_func(struct bpf_verifier_env *env, int regno, - struct bpf_kfunc_call_arg_meta *meta) +static int process_timer_helper(struct bpf_verifier_env *env, int regno, + struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; - struct bpf_map *map = reg->map_ptr; - int err; - - err = check_map_field_pointer(env, regno, BPF_WORKQUEUE); - if (err) - return err; - - if (meta->map.ptr) { - verifier_bug(env, "Two map pointers in a bpf_wq helper"); - return -EFAULT; - } - - meta->map.uid = reg->map_uid; - meta->map.ptr = map; - return 0; + return process_timer_func(env, regno, &meta->map); } -static int process_task_work_func(struct bpf_verifier_env *env, int regno, - struct bpf_kfunc_call_arg_meta *meta) +static int process_timer_kfunc(struct bpf_verifier_env *env, int regno, + struct bpf_kfunc_call_arg_meta *meta) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; - struct bpf_map *map = reg->map_ptr; - int err; - - err = check_map_field_pointer(env, regno, BPF_TASK_WORK); - if (err) - return err; - - if (meta->map.ptr) { - verifier_bug(env, "Two map pointers in a bpf_task_work helper"); - return -EFAULT; - } - meta->map.uid = reg->map_uid; - meta->map.ptr = map; - return 0; + return process_timer_func(env, regno, &meta->map); } static int process_kptr_func(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); struct btf_field *kptr_field; struct bpf_map *map_ptr; struct btf_record *rec; @@ -8652,7 +8715,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, return -EINVAL; } rec = map_ptr->record; - meta->map_ptr = map_ptr; + meta->map.ptr = map_ptr; } if (!tnum_is_const(reg->var_off)) { @@ -8709,7 +8772,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn_idx, enum bpf_arg_type arg_type, int clone_ref_obj_id) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); int err; if (reg->type != PTR_TO_STACK && reg->type != CONST_PTR_TO_DYNPTR) { @@ -8829,7 +8892,7 @@ static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg_idx, static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx, struct bpf_kfunc_call_arg_meta *meta) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); const struct btf_type *t; int spi, err, i, nr_slots, btf_id; @@ -8944,15 +9007,24 @@ static bool regs_exact(const struct bpf_reg_state *rold, const struct bpf_reg_state *rcur, struct bpf_idmap *idmap); +/* + * Check if scalar registers are exact for the purpose of not widening. + * More lenient than regs_exact() + */ +static bool scalars_exact_for_widen(const struct bpf_reg_state *rold, + const struct bpf_reg_state *rcur) +{ + return !memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)); +} + static void maybe_widen_reg(struct bpf_verifier_env *env, - struct bpf_reg_state *rold, struct bpf_reg_state *rcur, - struct bpf_idmap *idmap) + struct bpf_reg_state *rold, struct bpf_reg_state *rcur) { if (rold->type != SCALAR_VALUE) return; if (rold->type != rcur->type) return; - if (rold->precise || rcur->precise || regs_exact(rold, rcur, idmap)) + if (rold->precise || rcur->precise || scalars_exact_for_widen(rold, rcur)) return; __mark_reg_unknown(env, rcur); } @@ -8964,7 +9036,6 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env, struct bpf_func_state *fold, *fcur; int i, fr, num_slots; - reset_idmap_scratch(env); for (fr = old->curframe; fr >= 0; fr--) { fold = old->frame[fr]; fcur = cur->frame[fr]; @@ -8972,8 +9043,7 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env, for (i = 0; i < MAX_BPF_REG; i++) maybe_widen_reg(env, &fold->regs[i], - &fcur->regs[i], - &env->idmap_scratch); + &fcur->regs[i]); num_slots = min(fold->allocated_stack / BPF_REG_SIZE, fcur->allocated_stack / BPF_REG_SIZE); @@ -8984,8 +9054,7 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env, maybe_widen_reg(env, &fold->stack[i].spilled_ptr, - &fcur->stack[i].spilled_ptr, - &env->idmap_scratch); + &fcur->stack[i].spilled_ptr); } } return 0; @@ -9159,13 +9228,13 @@ static int resolve_map_arg_type(struct bpf_verifier_env *env, const struct bpf_call_arg_meta *meta, enum bpf_arg_type *arg_type) { - if (!meta->map_ptr) { + if (!meta->map.ptr) { /* kernel subsystem misconfigured verifier */ verifier_bug(env, "invalid map_ptr to access map->type"); return -EFAULT; } - switch (meta->map_ptr->map_type) { + switch (meta->map.ptr->map_type) { case BPF_MAP_TYPE_SOCKMAP: case BPF_MAP_TYPE_SOCKHASH: if (*arg_type == ARG_PTR_TO_MAP_VALUE) { @@ -9301,7 +9370,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, const u32 *arg_btf_id, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); enum bpf_reg_type expected, type = reg->type; const struct bpf_reg_types *compatible; int i, j; @@ -9719,7 +9788,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, int insn_idx) { u32 regno = BPF_REG_1 + arg; - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); enum bpf_arg_type arg_type = fn->arg_type[arg]; enum bpf_reg_type type = reg->type; u32 *arg_btf_id = NULL; @@ -9819,7 +9888,7 @@ skip_type_check: switch (base_type(arg_type)) { case ARG_CONST_MAP_PTR: /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ - if (meta->map_ptr) { + if (meta->map.ptr) { /* Use map_uid (which is unique id of inner map) to reject: * inner_map1 = bpf_map_lookup_elem(outer_map, key1) * inner_map2 = bpf_map_lookup_elem(outer_map, key2) @@ -9832,23 +9901,23 @@ skip_type_check: * * Comparing map_ptr is enough to distinguish normal and outer maps. */ - if (meta->map_ptr != reg->map_ptr || - meta->map_uid != reg->map_uid) { + if (meta->map.ptr != reg->map_ptr || + meta->map.uid != reg->map_uid) { verbose(env, "timer pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n", - meta->map_uid, reg->map_uid); + meta->map.uid, reg->map_uid); return -EINVAL; } } - meta->map_ptr = reg->map_ptr; - meta->map_uid = reg->map_uid; + meta->map.ptr = reg->map_ptr; + meta->map.uid = reg->map_uid; break; case ARG_PTR_TO_MAP_KEY: /* bpf_map_xxx(..., map_ptr, ..., key) call: * check that [key, key + map->key_size) are within * stack limits and initialized */ - if (!meta->map_ptr) { + if (!meta->map.ptr) { /* in function declaration map_ptr must come before * map_key, so that it's verified and known before * we have to check map_key here. Otherwise it means @@ -9857,11 +9926,11 @@ skip_type_check: verifier_bug(env, "invalid map_ptr to access map->key"); return -EFAULT; } - key_size = meta->map_ptr->key_size; + key_size = meta->map.ptr->key_size; err = check_helper_mem_access(env, regno, key_size, BPF_READ, false, NULL); if (err) return err; - if (can_elide_value_nullness(meta->map_ptr->map_type)) { + if (can_elide_value_nullness(meta->map.ptr->map_type)) { err = get_constant_map_key(env, reg, key_size, &meta->const_map_key); if (err < 0) { meta->const_map_key = -1; @@ -9879,13 +9948,13 @@ skip_type_check: /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity */ - if (!meta->map_ptr) { + if (!meta->map.ptr) { /* kernel subsystem misconfigured verifier */ verifier_bug(env, "invalid map_ptr to access map->value"); return -EFAULT; } meta->raw_mode = arg_type & MEM_UNINIT; - err = check_helper_mem_access(env, regno, meta->map_ptr->value_size, + err = check_helper_mem_access(env, regno, meta->map.ptr->value_size, arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ, false, meta); break; @@ -9916,7 +9985,7 @@ skip_type_check: } break; case ARG_PTR_TO_TIMER: - err = process_timer_func(env, regno, meta); + err = process_timer_helper(env, regno, meta); if (err) return err; break; @@ -10354,10 +10423,27 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn) return true; } -static int check_func_proto(const struct bpf_func_proto *fn, int func_id) +static bool check_mem_arg_rw_flag_ok(const struct bpf_func_proto *fn) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) { + enum bpf_arg_type arg_type = fn->arg_type[i]; + + if (base_type(arg_type) != ARG_PTR_TO_MEM) + continue; + if (!(arg_type & (MEM_WRITE | MEM_RDONLY))) + return false; + } + + return true; +} + +static int check_func_proto(const struct bpf_func_proto *fn) { return check_raw_mode_ok(fn) && check_arg_pair_ok(fn) && + check_mem_arg_rw_flag_ok(fn) && check_btf_id_ok(fn) ? 0 : -EINVAL; } @@ -11206,7 +11292,7 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, int func_id, int insn_idx) { struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; - struct bpf_map *map = meta->map_ptr; + struct bpf_map *map = meta->map.ptr; if (func_id != BPF_FUNC_tail_call && func_id != BPF_FUNC_map_lookup_elem && @@ -11239,11 +11325,11 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, } if (!aux->map_ptr_state.map_ptr) - bpf_map_ptr_store(aux, meta->map_ptr, - !meta->map_ptr->bypass_spec_v1, false); - else if (aux->map_ptr_state.map_ptr != meta->map_ptr) - bpf_map_ptr_store(aux, meta->map_ptr, - !meta->map_ptr->bypass_spec_v1, true); + bpf_map_ptr_store(aux, meta->map.ptr, + !meta->map.ptr->bypass_spec_v1, false); + else if (aux->map_ptr_state.map_ptr != meta->map.ptr) + bpf_map_ptr_store(aux, meta->map.ptr, + !meta->map.ptr->bypass_spec_v1, true); return 0; } @@ -11252,8 +11338,8 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, int func_id, int insn_idx) { struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; - struct bpf_reg_state *regs = cur_regs(env), *reg; - struct bpf_map *map = meta->map_ptr; + struct bpf_reg_state *reg; + struct bpf_map *map = meta->map.ptr; u64 val, max; int err; @@ -11264,7 +11350,7 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, return -EINVAL; } - reg = ®s[BPF_REG_3]; + reg = reg_state(env, BPF_REG_3); val = reg->var_off.value; max = map->max_entries; @@ -11410,8 +11496,7 @@ static struct bpf_insn_aux_data *cur_aux(const struct bpf_verifier_env *env) static bool loop_flag_is_zero(struct bpf_verifier_env *env) { - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = ®s[BPF_REG_4]; + struct bpf_reg_state *reg = reg_state(env, BPF_REG_4); bool reg_is_null = register_is_null(reg); if (reg_is_null) @@ -11471,6 +11556,7 @@ static inline bool in_sleepable_context(struct bpf_verifier_env *env) { return !env->cur_state->active_rcu_locks && !env->cur_state->active_preempt_locks && + !env->cur_state->active_locks && !env->cur_state->active_irq_id && in_sleepable(env); } @@ -11529,7 +11615,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn memset(&meta, 0, sizeof(meta)); meta.pkt_access = fn->pkt_access; - err = check_func_proto(fn, func_id); + err = check_func_proto(fn); if (err) { verifier_bug(env, "incorrect func proto %s#%d", func_id_name(func_id), func_id); return err; @@ -11809,22 +11895,22 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn * can check 'value_size' boundary of memory access * to map element returned from bpf_map_lookup_elem() */ - if (meta.map_ptr == NULL) { + if (meta.map.ptr == NULL) { verifier_bug(env, "unexpected null map_ptr"); return -EFAULT; } if (func_id == BPF_FUNC_map_lookup_elem && - can_elide_value_nullness(meta.map_ptr->map_type) && + can_elide_value_nullness(meta.map.ptr->map_type) && meta.const_map_key >= 0 && - meta.const_map_key < meta.map_ptr->max_entries) + meta.const_map_key < meta.map.ptr->max_entries) ret_flag &= ~PTR_MAYBE_NULL; - regs[BPF_REG_0].map_ptr = meta.map_ptr; - regs[BPF_REG_0].map_uid = meta.map_uid; + regs[BPF_REG_0].map_ptr = meta.map.ptr; + regs[BPF_REG_0].map_uid = meta.map.uid; regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag; if (!type_may_be_null(ret_flag) && - btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) { + btf_record_has_field(meta.map.ptr->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) { regs[BPF_REG_0].id = ++env->id_gen; } break; @@ -11927,7 +12013,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (type_may_be_null(regs[BPF_REG_0].type)) regs[BPF_REG_0].id = ++env->id_gen; - if (helper_multiple_ref_obj_use(func_id, meta.map_ptr)) { + if (helper_multiple_ref_obj_use(func_id, meta.map.ptr)) { verifier_bug(env, "func %s#%d sets ref_obj_id more than once", func_id_name(func_id), func_id); return -EFAULT; @@ -11939,7 +12025,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (is_ptr_cast_function(func_id) || is_dynptr_ref_function(func_id)) { /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; - } else if (is_acquire_function(func_id, meta.map_ptr)) { + } else if (is_acquire_function(func_id, meta.map.ptr)) { int id = acquire_reference(env, insn_idx); if (id < 0) @@ -11954,7 +12040,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (err) return err; - err = check_map_func_compatibility(env, meta.map_ptr, func_id); + err = check_map_func_compatibility(env, meta.map.ptr, func_id); if (err) return err; @@ -12045,11 +12131,6 @@ static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta) return meta->kfunc_flags & KF_RELEASE; } -static bool is_kfunc_trusted_args(struct bpf_kfunc_call_arg_meta *meta) -{ - return (meta->kfunc_flags & KF_TRUSTED_ARGS) || is_kfunc_release(meta); -} - static bool is_kfunc_sleepable(struct bpf_kfunc_call_arg_meta *meta) { return meta->kfunc_flags & KF_SLEEPABLE; @@ -12096,11 +12177,6 @@ static bool is_kfunc_arg_const_mem_size(const struct btf *btf, return btf_param_match_suffix(btf, arg, "__szk"); } -static bool is_kfunc_arg_optional(const struct btf *btf, const struct btf_param *arg) -{ - return btf_param_match_suffix(btf, arg, "__opt"); -} - static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg) { return btf_param_match_suffix(btf, arg, "__k"); @@ -12146,11 +12222,6 @@ static bool is_kfunc_arg_irq_flag(const struct btf *btf, const struct btf_param return btf_param_match_suffix(btf, arg, "__irq_flag"); } -static bool is_kfunc_arg_prog(const struct btf *btf, const struct btf_param *arg) -{ - return btf_param_match_suffix(btf, arg, "__prog"); -} - static bool is_kfunc_arg_scalar_with_name(const struct btf *btf, const struct btf_param *arg, const char *name) @@ -12179,6 +12250,8 @@ enum { KF_ARG_WORKQUEUE_ID, KF_ARG_RES_SPIN_LOCK_ID, KF_ARG_TASK_WORK_ID, + KF_ARG_PROG_AUX_ID, + KF_ARG_TIMER_ID }; BTF_ID_LIST(kf_arg_btf_ids) @@ -12190,6 +12263,8 @@ BTF_ID(struct, bpf_rb_node) BTF_ID(struct, bpf_wq) BTF_ID(struct, bpf_res_spin_lock) BTF_ID(struct, bpf_task_work) +BTF_ID(struct, bpf_prog_aux) +BTF_ID(struct, bpf_timer) static bool __is_kfunc_ptr_arg_type(const struct btf *btf, const struct btf_param *arg, int type) @@ -12233,6 +12308,11 @@ static bool is_kfunc_arg_rbtree_node(const struct btf *btf, const struct btf_par return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_NODE_ID); } +static bool is_kfunc_arg_timer(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_TIMER_ID); +} + static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg) { return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID); @@ -12270,6 +12350,11 @@ static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf return true; } +static bool is_kfunc_arg_prog_aux(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_PROG_AUX_ID); +} + /* Returns true if struct is composed of scalars, 4 levels of nesting allowed */ static bool __btf_type_is_scalar_struct(struct bpf_verifier_env *env, const struct btf *btf, @@ -12327,6 +12412,7 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_NULL, KF_ARG_PTR_TO_CONST_STR, KF_ARG_PTR_TO_MAP, + KF_ARG_PTR_TO_TIMER, KF_ARG_PTR_TO_WORKQUEUE, KF_ARG_PTR_TO_IRQ_FLAG, KF_ARG_PTR_TO_RES_SPIN_LOCK, @@ -12363,7 +12449,7 @@ enum special_kfunc_type { KF_bpf_percpu_obj_new_impl, KF_bpf_percpu_obj_drop_impl, KF_bpf_throw, - KF_bpf_wq_set_callback_impl, + KF_bpf_wq_set_callback, KF_bpf_preempt_disable, KF_bpf_preempt_enable, KF_bpf_iter_css_task_new, @@ -12383,8 +12469,14 @@ enum special_kfunc_type { KF_bpf_dynptr_from_file, KF_bpf_dynptr_file_discard, KF___bpf_trap, - KF_bpf_task_work_schedule_signal_impl, - KF_bpf_task_work_schedule_resume_impl, + KF_bpf_task_work_schedule_signal, + KF_bpf_task_work_schedule_resume, + KF_bpf_arena_alloc_pages, + KF_bpf_arena_free_pages, + KF_bpf_arena_reserve_pages, + KF_bpf_session_is_return, + KF_bpf_stream_vprintk, + KF_bpf_stream_print_stack, }; BTF_ID_LIST(special_kfunc_list) @@ -12424,7 +12516,7 @@ BTF_ID(func, bpf_dynptr_clone) BTF_ID(func, bpf_percpu_obj_new_impl) BTF_ID(func, bpf_percpu_obj_drop_impl) BTF_ID(func, bpf_throw) -BTF_ID(func, bpf_wq_set_callback_impl) +BTF_ID(func, bpf_wq_set_callback) BTF_ID(func, bpf_preempt_disable) BTF_ID(func, bpf_preempt_enable) #ifdef CONFIG_CGROUPS @@ -12457,13 +12549,19 @@ BTF_ID(func, bpf_res_spin_unlock_irqrestore) BTF_ID(func, bpf_dynptr_from_file) BTF_ID(func, bpf_dynptr_file_discard) BTF_ID(func, __bpf_trap) -BTF_ID(func, bpf_task_work_schedule_signal_impl) -BTF_ID(func, bpf_task_work_schedule_resume_impl) +BTF_ID(func, bpf_task_work_schedule_signal) +BTF_ID(func, bpf_task_work_schedule_resume) +BTF_ID(func, bpf_arena_alloc_pages) +BTF_ID(func, bpf_arena_free_pages) +BTF_ID(func, bpf_arena_reserve_pages) +BTF_ID(func, bpf_session_is_return) +BTF_ID(func, bpf_stream_vprintk) +BTF_ID(func, bpf_stream_print_stack) static bool is_task_work_add_kfunc(u32 func_id) { - return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal_impl] || - func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume_impl]; + return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal] || + func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume]; } static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) @@ -12513,9 +12611,16 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, struct bpf_reg_state *reg = ®s[regno]; bool arg_mem_size = false; - if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) + if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] || + meta->func_id == special_kfunc_list[KF_bpf_session_is_return] || + meta->func_id == special_kfunc_list[KF_bpf_session_cookie]) return KF_ARG_PTR_TO_CTX; + if (argno + 1 < nargs && + (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]) || + is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]))) + arg_mem_size = true; + /* In this function, we verify the kfunc's BTF as per the argument type, * leaving the rest of the verification with respect to the register * type to our caller. When a set of conditions hold in the BTF type of @@ -12524,7 +12629,8 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno)) return KF_ARG_PTR_TO_CTX; - if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg)) + if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg) && + !arg_mem_size) return KF_ARG_PTR_TO_NULL; if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno])) @@ -12560,6 +12666,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_wq(meta->btf, &args[argno])) return KF_ARG_PTR_TO_WORKQUEUE; + if (is_kfunc_arg_timer(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_TIMER; + if (is_kfunc_arg_task_work(meta->btf, &args[argno])) return KF_ARG_PTR_TO_TASK_WORK; @@ -12581,11 +12690,6 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_callback(env, meta->btf, &args[argno])) return KF_ARG_PTR_TO_CALLBACK; - if (argno + 1 < nargs && - (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]) || - is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]))) - arg_mem_size = true; - /* This is the catch all argument type of register types supported by * check_helper_mem_access. However, we only allow when argument type is * pointer to scalar, or struct composed (recursively) of scalars. When @@ -12625,7 +12729,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, /* Enforce strict type matching for calls to kfuncs that are acquiring * or releasing a reference, or are no-cast aliases. We do _not_ - * enforce strict matching for plain KF_TRUSTED_ARGS kfuncs by default, + * enforce strict matching for kfuncs by default, * as we want to enable BPF programs to pass types that are bitwise * equivalent without forcing them to explicitly cast with something * like bpf_cast_to_kern_ctx(). @@ -12675,7 +12779,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, static int process_irq_flag(struct bpf_verifier_env *env, int regno, struct bpf_kfunc_call_arg_meta *meta) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); int err, kfunc_class = IRQ_NATIVE_KFUNC; bool irq_save; @@ -12893,10 +12997,24 @@ static bool is_bpf_res_spin_lock_kfunc(u32 btf_id) btf_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]; } +static bool is_bpf_arena_kfunc(u32 btf_id) +{ + return btf_id == special_kfunc_list[KF_bpf_arena_alloc_pages] || + btf_id == special_kfunc_list[KF_bpf_arena_free_pages] || + btf_id == special_kfunc_list[KF_bpf_arena_reserve_pages]; +} + +static bool is_bpf_stream_kfunc(u32 btf_id) +{ + return btf_id == special_kfunc_list[KF_bpf_stream_vprintk] || + btf_id == special_kfunc_list[KF_bpf_stream_print_stack]; +} + static bool kfunc_spin_allowed(u32 btf_id) { return is_bpf_graph_api_kfunc(btf_id) || is_bpf_iter_num_api_kfunc(btf_id) || - is_bpf_res_spin_lock_kfunc(btf_id); + is_bpf_res_spin_lock_kfunc(btf_id) || is_bpf_arena_kfunc(btf_id) || + is_bpf_stream_kfunc(btf_id); } static bool is_sync_callback_calling_kfunc(u32 btf_id) @@ -12906,7 +13024,7 @@ static bool is_sync_callback_calling_kfunc(u32 btf_id) static bool is_async_callback_calling_kfunc(u32 btf_id) { - return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl] || + return is_bpf_wq_set_callback_kfunc(btf_id) || is_task_work_add_kfunc(btf_id); } @@ -12916,9 +13034,9 @@ static bool is_bpf_throw_kfunc(struct bpf_insn *insn) insn->imm == special_kfunc_list[KF_bpf_throw]; } -static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id) +static bool is_bpf_wq_set_callback_kfunc(u32 btf_id) { - return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl]; + return btf_id == special_kfunc_list[KF_bpf_wq_set_callback]; } static bool is_callback_calling_kfunc(u32 btf_id) @@ -13192,8 +13310,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (is_kfunc_arg_ignore(btf, &args[i])) continue; - if (is_kfunc_arg_prog(btf, &args[i])) { - /* Used to reject repeated use of __prog. */ + if (is_kfunc_arg_prog_aux(btf, &args[i])) { + /* Reject repeated use bpf_prog_aux */ if (meta->arg_prog) { verifier_bug(env, "Only 1 prog->aux argument supported per-kfunc"); return -EFAULT; @@ -13254,9 +13372,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EINVAL; } - if ((is_kfunc_trusted_args(meta) || is_kfunc_rcu(meta)) && - (register_is_null(reg) || type_may_be_null(reg->type)) && - !is_kfunc_arg_nullable(meta->btf, &args[i])) { + if ((register_is_null(reg) || type_may_be_null(reg->type)) && + !is_kfunc_arg_nullable(meta->btf, &args[i])) { verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i); return -EACCES; } @@ -13321,9 +13438,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ fallthrough; case KF_ARG_PTR_TO_ALLOC_BTF_ID: case KF_ARG_PTR_TO_BTF_ID: - if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta)) - break; - if (!is_trusted_reg(reg)) { if (!is_kfunc_rcu(meta)) { verbose(env, "R%d must be referenced or trusted\n", regno); @@ -13348,6 +13462,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_REFCOUNTED_KPTR: case KF_ARG_PTR_TO_CONST_STR: case KF_ARG_PTR_TO_WORKQUEUE: + case KF_ARG_PTR_TO_TIMER: case KF_ARG_PTR_TO_TASK_WORK: case KF_ARG_PTR_TO_IRQ_FLAG: case KF_ARG_PTR_TO_RES_SPIN_LOCK: @@ -13575,7 +13690,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ struct bpf_reg_state *size_reg = ®s[regno + 1]; const struct btf_param *size_arg = &args[i + 1]; - if (!register_is_null(buff_reg) || !is_kfunc_arg_optional(meta->btf, buff_arg)) { + if (!register_is_null(buff_reg) || !is_kfunc_arg_nullable(meta->btf, buff_arg)) { ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1); if (ret < 0) { verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1); @@ -13643,7 +13758,16 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "arg#%d doesn't point to a map value\n", i); return -EINVAL; } - ret = process_wq_func(env, regno, meta); + ret = check_map_field_pointer(env, regno, BPF_WORKQUEUE, &meta->map); + if (ret < 0) + return ret; + break; + case KF_ARG_PTR_TO_TIMER: + if (reg->type != PTR_TO_MAP_VALUE) { + verbose(env, "arg#%d doesn't point to a map value\n", i); + return -EINVAL; + } + ret = process_timer_kfunc(env, regno, meta); if (ret < 0) return ret; break; @@ -13652,7 +13776,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "arg#%d doesn't point to a map value\n", i); return -EINVAL; } - ret = process_task_work_func(env, regno, meta); + ret = check_map_field_pointer(env, regno, BPF_TASK_WORK, &meta->map); if (ret < 0) return ret; break; @@ -13699,44 +13823,28 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return 0; } -static int fetch_kfunc_meta(struct bpf_verifier_env *env, - struct bpf_insn *insn, - struct bpf_kfunc_call_arg_meta *meta, - const char **kfunc_name) +static int fetch_kfunc_arg_meta(struct bpf_verifier_env *env, + s32 func_id, + s16 offset, + struct bpf_kfunc_call_arg_meta *meta) { - const struct btf_type *func, *func_proto; - u32 func_id, *kfunc_flags; - const char *func_name; - struct btf *desc_btf; - - if (kfunc_name) - *kfunc_name = NULL; + struct bpf_kfunc_meta kfunc; + int err; - if (!insn->imm) - return -EINVAL; + err = fetch_kfunc_meta(env, func_id, offset, &kfunc); + if (err) + return err; - desc_btf = find_kfunc_desc_btf(env, insn->off); - if (IS_ERR(desc_btf)) - return PTR_ERR(desc_btf); + memset(meta, 0, sizeof(*meta)); + meta->btf = kfunc.btf; + meta->func_id = kfunc.id; + meta->func_proto = kfunc.proto; + meta->func_name = kfunc.name; - func_id = insn->imm; - func = btf_type_by_id(desc_btf, func_id); - func_name = btf_name_by_offset(desc_btf, func->name_off); - if (kfunc_name) - *kfunc_name = func_name; - func_proto = btf_type_by_id(desc_btf, func->type); - - kfunc_flags = btf_kfunc_id_set_contains(desc_btf, func_id, env->prog); - if (!kfunc_flags) { + if (!kfunc.flags || !btf_kfunc_is_allowed(kfunc.btf, kfunc.id, env->prog)) return -EACCES; - } - memset(meta, 0, sizeof(*meta)); - meta->btf = desc_btf; - meta->func_id = func_id; - meta->kfunc_flags = *kfunc_flags; - meta->func_proto = func_proto; - meta->func_name = func_name; + meta->kfunc_flags = *kfunc.flags; return 0; } @@ -13941,12 +14049,13 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (!insn->imm) return 0; - err = fetch_kfunc_meta(env, insn, &meta, &func_name); - if (err == -EACCES && func_name) - verbose(env, "calling kernel function %s is not allowed\n", func_name); + err = fetch_kfunc_arg_meta(env, insn->imm, insn->off, &meta); + if (err == -EACCES && meta.func_name) + verbose(env, "calling kernel function %s is not allowed\n", meta.func_name); if (err) return err; desc_btf = meta.btf; + func_name = meta.func_name; insn_aux = &env->insn_aux_data[insn_idx]; insn_aux->is_iter_next = is_iter_next_kfunc(&meta); @@ -14016,7 +14125,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, meta.r0_rdonly = false; } - if (is_bpf_wq_set_callback_impl_kfunc(meta.func_id)) { + if (is_bpf_wq_set_callback_kfunc(meta.func_id)) { err = push_callback_call(env, insn, insn_idx, meta.subprogno, set_timer_callback_state); if (err) { @@ -14154,8 +14263,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } - for (i = 0; i < CALLER_SAVED_REGS; i++) - mark_reg_not_init(env, regs, caller_saved[i]); + for (i = 0; i < CALLER_SAVED_REGS; i++) { + u32 regno = caller_saved[i]; + + mark_reg_not_init(env, regs, regno); + regs[regno].subreg_def = DEF_NOT_SUBREG; + } /* Check return type */ t = btf_type_skip_modifiers(desc_btf, meta.func_proto->type, NULL); @@ -14220,26 +14333,38 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (is_kfunc_rcu_protected(&meta)) regs[BPF_REG_0].type |= MEM_RCU; } else { - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].btf = desc_btf; - regs[BPF_REG_0].type = PTR_TO_BTF_ID; - regs[BPF_REG_0].btf_id = ptr_type_id; + enum bpf_reg_type type = PTR_TO_BTF_ID; if (meta.func_id == special_kfunc_list[KF_bpf_get_kmem_cache]) - regs[BPF_REG_0].type |= PTR_UNTRUSTED; - else if (is_kfunc_rcu_protected(&meta)) - regs[BPF_REG_0].type |= MEM_RCU; - - if (is_iter_next_kfunc(&meta)) { - struct bpf_reg_state *cur_iter; - - cur_iter = get_iter_from_state(env->cur_state, &meta); - - if (cur_iter->type & MEM_RCU) /* KF_RCU_PROTECTED */ - regs[BPF_REG_0].type |= MEM_RCU; - else - regs[BPF_REG_0].type |= PTR_TRUSTED; + type |= PTR_UNTRUSTED; + else if (is_kfunc_rcu_protected(&meta) || + (is_iter_next_kfunc(&meta) && + (get_iter_from_state(env->cur_state, &meta) + ->type & MEM_RCU))) { + /* + * If the iterator's constructor (the _new + * function e.g., bpf_iter_task_new) has been + * annotated with BPF kfunc flag + * KF_RCU_PROTECTED and was called within a RCU + * read-side critical section, also propagate + * the MEM_RCU flag to the pointer returned from + * the iterator's next function (e.g., + * bpf_iter_task_next). + */ + type |= MEM_RCU; + } else { + /* + * Any PTR_TO_BTF_ID that is returned from a BPF + * kfunc should by default be treated as + * implicitly trusted. + */ + type |= PTR_TRUSTED; } + + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].btf = desc_btf; + regs[BPF_REG_0].type = type; + regs[BPF_REG_0].btf_id = ptr_type_id; } if (is_kfunc_ret_null(&meta)) { @@ -14295,6 +14420,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return err; } + if (meta.func_id == special_kfunc_list[KF_bpf_session_cookie]) + env->prog->call_session_cookie = true; + return 0; } @@ -15081,6 +15209,252 @@ static void scalar_min_max_mul(struct bpf_reg_state *dst_reg, } } +static void scalar32_min_max_udiv(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + u32 *dst_umin = &dst_reg->u32_min_value; + u32 *dst_umax = &dst_reg->u32_max_value; + u32 src_val = src_reg->u32_min_value; /* non-zero, const divisor */ + + *dst_umin = *dst_umin / src_val; + *dst_umax = *dst_umax / src_val; + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; + reset_reg64_and_tnum(dst_reg); +} + +static void scalar_min_max_udiv(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + u64 *dst_umin = &dst_reg->umin_value; + u64 *dst_umax = &dst_reg->umax_value; + u64 src_val = src_reg->umin_value; /* non-zero, const divisor */ + + *dst_umin = div64_u64(*dst_umin, src_val); + *dst_umax = div64_u64(*dst_umax, src_val); + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; + reset_reg32_and_tnum(dst_reg); +} + +static void scalar32_min_max_sdiv(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s32 *dst_smin = &dst_reg->s32_min_value; + s32 *dst_smax = &dst_reg->s32_max_value; + s32 src_val = src_reg->s32_min_value; /* non-zero, const divisor */ + s32 res1, res2; + + /* BPF div specification: S32_MIN / -1 = S32_MIN */ + if (*dst_smin == S32_MIN && src_val == -1) { + /* + * If the dividend range contains more than just S32_MIN, + * we cannot precisely track the result, so it becomes unbounded. + * e.g., [S32_MIN, S32_MIN+10]/(-1), + * = {S32_MIN} U [-(S32_MIN+10), -(S32_MIN+1)] + * = {S32_MIN} U [S32_MAX-9, S32_MAX] = [S32_MIN, S32_MAX] + * Otherwise (if dividend is exactly S32_MIN), result remains S32_MIN. + */ + if (*dst_smax != S32_MIN) { + *dst_smin = S32_MIN; + *dst_smax = S32_MAX; + } + goto reset; + } + + res1 = *dst_smin / src_val; + res2 = *dst_smax / src_val; + *dst_smin = min(res1, res2); + *dst_smax = max(res1, res2); + +reset: + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->u32_min_value = 0; + dst_reg->u32_max_value = U32_MAX; + reset_reg64_and_tnum(dst_reg); +} + +static void scalar_min_max_sdiv(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s64 *dst_smin = &dst_reg->smin_value; + s64 *dst_smax = &dst_reg->smax_value; + s64 src_val = src_reg->smin_value; /* non-zero, const divisor */ + s64 res1, res2; + + /* BPF div specification: S64_MIN / -1 = S64_MIN */ + if (*dst_smin == S64_MIN && src_val == -1) { + /* + * If the dividend range contains more than just S64_MIN, + * we cannot precisely track the result, so it becomes unbounded. + * e.g., [S64_MIN, S64_MIN+10]/(-1), + * = {S64_MIN} U [-(S64_MIN+10), -(S64_MIN+1)] + * = {S64_MIN} U [S64_MAX-9, S64_MAX] = [S64_MIN, S64_MAX] + * Otherwise (if dividend is exactly S64_MIN), result remains S64_MIN. + */ + if (*dst_smax != S64_MIN) { + *dst_smin = S64_MIN; + *dst_smax = S64_MAX; + } + goto reset; + } + + res1 = div64_s64(*dst_smin, src_val); + res2 = div64_s64(*dst_smax, src_val); + *dst_smin = min(res1, res2); + *dst_smax = max(res1, res2); + +reset: + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->umin_value = 0; + dst_reg->umax_value = U64_MAX; + reset_reg32_and_tnum(dst_reg); +} + +static void scalar32_min_max_umod(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + u32 *dst_umin = &dst_reg->u32_min_value; + u32 *dst_umax = &dst_reg->u32_max_value; + u32 src_val = src_reg->u32_min_value; /* non-zero, const divisor */ + u32 res_max = src_val - 1; + + /* + * If dst_umax <= res_max, the result remains unchanged. + * e.g., [2, 5] % 10 = [2, 5]. + */ + if (*dst_umax <= res_max) + return; + + *dst_umin = 0; + *dst_umax = min(*dst_umax, res_max); + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; + reset_reg64_and_tnum(dst_reg); +} + +static void scalar_min_max_umod(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + u64 *dst_umin = &dst_reg->umin_value; + u64 *dst_umax = &dst_reg->umax_value; + u64 src_val = src_reg->umin_value; /* non-zero, const divisor */ + u64 res_max = src_val - 1; + + /* + * If dst_umax <= res_max, the result remains unchanged. + * e.g., [2, 5] % 10 = [2, 5]. + */ + if (*dst_umax <= res_max) + return; + + *dst_umin = 0; + *dst_umax = min(*dst_umax, res_max); + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; + reset_reg32_and_tnum(dst_reg); +} + +static void scalar32_min_max_smod(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s32 *dst_smin = &dst_reg->s32_min_value; + s32 *dst_smax = &dst_reg->s32_max_value; + s32 src_val = src_reg->s32_min_value; /* non-zero, const divisor */ + + /* + * Safe absolute value calculation: + * If src_val == S32_MIN (-2147483648), src_abs becomes 2147483648. + * Here use unsigned integer to avoid overflow. + */ + u32 src_abs = (src_val > 0) ? (u32)src_val : -(u32)src_val; + + /* + * Calculate the maximum possible absolute value of the result. + * Even if src_abs is 2147483648 (S32_MIN), subtracting 1 gives + * 2147483647 (S32_MAX), which fits perfectly in s32. + */ + s32 res_max_abs = src_abs - 1; + + /* + * If the dividend is already within the result range, + * the result remains unchanged. e.g., [-2, 5] % 10 = [-2, 5]. + */ + if (*dst_smin >= -res_max_abs && *dst_smax <= res_max_abs) + return; + + /* General case: result has the same sign as the dividend. */ + if (*dst_smin >= 0) { + *dst_smin = 0; + *dst_smax = min(*dst_smax, res_max_abs); + } else if (*dst_smax <= 0) { + *dst_smax = 0; + *dst_smin = max(*dst_smin, -res_max_abs); + } else { + *dst_smin = -res_max_abs; + *dst_smax = res_max_abs; + } + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->u32_min_value = 0; + dst_reg->u32_max_value = U32_MAX; + reset_reg64_and_tnum(dst_reg); +} + +static void scalar_min_max_smod(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s64 *dst_smin = &dst_reg->smin_value; + s64 *dst_smax = &dst_reg->smax_value; + s64 src_val = src_reg->smin_value; /* non-zero, const divisor */ + + /* + * Safe absolute value calculation: + * If src_val == S64_MIN (-2^63), src_abs becomes 2^63. + * Here use unsigned integer to avoid overflow. + */ + u64 src_abs = (src_val > 0) ? (u64)src_val : -(u64)src_val; + + /* + * Calculate the maximum possible absolute value of the result. + * Even if src_abs is 2^63 (S64_MIN), subtracting 1 gives + * 2^63 - 1 (S64_MAX), which fits perfectly in s64. + */ + s64 res_max_abs = src_abs - 1; + + /* + * If the dividend is already within the result range, + * the result remains unchanged. e.g., [-2, 5] % 10 = [-2, 5]. + */ + if (*dst_smin >= -res_max_abs && *dst_smax <= res_max_abs) + return; + + /* General case: result has the same sign as the dividend. */ + if (*dst_smin >= 0) { + *dst_smin = 0; + *dst_smax = min(*dst_smax, res_max_abs); + } else if (*dst_smax <= 0) { + *dst_smax = 0; + *dst_smin = max(*dst_smin, -res_max_abs); + } else { + *dst_smin = -res_max_abs; + *dst_smax = res_max_abs; + } + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->umin_value = 0; + dst_reg->umax_value = U64_MAX; + reset_reg32_and_tnum(dst_reg); +} + static void scalar32_min_max_and(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { @@ -15305,21 +15679,17 @@ static void __scalar64_min_max_lsh(struct bpf_reg_state *dst_reg, u64 umin_val, u64 umax_val) { /* Special case <<32 because it is a common compiler pattern to sign - * extend subreg by doing <<32 s>>32. In this case if 32bit bounds are - * positive we know this shift will also be positive so we can track - * bounds correctly. Otherwise we lose all sign bit information except - * what we can pick up from var_off. Perhaps we can generalize this - * later to shifts of any length. + * extend subreg by doing <<32 s>>32. smin/smax assignments are correct + * because s32 bounds don't flip sign when shifting to the left by + * 32bits. */ - if (umin_val == 32 && umax_val == 32 && dst_reg->s32_max_value >= 0) + if (umin_val == 32 && umax_val == 32) { dst_reg->smax_value = (s64)dst_reg->s32_max_value << 32; - else - dst_reg->smax_value = S64_MAX; - - if (umin_val == 32 && umax_val == 32 && dst_reg->s32_min_value >= 0) dst_reg->smin_value = (s64)dst_reg->s32_min_value << 32; - else + } else { + dst_reg->smax_value = S64_MAX; dst_reg->smin_value = S64_MIN; + } /* If we might shift our top bit out, then we know nothing */ if (dst_reg->umax_value > 1ULL << (63 - umax_val)) { @@ -15462,6 +15832,48 @@ static void scalar_min_max_arsh(struct bpf_reg_state *dst_reg, __update_reg_bounds(dst_reg); } +static void scalar_byte_swap(struct bpf_reg_state *dst_reg, struct bpf_insn *insn) +{ + /* + * Byte swap operation - update var_off using tnum_bswap. + * Three cases: + * 1. bswap(16|32|64): opcode=0xd7 (BPF_END | BPF_ALU64 | BPF_TO_LE) + * unconditional swap + * 2. to_le(16|32|64): opcode=0xd4 (BPF_END | BPF_ALU | BPF_TO_LE) + * swap on big-endian, truncation or no-op on little-endian + * 3. to_be(16|32|64): opcode=0xdc (BPF_END | BPF_ALU | BPF_TO_BE) + * swap on little-endian, truncation or no-op on big-endian + */ + + bool alu64 = BPF_CLASS(insn->code) == BPF_ALU64; + bool to_le = BPF_SRC(insn->code) == BPF_TO_LE; + bool is_big_endian; +#ifdef CONFIG_CPU_BIG_ENDIAN + is_big_endian = true; +#else + is_big_endian = false; +#endif + /* Apply bswap if alu64 or switch between big-endian and little-endian machines */ + bool need_bswap = alu64 || (to_le == is_big_endian); + + if (need_bswap) { + if (insn->imm == 16) + dst_reg->var_off = tnum_bswap16(dst_reg->var_off); + else if (insn->imm == 32) + dst_reg->var_off = tnum_bswap32(dst_reg->var_off); + else if (insn->imm == 64) + dst_reg->var_off = tnum_bswap64(dst_reg->var_off); + /* + * Byteswap scrambles the range, so we must reset bounds. + * Bounds will be re-derived from the new tnum later. + */ + __mark_reg_unbounded(dst_reg); + } + /* For bswap16/32, truncate dst register to match the swapped size */ + if (insn->imm == 16 || insn->imm == 32) + coerce_reg_to_size(dst_reg, insn->imm / 8); +} + static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, const struct bpf_reg_state *src_reg) { @@ -15488,8 +15900,17 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, case BPF_XOR: case BPF_OR: case BPF_MUL: + case BPF_END: return true; + /* + * Division and modulo operators range is only safe to compute when the + * divisor is a constant. + */ + case BPF_DIV: + case BPF_MOD: + return src_is_const; + /* Shift operators range is only computable if shift dimension operand * is a constant. Shifts greater than 31 or 63 are undefined. This * includes shifts by a negative number. @@ -15503,6 +15924,35 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, } } +static int maybe_fork_scalars(struct bpf_verifier_env *env, struct bpf_insn *insn, + struct bpf_reg_state *dst_reg) +{ + struct bpf_verifier_state *branch; + struct bpf_reg_state *regs; + bool alu32; + + if (dst_reg->smin_value == -1 && dst_reg->smax_value == 0) + alu32 = false; + else if (dst_reg->s32_min_value == -1 && dst_reg->s32_max_value == 0) + alu32 = true; + else + return 0; + + branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false); + if (IS_ERR(branch)) + return PTR_ERR(branch); + + regs = branch->frame[branch->curframe]->regs; + if (alu32) { + __mark_reg32_known(®s[insn->dst_reg], 0); + __mark_reg32_known(dst_reg, -1ull); + } else { + __mark_reg_known(®s[insn->dst_reg], 0); + __mark_reg_known(dst_reg, -1ull); + } + return 0; +} + /* WARNING: This function does calculations on 64-bit values, but the actual * execution may occur on 32-bit values. Therefore, things like bitshifts * need extra checks in the 32-bit case. @@ -15513,6 +15963,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, struct bpf_reg_state src_reg) { u8 opcode = BPF_OP(insn->code); + s16 off = insn->off; bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64); int ret; @@ -15564,12 +16015,54 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, scalar32_min_max_mul(dst_reg, &src_reg); scalar_min_max_mul(dst_reg, &src_reg); break; + case BPF_DIV: + /* BPF div specification: x / 0 = 0 */ + if ((alu32 && src_reg.u32_min_value == 0) || (!alu32 && src_reg.umin_value == 0)) { + ___mark_reg_known(dst_reg, 0); + break; + } + if (alu32) + if (off == 1) + scalar32_min_max_sdiv(dst_reg, &src_reg); + else + scalar32_min_max_udiv(dst_reg, &src_reg); + else + if (off == 1) + scalar_min_max_sdiv(dst_reg, &src_reg); + else + scalar_min_max_udiv(dst_reg, &src_reg); + break; + case BPF_MOD: + /* BPF mod specification: x % 0 = x */ + if ((alu32 && src_reg.u32_min_value == 0) || (!alu32 && src_reg.umin_value == 0)) + break; + if (alu32) + if (off == 1) + scalar32_min_max_smod(dst_reg, &src_reg); + else + scalar32_min_max_umod(dst_reg, &src_reg); + else + if (off == 1) + scalar_min_max_smod(dst_reg, &src_reg); + else + scalar_min_max_umod(dst_reg, &src_reg); + break; case BPF_AND: + if (tnum_is_const(src_reg.var_off)) { + ret = maybe_fork_scalars(env, insn, dst_reg); + if (ret) + return ret; + } dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off); scalar32_min_max_and(dst_reg, &src_reg); scalar_min_max_and(dst_reg, &src_reg); break; case BPF_OR: + if (tnum_is_const(src_reg.var_off)) { + ret = maybe_fork_scalars(env, insn, dst_reg); + if (ret) + return ret; + } dst_reg->var_off = tnum_or(dst_reg->var_off, src_reg.var_off); scalar32_min_max_or(dst_reg, &src_reg); scalar_min_max_or(dst_reg, &src_reg); @@ -15597,12 +16090,23 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, else scalar_min_max_arsh(dst_reg, &src_reg); break; + case BPF_END: + scalar_byte_swap(dst_reg, insn); + break; default: break; } - /* ALU32 ops are zero extended into 64bit register */ - if (alu32) + /* + * ALU32 ops are zero extended into 64bit register. + * + * BPF_END is already handled inside the helper (truncation), + * so skip zext here to avoid unexpected zero extension. + * e.g., le64: opcode=(BPF_END|BPF_ALU|BPF_TO_LE), imm=0x40 + * This is a 64bit byte swap operation with alu32==true, + * but we should not zero extend the result. + */ + if (alu32 && opcode != BPF_END) zext_32_to_64(dst_reg); reg_bounds_sync(dst_reg); return 0; @@ -15705,6 +16209,13 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, verbose(env, "verifier internal error: no src_reg\n"); return -EFAULT; } + /* + * For alu32 linked register tracking, we need to check dst_reg's + * umax_value before the ALU operation. After adjust_scalar_min_max_vals(), + * alu32 ops will have zero-extended the result, making umax_value <= U32_MAX. + */ + u64 dst_umax = dst_reg->umax_value; + err = adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg); if (err) return err; @@ -15714,26 +16225,44 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * r1 += 0x1 * if r2 < 1000 goto ... * use r1 in memory access - * So for 64-bit alu remember constant delta between r2 and r1 and - * update r1 after 'if' condition. + * So remember constant delta between r2 and r1 and update r1 after + * 'if' condition. */ if (env->bpf_capable && - BPF_OP(insn->code) == BPF_ADD && !alu32 && - dst_reg->id && is_reg_const(src_reg, false)) { - u64 val = reg_const_value(src_reg, false); + (BPF_OP(insn->code) == BPF_ADD || BPF_OP(insn->code) == BPF_SUB) && + dst_reg->id && is_reg_const(src_reg, alu32)) { + u64 val = reg_const_value(src_reg, alu32); + s32 off; + + if (!alu32 && ((s64)val < S32_MIN || (s64)val > S32_MAX)) + goto clear_id; + + if (alu32 && (dst_umax > U32_MAX)) + goto clear_id; - if ((dst_reg->id & BPF_ADD_CONST) || - /* prevent overflow in sync_linked_regs() later */ - val > (u32)S32_MAX) { + off = (s32)val; + + if (BPF_OP(insn->code) == BPF_SUB) { + /* Negating S32_MIN would overflow */ + if (off == S32_MIN) + goto clear_id; + off = -off; + } + + if (dst_reg->id & BPF_ADD_CONST) { /* * If the register already went through rX += val * we cannot accumulate another val into rx->off. */ +clear_id: dst_reg->off = 0; dst_reg->id = 0; } else { - dst_reg->id |= BPF_ADD_CONST; - dst_reg->off = val; + if (alu32) + dst_reg->id |= BPF_ADD_CONST32; + else + dst_reg->id |= BPF_ADD_CONST64; + dst_reg->off = off; } } else { /* @@ -15782,7 +16311,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } /* check dest operand */ - if (opcode == BPF_NEG && + if ((opcode == BPF_NEG || opcode == BPF_END) && regs[insn->dst_reg].type == SCALAR_VALUE) { err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); err = err ?: adjust_scalar_min_max_vals(env, insn, @@ -16802,8 +17331,8 @@ static void collect_linked_regs(struct bpf_verifier_state *vstate, u32 id, /* For all R in linked_regs, copy known_reg range into R * if R->id == known_reg->id. */ -static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_state *known_reg, - struct linked_regs *linked_regs) +static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_state *vstate, + struct bpf_reg_state *known_reg, struct linked_regs *linked_regs) { struct bpf_reg_state fake_reg; struct bpf_reg_state *reg; @@ -16827,23 +17356,32 @@ static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_s } else { s32 saved_subreg_def = reg->subreg_def; s32 saved_off = reg->off; + u32 saved_id = reg->id; fake_reg.type = SCALAR_VALUE; - __mark_reg_known(&fake_reg, (s32)reg->off - (s32)known_reg->off); + __mark_reg_known(&fake_reg, (s64)reg->off - (s64)known_reg->off); /* reg = known_reg; reg += delta */ copy_register_state(reg, known_reg); /* - * Must preserve off, id and add_const flag, + * Must preserve off, id and subreg_def flag, * otherwise another sync_linked_regs() will be incorrect. */ reg->off = saved_off; + reg->id = saved_id; reg->subreg_def = saved_subreg_def; scalar32_min_max_add(reg, &fake_reg); scalar_min_max_add(reg, &fake_reg); reg->var_off = tnum_add(reg->var_off, fake_reg.var_off); + if (known_reg->id & BPF_ADD_CONST32) + zext_32_to_64(reg); + reg_bounds_sync(reg); } + if (e->is_reg) + mark_reg_scratched(env, e->regno); + else + mark_stack_slot_scratched(env, e->spi); } } @@ -17030,13 +17568,15 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (BPF_SRC(insn->code) == BPF_X && src_reg->type == SCALAR_VALUE && src_reg->id && !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) { - sync_linked_regs(this_branch, src_reg, &linked_regs); - sync_linked_regs(other_branch, &other_branch_regs[insn->src_reg], &linked_regs); + sync_linked_regs(env, this_branch, src_reg, &linked_regs); + sync_linked_regs(env, other_branch, &other_branch_regs[insn->src_reg], + &linked_regs); } if (dst_reg->type == SCALAR_VALUE && dst_reg->id && !WARN_ON_ONCE(dst_reg->id != other_branch_regs[insn->dst_reg].id)) { - sync_linked_regs(this_branch, dst_reg, &linked_regs); - sync_linked_regs(other_branch, &other_branch_regs[insn->dst_reg], &linked_regs); + sync_linked_regs(env, this_branch, dst_reg, &linked_regs); + sync_linked_regs(env, other_branch, &other_branch_regs[insn->dst_reg], + &linked_regs); } /* if one pointer register is compared to another pointer @@ -17411,6 +17951,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char switch (env->prog->expected_attach_type) { case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: range = retval_range(0, 0); break; case BPF_TRACE_RAW_TP: @@ -17693,6 +18234,10 @@ static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm) switch (imm) { #ifdef CONFIG_X86_64 case BPF_FUNC_get_smp_processor_id: +#ifdef CONFIG_SMP + case BPF_FUNC_get_current_task_btf: + case BPF_FUNC_get_current_task: +#endif return env->prog->jit_requested && bpf_jit_supports_percpu_insn(); #endif default: @@ -17737,7 +18282,7 @@ static bool get_call_summary(struct bpf_verifier_env *env, struct bpf_insn *call if (bpf_pseudo_kfunc_call(call)) { int err; - err = fetch_kfunc_meta(env, call, &meta, NULL); + err = fetch_kfunc_arg_meta(env, call->imm, call->off, &meta); if (err < 0) /* error would be reported later */ return false; @@ -18245,7 +18790,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env) } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { struct bpf_kfunc_call_arg_meta meta; - ret = fetch_kfunc_meta(env, insn, &meta, NULL); + ret = fetch_kfunc_arg_meta(env, insn->imm, insn->off, &meta); if (ret == 0 && is_iter_next_kfunc(&meta)) { mark_prune_point(env, t); /* Checking and saving state checkpoints at iter_next() call @@ -18948,30 +19493,49 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap) if (old_id == 0) /* cur_id == 0 as well */ return true; - for (i = 0; i < BPF_ID_MAP_SIZE; i++) { - if (!map[i].old) { - /* Reached an empty slot; haven't seen this id before */ - map[i].old = old_id; - map[i].cur = cur_id; - return true; - } + for (i = 0; i < idmap->cnt; i++) { if (map[i].old == old_id) return map[i].cur == cur_id; if (map[i].cur == cur_id) return false; } + + /* Reached the end of known mappings; haven't seen this id before */ + if (idmap->cnt < BPF_ID_MAP_SIZE) { + map[idmap->cnt].old = old_id; + map[idmap->cnt].cur = cur_id; + idmap->cnt++; + return true; + } + /* We ran out of idmap slots, which should be impossible */ WARN_ON_ONCE(1); return false; } -/* Similar to check_ids(), but allocate a unique temporary ID - * for 'old_id' or 'cur_id' of zero. - * This makes pairs like '0 vs unique ID', 'unique ID vs 0' valid. +/* + * Compare scalar register IDs for state equivalence. + * + * When old_id == 0, the old register is independent - not linked to any + * other register. Any linking in the current state only adds constraints, + * making it more restrictive. Since the old state didn't rely on any ID + * relationships for this register, it's always safe to accept cur regardless + * of its ID. Hence, return true immediately. + * + * When old_id != 0 but cur_id == 0, we need to ensure that different + * independent registers in cur don't incorrectly satisfy the ID matching + * requirements of linked registers in old. + * + * Example: if old has r6.id=X and r7.id=X (linked), but cur has r6.id=0 + * and r7.id=0 (both independent), without temp IDs both would map old_id=X + * to cur_id=0 and pass. With temp IDs: r6 maps X->temp1, r7 tries to map + * X->temp2, but X is already mapped to temp1, so the check fails correctly. */ static bool check_scalar_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap) { - old_id = old_id ? old_id : ++idmap->tmp_id_gen; + if (!old_id) + return true; + cur_id = cur_id ? cur_id : ++idmap->tmp_id_gen; return check_ids(old_id, cur_id, idmap); @@ -19045,6 +19609,72 @@ static void clean_verifier_state(struct bpf_verifier_env *env, * doesn't meant that the states are DONE. The verifier has to compare * the callsites */ + +/* Find id in idset and increment its count, or add new entry */ +static void idset_cnt_inc(struct bpf_idset *idset, u32 id) +{ + u32 i; + + for (i = 0; i < idset->num_ids; i++) { + if (idset->entries[i].id == id) { + idset->entries[i].cnt++; + return; + } + } + /* New id */ + if (idset->num_ids < BPF_ID_MAP_SIZE) { + idset->entries[idset->num_ids].id = id; + idset->entries[idset->num_ids].cnt = 1; + idset->num_ids++; + } +} + +/* Find id in idset and return its count, or 0 if not found */ +static u32 idset_cnt_get(struct bpf_idset *idset, u32 id) +{ + u32 i; + + for (i = 0; i < idset->num_ids; i++) { + if (idset->entries[i].id == id) + return idset->entries[i].cnt; + } + return 0; +} + +/* + * Clear singular scalar ids in a state. + * A register with a non-zero id is called singular if no other register shares + * the same base id. Such registers can be treated as independent (id=0). + */ +static void clear_singular_ids(struct bpf_verifier_env *env, + struct bpf_verifier_state *st) +{ + struct bpf_idset *idset = &env->idset_scratch; + struct bpf_func_state *func; + struct bpf_reg_state *reg; + + idset->num_ids = 0; + + bpf_for_each_reg_in_vstate(st, func, reg, ({ + if (reg->type != SCALAR_VALUE) + continue; + if (!reg->id) + continue; + idset_cnt_inc(idset, reg->id & ~BPF_ADD_CONST); + })); + + bpf_for_each_reg_in_vstate(st, func, reg, ({ + if (reg->type != SCALAR_VALUE) + continue; + if (!reg->id) + continue; + if (idset_cnt_get(idset, reg->id & ~BPF_ADD_CONST) == 1) { + reg->id = 0; + reg->off = 0; + } + })); +} + static void clean_live_states(struct bpf_verifier_env *env, int insn, struct bpf_verifier_state *cur) { @@ -19091,11 +19721,9 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, if (exact == EXACT) return regs_exact(rold, rcur, idmap); - if (rold->type == NOT_INIT) { - if (exact == NOT_EXACT || rcur->type == NOT_INIT) - /* explored state can't have used this */ - return true; - } + if (rold->type == NOT_INIT) + /* explored state can't have used this */ + return true; /* Enforce that register types have to match exactly, including their * modifiers (like PTR_MAYBE_NULL, MEM_RDONLY, etc), as a general @@ -19132,11 +19760,21 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, } if (!rold->precise && exact == NOT_EXACT) return true; - if ((rold->id & BPF_ADD_CONST) != (rcur->id & BPF_ADD_CONST)) - return false; - if ((rold->id & BPF_ADD_CONST) && (rold->off != rcur->off)) - return false; - /* Why check_ids() for scalar registers? + /* + * Linked register tracking uses rold->id to detect relationships. + * When rold->id == 0, the register is independent and any linking + * in rcur only adds constraints. When rold->id != 0, we must verify + * id mapping and (for BPF_ADD_CONST) offset consistency. + * + * +------------------+-----------+------------------+---------------+ + * | | rold->id | rold + ADD_CONST | rold->id == 0 | + * |------------------+-----------+------------------+---------------| + * | rcur->id | range,ids | false | range | + * | rcur + ADD_CONST | false | range,ids,off | range | + * | rcur->id == 0 | range,ids | false | range | + * +------------------+-----------+------------------+---------------+ + * + * Why check_ids() for scalar registers? * * Consider the following BPF code: * 1: r6 = ... unbound scalar, ID=a ... @@ -19160,9 +19798,22 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, * --- * Also verify that new value satisfies old value range knowledge. */ - return range_within(rold, rcur) && - tnum_in(rold->var_off, rcur->var_off) && - check_scalar_ids(rold->id, rcur->id, idmap); + + /* ADD_CONST mismatch: different linking semantics */ + if ((rold->id & BPF_ADD_CONST) && !(rcur->id & BPF_ADD_CONST)) + return false; + + if (rold->id && !(rold->id & BPF_ADD_CONST) && (rcur->id & BPF_ADD_CONST)) + return false; + + /* Both have offset linkage: offsets must match */ + if ((rold->id & BPF_ADD_CONST) && rold->off != rcur->off) + return false; + + if (!check_scalar_ids(rold->id, rcur->id, idmap)) + return false; + + return range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off); case PTR_TO_MAP_KEY: case PTR_TO_MAP_VALUE: case PTR_TO_MEM: @@ -19264,7 +19915,7 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, spi = i / BPF_REG_SIZE; - if (exact != NOT_EXACT && + if (exact == EXACT && (i >= cur->allocated_stack || old->stack[spi].slot_type[i % BPF_REG_SIZE] != cur->stack[spi].slot_type[i % BPF_REG_SIZE])) @@ -19470,8 +20121,10 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat static void reset_idmap_scratch(struct bpf_verifier_env *env) { - env->idmap_scratch.tmp_id_gen = env->id_gen; - memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map)); + struct bpf_idmap *idmap = &env->idmap_scratch; + + idmap->tmp_id_gen = env->id_gen; + idmap->cnt = 0; } static bool states_equal(struct bpf_verifier_env *env, @@ -19835,8 +20488,10 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) } } if (bpf_calls_callback(env, insn_idx)) { - if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) + if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) { + loop = true; goto hit; + } goto skip_inf_loop_check; } /* attempt to detect infinite loop to avoid unnecessary doomed work */ @@ -20041,6 +20696,8 @@ miss: if (env->bpf_capable) mark_all_scalars_imprecise(env, cur); + clear_singular_ids(env, cur); + /* add new state to the head of linked list */ new = &new_sl->state; err = copy_verifier_state(new, cur); @@ -20611,17 +21268,19 @@ static int do_check(struct bpf_verifier_env *env) * may skip a nospec patched-in after the jump. This can * currently never happen because nospec_result is only * used for the write-ops - * `*(size*)(dst_reg+off)=src_reg|imm32` which must - * never skip the following insn. Still, add a warning - * to document this in case nospec_result is used - * elsewhere in the future. + * `*(size*)(dst_reg+off)=src_reg|imm32` and helper + * calls. These must never skip the following insn + * (i.e., bpf_insn_successors()'s opcode_info.can_jump + * is false). Still, add a warning to document this in + * case nospec_result is used elsewhere in the future. * * All non-branch instructions have a single * fall-through edge. For these, nospec_result should * already work. */ - if (verifier_bug_if(BPF_CLASS(insn->code) == BPF_JMP || - BPF_CLASS(insn->code) == BPF_JMP32, env, + if (verifier_bug_if((BPF_CLASS(insn->code) == BPF_JMP || + BPF_CLASS(insn->code) == BPF_JMP32) && + BPF_OP(insn->code) != BPF_CALL, env, "speculation barrier after jump instruction may not have the desired effect")) return -EFAULT; process_bpf_exit: @@ -20660,12 +21319,7 @@ static int find_btf_percpu_datasec(struct btf *btf) * types to look at only module's own BTF types. */ n = btf_nr_types(btf); - if (btf_is_module(btf)) - i = btf_nr_types(btf_vmlinux); - else - i = 1; - - for(; i < n; i++) { + for (i = btf_named_start_id(btf, true); i < n; i++) { t = btf_type_by_id(btf, i); if (BTF_INFO_KIND(t->info) != BTF_KIND_DATASEC) continue; @@ -20890,20 +21544,6 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } } - if (btf_record_has_field(map->record, BPF_TIMER)) { - if (is_tracing_prog_type(prog_type)) { - verbose(env, "tracing progs cannot use bpf_timer yet\n"); - return -EINVAL; - } - } - - if (btf_record_has_field(map->record, BPF_WORKQUEUE)) { - if (is_tracing_prog_type(prog_type)) { - verbose(env, "tracing progs cannot use bpf_wq yet\n"); - return -EINVAL; - } - } - if ((bpf_prog_is_offloaded(prog->aux) || bpf_map_is_offloaded(map)) && !bpf_offload_prog_map_match(prog, map)) { verbose(env, "offload device mismatch between prog and map\n"); @@ -20935,6 +21575,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_STACK: case BPF_MAP_TYPE_ARENA: case BPF_MAP_TYPE_INSN_ARRAY: + case BPF_MAP_TYPE_PROG_ARRAY: break; default: verbose(env, @@ -21141,11 +21782,6 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) } else { u32 off = insn[1].imm; - if (off >= BPF_MAX_VAR_OFF) { - verbose(env, "direct value offset of %u is not allowed\n", off); - return -EINVAL; - } - if (!map->ops->map_direct_value_addr) { verbose(env, "no direct value access support for this map type\n"); return -EINVAL; @@ -22446,6 +23082,12 @@ static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) { if (!env->insn_aux_data[insn_idx].non_sleepable) addr = (unsigned long)bpf_dynptr_from_file_sleepable; + } else if (func_id == special_kfunc_list[KF_bpf_arena_alloc_pages]) { + if (env->insn_aux_data[insn_idx].non_sleepable) + addr = (unsigned long)bpf_arena_alloc_pages_non_sleepable; + } else if (func_id == special_kfunc_list[KF_bpf_arena_free_pages]) { + if (env->insn_aux_data[insn_idx].non_sleepable) + addr = (unsigned long)bpf_arena_free_pages_non_sleepable; } desc->addr = addr; return 0; @@ -22498,8 +23140,7 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (!bpf_jit_supports_far_kfunc_call()) insn->imm = BPF_CALL_IMM(desc->addr); - if (insn->off) - return 0; + if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl] || desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta; @@ -22565,6 +23206,36 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); *cnt = 1; + } else if (desc->func_id == special_kfunc_list[KF_bpf_session_is_return] && + env->prog->expected_attach_type == BPF_TRACE_FSESSION) { + /* + * inline the bpf_session_is_return() for fsession: + * bool bpf_session_is_return(void *ctx) + * { + * return (((u64 *)ctx)[-1] >> BPF_TRAMP_IS_RETURN_SHIFT) & 1; + * } + */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + insn_buf[1] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, BPF_TRAMP_IS_RETURN_SHIFT); + insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1); + *cnt = 3; + } else if (desc->func_id == special_kfunc_list[KF_bpf_session_cookie] && + env->prog->expected_attach_type == BPF_TRACE_FSESSION) { + /* + * inline bpf_session_cookie() for fsession: + * __u64 *bpf_session_cookie(void *ctx) + * { + * u64 off = (((u64 *)ctx)[-1] >> BPF_TRAMP_COOKIE_INDEX_SHIFT) & 0xFF; + * return &((u64 *)ctx)[-off]; + * } + */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + insn_buf[1] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, BPF_TRAMP_COOKIE_INDEX_SHIFT); + insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF); + insn_buf[3] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3); + insn_buf[4] = BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1); + insn_buf[5] = BPF_ALU64_IMM(BPF_NEG, BPF_REG_0, 0); + *cnt = 6; } if (env->insn_aux_data[insn_idx].arg_prog) { @@ -23278,21 +23949,48 @@ patch_map_ops_generic: insn = new_prog->insnsi + i + delta; goto next_insn; } + + /* Implement bpf_get_current_task() and bpf_get_current_task_btf() inline. */ + if ((insn->imm == BPF_FUNC_get_current_task || insn->imm == BPF_FUNC_get_current_task_btf) && + verifier_inlines_helper_call(env, insn->imm)) { + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, (u32)(unsigned long)¤t_task); + insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); + insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0); + cnt = 3; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } #endif /* Implement bpf_get_func_arg inline. */ if (prog_type == BPF_PROG_TYPE_TRACING && insn->imm == BPF_FUNC_get_func_arg) { - /* Load nr_args from ctx - 8 */ - insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); - insn_buf[1] = BPF_JMP32_REG(BPF_JGE, BPF_REG_2, BPF_REG_0, 6); - insn_buf[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 3); - insn_buf[3] = BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_1); - insn_buf[4] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0); - insn_buf[5] = BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0); - insn_buf[6] = BPF_MOV64_IMM(BPF_REG_0, 0); - insn_buf[7] = BPF_JMP_A(1); - insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL); - cnt = 9; + if (eatype == BPF_TRACE_RAW_TP) { + int nr_args = btf_type_vlen(prog->aux->attach_func_proto); + + /* skip 'void *__data' in btf_trace_##name() and save to reg0 */ + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, nr_args - 1); + cnt = 1; + } else { + /* Load nr_args from ctx - 8 */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF); + cnt = 2; + } + insn_buf[cnt++] = BPF_JMP32_REG(BPF_JGE, BPF_REG_2, BPF_REG_0, 6); + insn_buf[cnt++] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 3); + insn_buf[cnt++] = BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_1); + insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0); + insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0); + insn_buf[cnt++] = BPF_MOV64_IMM(BPF_REG_0, 0); + insn_buf[cnt++] = BPF_JMP_A(1); + insn_buf[cnt++] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL); new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) @@ -23308,15 +24006,17 @@ patch_map_ops_generic: if (prog_type == BPF_PROG_TYPE_TRACING && insn->imm == BPF_FUNC_get_func_ret) { if (eatype == BPF_TRACE_FEXIT || + eatype == BPF_TRACE_FSESSION || eatype == BPF_MODIFY_RETURN) { /* Load nr_args from ctx - 8 */ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); - insn_buf[1] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3); - insn_buf[2] = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1); - insn_buf[3] = BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0); - insn_buf[4] = BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0); - insn_buf[5] = BPF_MOV64_IMM(BPF_REG_0, 0); - cnt = 6; + insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF); + insn_buf[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3); + insn_buf[3] = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1); + insn_buf[4] = BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0); + insn_buf[5] = BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0); + insn_buf[6] = BPF_MOV64_IMM(BPF_REG_0, 0); + cnt = 7; } else { insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, -EOPNOTSUPP); cnt = 1; @@ -23335,13 +24035,24 @@ patch_map_ops_generic: /* Implement get_func_arg_cnt inline. */ if (prog_type == BPF_PROG_TYPE_TRACING && insn->imm == BPF_FUNC_get_func_arg_cnt) { - /* Load nr_args from ctx - 8 */ - insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + if (eatype == BPF_TRACE_RAW_TP) { + int nr_args = btf_type_vlen(prog->aux->attach_func_proto); - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1); + /* skip 'void *__data' in btf_trace_##name() and save to reg0 */ + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, nr_args - 1); + cnt = 1; + } else { + /* Load nr_args from ctx - 8 */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF); + cnt = 2; + } + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) return -ENOMEM; + delta += cnt - 1; env->prog = prog = new_prog; insn = new_prog->insnsi + i + delta; goto next_insn; @@ -24252,7 +24963,8 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, if (tgt_prog->type == BPF_PROG_TYPE_TRACING && prog_extension && (tgt_prog->expected_attach_type == BPF_TRACE_FENTRY || - tgt_prog->expected_attach_type == BPF_TRACE_FEXIT)) { + tgt_prog->expected_attach_type == BPF_TRACE_FEXIT || + tgt_prog->expected_attach_type == BPF_TRACE_FSESSION)) { /* Program extensions can extend all program types * except fentry/fexit. The reason is the following. * The fentry/fexit programs are used for performance @@ -24267,7 +24979,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, * beyond reasonable stack size. Hence extending fentry * is not allowed. */ - bpf_log(log, "Cannot extend fentry/fexit\n"); + bpf_log(log, "Cannot extend fentry/fexit/fsession\n"); return -EINVAL; } } else { @@ -24351,6 +25063,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, case BPF_LSM_CGROUP: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: + if (prog->expected_attach_type == BPF_TRACE_FSESSION && + !bpf_jit_supports_fsession()) { + bpf_log(log, "JIT does not support fsession\n"); + return -EOPNOTSUPP; + } if (!btf_type_is_func(t)) { bpf_log(log, "attach_btf_id %u is not a function\n", btf_id); @@ -24517,6 +25235,7 @@ static bool can_be_sleepable(struct bpf_prog *prog) case BPF_TRACE_FEXIT: case BPF_MODIFY_RETURN: case BPF_TRACE_ITER: + case BPF_TRACE_FSESSION: return true; default: return false; @@ -24598,9 +25317,10 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) tgt_info.tgt_name); return -EINVAL; } else if ((prog->expected_attach_type == BPF_TRACE_FEXIT || + prog->expected_attach_type == BPF_TRACE_FSESSION || prog->expected_attach_type == BPF_MODIFY_RETURN) && btf_id_set_contains(&noreturn_deny, btf_id)) { - verbose(env, "Attaching fexit/fmod_ret to __noreturn function '%s' is rejected.\n", + verbose(env, "Attaching fexit/fsession/fmod_ret to __noreturn function '%s' is rejected.\n", tgt_info.tgt_name); return -EINVAL; } @@ -24809,6 +25529,12 @@ static void compute_insn_live_regs(struct bpf_verifier_env *env, case BPF_JMP32: switch (code) { case BPF_JA: + def = 0; + if (BPF_SRC(insn->code) == BPF_X) + use = dst; + else + use = 0; + break; case BPF_JCOND: def = 0; use = 0; @@ -25076,15 +25802,18 @@ dfs_continue: } /* * Assign SCC number only if component has two or more elements, - * or if component has a self reference. + * or if component has a self reference, or if instruction is a + * callback calling function (implicit loop). */ - assign_scc = stack[stack_sz - 1] != w; - for (j = 0; j < succ->cnt; ++j) { + assign_scc = stack[stack_sz - 1] != w; /* two or more elements? */ + for (j = 0; j < succ->cnt; ++j) { /* self reference? */ if (succ->items[j] == w) { assign_scc = true; break; } } + if (bpf_calls_callback(env, w)) /* implicit loop? */ + assign_scc = true; /* Pop component elements from stack */ do { t = stack[--stack_sz]; diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 22051b4f1ccb..3bfe37693d68 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -52,7 +52,7 @@ struct cgroup_fs_context { bool cpuset_clone_children; bool none; /* User explicitly requested empty subsystem */ bool all_ss; /* Seen 'all' option */ - u16 subsys_mask; /* Selected subsystems */ + u32 subsys_mask; /* Selected subsystems */ char *name; /* Hierarchy name */ char *release_agent; /* Path for release notifications */ }; @@ -146,7 +146,7 @@ struct cgroup_mgctx { struct cgroup_taskset tset; /* subsystems affected by migration */ - u16 ss_mask; + u32 ss_mask; }; #define CGROUP_TASKSET_INIT(tset) \ @@ -235,8 +235,8 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, void cgroup_favor_dynmods(struct cgroup_root *root, bool favor); void cgroup_free_root(struct cgroup_root *root); void init_cgroup_root(struct cgroup_fs_context *ctx); -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask); -int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); +int cgroup_setup_root(struct cgroup_root *root, u32 ss_mask); +int rebind_subsystems(struct cgroup_root *dst_root, u32 ss_mask); int cgroup_do_get_tree(struct fs_context *fc); int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index a9e029b570c8..724950c4b690 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -28,7 +28,7 @@ #define CGROUP_PIDLIST_DESTROY_DELAY HZ /* Controllers blocked by the commandline in v1 */ -static u16 cgroup_no_v1_mask; +static u32 cgroup_no_v1_mask; /* disable named v1 mounts */ static bool cgroup_no_v1_named; @@ -1037,13 +1037,13 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) static int check_cgroupfs_options(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); - u16 mask = U16_MAX; - u16 enabled = 0; + u32 mask = U32_MAX; + u32 enabled = 0; struct cgroup_subsys *ss; int i; #ifdef CONFIG_CPUSETS - mask = ~((u16)1 << cpuset_cgrp_id); + mask = ~((u32)1 << cpuset_cgrp_id); #endif for_each_subsys(ss, i) if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i) && @@ -1095,7 +1095,7 @@ int cgroup1_reconfigure(struct fs_context *fc) struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb); struct cgroup_root *root = cgroup_root_from_kf(kf_root); int ret = 0; - u16 added_mask, removed_mask; + u32 added_mask, removed_mask; cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); @@ -1343,7 +1343,7 @@ static int __init cgroup_no_v1(char *str) continue; if (!strcmp(token, "all")) { - cgroup_no_v1_mask = U16_MAX; + cgroup_no_v1_mask = U32_MAX; continue; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 5f0d33b04910..8af4351536cf 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -203,13 +203,13 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root); bool cgrp_dfl_visible; /* some controllers are not supported in the default hierarchy */ -static u16 cgrp_dfl_inhibit_ss_mask; +static u32 cgrp_dfl_inhibit_ss_mask; /* some controllers are implicitly enabled on the default hierarchy */ -static u16 cgrp_dfl_implicit_ss_mask; +static u32 cgrp_dfl_implicit_ss_mask; /* some controllers can be threaded on the default hierarchy */ -static u16 cgrp_dfl_threaded_ss_mask; +static u32 cgrp_dfl_threaded_ss_mask; /* The list of hierarchy roots */ LIST_HEAD(cgroup_roots); @@ -231,10 +231,10 @@ static u64 css_serial_nr_next = 1; * These bitmasks identify subsystems with specific features to avoid * having to do iterative checks repeatedly. */ -static u16 have_fork_callback __read_mostly; -static u16 have_exit_callback __read_mostly; -static u16 have_release_callback __read_mostly; -static u16 have_canfork_callback __read_mostly; +static u32 have_fork_callback __read_mostly; +static u32 have_exit_callback __read_mostly; +static u32 have_release_callback __read_mostly; +static u32 have_canfork_callback __read_mostly; static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS); @@ -472,13 +472,13 @@ static bool cgroup_is_valid_domain(struct cgroup *cgrp) } /* subsystems visibly enabled on a cgroup */ -static u16 cgroup_control(struct cgroup *cgrp) +static u32 cgroup_control(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); - u16 root_ss_mask = cgrp->root->subsys_mask; + u32 root_ss_mask = cgrp->root->subsys_mask; if (parent) { - u16 ss_mask = parent->subtree_control; + u32 ss_mask = parent->subtree_control; /* threaded cgroups can only have threaded controllers */ if (cgroup_is_threaded(cgrp)) @@ -493,12 +493,12 @@ static u16 cgroup_control(struct cgroup *cgrp) } /* subsystems enabled on a cgroup */ -static u16 cgroup_ss_mask(struct cgroup *cgrp) +static u32 cgroup_ss_mask(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); if (parent) { - u16 ss_mask = parent->subtree_ss_mask; + u32 ss_mask = parent->subtree_ss_mask; /* threaded cgroups can only have threaded controllers */ if (cgroup_is_threaded(cgrp)) @@ -1633,9 +1633,9 @@ static umode_t cgroup_file_mode(const struct cftype *cft) * This function calculates which subsystems need to be enabled if * @subtree_control is to be applied while restricted to @this_ss_mask. */ -static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask) +static u32 cgroup_calc_subtree_ss_mask(u32 subtree_control, u32 this_ss_mask) { - u16 cur_ss_mask = subtree_control; + u32 cur_ss_mask = subtree_control; struct cgroup_subsys *ss; int ssid; @@ -1644,7 +1644,7 @@ static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask) cur_ss_mask |= cgrp_dfl_implicit_ss_mask; while (true) { - u16 new_ss_mask = cur_ss_mask; + u32 new_ss_mask = cur_ss_mask; do_each_subsys_mask(ss, ssid, cur_ss_mask) { new_ss_mask |= ss->depends_on; @@ -1848,12 +1848,12 @@ err: return ret; } -int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) +int rebind_subsystems(struct cgroup_root *dst_root, u32 ss_mask) { struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; int ssid, ret; - u16 dfl_disable_ss_mask = 0; + u32 dfl_disable_ss_mask = 0; lockdep_assert_held(&cgroup_mutex); @@ -2149,7 +2149,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) +int cgroup_setup_root(struct cgroup_root *root, u32 ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; @@ -3131,7 +3131,7 @@ void cgroup_procs_write_finish(struct task_struct *task, put_task_struct(task); } -static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) +static void cgroup_print_ss_mask(struct seq_file *seq, u32 ss_mask) { struct cgroup_subsys *ss; bool printed = false; @@ -3496,9 +3496,9 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret) cgroup_apply_control_disable(cgrp); } -static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable) +static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u32 enable) { - u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask; + u32 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask; /* if nothing is getting enabled, nothing to worry about */ if (!enable) @@ -3541,7 +3541,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - u16 enable = 0, disable = 0; + u32 enable = 0, disable = 0; struct cgroup *cgrp, *child; struct cgroup_subsys *ss; char *tok; @@ -4945,7 +4945,7 @@ bool css_has_online_children(struct cgroup_subsys_state *css) rcu_read_lock(); css_for_each_child(child, css) { - if (child->flags & CSS_ONLINE) { + if (css_is_online(child)) { ret = true; break; } @@ -5750,7 +5750,7 @@ static void offline_css(struct cgroup_subsys_state *css) lockdep_assert_held(&cgroup_mutex); - if (!(css->flags & CSS_ONLINE)) + if (!css_is_online(css)) return; if (ss->css_offline) @@ -6347,7 +6347,7 @@ int __init cgroup_init(void) struct cgroup_subsys *ss; int ssid; - BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); + BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 32); BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index 01976c8e7d49..fd7d19842ded 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -9,6 +9,7 @@ #include <linux/cpuset.h> #include <linux/spinlock.h> #include <linux/union_find.h> +#include <linux/sched/isolation.h> /* See "Frequency meter" comments, below. */ @@ -144,17 +145,12 @@ struct cpuset { */ nodemask_t old_mems_allowed; - struct fmeter fmeter; /* memory_pressure filter */ - /* * Tasks are being attached to this cpuset. Used to prevent * zeroing cpus/mems_allowed between ->can_attach() and ->attach(). */ int attach_in_progress; - /* for custom sched domain */ - int relax_domain_level; - /* partition root state */ int partition_root_state; @@ -179,10 +175,19 @@ struct cpuset { /* Handle for cpuset.cpus.partition */ struct cgroup_file partition_file; +#ifdef CONFIG_CPUSETS_V1 + struct fmeter fmeter; /* memory_pressure filter */ + + /* for custom sched domain */ + int relax_domain_level; + /* Used to merge intersecting subsets for generate_sched_domains */ struct uf_node node; +#endif }; +extern struct cpuset top_cpuset; + static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) { return css ? container_of(css, struct cpuset, css) : NULL; @@ -240,6 +245,30 @@ static inline int is_spread_slab(const struct cpuset *cs) return test_bit(CS_SPREAD_SLAB, &cs->flags); } +/* + * Helper routine for generate_sched_domains(). + * Do cpusets a, b have overlapping effective cpus_allowed masks? + */ +static inline int cpusets_overlap(struct cpuset *a, struct cpuset *b) +{ + return cpumask_intersects(a->effective_cpus, b->effective_cpus); +} + +static inline int nr_cpusets(void) +{ + /* jump label reference count + the top-level cpuset */ + return static_key_count(&cpusets_enabled_key.key) + 1; +} + +static inline bool cpuset_is_populated(struct cpuset *cs) +{ + lockdep_assert_cpuset_lock_held(); + + /* Cpusets in the process of attaching should be considered as populated */ + return cgroup_is_populated(cs->css.cgroup) || + cs->attach_in_progress; +} + /** * cpuset_for_each_child - traverse online children of a cpuset * @child_cs: loop cursor pointing to the current child @@ -285,7 +314,6 @@ void cpuset_full_unlock(void); */ #ifdef CONFIG_CPUSETS_V1 extern struct cftype cpuset1_files[]; -void fmeter_init(struct fmeter *fmp); void cpuset1_update_task_spread_flags(struct cpuset *cs, struct task_struct *tsk); void cpuset1_update_tasks_flags(struct cpuset *cs); @@ -293,8 +321,13 @@ void cpuset1_hotplug_update_tasks(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated); int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial); +bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2); +void cpuset1_init(struct cpuset *cs); +void cpuset1_online_css(struct cgroup_subsys_state *css); +int cpuset1_generate_sched_domains(cpumask_var_t **domains, + struct sched_domain_attr **attributes); + #else -static inline void fmeter_init(struct fmeter *fmp) {} static inline void cpuset1_update_task_spread_flags(struct cpuset *cs, struct task_struct *tsk) {} static inline void cpuset1_update_tasks_flags(struct cpuset *cs) {} @@ -303,6 +336,13 @@ static inline void cpuset1_hotplug_update_tasks(struct cpuset *cs, bool cpus_updated, bool mems_updated) {} static inline int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial) { return 0; } +static inline bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, + struct cpuset *cs2) { return false; } +static inline void cpuset1_init(struct cpuset *cs) {} +static inline void cpuset1_online_css(struct cgroup_subsys_state *css) {} +static inline int cpuset1_generate_sched_domains(cpumask_var_t **domains, + struct sched_domain_attr **attributes) { return 0; }; + #endif /* CONFIG_CPUSETS_V1 */ #endif /* __CPUSET_INTERNAL_H */ diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index 12e76774c75b..7a23b9e8778f 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -62,7 +62,7 @@ struct cpuset_remove_tasks_struct { #define FM_SCALE 1000 /* faux fixed point scale */ /* Initialize a frequency meter */ -void fmeter_init(struct fmeter *fmp) +static void fmeter_init(struct fmeter *fmp) { fmp->cnt = 0; fmp->val = 0; @@ -368,11 +368,44 @@ int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial) if (par && !is_cpuset_subset(trial, par)) goto out; + /* + * Cpusets with tasks - existing or newly being attached - can't + * be changed to have empty cpus_allowed or mems_allowed. + */ + ret = -ENOSPC; + if (cpuset_is_populated(cur)) { + if (!cpumask_empty(cur->cpus_allowed) && + cpumask_empty(trial->cpus_allowed)) + goto out; + if (!nodes_empty(cur->mems_allowed) && + nodes_empty(trial->mems_allowed)) + goto out; + } + ret = 0; out: return ret; } +/* + * cpuset1_cpus_excl_conflict() - Check if two cpusets have exclusive CPU conflicts + * to legacy (v1) + * @cs1: first cpuset to check + * @cs2: second cpuset to check + * + * Returns: true if CPU exclusivity conflict exists, false otherwise + * + * If either cpuset is CPU exclusive, their allowed CPUs cannot intersect. + */ +bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) +{ + if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2)) + return cpumask_intersects(cs1->cpus_allowed, + cs2->cpus_allowed); + + return false; +} + #ifdef CONFIG_PROC_PID_CPUSET /* * proc_cpuset_show() @@ -499,6 +532,242 @@ out_unlock: return retval; } +void cpuset1_init(struct cpuset *cs) +{ + fmeter_init(&cs->fmeter); + cs->relax_domain_level = -1; +} + +void cpuset1_online_css(struct cgroup_subsys_state *css) +{ + struct cpuset *tmp_cs; + struct cgroup_subsys_state *pos_css; + struct cpuset *cs = css_cs(css); + struct cpuset *parent = parent_cs(cs); + + lockdep_assert_cpus_held(); + lockdep_assert_cpuset_lock_held(); + + if (is_spread_page(parent)) + set_bit(CS_SPREAD_PAGE, &cs->flags); + if (is_spread_slab(parent)) + set_bit(CS_SPREAD_SLAB, &cs->flags); + + if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) + return; + + /* + * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is + * set. This flag handling is implemented in cgroup core for + * historical reasons - the flag may be specified during mount. + * + * Currently, if any sibling cpusets have exclusive cpus or mem, we + * refuse to clone the configuration - thereby refusing the task to + * be entered, and as a result refusing the sys_unshare() or + * clone() which initiated it. If this becomes a problem for some + * users who wish to allow that scenario, then this could be + * changed to grant parent->cpus_allowed-sibling_cpus_exclusive + * (and likewise for mems) to the new cgroup. + */ + rcu_read_lock(); + cpuset_for_each_child(tmp_cs, pos_css, parent) { + if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { + rcu_read_unlock(); + return; + } + } + rcu_read_unlock(); + + cpuset_callback_lock_irq(); + cs->mems_allowed = parent->mems_allowed; + cs->effective_mems = parent->mems_allowed; + cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); + cpumask_copy(cs->effective_cpus, parent->cpus_allowed); + cpuset_callback_unlock_irq(); +} + +static void +update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) +{ + if (dattr->relax_domain_level < c->relax_domain_level) + dattr->relax_domain_level = c->relax_domain_level; +} + +static void update_domain_attr_tree(struct sched_domain_attr *dattr, + struct cpuset *root_cs) +{ + struct cpuset *cp; + struct cgroup_subsys_state *pos_css; + + rcu_read_lock(); + cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { + /* skip the whole subtree if @cp doesn't have any CPU */ + if (cpumask_empty(cp->cpus_allowed)) { + pos_css = css_rightmost_descendant(pos_css); + continue; + } + + if (is_sched_load_balance(cp)) + update_domain_attr(dattr, cp); + } + rcu_read_unlock(); +} + +/* + * cpuset1_generate_sched_domains() + * + * Finding the best partition (set of domains): + * The double nested loops below over i, j scan over the load + * balanced cpusets (using the array of cpuset pointers in csa[]) + * looking for pairs of cpusets that have overlapping cpus_allowed + * and merging them using a union-find algorithm. + * + * The union of the cpus_allowed masks from the set of all cpusets + * having the same root then form the one element of the partition + * (one sched domain) to be passed to partition_sched_domains(). + */ +int cpuset1_generate_sched_domains(cpumask_var_t **domains, + struct sched_domain_attr **attributes) +{ + struct cpuset *cp; /* top-down scan of cpusets */ + struct cpuset **csa; /* array of all cpuset ptrs */ + int csn; /* how many cpuset ptrs in csa so far */ + int i, j; /* indices for partition finding loops */ + cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ + struct sched_domain_attr *dattr; /* attributes for custom domains */ + int ndoms = 0; /* number of sched domains in result */ + int nslot; /* next empty doms[] struct cpumask slot */ + struct cgroup_subsys_state *pos_css; + int nslot_update; + + lockdep_assert_cpuset_lock_held(); + + doms = NULL; + dattr = NULL; + csa = NULL; + + /* Special case for the 99% of systems with one, full, sched domain */ + if (is_sched_load_balance(&top_cpuset)) { + ndoms = 1; + doms = alloc_sched_domains(ndoms); + if (!doms) + goto done; + + dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); + if (dattr) { + *dattr = SD_ATTR_INIT; + update_domain_attr_tree(dattr, &top_cpuset); + } + cpumask_and(doms[0], top_cpuset.effective_cpus, + housekeeping_cpumask(HK_TYPE_DOMAIN)); + + goto done; + } + + csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL); + if (!csa) + goto done; + csn = 0; + + rcu_read_lock(); + cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { + if (cp == &top_cpuset) + continue; + + /* + * Continue traversing beyond @cp iff @cp has some CPUs and + * isn't load balancing. The former is obvious. The + * latter: All child cpusets contain a subset of the + * parent's cpus, so just skip them, and then we call + * update_domain_attr_tree() to calc relax_domain_level of + * the corresponding sched domain. + */ + if (!cpumask_empty(cp->cpus_allowed) && + !(is_sched_load_balance(cp) && + cpumask_intersects(cp->cpus_allowed, + housekeeping_cpumask(HK_TYPE_DOMAIN)))) + continue; + + if (is_sched_load_balance(cp) && + !cpumask_empty(cp->effective_cpus)) + csa[csn++] = cp; + + /* skip @cp's subtree */ + pos_css = css_rightmost_descendant(pos_css); + continue; + } + rcu_read_unlock(); + + for (i = 0; i < csn; i++) + uf_node_init(&csa[i]->node); + + /* Merge overlapping cpusets */ + for (i = 0; i < csn; i++) { + for (j = i + 1; j < csn; j++) { + if (cpusets_overlap(csa[i], csa[j])) + uf_union(&csa[i]->node, &csa[j]->node); + } + } + + /* Count the total number of domains */ + for (i = 0; i < csn; i++) { + if (uf_find(&csa[i]->node) == &csa[i]->node) + ndoms++; + } + + /* + * Now we know how many domains to create. + * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. + */ + doms = alloc_sched_domains(ndoms); + if (!doms) + goto done; + + /* + * The rest of the code, including the scheduler, can deal with + * dattr==NULL case. No need to abort if alloc fails. + */ + dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr), + GFP_KERNEL); + + for (nslot = 0, i = 0; i < csn; i++) { + nslot_update = 0; + for (j = i; j < csn; j++) { + if (uf_find(&csa[j]->node) == &csa[i]->node) { + struct cpumask *dp = doms[nslot]; + + if (i == j) { + nslot_update = 1; + cpumask_clear(dp); + if (dattr) + *(dattr + nslot) = SD_ATTR_INIT; + } + cpumask_or(dp, dp, csa[j]->effective_cpus); + cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN)); + if (dattr) + update_domain_attr_tree(dattr + nslot, csa[j]); + } + } + if (nslot_update) + nslot++; + } + BUG_ON(nslot != ndoms); + +done: + kfree(csa); + + /* + * Fallback to the default domain if kmalloc() failed. + * See comments in partition_sched_domains(). + */ + if (doms == NULL) + ndoms = 1; + + *domains = doms; + *attributes = dattr; + return ndoms; +} + /* * for the common functions, 'private' gives the type of file */ diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index c06e2e96f79d..832179236529 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -26,7 +26,6 @@ #include <linux/mempolicy.h> #include <linux/mm.h> #include <linux/memory.h> -#include <linux/export.h> #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/sched/deadline.h> @@ -86,12 +85,6 @@ static cpumask_var_t isolated_cpus; static bool isolated_cpus_updating; /* - * Housekeeping (HK_TYPE_DOMAIN) CPUs at boot - */ -static cpumask_var_t boot_hk_cpus; -static bool have_boot_isolcpus; - -/* * A flag to force sched domain rebuild at the end of an operation. * It can be set in * - update_partition_sd_lb() @@ -126,6 +119,17 @@ static bool force_sd_rebuild; * For simplicity, a local partition can be created under a local or remote * partition but a remote partition cannot have any partition root in its * ancestor chain except the cgroup root. + * + * A valid partition can be formed by setting exclusive_cpus or cpus_allowed + * if exclusive_cpus is not set. In the case of partition with empty + * exclusive_cpus, all the conflicting exclusive CPUs specified in the + * following cpumasks of sibling cpusets will be removed from its + * cpus_allowed in determining its effective_xcpus. + * - effective_xcpus + * - exclusive_cpus + * + * The "cpuset.cpus.exclusive" control file should be used for setting up + * partition if the users want to get as many CPUs as possible. */ #define PRS_MEMBER 0 #define PRS_ROOT 1 @@ -208,12 +212,10 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs) * If cpu_online_mask is used while a hotunplug operation is happening in * parallel, we may leave an offline CPU in cpu_allowed or some other masks. */ -static struct cpuset top_cpuset = { +struct cpuset top_cpuset = { .flags = BIT(CS_CPU_EXCLUSIVE) | BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), .partition_root_state = PRS_ROOT, - .relax_domain_level = -1, - .remote_partition = false, }; /* @@ -268,6 +270,11 @@ void cpuset_unlock(void) mutex_unlock(&cpuset_mutex); } +void lockdep_assert_cpuset_lock_held(void) +{ + lockdep_assert_held(&cpuset_mutex); +} + /** * cpuset_full_lock - Acquire full protection for cpuset modification * @@ -286,6 +293,13 @@ void cpuset_full_unlock(void) cpus_read_unlock(); } +#ifdef CONFIG_LOCKDEP +bool lockdep_is_cpuset_held(void) +{ + return lockdep_is_held(&cpuset_mutex); +} +#endif + static DEFINE_SPINLOCK(callback_lock); void cpuset_callback_lock_irq(void) @@ -319,7 +333,7 @@ static inline void check_insane_mems_config(nodemask_t *nodes) */ static inline void dec_attach_in_progress_locked(struct cpuset *cs) { - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); cs->attach_in_progress--; if (!cs->attach_in_progress) @@ -353,15 +367,6 @@ static inline bool is_in_v2_mode(void) (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); } -static inline bool cpuset_is_populated(struct cpuset *cs) -{ - lockdep_assert_held(&cpuset_mutex); - - /* Cpusets in the process of attaching should be considered as populated */ - return cgroup_is_populated(cs->css.cgroup) || - cs->attach_in_progress; -} - /** * partition_is_populated - check if partition has tasks * @cs: partition root to be checked @@ -453,9 +458,8 @@ static void guarantee_active_cpus(struct task_struct *tsk, */ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) { - while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY])) + while (!nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY])) cs = parent_cs(cs); - nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]); } /** @@ -603,36 +607,32 @@ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) /** * cpus_excl_conflict - Check if two cpusets have exclusive CPU conflicts - * @cs1: first cpuset to check - * @cs2: second cpuset to check + * @trial: the trial cpuset to be checked + * @sibling: a sibling cpuset to be checked against + * @xcpus_changed: set if exclusive_cpus has been set * * Returns: true if CPU exclusivity conflict exists, false otherwise * * Conflict detection rules: - * 1. If either cpuset is CPU exclusive, they must be mutually exclusive - * 2. exclusive_cpus masks cannot intersect between cpusets - * 3. The allowed CPUs of one cpuset cannot be a subset of another's exclusive CPUs + * o cgroup v1 + * See cpuset1_cpus_excl_conflict() + * o cgroup v2 + * - The exclusive_cpus values cannot overlap. + * - New exclusive_cpus cannot be a superset of a sibling's cpus_allowed. */ -static inline bool cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) +static inline bool cpus_excl_conflict(struct cpuset *trial, struct cpuset *sibling, + bool xcpus_changed) { - /* If either cpuset is exclusive, check if they are mutually exclusive */ - if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2)) - return !cpusets_are_exclusive(cs1, cs2); - - /* Exclusive_cpus cannot intersect */ - if (cpumask_intersects(cs1->exclusive_cpus, cs2->exclusive_cpus)) - return true; - - /* The cpus_allowed of one cpuset cannot be a subset of another cpuset's exclusive_cpus */ - if (!cpumask_empty(cs1->cpus_allowed) && - cpumask_subset(cs1->cpus_allowed, cs2->exclusive_cpus)) - return true; + if (!cpuset_v2()) + return cpuset1_cpus_excl_conflict(trial, sibling); - if (!cpumask_empty(cs2->cpus_allowed) && - cpumask_subset(cs2->cpus_allowed, cs1->exclusive_cpus)) + /* The cpus_allowed of a sibling cpuset cannot be a subset of the new exclusive_cpus */ + if (xcpus_changed && !cpumask_empty(sibling->cpus_allowed) && + cpumask_subset(sibling->cpus_allowed, trial->exclusive_cpus)) return true; - return false; + /* Exclusive_cpus cannot intersect */ + return cpumask_intersects(trial->exclusive_cpus, sibling->exclusive_cpus); } static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) @@ -666,6 +666,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) { struct cgroup_subsys_state *css; struct cpuset *c, *par; + bool xcpus_changed; int ret = 0; rcu_read_lock(); @@ -682,20 +683,6 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) par = parent_cs(cur); /* - * Cpusets with tasks - existing or newly being attached - can't - * be changed to have empty cpus_allowed or mems_allowed. - */ - ret = -ENOSPC; - if (cpuset_is_populated(cur)) { - if (!cpumask_empty(cur->cpus_allowed) && - cpumask_empty(trial->cpus_allowed)) - goto out; - if (!nodes_empty(cur->mems_allowed) && - nodes_empty(trial->mems_allowed)) - goto out; - } - - /* * We can't shrink if we won't have enough room for SCHED_DEADLINE * tasks. This check is not done when scheduling is disabled as the * users should know what they are doing. @@ -722,10 +709,11 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) * overlap. exclusive_cpus cannot overlap with each other if set. */ ret = -EINVAL; + xcpus_changed = !cpumask_equal(cur->exclusive_cpus, trial->exclusive_cpus); cpuset_for_each_child(c, css, par) { if (c == cur) continue; - if (cpus_excl_conflict(trial, c)) + if (cpus_excl_conflict(trial, c, xcpus_changed)) goto out; if (mems_excl_conflict(trial, c)) goto out; @@ -738,49 +726,6 @@ out: } #ifdef CONFIG_SMP -/* - * Helper routine for generate_sched_domains(). - * Do cpusets a, b have overlapping effective cpus_allowed masks? - */ -static int cpusets_overlap(struct cpuset *a, struct cpuset *b) -{ - return cpumask_intersects(a->effective_cpus, b->effective_cpus); -} - -static void -update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) -{ - if (dattr->relax_domain_level < c->relax_domain_level) - dattr->relax_domain_level = c->relax_domain_level; - return; -} - -static void update_domain_attr_tree(struct sched_domain_attr *dattr, - struct cpuset *root_cs) -{ - struct cpuset *cp; - struct cgroup_subsys_state *pos_css; - - rcu_read_lock(); - cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { - /* skip the whole subtree if @cp doesn't have any CPU */ - if (cpumask_empty(cp->cpus_allowed)) { - pos_css = css_rightmost_descendant(pos_css); - continue; - } - - if (is_sched_load_balance(cp)) - update_domain_attr(dattr, cp); - } - rcu_read_unlock(); -} - -/* Must be called with cpuset_mutex held. */ -static inline int nr_cpusets(void) -{ - /* jump label reference count + the top-level cpuset */ - return static_key_count(&cpusets_enabled_key.key) + 1; -} /* * generate_sched_domains() @@ -820,103 +765,46 @@ static inline int nr_cpusets(void) * convenient format, that can be easily compared to the prior * value to determine what partition elements (sched domains) * were changed (added or removed.) - * - * Finding the best partition (set of domains): - * The double nested loops below over i, j scan over the load - * balanced cpusets (using the array of cpuset pointers in csa[]) - * looking for pairs of cpusets that have overlapping cpus_allowed - * and merging them using a union-find algorithm. - * - * The union of the cpus_allowed masks from the set of all cpusets - * having the same root then form the one element of the partition - * (one sched domain) to be passed to partition_sched_domains(). - * */ static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) { struct cpuset *cp; /* top-down scan of cpusets */ struct cpuset **csa; /* array of all cpuset ptrs */ - int csn; /* how many cpuset ptrs in csa so far */ int i, j; /* indices for partition finding loops */ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ - int nslot; /* next empty doms[] struct cpumask slot */ struct cgroup_subsys_state *pos_css; - bool root_load_balance = is_sched_load_balance(&top_cpuset); - bool cgrpv2 = cpuset_v2(); - int nslot_update; + + if (!cpuset_v2()) + return cpuset1_generate_sched_domains(domains, attributes); doms = NULL; dattr = NULL; csa = NULL; /* Special case for the 99% of systems with one, full, sched domain */ - if (root_load_balance && cpumask_empty(subpartitions_cpus)) { -single_root_domain: + if (cpumask_empty(subpartitions_cpus)) { ndoms = 1; - doms = alloc_sched_domains(ndoms); - if (!doms) - goto done; - - dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); - if (dattr) { - *dattr = SD_ATTR_INIT; - update_domain_attr_tree(dattr, &top_cpuset); - } - cpumask_and(doms[0], top_cpuset.effective_cpus, - housekeeping_cpumask(HK_TYPE_DOMAIN)); - - goto done; + /* !csa will be checked and can be correctly handled */ + goto generate_doms; } csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL); if (!csa) goto done; - csn = 0; + /* Find how many partitions and cache them to csa[] */ rcu_read_lock(); - if (root_load_balance) - csa[csn++] = &top_cpuset; cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { - if (cp == &top_cpuset) - continue; - - if (cgrpv2) - goto v2; - - /* - * v1: - * Continue traversing beyond @cp iff @cp has some CPUs and - * isn't load balancing. The former is obvious. The - * latter: All child cpusets contain a subset of the - * parent's cpus, so just skip them, and then we call - * update_domain_attr_tree() to calc relax_domain_level of - * the corresponding sched domain. - */ - if (!cpumask_empty(cp->cpus_allowed) && - !(is_sched_load_balance(cp) && - cpumask_intersects(cp->cpus_allowed, - housekeeping_cpumask(HK_TYPE_DOMAIN)))) - continue; - - if (is_sched_load_balance(cp) && - !cpumask_empty(cp->effective_cpus)) - csa[csn++] = cp; - - /* skip @cp's subtree */ - pos_css = css_rightmost_descendant(pos_css); - continue; - -v2: /* * Only valid partition roots that are not isolated and with - * non-empty effective_cpus will be saved into csn[]. + * non-empty effective_cpus will be saved into csa[]. */ if ((cp->partition_root_state == PRS_ROOT) && !cpumask_empty(cp->effective_cpus)) - csa[csn++] = cp; + csa[ndoms++] = cp; /* * Skip @cp's subtree if not a partition root and has no @@ -927,40 +815,18 @@ v2: } rcu_read_unlock(); - /* - * If there are only isolated partitions underneath the cgroup root, - * we can optimize out unneeded sched domains scanning. - */ - if (root_load_balance && (csn == 1)) - goto single_root_domain; - - for (i = 0; i < csn; i++) - uf_node_init(&csa[i]->node); - - /* Merge overlapping cpusets */ - for (i = 0; i < csn; i++) { - for (j = i + 1; j < csn; j++) { - if (cpusets_overlap(csa[i], csa[j])) { + for (i = 0; i < ndoms; i++) { + for (j = i + 1; j < ndoms; j++) { + if (cpusets_overlap(csa[i], csa[j])) /* * Cgroup v2 shouldn't pass down overlapping * partition root cpusets. */ - WARN_ON_ONCE(cgrpv2); - uf_union(&csa[i]->node, &csa[j]->node); - } + WARN_ON_ONCE(1); } } - /* Count the total number of domains */ - for (i = 0; i < csn; i++) { - if (uf_find(&csa[i]->node) == &csa[i]->node) - ndoms++; - } - - /* - * Now we know how many domains to create. - * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. - */ +generate_doms: doms = alloc_sched_domains(ndoms); if (!doms) goto done; @@ -977,45 +843,19 @@ v2: * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a * subset of HK_TYPE_DOMAIN housekeeping CPUs. */ - if (cgrpv2) { - for (i = 0; i < ndoms; i++) { - /* - * The top cpuset may contain some boot time isolated - * CPUs that need to be excluded from the sched domain. - */ - if (csa[i] == &top_cpuset) - cpumask_and(doms[i], csa[i]->effective_cpus, - housekeeping_cpumask(HK_TYPE_DOMAIN)); - else - cpumask_copy(doms[i], csa[i]->effective_cpus); - if (dattr) - dattr[i] = SD_ATTR_INIT; - } - goto done; - } - - for (nslot = 0, i = 0; i < csn; i++) { - nslot_update = 0; - for (j = i; j < csn; j++) { - if (uf_find(&csa[j]->node) == &csa[i]->node) { - struct cpumask *dp = doms[nslot]; - - if (i == j) { - nslot_update = 1; - cpumask_clear(dp); - if (dattr) - *(dattr + nslot) = SD_ATTR_INIT; - } - cpumask_or(dp, dp, csa[j]->effective_cpus); - cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN)); - if (dattr) - update_domain_attr_tree(dattr + nslot, csa[j]); - } - } - if (nslot_update) - nslot++; + for (i = 0; i < ndoms; i++) { + /* + * The top cpuset may contain some boot time isolated + * CPUs that need to be excluded from the sched domain. + */ + if (!csa || csa[i] == &top_cpuset) + cpumask_and(doms[i], top_cpuset.effective_cpus, + housekeeping_cpumask(HK_TYPE_DOMAIN)); + else + cpumask_copy(doms[i], csa[i]->effective_cpus); + if (dattr) + dattr[i] = SD_ATTR_INIT; } - BUG_ON(nslot != ndoms); done: kfree(csa); @@ -1055,7 +895,7 @@ void dl_rebuild_rd_accounting(void) int cpu; u64 cookie = ++dl_cookie; - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); lockdep_assert_cpus_held(); lockdep_assert_held(&sched_domains_mutex); @@ -1100,53 +940,33 @@ void dl_rebuild_rd_accounting(void) */ void rebuild_sched_domains_locked(void) { - struct cgroup_subsys_state *pos_css; struct sched_domain_attr *attr; cpumask_var_t *doms; - struct cpuset *cs; int ndoms; + int i; lockdep_assert_cpus_held(); - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); force_sd_rebuild = false; - /* - * If we have raced with CPU hotplug, return early to avoid - * passing doms with offlined cpu to partition_sched_domains(). - * Anyways, cpuset_handle_hotplug() will rebuild sched domains. - * - * With no CPUs in any subpartitions, top_cpuset's effective CPUs - * should be the same as the active CPUs, so checking only top_cpuset - * is enough to detect racing CPU offlines. - */ - if (cpumask_empty(subpartitions_cpus) && - !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) - return; + /* Generate domain masks and attrs */ + ndoms = generate_sched_domains(&doms, &attr); /* - * With subpartition CPUs, however, the effective CPUs of a partition - * root should be only a subset of the active CPUs. Since a CPU in any - * partition root could be offlined, all must be checked. - */ - if (!cpumask_empty(subpartitions_cpus)) { - rcu_read_lock(); - cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { - if (!is_partition_valid(cs)) { - pos_css = css_rightmost_descendant(pos_css); - continue; - } - if (!cpumask_subset(cs->effective_cpus, - cpu_active_mask)) { - rcu_read_unlock(); - return; - } - } - rcu_read_unlock(); + * cpuset_hotplug_workfn is invoked synchronously now, thus this + * function should not race with CPU hotplug. And the effective CPUs + * must not include any offline CPUs. Passing an offline CPU in the + * doms to partition_sched_domains() will trigger a kernel panic. + * + * We perform a final check here: if the doms contains any + * offline CPUs, a warning is emitted and we return directly to + * prevent the panic. + */ + for (i = 0; i < ndoms; ++i) { + if (WARN_ON_ONCE(!cpumask_subset(doms[i], cpu_active_mask))) + return; } - /* Generate domain masks and attrs */ - ndoms = generate_sched_domains(&doms, &attr); - /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); } @@ -1205,11 +1025,10 @@ void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) if (top_cs) { /* + * PF_KTHREAD tasks are handled by housekeeping. * PF_NO_SETAFFINITY tasks are ignored. - * All per cpu kthreads should have PF_NO_SETAFFINITY - * flag set, see kthread_set_per_cpu(). */ - if (task->flags & PF_NO_SETAFFINITY) + if (task->flags & (PF_KTHREAD | PF_NO_SETAFFINITY)) continue; cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus); } else { @@ -1450,15 +1269,16 @@ static bool isolated_cpus_can_update(struct cpumask *add_cpus, * @new_cpus: cpu mask * Return: true if there is conflict, false otherwise * - * CPUs outside of boot_hk_cpus, if defined, can only be used in an + * CPUs outside of HK_TYPE_DOMAIN_BOOT, if defined, can only be used in an * isolated partition. */ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) { - if (!have_boot_isolcpus) + if (!housekeeping_enabled(HK_TYPE_DOMAIN_BOOT)) return false; - if ((prstate != PRS_ISOLATED) && !cpumask_subset(new_cpus, boot_hk_cpus)) + if ((prstate != PRS_ISOLATED) && + !cpumask_subset(new_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT))) return true; return false; @@ -1477,29 +1297,13 @@ static void update_isolation_cpumasks(void) if (!isolated_cpus_updating) return; - lockdep_assert_cpus_held(); - - ret = workqueue_unbound_exclude_cpumask(isolated_cpus); - WARN_ON_ONCE(ret < 0); - - ret = tmigr_isolated_exclude_cpumask(isolated_cpus); + ret = housekeeping_update(isolated_cpus); WARN_ON_ONCE(ret < 0); isolated_cpus_updating = false; } /** - * cpuset_cpu_is_isolated - Check if the given CPU is isolated - * @cpu: the CPU number to be checked - * Return: true if CPU is used in an isolated partition, false otherwise - */ -bool cpuset_cpu_is_isolated(int cpu) -{ - return cpumask_test_cpu(cpu, isolated_cpus); -} -EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated); - -/** * rm_siblings_excl_cpus - Remove exclusive CPUs that are used by sibling cpusets * @parent: Parent cpuset containing all siblings * @cs: Current cpuset (will be skipped) @@ -1517,23 +1321,29 @@ static int rm_siblings_excl_cpus(struct cpuset *parent, struct cpuset *cs, int retval = 0; if (cpumask_empty(excpus)) - return retval; + return 0; /* - * Exclude exclusive CPUs from siblings + * Remove exclusive CPUs from siblings */ rcu_read_lock(); cpuset_for_each_child(sibling, css, parent) { + struct cpumask *sibling_xcpus; + if (sibling == cs) continue; - if (cpumask_intersects(excpus, sibling->exclusive_cpus)) { - cpumask_andnot(excpus, excpus, sibling->exclusive_cpus); - retval++; - continue; - } - if (cpumask_intersects(excpus, sibling->effective_xcpus)) { - cpumask_andnot(excpus, excpus, sibling->effective_xcpus); + /* + * If exclusive_cpus is defined, effective_xcpus will always + * be a subset. Otherwise, effective_xcpus will only be set + * in a valid partition root. + */ + sibling_xcpus = cpumask_empty(sibling->exclusive_cpus) + ? sibling->effective_xcpus + : sibling->exclusive_cpus; + + if (cpumask_intersects(excpus, sibling_xcpus)) { + cpumask_andnot(excpus, excpus, sibling_xcpus); retval++; } } @@ -1822,7 +1632,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, int parent_prs = parent->partition_root_state; bool nocpu; - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); WARN_ON_ONCE(is_remote_partition(cs)); /* For local partition only */ /* @@ -2331,17 +2141,13 @@ get_css: spin_lock_irq(&callback_lock); cpumask_copy(cp->effective_cpus, tmp->new_cpus); cp->partition_root_state = new_prs; - if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) - compute_excpus(cp, cp->effective_xcpus); - /* - * Make sure effective_xcpus is properly set for a valid - * partition root. + * Need to compute effective_xcpus if either exclusive_cpus + * is non-empty or it is a valid partition root. */ - if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus)) - cpumask_and(cp->effective_xcpus, - cp->cpus_allowed, parent->effective_xcpus); - else if (new_prs < 0) + if ((new_prs > 0) || !cpumask_empty(cp->exclusive_cpus)) + compute_excpus(cp, cp->effective_xcpus); + if (new_prs <= 0) reset_partition_data(cp); spin_unlock_irq(&callback_lock); @@ -2394,7 +2200,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, struct cpuset *sibling; struct cgroup_subsys_state *pos_css; - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); /* * Check all its siblings and call update_cpumasks_hier() @@ -2403,27 +2209,20 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, * It is possible a change in parent's effective_cpus * due to a change in a child partition's effective_xcpus will impact * its siblings even if they do not inherit parent's effective_cpus - * directly. + * directly. It should not impact valid partition. * * The update_cpumasks_hier() function may sleep. So we have to * release the RCU read lock before calling it. */ rcu_read_lock(); cpuset_for_each_child(sibling, pos_css, parent) { - if (sibling == cs) + if (sibling == cs || is_partition_valid(sibling)) continue; - if (!is_partition_valid(sibling)) { - compute_effective_cpumask(tmp->new_cpus, sibling, - parent); - if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus)) - continue; - } else if (is_remote_partition(sibling)) { - /* - * Change in a sibling cpuset won't affect a remote - * partition root. - */ + + compute_effective_cpumask(tmp->new_cpus, sibling, + parent); + if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus)) continue; - } if (!css_tryget_online(&sibling->css)) continue; @@ -2479,43 +2278,6 @@ static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *tri return PERR_NONE; } -static int cpus_allowed_validate_change(struct cpuset *cs, struct cpuset *trialcs, - struct tmpmasks *tmp) -{ - int retval; - struct cpuset *parent = parent_cs(cs); - - retval = validate_change(cs, trialcs); - - if ((retval == -EINVAL) && cpuset_v2()) { - struct cgroup_subsys_state *css; - struct cpuset *cp; - - /* - * The -EINVAL error code indicates that partition sibling - * CPU exclusivity rule has been violated. We still allow - * the cpumask change to proceed while invalidating the - * partition. However, any conflicting sibling partitions - * have to be marked as invalid too. - */ - trialcs->prs_err = PERR_NOTEXCL; - rcu_read_lock(); - cpuset_for_each_child(cp, css, parent) { - struct cpumask *xcpus = user_xcpus(trialcs); - - if (is_partition_valid(cp) && - cpumask_intersects(xcpus, cp->effective_xcpus)) { - rcu_read_unlock(); - update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, tmp); - rcu_read_lock(); - } - } - rcu_read_unlock(); - retval = 0; - } - return retval; -} - /** * partition_cpus_change - Handle partition state changes due to CPU mask updates * @cs: The target cpuset being modified @@ -2575,15 +2337,15 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) return 0; - if (alloc_tmpmasks(&tmp)) - return -ENOMEM; - compute_trialcs_excpus(trialcs, cs); trialcs->prs_err = PERR_NONE; - retval = cpus_allowed_validate_change(cs, trialcs, &tmp); + retval = validate_change(cs, trialcs); if (retval < 0) - goto out_free; + return retval; + + if (alloc_tmpmasks(&tmp)) + return -ENOMEM; /* * Check all the descendants in update_cpumasks_hier() if @@ -2606,7 +2368,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ if (cs->partition_root_state) update_partition_sd_lb(cs, old_prs); -out_free: + free_tmpmasks(&tmp); return retval; } @@ -2859,13 +2621,13 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) cpuset_for_each_descendant_pre(cp, pos_css, cs) { struct cpuset *parent = parent_cs(cp); - nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems); + bool has_mems = nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems); /* * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some MEMs. */ - if (is_in_v2_mode() && nodes_empty(*new_mems)) + if (is_in_v2_mode() && !has_mems) *new_mems = parent->effective_mems; /* Skip the whole subtree if the nodemask remains the same. */ @@ -3265,7 +3027,7 @@ static nodemask_t cpuset_attach_nodemask_to; static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) { - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); if (cs != &top_cpuset) guarantee_active_cpus(task, cpus_attach); @@ -3621,8 +3383,7 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) return ERR_PTR(-ENOMEM); __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); - fmeter_init(&cs->fmeter); - cs->relax_domain_level = -1; + cpuset1_init(cs); /* Set CS_MEMORY_MIGRATE for default hierarchy */ if (cpuset_v2()) @@ -3635,17 +3396,11 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); struct cpuset *parent = parent_cs(cs); - struct cpuset *tmp_cs; - struct cgroup_subsys_state *pos_css; if (!parent) return 0; cpuset_full_lock(); - if (is_spread_page(parent)) - set_bit(CS_SPREAD_PAGE, &cs->flags); - if (is_spread_slab(parent)) - set_bit(CS_SPREAD_SLAB, &cs->flags); /* * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated */ @@ -3660,39 +3415,8 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->effective_mems = parent->effective_mems; } spin_unlock_irq(&callback_lock); + cpuset1_online_css(css); - if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) - goto out_unlock; - - /* - * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is - * set. This flag handling is implemented in cgroup core for - * historical reasons - the flag may be specified during mount. - * - * Currently, if any sibling cpusets have exclusive cpus or mem, we - * refuse to clone the configuration - thereby refusing the task to - * be entered, and as a result refusing the sys_unshare() or - * clone() which initiated it. If this becomes a problem for some - * users who wish to allow that scenario, then this could be - * changed to grant parent->cpus_allowed-sibling_cpus_exclusive - * (and likewise for mems) to the new cgroup. - */ - rcu_read_lock(); - cpuset_for_each_child(tmp_cs, pos_css, parent) { - if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { - rcu_read_unlock(); - goto out_unlock; - } - } - rcu_read_unlock(); - - spin_lock_irq(&callback_lock); - cs->mems_allowed = parent->mems_allowed; - cs->effective_mems = parent->mems_allowed; - cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); - cpumask_copy(cs->effective_cpus, parent->cpus_allowed); - spin_unlock_irq(&callback_lock); -out_unlock: cpuset_full_unlock(); return 0; } @@ -3892,16 +3616,13 @@ int __init cpuset_init(void) cpumask_setall(top_cpuset.exclusive_cpus); nodes_setall(top_cpuset.effective_mems); - fmeter_init(&top_cpuset.fmeter); + cpuset1_init(&top_cpuset); BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); - have_boot_isolcpus = housekeeping_enabled(HK_TYPE_DOMAIN); - if (have_boot_isolcpus) { - BUG_ON(!alloc_cpumask_var(&boot_hk_cpus, GFP_KERNEL)); - cpumask_copy(boot_hk_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN)); - cpumask_andnot(isolated_cpus, cpu_possible_mask, boot_hk_cpus); - } + if (housekeeping_enabled(HK_TYPE_DOMAIN_BOOT)) + cpumask_andnot(isolated_cpus, cpu_possible_mask, + housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT)); return 0; } @@ -4229,7 +3950,7 @@ static void __cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask */ void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask) { - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); __cpuset_cpus_allowed_locked(tsk, pmask); } diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 81ea38dd6f9d..a5490097fe52 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -230,7 +230,7 @@ static int cgroup_subsys_states_read(struct seq_file *seq, void *v) } static void cgroup_masks_read_one(struct seq_file *seq, const char *name, - u16 mask) + u32 mask) { struct cgroup_subsys *ss; int ssid; diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c index e12b946278b6..1ea6afffa985 100644 --- a/kernel/cgroup/dmem.c +++ b/kernel/cgroup/dmem.c @@ -14,6 +14,7 @@ #include <linux/mutex.h> #include <linux/page_counter.h> #include <linux/parser.h> +#include <linux/refcount.h> #include <linux/rculist.h> #include <linux/slab.h> @@ -71,7 +72,9 @@ struct dmem_cgroup_pool_state { struct rcu_head rcu; struct page_counter cnt; + struct dmem_cgroup_pool_state *parent; + refcount_t ref; bool inited; }; @@ -88,6 +91,9 @@ struct dmem_cgroup_pool_state { static DEFINE_SPINLOCK(dmemcg_lock); static LIST_HEAD(dmem_cgroup_regions); +static void dmemcg_free_region(struct kref *ref); +static void dmemcg_pool_free_rcu(struct rcu_head *rcu); + static inline struct dmemcg_state * css_to_dmemcs(struct cgroup_subsys_state *css) { @@ -104,10 +110,38 @@ static struct dmemcg_state *parent_dmemcs(struct dmemcg_state *cg) return cg->css.parent ? css_to_dmemcs(cg->css.parent) : NULL; } +static void dmemcg_pool_get(struct dmem_cgroup_pool_state *pool) +{ + refcount_inc(&pool->ref); +} + +static bool dmemcg_pool_tryget(struct dmem_cgroup_pool_state *pool) +{ + return refcount_inc_not_zero(&pool->ref); +} + +static void dmemcg_pool_put(struct dmem_cgroup_pool_state *pool) +{ + if (!refcount_dec_and_test(&pool->ref)) + return; + + call_rcu(&pool->rcu, dmemcg_pool_free_rcu); +} + +static void dmemcg_pool_free_rcu(struct rcu_head *rcu) +{ + struct dmem_cgroup_pool_state *pool = container_of(rcu, typeof(*pool), rcu); + + if (pool->parent) + dmemcg_pool_put(pool->parent); + kref_put(&pool->region->ref, dmemcg_free_region); + kfree(pool); +} + static void free_cg_pool(struct dmem_cgroup_pool_state *pool) { list_del(&pool->region_node); - kfree(pool); + dmemcg_pool_put(pool); } static void @@ -342,6 +376,12 @@ alloc_pool_single(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region page_counter_init(&pool->cnt, ppool ? &ppool->cnt : NULL, true); reset_all_resource_limits(pool); + refcount_set(&pool->ref, 1); + kref_get(®ion->ref); + if (ppool && !pool->parent) { + pool->parent = ppool; + dmemcg_pool_get(ppool); + } list_add_tail_rcu(&pool->css_node, &dmemcs->pools); list_add_tail(&pool->region_node, ®ion->pools); @@ -389,6 +429,10 @@ get_cg_pool_locked(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *regio /* Fix up parent links, mark as inited. */ pool->cnt.parent = &ppool->cnt; + if (ppool && !pool->parent) { + pool->parent = ppool; + dmemcg_pool_get(ppool); + } pool->inited = true; pool = ppool; @@ -423,7 +467,7 @@ static void dmemcg_free_region(struct kref *ref) */ void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region) { - struct list_head *entry; + struct dmem_cgroup_pool_state *pool, *next; if (!region) return; @@ -433,11 +477,10 @@ void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region) /* Remove from global region list */ list_del_rcu(®ion->region_node); - list_for_each_rcu(entry, ®ion->pools) { - struct dmem_cgroup_pool_state *pool = - container_of(entry, typeof(*pool), region_node); - + list_for_each_entry_safe(pool, next, ®ion->pools, region_node) { list_del_rcu(&pool->css_node); + list_del(&pool->region_node); + dmemcg_pool_put(pool); } /* @@ -518,8 +561,10 @@ static struct dmem_cgroup_region *dmemcg_get_region_by_name(const char *name) */ void dmem_cgroup_pool_state_put(struct dmem_cgroup_pool_state *pool) { - if (pool) + if (pool) { css_put(&pool->cs->css); + dmemcg_pool_put(pool); + } } EXPORT_SYMBOL_GPL(dmem_cgroup_pool_state_put); @@ -533,6 +578,8 @@ get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region) pool = find_cg_pool_locked(cg, region); if (pool && !READ_ONCE(pool->inited)) pool = NULL; + if (pool && !dmemcg_pool_tryget(pool)) + pool = NULL; rcu_read_unlock(); while (!pool) { @@ -541,6 +588,8 @@ get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region) pool = get_cg_pool_locked(cg, region, &allocpool); else pool = ERR_PTR(-ENODEV); + if (!IS_ERR(pool)) + dmemcg_pool_get(pool); spin_unlock(&dmemcg_lock); if (pool == ERR_PTR(-ENOMEM)) { @@ -576,6 +625,7 @@ void dmem_cgroup_uncharge(struct dmem_cgroup_pool_state *pool, u64 size) page_counter_uncharge(&pool->cnt, size); css_put(&pool->cs->css); + dmemcg_pool_put(pool); } EXPORT_SYMBOL_GPL(dmem_cgroup_uncharge); @@ -627,7 +677,9 @@ int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size, if (ret_limit_pool) { *ret_limit_pool = container_of(fail, struct dmem_cgroup_pool_state, cnt); css_get(&(*ret_limit_pool)->cs->css); + dmemcg_pool_get(*ret_limit_pool); } + dmemcg_pool_put(pool); ret = -EAGAIN; goto err; } @@ -700,6 +752,9 @@ static ssize_t dmemcg_limit_write(struct kernfs_open_file *of, if (!region_name[0]) continue; + if (!options || !*options) + return -EINVAL; + rcu_read_lock(); region = dmemcg_get_region_by_name(region_name); rcu_read_unlock(); @@ -719,6 +774,7 @@ static ssize_t dmemcg_limit_write(struct kernfs_open_file *of, /* And commit */ apply(pool, new_limit); + dmemcg_pool_put(pool); out_put: kref_put(®ion->ref, dmemcg_free_region); diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config index 9f6ab7dabf67..774702591d26 100644 --- a/kernel/configs/debug.config +++ b/kernel/configs/debug.config @@ -84,7 +84,7 @@ CONFIG_SLUB_DEBUG_ON=y # Debug Oops, Lockups and Hangs # CONFIG_BOOTPARAM_HUNG_TASK_PANIC=0 -# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=0 CONFIG_DEBUG_ATOMIC_SLEEP=y CONFIG_DETECT_HUNG_TASK=y CONFIG_PANIC_ON_OOPS=y diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index fb5be6e9b423..a743e7ffa6c0 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -54,24 +54,6 @@ static __always_inline void rcu_task_enter(void) #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ } -/* Turn on heavyweight RCU tasks trace readers on kernel exit. */ -static __always_inline void rcu_task_trace_heavyweight_enter(void) -{ -#ifdef CONFIG_TASKS_TRACE_RCU - if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) - current->trc_reader_special.b.need_mb = true; -#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ -} - -/* Turn off heavyweight RCU tasks trace readers on kernel entry. */ -static __always_inline void rcu_task_trace_heavyweight_exit(void) -{ -#ifdef CONFIG_TASKS_TRACE_RCU - if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) - current->trc_reader_special.b.need_mb = false; -#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ -} - /* * Record entry into an extended quiescent state. This is only to be * called when not already in an extended quiescent state, that is, @@ -85,7 +67,6 @@ static noinstr void ct_kernel_exit_state(int offset) * critical sections, and we also must force ordering with the * next idle sojourn. */ - rcu_task_trace_heavyweight_enter(); // Before CT state update! // RCU is still watching. Better not be in extended quiescent state! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !rcu_is_watching_curr_cpu()); (void)ct_state_inc(offset); @@ -108,7 +89,6 @@ static noinstr void ct_kernel_enter_state(int offset) */ seq = ct_state_inc(offset); // RCU is now watching. Better not be in an extended quiescent state! - rcu_task_trace_heavyweight_exit(); // After CT state update! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & CT_RCU_WATCHING)); } diff --git a/kernel/cpu.c b/kernel/cpu.c index 8df2d773fe3b..01968a5c4a16 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -534,6 +534,11 @@ int lockdep_is_cpus_held(void) { return percpu_rwsem_is_held(&cpu_hotplug_lock); } + +int lockdep_is_cpus_write_held(void) +{ + return percpu_rwsem_is_write_held(&cpu_hotplug_lock); +} #endif static void lockdep_acquire_cpus_lock(void) @@ -1410,6 +1415,16 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, cpus_write_lock(); + /* + * Keep at least one housekeeping cpu onlined to avoid generating + * an empty sched_domain span. + */ + if (cpumask_any_and(cpu_online_mask, + housekeeping_cpumask(HK_TYPE_DOMAIN)) >= nr_cpu_ids) { + ret = -EBUSY; + goto out; + } + cpuhp_tasks_frozen = tasks_frozen; prev_state = cpuhp_set_state(cpu, st, target); @@ -1456,22 +1471,8 @@ out: return ret; } -struct cpu_down_work { - unsigned int cpu; - enum cpuhp_state target; -}; - -static long __cpu_down_maps_locked(void *arg) -{ - struct cpu_down_work *work = arg; - - return _cpu_down(work->cpu, 0, work->target); -} - static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target) { - struct cpu_down_work work = { .cpu = cpu, .target = target, }; - /* * If the platform does not support hotplug, report it explicitly to * differentiate it from a transient offlining failure. @@ -1480,18 +1481,7 @@ static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target) return -EOPNOTSUPP; if (cpu_hotplug_disabled) return -EBUSY; - - /* - * Ensure that the control task does not run on the to be offlined - * CPU to prevent a deadlock against cfs_b->period_timer. - * Also keep at least one housekeeping cpu onlined to avoid generating - * an empty sched_domain span. - */ - for_each_cpu_and(cpu, cpu_online_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)) { - if (cpu != work.cpu) - return work_on_cpu(cpu, __cpu_down_maps_locked, &work); - } - return -EBUSY; + return _cpu_down(cpu, 0, target); } static int cpu_down(unsigned int cpu, enum cpuhp_state target) diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 99dac1aa972a..3952b3e102e0 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -44,9 +44,15 @@ note_buf_t __percpu *crash_notes; int kimage_crash_copy_vmcoreinfo(struct kimage *image) { - struct page *vmcoreinfo_page; + struct page *vmcoreinfo_base; + struct page *vmcoreinfo_pages[DIV_ROUND_UP(VMCOREINFO_BYTES, PAGE_SIZE)]; + unsigned int order, nr_pages; + int i; void *safecopy; + nr_pages = DIV_ROUND_UP(VMCOREINFO_BYTES, PAGE_SIZE); + order = get_order(VMCOREINFO_BYTES); + if (!IS_ENABLED(CONFIG_CRASH_DUMP)) return 0; if (image->type != KEXEC_TYPE_CRASH) @@ -61,12 +67,15 @@ int kimage_crash_copy_vmcoreinfo(struct kimage *image) * happens to generate vmcoreinfo note, hereby we rely on * vmap for this purpose. */ - vmcoreinfo_page = kimage_alloc_control_pages(image, 0); - if (!vmcoreinfo_page) { + vmcoreinfo_base = kimage_alloc_control_pages(image, order); + if (!vmcoreinfo_base) { pr_warn("Could not allocate vmcoreinfo buffer\n"); return -ENOMEM; } - safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL); + for (i = 0; i < nr_pages; i++) + vmcoreinfo_pages[i] = vmcoreinfo_base + i; + + safecopy = vmap(vmcoreinfo_pages, nr_pages, VM_MAP, PAGE_KERNEL); if (!safecopy) { pr_warn("Could not vmap vmcoreinfo buffer\n"); return -ENOMEM; diff --git a/kernel/crash_dump_dm_crypt.c b/kernel/crash_dump_dm_crypt.c index 401423ba477d..37129243054d 100644 --- a/kernel/crash_dump_dm_crypt.c +++ b/kernel/crash_dump_dm_crypt.c @@ -143,6 +143,7 @@ static int read_key_from_user_keying(struct dm_crypt_key *dm_key) { const struct user_key_payload *ukp; struct key *key; + int ret = 0; kexec_dprintk("Requesting logon key %s", dm_key->key_desc); key = request_key(&key_type_logon, dm_key->key_desc, NULL); @@ -152,20 +153,28 @@ static int read_key_from_user_keying(struct dm_crypt_key *dm_key) return PTR_ERR(key); } + down_read(&key->sem); ukp = user_key_payload_locked(key); - if (!ukp) - return -EKEYREVOKED; + if (!ukp) { + ret = -EKEYREVOKED; + goto out; + } if (ukp->datalen > KEY_SIZE_MAX) { pr_err("Key size %u exceeds maximum (%u)\n", ukp->datalen, KEY_SIZE_MAX); - return -EINVAL; + ret = -EINVAL; + goto out; } memcpy(dm_key->data, ukp->data, ukp->datalen); dm_key->key_size = ukp->datalen; kexec_dprintk("Get dm crypt key (size=%u) %s: %8ph\n", dm_key->key_size, dm_key->key_desc, dm_key->data); - return 0; + +out: + up_read(&key->sem); + key_put(key); + return ret; } struct config_key { @@ -223,7 +232,7 @@ static void config_key_release(struct config_item *item) key_count--; } -static struct configfs_item_operations config_key_item_ops = { +static const struct configfs_item_operations config_key_item_ops = { .release = config_key_release, }; @@ -298,7 +307,7 @@ static struct configfs_attribute *config_keys_attrs[] = { * Note that, since no extra work is required on ->drop_item(), * no ->drop_item() is provided. */ -static struct configfs_group_operations config_keys_group_ops = { +static const struct configfs_group_operations config_keys_group_ops = { .make_item = config_keys_make_item, }; diff --git a/kernel/cred.c b/kernel/cred.c index a6f686b30da1..12a7b1ce5131 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -621,29 +621,6 @@ int set_security_override(struct cred *new, u32 secid) EXPORT_SYMBOL(set_security_override); /** - * set_security_override_from_ctx - Set the security ID in a set of credentials - * @new: The credentials to alter - * @secctx: The LSM security context to generate the security ID from. - * - * Set the LSM security ID in a set of credentials so that the subjective - * security is overridden when an alternative set of credentials is used. The - * security ID is specified in string form as a security context to be - * interpreted by the LSM. - */ -int set_security_override_from_ctx(struct cred *new, const char *secctx) -{ - u32 secid; - int ret; - - ret = security_secctx_to_secid(secctx, strlen(secctx), &secid); - if (ret < 0) - return ret; - - return set_security_override(new, secid); -} -EXPORT_SYMBOL(set_security_override_from_ctx); - -/** * set_create_files_as - Set the LSM file create context in a set of credentials * @new: The credentials to alter * @inode: The inode to take the context from diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 22fe969c5d2e..f586afd76c80 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -27,6 +27,7 @@ #include <linux/kernel.h> #include <linux/sched/signal.h> +#include <linux/hex.h> #include <linux/kgdb.h> #include <linux/kdb.h> #include <linux/serial_core.h> diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 30e7912ebb0d..2e55c493c98b 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -18,6 +18,8 @@ do { \ d->type##_delay_max = tsk->delays->type##_delay_max; \ d->type##_delay_min = tsk->delays->type##_delay_min; \ + d->type##_delay_max_ts.tv_sec = tsk->delays->type##_delay_max_ts.tv_sec; \ + d->type##_delay_max_ts.tv_nsec = tsk->delays->type##_delay_max_ts.tv_nsec; \ tmp = d->type##_delay_total + tsk->delays->type##_delay; \ d->type##_delay_total = (tmp < d->type##_delay_total) ? 0 : tmp; \ d->type##_count += tsk->delays->type##_count; \ @@ -104,7 +106,8 @@ void __delayacct_tsk_init(struct task_struct *tsk) * Finish delay accounting for a statistic using its timestamps (@start), * accumulator (@total) and @count */ -static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count, u64 *max, u64 *min) +static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count, + u64 *max, u64 *min, struct timespec64 *ts) { s64 ns = local_clock() - *start; unsigned long flags; @@ -113,8 +116,10 @@ static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *cou raw_spin_lock_irqsave(lock, flags); *total += ns; (*count)++; - if (ns > *max) + if (ns > *max) { *max = ns; + ktime_get_real_ts64(ts); + } if (*min == 0 || ns < *min) *min = ns; raw_spin_unlock_irqrestore(lock, flags); @@ -137,7 +142,8 @@ void __delayacct_blkio_end(struct task_struct *p) &p->delays->blkio_delay, &p->delays->blkio_count, &p->delays->blkio_delay_max, - &p->delays->blkio_delay_min); + &p->delays->blkio_delay_min, + &p->delays->blkio_delay_max_ts); } int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) @@ -170,6 +176,8 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->cpu_delay_max = tsk->sched_info.max_run_delay; d->cpu_delay_min = tsk->sched_info.min_run_delay; + d->cpu_delay_max_ts.tv_sec = tsk->sched_info.max_run_delay_ts.tv_sec; + d->cpu_delay_max_ts.tv_nsec = tsk->sched_info.max_run_delay_ts.tv_nsec; tmp = (s64)d->cpu_delay_total + t2; d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp; tmp = (s64)d->cpu_run_virtual_total + t3; @@ -217,7 +225,8 @@ void __delayacct_freepages_end(void) ¤t->delays->freepages_delay, ¤t->delays->freepages_count, ¤t->delays->freepages_delay_max, - ¤t->delays->freepages_delay_min); + ¤t->delays->freepages_delay_min, + ¤t->delays->freepages_delay_max_ts); } void __delayacct_thrashing_start(bool *in_thrashing) @@ -241,7 +250,8 @@ void __delayacct_thrashing_end(bool *in_thrashing) ¤t->delays->thrashing_delay, ¤t->delays->thrashing_count, ¤t->delays->thrashing_delay_max, - ¤t->delays->thrashing_delay_min); + ¤t->delays->thrashing_delay_min, + ¤t->delays->thrashing_delay_max_ts); } void __delayacct_swapin_start(void) @@ -256,7 +266,8 @@ void __delayacct_swapin_end(void) ¤t->delays->swapin_delay, ¤t->delays->swapin_count, ¤t->delays->swapin_delay_max, - ¤t->delays->swapin_delay_min); + ¤t->delays->swapin_delay_min, + ¤t->delays->swapin_delay_max_ts); } void __delayacct_compact_start(void) @@ -271,7 +282,8 @@ void __delayacct_compact_end(void) ¤t->delays->compact_delay, ¤t->delays->compact_count, ¤t->delays->compact_delay_max, - ¤t->delays->compact_delay_min); + ¤t->delays->compact_delay_min, + ¤t->delays->compact_delay_max_ts); } void __delayacct_wpcopy_start(void) @@ -286,7 +298,8 @@ void __delayacct_wpcopy_end(void) ¤t->delays->wpcopy_delay, ¤t->delays->wpcopy_count, ¤t->delays->wpcopy_delay_max, - ¤t->delays->wpcopy_delay_min); + ¤t->delays->wpcopy_delay_min, + ¤t->delays->wpcopy_delay_max_ts); } void __delayacct_irq(struct task_struct *task, u32 delta) @@ -296,8 +309,10 @@ void __delayacct_irq(struct task_struct *task, u32 delta) raw_spin_lock_irqsave(&task->delays->lock, flags); task->delays->irq_delay += delta; task->delays->irq_count++; - if (delta > task->delays->irq_delay_max) + if (delta > task->delays->irq_delay_max) { task->delays->irq_delay_max = delta; + ktime_get_real_ts64(&task->delays->irq_delay_max_ts); + } if (delta && (!task->delays->irq_delay_min || delta < task->delays->irq_delay_min)) task->delays->irq_delay_min = delta; raw_spin_unlock_irqrestore(&task->delays->lock, flags); diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index d8fd6f779f79..c56004d314dc 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -91,6 +91,16 @@ static int __init early_cma(char *p) } early_param("cma", early_cma); +/* + * cma_skip_dt_default_reserved_mem - This is called from the + * reserved_mem framework to detect if the default cma region is being + * set by the "cma=" kernel parameter. + */ +bool __init cma_skip_dt_default_reserved_mem(void) +{ + return size_cmdline != -1; +} + #ifdef CONFIG_DMA_NUMA_CMA static struct cma *dma_contiguous_numa_area[MAX_NUMNODES]; @@ -247,10 +257,12 @@ void __init dma_contiguous_reserve(phys_addr_t limit) pr_debug("%s: reserving %ld MiB for global area\n", __func__, (unsigned long)selected_size / SZ_1M); - dma_contiguous_reserve_area(selected_size, selected_base, - selected_limit, - &dma_contiguous_default_area, - fixed); + ret = dma_contiguous_reserve_area(selected_size, selected_base, + selected_limit, + &dma_contiguous_default_area, + fixed); + if (ret) + return; ret = dma_heap_cma_register_heap(dma_contiguous_default_area); if (ret) @@ -470,12 +482,6 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem) struct cma *cma; int err; - if (size_cmdline != -1 && default_cma) { - pr_info("Reserved memory: bypass %s node, using cmdline CMA params instead\n", - rmem->name); - return -EBUSY; - } - if (!of_get_flat_dt_prop(node, "reusable", NULL) || of_get_flat_dt_prop(node, "no-map", NULL)) return -EINVAL; diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 37163eb49f9f..ee29c47781e3 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -638,7 +638,7 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, /* let the implementation decide on the zone to allocate from: */ flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM); - if (dma_alloc_direct(dev, ops)) { + if (dma_alloc_direct(dev, ops) || arch_dma_alloc_direct(dev)) { cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs); } else if (use_dma_iommu(dev)) { cpu_addr = iommu_dma_alloc(dev, size, dma_handle, flag, attrs); @@ -679,7 +679,7 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, return; debug_dma_free_coherent(dev, size, cpu_addr, dma_handle); - if (dma_alloc_direct(dev, ops)) + if (dma_alloc_direct(dev, ops) || arch_dma_free_direct(dev, dma_handle)) dma_direct_free(dev, size, cpu_addr, dma_handle, attrs); else if (use_dma_iommu(dev)) iommu_dma_free(dev, size, cpu_addr, dma_handle, attrs); diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 26392badc36b..2b2fbb709242 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -184,6 +184,12 @@ static __init struct gen_pool *__dma_atomic_pool_init(size_t pool_size, return pool; } +#ifdef CONFIG_ZONE_DMA32 +#define has_managed_dma32 has_managed_zone(ZONE_DMA32) +#else +#define has_managed_dma32 false +#endif + static int __init dma_atomic_pool_init(void) { int ret = 0; @@ -199,17 +205,20 @@ static int __init dma_atomic_pool_init(void) } INIT_WORK(&atomic_pool_work, atomic_pool_work_fn); - atomic_pool_kernel = __dma_atomic_pool_init(atomic_pool_size, + /* All memory might be in the DMA zone(s) to begin with */ + if (has_managed_zone(ZONE_NORMAL)) { + atomic_pool_kernel = __dma_atomic_pool_init(atomic_pool_size, GFP_KERNEL); - if (!atomic_pool_kernel) - ret = -ENOMEM; + if (!atomic_pool_kernel) + ret = -ENOMEM; + } if (has_managed_dma()) { atomic_pool_dma = __dma_atomic_pool_init(atomic_pool_size, GFP_KERNEL | GFP_DMA); if (!atomic_pool_dma) ret = -ENOMEM; } - if (IS_ENABLED(CONFIG_ZONE_DMA32)) { + if (has_managed_dma32) { atomic_pool_dma32 = __dma_atomic_pool_init(atomic_pool_size, GFP_KERNEL | GFP_DMA32); if (!atomic_pool_dma32) @@ -224,11 +233,11 @@ postcore_initcall(dma_atomic_pool_init); static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp) { if (prev == NULL) { - if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32)) - return atomic_pool_dma32; - if (atomic_pool_dma && (gfp & GFP_DMA)) - return atomic_pool_dma; - return atomic_pool_kernel; + if (gfp & GFP_DMA) + return atomic_pool_dma ?: atomic_pool_dma32 ?: atomic_pool_kernel; + if (gfp & GFP_DMA32) + return atomic_pool_dma32 ?: atomic_pool_dma ?: atomic_pool_kernel; + return atomic_pool_kernel ?: atomic_pool_dma32 ?: atomic_pool_dma; } if (prev == atomic_pool_kernel) return atomic_pool_dma32 ? atomic_pool_dma32 : atomic_pool_dma; @@ -268,15 +277,20 @@ struct page *dma_alloc_from_pool(struct device *dev, size_t size, { struct gen_pool *pool = NULL; struct page *page; + bool pool_found = false; while ((pool = dma_guess_pool(pool, gfp))) { + pool_found = true; page = __dma_alloc_from_pool(dev, size, pool, cpu_addr, phys_addr_ok); if (page) return page; } - WARN(1, "Failed to get suitable pool for %s\n", dev_name(dev)); + if (pool_found) + WARN(!(gfp & __GFP_NOWARN), "DMA pool exhausted for %s\n", dev_name(dev)); + else + WARN(1, "Failed to get suitable pool for %s\n", dev_name(dev)); return NULL; } diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 5c792b30c58a..9ef63e414791 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -17,6 +17,27 @@ void __weak arch_do_signal_or_restart(struct pt_regs *regs) { } #define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK) #endif +/* TIF bits, which prevent a time slice extension. */ +#ifdef CONFIG_PREEMPT_RT +/* + * Since rseq slice ext has a direct correlation to the worst case + * scheduling latency (schedule is delayed after all), only have it affect + * LAZY reschedules on PREEMPT_RT for now. + * + * However, since this delay is only applicable to userspace, a value + * for rseq_slice_extension_nsec that is strictly less than the worst case + * kernel space preempt_disable() region, should mean the scheduling latency + * is not affected, even for !LAZY. + * + * However, since this value depends on the hardware at hand, it cannot be + * pre-determined in any sensible way. Hence punt on this problem for now. + */ +# define TIF_SLICE_EXT_SCHED (_TIF_NEED_RESCHED_LAZY) +#else +# define TIF_SLICE_EXT_SCHED (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY) +#endif +#define TIF_SLICE_EXT_DENY (EXIT_TO_USER_MODE_WORK & ~TIF_SLICE_EXT_SCHED) + static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work) { @@ -28,8 +49,10 @@ static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *re local_irq_enable_exit_to_user(ti_work); - if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) - schedule(); + if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) { + if (!rseq_grant_slice_extension(ti_work & TIF_SLICE_EXT_DENY)) + schedule(); + } if (ti_work & _TIF_UPROBE) uprobe_notify_resume(regs); diff --git a/kernel/entry/common.h b/kernel/entry/common.h deleted file mode 100644 index f6e6d02f07fe..000000000000 --- a/kernel/entry/common.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _COMMON_H -#define _COMMON_H - -bool syscall_user_dispatch(struct pt_regs *regs); - -#endif diff --git a/kernel/entry/syscall-common.c b/kernel/entry/syscall-common.c index 940a597ded40..cd4967a9c53e 100644 --- a/kernel/entry/syscall-common.c +++ b/kernel/entry/syscall-common.c @@ -1,104 +1,23 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/audit.h> #include <linux/entry-common.h> -#include "common.h" #define CREATE_TRACE_POINTS #include <trace/events/syscalls.h> -static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) -{ - if (unlikely(audit_context())) { - unsigned long args[6]; - - syscall_get_arguments(current, regs, args); - audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); - } -} +/* Out of line to prevent tracepoint code duplication */ -long syscall_trace_enter(struct pt_regs *regs, long syscall, - unsigned long work) +long trace_syscall_enter(struct pt_regs *regs, long syscall) { - long ret = 0; - + trace_sys_enter(regs, syscall); /* - * Handle Syscall User Dispatch. This must comes first, since - * the ABI here can be something that doesn't make sense for - * other syscall_work features. + * Probes or BPF hooks in the tracepoint may have changed the + * system call number. Reread it. */ - if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { - if (syscall_user_dispatch(regs)) - return -1L; - } - - /* Handle ptrace */ - if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { - ret = ptrace_report_syscall_entry(regs); - if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) - return -1L; - } - - /* Do seccomp after ptrace, to catch any tracer changes. */ - if (work & SYSCALL_WORK_SECCOMP) { - ret = __secure_computing(); - if (ret == -1L) - return ret; - } - - /* Either of the above might have changed the syscall number */ - syscall = syscall_get_nr(current, regs); - - if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) { - trace_sys_enter(regs, syscall); - /* - * Probes or BPF hooks in the tracepoint may have changed the - * system call number as well. - */ - syscall = syscall_get_nr(current, regs); - } - - syscall_enter_audit(regs, syscall); - - return ret ? : syscall; + return syscall_get_nr(current, regs); } -/* - * If SYSCALL_EMU is set, then the only reason to report is when - * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall - * instruction has been already reported in syscall_enter_from_user_mode(). - */ -static inline bool report_single_step(unsigned long work) +void trace_syscall_exit(struct pt_regs *regs, long ret) { - if (work & SYSCALL_WORK_SYSCALL_EMU) - return false; - - return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP; -} - -void syscall_exit_work(struct pt_regs *regs, unsigned long work) -{ - bool step; - - /* - * If the syscall was rolled back due to syscall user dispatching, - * then the tracers below are not invoked for the same reason as - * the entry side was not invoked in syscall_trace_enter(): The ABI - * of these syscalls is unknown. - */ - if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { - if (unlikely(current->syscall_dispatch.on_dispatch)) { - current->syscall_dispatch.on_dispatch = false; - return; - } - } - - audit_syscall_exit(regs); - - if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) - trace_sys_exit(regs, syscall_get_return_value(current, regs)); - - step = report_single_step(work); - if (step || work & SYSCALL_WORK_SYSCALL_TRACE) - ptrace_report_syscall_exit(regs, step); + trace_sys_exit(regs, ret); } diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c index a9055eccb27e..d89dffcc2d64 100644 --- a/kernel/entry/syscall_user_dispatch.c +++ b/kernel/entry/syscall_user_dispatch.c @@ -2,6 +2,8 @@ /* * Copyright (C) 2020 Collabora Ltd. */ + +#include <linux/entry-common.h> #include <linux/sched.h> #include <linux/prctl.h> #include <linux/ptrace.h> @@ -15,8 +17,6 @@ #include <asm/syscall.h> -#include "common.h" - static void trigger_sigsys(struct pt_regs *regs) { struct kernel_siginfo info; diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 1f6589578703..9d24b6e0c91f 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -246,7 +246,7 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, if (user && !crosstask) { if (!user_mode(regs)) { - if (current->flags & (PF_KTHREAD | PF_USER_WORKER)) + if (!is_user_task(current)) goto exit_put; regs = task_pt_regs(current); } diff --git a/kernel/events/core.c b/kernel/events/core.c index 5b5cb620499e..e18119f30c29 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -57,6 +57,7 @@ #include <linux/task_work.h> #include <linux/percpu-rwsem.h> #include <linux/unwind_deferred.h> +#include <linux/kvm_types.h> #include "internal.h" @@ -166,6 +167,18 @@ enum event_type_t { EVENT_CPU = 0x10, EVENT_CGROUP = 0x20, + /* + * EVENT_GUEST is set when scheduling in/out events between the host + * and a guest with a mediated vPMU. Among other things, EVENT_GUEST + * is used: + * + * - In for_each_epc() to skip PMUs that don't support events in a + * MEDIATED_VPMU guest, i.e. don't need to be context switched. + * - To indicate the start/end point of the events in a guest. Guest + * running time is deducted for host-only (exclude_guest) events. + */ + EVENT_GUEST = 0x40, + EVENT_FLAGS = EVENT_CGROUP | EVENT_GUEST, /* compound helpers */ EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN, @@ -458,6 +471,20 @@ static cpumask_var_t perf_online_pkg_mask; static cpumask_var_t perf_online_sys_mask; static struct kmem_cache *perf_event_cache; +#ifdef CONFIG_PERF_GUEST_MEDIATED_PMU +static DEFINE_PER_CPU(bool, guest_ctx_loaded); + +static __always_inline bool is_guest_mediated_pmu_loaded(void) +{ + return __this_cpu_read(guest_ctx_loaded); +} +#else +static __always_inline bool is_guest_mediated_pmu_loaded(void) +{ + return false; +} +#endif + /* * perf event paranoia level: * -1 - not paranoid at all @@ -779,33 +806,97 @@ do { \ ___p; \ }) -#define for_each_epc(_epc, _ctx, _pmu, _cgroup) \ +static bool perf_skip_pmu_ctx(struct perf_event_pmu_context *pmu_ctx, + enum event_type_t event_type) +{ + if ((event_type & EVENT_CGROUP) && !pmu_ctx->nr_cgroups) + return true; + if ((event_type & EVENT_GUEST) && + !(pmu_ctx->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU)) + return true; + return false; +} + +#define for_each_epc(_epc, _ctx, _pmu, _event_type) \ list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \ - if (_cgroup && !_epc->nr_cgroups) \ + if (perf_skip_pmu_ctx(_epc, _event_type)) \ continue; \ else if (_pmu && _epc->pmu != _pmu) \ continue; \ else -static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup) +static void perf_ctx_disable(struct perf_event_context *ctx, + enum event_type_t event_type) { struct perf_event_pmu_context *pmu_ctx; - for_each_epc(pmu_ctx, ctx, NULL, cgroup) + for_each_epc(pmu_ctx, ctx, NULL, event_type) perf_pmu_disable(pmu_ctx->pmu); } -static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup) +static void perf_ctx_enable(struct perf_event_context *ctx, + enum event_type_t event_type) { struct perf_event_pmu_context *pmu_ctx; - for_each_epc(pmu_ctx, ctx, NULL, cgroup) + for_each_epc(pmu_ctx, ctx, NULL, event_type) perf_pmu_enable(pmu_ctx->pmu); } static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); +static inline void update_perf_time_ctx(struct perf_time_ctx *time, u64 now, bool adv) +{ + if (adv) + time->time += now - time->stamp; + time->stamp = now; + + /* + * The above: time' = time + (now - timestamp), can be re-arranged + * into: time` = now + (time - timestamp), which gives a single value + * offset to compute future time without locks on. + * + * See perf_event_time_now(), which can be used from NMI context where + * it's (obviously) not possible to acquire ctx->lock in order to read + * both the above values in a consistent manner. + */ + WRITE_ONCE(time->offset, time->time - time->stamp); +} + +static_assert(offsetof(struct perf_event_context, timeguest) - + offsetof(struct perf_event_context, time) == + sizeof(struct perf_time_ctx)); + +#define T_TOTAL 0 +#define T_GUEST 1 + +static inline u64 __perf_event_time_ctx(struct perf_event *event, + struct perf_time_ctx *times) +{ + u64 time = times[T_TOTAL].time; + + if (event->attr.exclude_guest) + time -= times[T_GUEST].time; + + return time; +} + +static inline u64 __perf_event_time_ctx_now(struct perf_event *event, + struct perf_time_ctx *times, + u64 now) +{ + if (is_guest_mediated_pmu_loaded() && event->attr.exclude_guest) { + /* + * (now + times[total].offset) - (now + times[guest].offset) := + * times[total].offset - times[guest].offset + */ + return READ_ONCE(times[T_TOTAL].offset) - READ_ONCE(times[T_GUEST].offset); + } + + return now + READ_ONCE(times[T_TOTAL].offset); +} + #ifdef CONFIG_CGROUP_PERF static inline bool @@ -842,12 +933,16 @@ static inline int is_cgroup_event(struct perf_event *event) return event->cgrp != NULL; } +static_assert(offsetof(struct perf_cgroup_info, timeguest) - + offsetof(struct perf_cgroup_info, time) == + sizeof(struct perf_time_ctx)); + static inline u64 perf_cgroup_event_time(struct perf_event *event) { struct perf_cgroup_info *t; t = per_cpu_ptr(event->cgrp->info, event->cpu); - return t->time; + return __perf_event_time_ctx(event, &t->time); } static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) @@ -856,20 +951,21 @@ static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) t = per_cpu_ptr(event->cgrp->info, event->cpu); if (!__load_acquire(&t->active)) - return t->time; - now += READ_ONCE(t->timeoffset); - return now; + return __perf_event_time_ctx(event, &t->time); + + return __perf_event_time_ctx_now(event, &t->time, now); } -static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv) +static inline void __update_cgrp_guest_time(struct perf_cgroup_info *info, u64 now, bool adv) { - if (adv) - info->time += now - info->timestamp; - info->timestamp = now; - /* - * see update_context_time() - */ - WRITE_ONCE(info->timeoffset, info->time - info->timestamp); + update_perf_time_ctx(&info->timeguest, now, adv); +} + +static inline void update_cgrp_time(struct perf_cgroup_info *info, u64 now) +{ + update_perf_time_ctx(&info->time, now, true); + if (is_guest_mediated_pmu_loaded()) + __update_cgrp_guest_time(info, now, true); } static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) @@ -885,7 +981,7 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); - __update_cgrp_time(info, now, true); + update_cgrp_time(info, now); if (final) __store_release(&info->active, 0); } @@ -908,11 +1004,11 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) * Do not update time when cgroup is not active */ if (info->active) - __update_cgrp_time(info, perf_clock(), true); + update_cgrp_time(info, perf_clock()); } static inline void -perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest) { struct perf_event_context *ctx = &cpuctx->ctx; struct perf_cgroup *cgrp = cpuctx->cgrp; @@ -932,8 +1028,12 @@ perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); - __update_cgrp_time(info, ctx->timestamp, false); - __store_release(&info->active, 1); + if (guest) { + __update_cgrp_guest_time(info, ctx->time.stamp, false); + } else { + update_perf_time_ctx(&info->time, ctx->time.stamp, false); + __store_release(&info->active, 1); + } } } @@ -964,8 +1064,7 @@ static void perf_cgroup_switch(struct task_struct *task) return; WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); - - perf_ctx_disable(&cpuctx->ctx, true); + perf_ctx_disable(&cpuctx->ctx, EVENT_CGROUP); ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); /* @@ -981,7 +1080,7 @@ static void perf_cgroup_switch(struct task_struct *task) */ ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); - perf_ctx_enable(&cpuctx->ctx, true); + perf_ctx_enable(&cpuctx->ctx, EVENT_CGROUP); } static int perf_cgroup_ensure_storage(struct perf_event *event, @@ -1138,7 +1237,7 @@ static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, } static inline void -perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest) { } @@ -1550,29 +1649,24 @@ static void perf_unpin_context(struct perf_event_context *ctx) */ static void __update_context_time(struct perf_event_context *ctx, bool adv) { - u64 now = perf_clock(); - lockdep_assert_held(&ctx->lock); - if (adv) - ctx->time += now - ctx->timestamp; - ctx->timestamp = now; + update_perf_time_ctx(&ctx->time, perf_clock(), adv); +} - /* - * The above: time' = time + (now - timestamp), can be re-arranged - * into: time` = now + (time - timestamp), which gives a single value - * offset to compute future time without locks on. - * - * See perf_event_time_now(), which can be used from NMI context where - * it's (obviously) not possible to acquire ctx->lock in order to read - * both the above values in a consistent manner. - */ - WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp); +static void __update_context_guest_time(struct perf_event_context *ctx, bool adv) +{ + lockdep_assert_held(&ctx->lock); + + /* must be called after __update_context_time(); */ + update_perf_time_ctx(&ctx->timeguest, ctx->time.stamp, adv); } static void update_context_time(struct perf_event_context *ctx) { __update_context_time(ctx, true); + if (is_guest_mediated_pmu_loaded()) + __update_context_guest_time(ctx, true); } static u64 perf_event_time(struct perf_event *event) @@ -1585,7 +1679,7 @@ static u64 perf_event_time(struct perf_event *event) if (is_cgroup_event(event)) return perf_cgroup_event_time(event); - return ctx->time; + return __perf_event_time_ctx(event, &ctx->time); } static u64 perf_event_time_now(struct perf_event *event, u64 now) @@ -1599,10 +1693,9 @@ static u64 perf_event_time_now(struct perf_event *event, u64 now) return perf_cgroup_event_time_now(event, now); if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) - return ctx->time; + return __perf_event_time_ctx(event, &ctx->time); - now += READ_ONCE(ctx->timeoffset); - return now; + return __perf_event_time_ctx_now(event, &ctx->time, now); } static enum event_type_t get_event_type(struct perf_event *event) @@ -2422,20 +2515,23 @@ group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx) } static inline void -__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final) +__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, + bool final, enum event_type_t event_type) { if (ctx->is_active & EVENT_TIME) { if (ctx->is_active & EVENT_FROZEN) return; + update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx, final); + /* vPMU should not stop time */ + update_cgrp_time_from_cpuctx(cpuctx, !(event_type & EVENT_GUEST) && final); } } static inline void ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { - __ctx_time_update(cpuctx, ctx, false); + __ctx_time_update(cpuctx, ctx, false, 0); } /* @@ -2861,14 +2957,15 @@ static void task_ctx_sched_out(struct perf_event_context *ctx, static void perf_event_sched_in(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, - struct pmu *pmu) + struct pmu *pmu, + enum event_type_t event_type) { - ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED); + ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED | event_type); if (ctx) - ctx_sched_in(ctx, pmu, EVENT_PINNED); - ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); + ctx_sched_in(ctx, pmu, EVENT_PINNED | event_type); + ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE | event_type); if (ctx) - ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE); + ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE | event_type); } /* @@ -2902,11 +2999,11 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, event_type &= EVENT_ALL; - for_each_epc(epc, &cpuctx->ctx, pmu, false) + for_each_epc(epc, &cpuctx->ctx, pmu, 0) perf_pmu_disable(epc->pmu); if (task_ctx) { - for_each_epc(epc, task_ctx, pmu, false) + for_each_epc(epc, task_ctx, pmu, 0) perf_pmu_disable(epc->pmu); task_ctx_sched_out(task_ctx, pmu, event_type); @@ -2924,13 +3021,13 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, else if (event_type & EVENT_PINNED) ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); - perf_event_sched_in(cpuctx, task_ctx, pmu); + perf_event_sched_in(cpuctx, task_ctx, pmu, 0); - for_each_epc(epc, &cpuctx->ctx, pmu, false) + for_each_epc(epc, &cpuctx->ctx, pmu, 0) perf_pmu_enable(epc->pmu); if (task_ctx) { - for_each_epc(epc, task_ctx, pmu, false) + for_each_epc(epc, task_ctx, pmu, 0) perf_pmu_enable(epc->pmu); } } @@ -3479,11 +3576,10 @@ static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + enum event_type_t active_type = event_type & ~EVENT_FLAGS; struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; - bool cgroup = event_type & EVENT_CGROUP; - event_type &= ~EVENT_CGROUP; lockdep_assert_held(&ctx->lock); @@ -3507,14 +3603,14 @@ ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t * * would only update time for the pinned events. */ - __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx); + __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx, event_type); /* * CPU-release for the below ->is_active store, * see __load_acquire() in perf_event_time_now() */ barrier(); - ctx->is_active &= ~event_type; + ctx->is_active &= ~active_type; if (!(ctx->is_active & EVENT_ALL)) { /* @@ -3533,9 +3629,20 @@ ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t cpuctx->task_ctx = NULL; } - is_active ^= ctx->is_active; /* changed bits */ + if (event_type & EVENT_GUEST) { + /* + * Schedule out all exclude_guest events of PMU + * with PERF_PMU_CAP_MEDIATED_VPMU. + */ + is_active = EVENT_ALL; + __update_context_guest_time(ctx, false); + perf_cgroup_set_timestamp(cpuctx, true); + barrier(); + } else { + is_active ^= ctx->is_active; /* changed bits */ + } - for_each_epc(pmu_ctx, ctx, pmu, cgroup) + for_each_epc(pmu_ctx, ctx, pmu, event_type) __pmu_ctx_sched_out(pmu_ctx, is_active); } @@ -3691,7 +3798,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { - perf_ctx_disable(ctx, false); + perf_ctx_disable(ctx, 0); /* PMIs are disabled; ctx->nr_no_switch_fast is stable. */ if (local_read(&ctx->nr_no_switch_fast) || @@ -3715,7 +3822,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) perf_ctx_sched_task_cb(ctx, task, false); - perf_ctx_enable(ctx, false); + perf_ctx_enable(ctx, 0); /* * RCU_INIT_POINTER here is safe because we've not @@ -3739,13 +3846,13 @@ unlock: if (do_switch) { raw_spin_lock(&ctx->lock); - perf_ctx_disable(ctx, false); + perf_ctx_disable(ctx, 0); inside_switch: perf_ctx_sched_task_cb(ctx, task, false); task_ctx_sched_out(ctx, NULL, EVENT_ALL); - perf_ctx_enable(ctx, false); + perf_ctx_enable(ctx, 0); raw_spin_unlock(&ctx->lock); } } @@ -3992,10 +4099,15 @@ static inline void group_update_userpage(struct perf_event *group_event) event_update_userpage(event); } +struct merge_sched_data { + int can_add_hw; + enum event_type_t event_type; +}; + static int merge_sched_in(struct perf_event *event, void *data) { struct perf_event_context *ctx = event->ctx; - int *can_add_hw = data; + struct merge_sched_data *msd = data; if (event->state <= PERF_EVENT_STATE_OFF) return 0; @@ -4003,13 +4115,22 @@ static int merge_sched_in(struct perf_event *event, void *data) if (!event_filter_match(event)) return 0; - if (group_can_go_on(event, *can_add_hw)) { + /* + * Don't schedule in any host events from PMU with + * PERF_PMU_CAP_MEDIATED_VPMU, while a guest is running. + */ + if (is_guest_mediated_pmu_loaded() && + event->pmu_ctx->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU && + !(msd->event_type & EVENT_GUEST)) + return 0; + + if (group_can_go_on(event, msd->can_add_hw)) { if (!group_sched_in(event, ctx)) list_add_tail(&event->active_list, get_event_list(event)); } if (event->state == PERF_EVENT_STATE_INACTIVE) { - *can_add_hw = 0; + msd->can_add_hw = 0; if (event->attr.pinned) { perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); @@ -4032,11 +4153,15 @@ static int merge_sched_in(struct perf_event *event, void *data) static void pmu_groups_sched_in(struct perf_event_context *ctx, struct perf_event_groups *groups, - struct pmu *pmu) + struct pmu *pmu, + enum event_type_t event_type) { - int can_add_hw = 1; + struct merge_sched_data msd = { + .can_add_hw = 1, + .event_type = event_type, + }; visit_groups_merge(ctx, groups, smp_processor_id(), pmu, - merge_sched_in, &can_add_hw); + merge_sched_in, &msd); } static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx, @@ -4045,20 +4170,18 @@ static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx, struct perf_event_context *ctx = pmu_ctx->ctx; if (event_type & EVENT_PINNED) - pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu); + pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu, event_type); if (event_type & EVENT_FLEXIBLE) - pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu); + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu, event_type); } static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + enum event_type_t active_type = event_type & ~EVENT_FLAGS; struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; - bool cgroup = event_type & EVENT_CGROUP; - - event_type &= ~EVENT_CGROUP; lockdep_assert_held(&ctx->lock); @@ -4066,9 +4189,11 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t return; if (!(is_active & EVENT_TIME)) { + /* EVENT_TIME should be active while the guest runs */ + WARN_ON_ONCE(event_type & EVENT_GUEST); /* start ctx time */ __update_context_time(ctx, false); - perf_cgroup_set_timestamp(cpuctx); + perf_cgroup_set_timestamp(cpuctx, false); /* * CPU-release for the below ->is_active store, * see __load_acquire() in perf_event_time_now() @@ -4076,7 +4201,7 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t barrier(); } - ctx->is_active |= (event_type | EVENT_TIME); + ctx->is_active |= active_type | EVENT_TIME; if (ctx->task) { if (!(is_active & EVENT_ALL)) cpuctx->task_ctx = ctx; @@ -4084,21 +4209,37 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t WARN_ON_ONCE(cpuctx->task_ctx != ctx); } - is_active ^= ctx->is_active; /* changed bits */ + if (event_type & EVENT_GUEST) { + /* + * Schedule in the required exclude_guest events of PMU + * with PERF_PMU_CAP_MEDIATED_VPMU. + */ + is_active = event_type & EVENT_ALL; + + /* + * Update ctx time to set the new start time for + * the exclude_guest events. + */ + update_context_time(ctx); + update_cgrp_time_from_cpuctx(cpuctx, false); + barrier(); + } else { + is_active ^= ctx->is_active; /* changed bits */ + } /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ if (is_active & EVENT_PINNED) { - for_each_epc(pmu_ctx, ctx, pmu, cgroup) - __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED); + for_each_epc(pmu_ctx, ctx, pmu, event_type) + __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED | (event_type & EVENT_GUEST)); } /* Then walk through the lower prio flexible groups */ if (is_active & EVENT_FLEXIBLE) { - for_each_epc(pmu_ctx, ctx, pmu, cgroup) - __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE); + for_each_epc(pmu_ctx, ctx, pmu, event_type) + __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE | (event_type & EVENT_GUEST)); } } @@ -4114,11 +4255,11 @@ static void perf_event_context_sched_in(struct task_struct *task) if (cpuctx->task_ctx == ctx) { perf_ctx_lock(cpuctx, ctx); - perf_ctx_disable(ctx, false); + perf_ctx_disable(ctx, 0); perf_ctx_sched_task_cb(ctx, task, true); - perf_ctx_enable(ctx, false); + perf_ctx_enable(ctx, 0); perf_ctx_unlock(cpuctx, ctx); goto rcu_unlock; } @@ -4131,7 +4272,7 @@ static void perf_event_context_sched_in(struct task_struct *task) if (!ctx->nr_events) goto unlock; - perf_ctx_disable(ctx, false); + perf_ctx_disable(ctx, 0); /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, @@ -4141,18 +4282,18 @@ static void perf_event_context_sched_in(struct task_struct *task) * events, no need to flip the cpuctx's events around. */ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) { - perf_ctx_disable(&cpuctx->ctx, false); + perf_ctx_disable(&cpuctx->ctx, 0); ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE); } - perf_event_sched_in(cpuctx, ctx, NULL); + perf_event_sched_in(cpuctx, ctx, NULL, 0); perf_ctx_sched_task_cb(cpuctx->task_ctx, task, true); if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) - perf_ctx_enable(&cpuctx->ctx, false); + perf_ctx_enable(&cpuctx->ctx, 0); - perf_ctx_enable(ctx, false); + perf_ctx_enable(ctx, 0); unlock: perf_ctx_unlock(cpuctx, ctx); @@ -5280,9 +5421,20 @@ attach_task_ctx_data(struct task_struct *task, struct kmem_cache *ctx_cache, return -ENOMEM; for (;;) { - if (try_cmpxchg((struct perf_ctx_data **)&task->perf_ctx_data, &old, cd)) { + if (try_cmpxchg(&task->perf_ctx_data, &old, cd)) { if (old) perf_free_ctx_data_rcu(old); + /* + * Above try_cmpxchg() pairs with try_cmpxchg() from + * detach_task_ctx_data() such that + * if we race with perf_event_exit_task(), we must + * observe PF_EXITING. + */ + if (task->flags & PF_EXITING) { + /* detach_task_ctx_data() may free it already */ + if (try_cmpxchg(&task->perf_ctx_data, &cd, NULL)) + perf_free_ctx_data_rcu(cd); + } return 0; } @@ -5328,6 +5480,8 @@ again: /* Allocate everything */ scoped_guard (rcu) { for_each_process_thread(g, p) { + if (p->flags & PF_EXITING) + continue; cd = rcu_dereference(p->perf_ctx_data); if (cd && !cd->global) { cd->global = 1; @@ -5594,6 +5748,8 @@ static void __free_event(struct perf_event *event) { struct pmu *pmu = event->pmu; + security_perf_event_free(event); + if (event->attach_state & PERF_ATTACH_CALLCHAIN) put_callchain_buffers(); @@ -5647,6 +5803,8 @@ static void __free_event(struct perf_event *event) call_rcu(&event->rcu_head, free_event_rcu); } +static void mediated_pmu_unaccount_event(struct perf_event *event); + DEFINE_FREE(__free_event, struct perf_event *, if (_T) __free_event(_T)) /* vs perf_event_alloc() success */ @@ -5656,8 +5814,7 @@ static void _free_event(struct perf_event *event) irq_work_sync(&event->pending_disable_irq); unaccount_event(event); - - security_perf_event_free(event); + mediated_pmu_unaccount_event(event); if (event->rb) { /* @@ -6180,6 +6337,138 @@ u64 perf_event_pause(struct perf_event *event, bool reset) } EXPORT_SYMBOL_GPL(perf_event_pause); +#ifdef CONFIG_PERF_GUEST_MEDIATED_PMU +static atomic_t nr_include_guest_events __read_mostly; + +static atomic_t nr_mediated_pmu_vms __read_mostly; +static DEFINE_MUTEX(perf_mediated_pmu_mutex); + +/* !exclude_guest event of PMU with PERF_PMU_CAP_MEDIATED_VPMU */ +static inline bool is_include_guest_event(struct perf_event *event) +{ + if ((event->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU) && + !event->attr.exclude_guest) + return true; + + return false; +} + +static int mediated_pmu_account_event(struct perf_event *event) +{ + if (!is_include_guest_event(event)) + return 0; + + if (atomic_inc_not_zero(&nr_include_guest_events)) + return 0; + + guard(mutex)(&perf_mediated_pmu_mutex); + if (atomic_read(&nr_mediated_pmu_vms)) + return -EOPNOTSUPP; + + atomic_inc(&nr_include_guest_events); + return 0; +} + +static void mediated_pmu_unaccount_event(struct perf_event *event) +{ + if (!is_include_guest_event(event)) + return; + + if (WARN_ON_ONCE(!atomic_read(&nr_include_guest_events))) + return; + + atomic_dec(&nr_include_guest_events); +} + +/* + * Currently invoked at VM creation to + * - Check whether there are existing !exclude_guest events of PMU with + * PERF_PMU_CAP_MEDIATED_VPMU + * - Set nr_mediated_pmu_vms to prevent !exclude_guest event creation on + * PMUs with PERF_PMU_CAP_MEDIATED_VPMU + * + * No impact for the PMU without PERF_PMU_CAP_MEDIATED_VPMU. The perf + * still owns all the PMU resources. + */ +int perf_create_mediated_pmu(void) +{ + if (atomic_inc_not_zero(&nr_mediated_pmu_vms)) + return 0; + + guard(mutex)(&perf_mediated_pmu_mutex); + if (atomic_read(&nr_include_guest_events)) + return -EBUSY; + + atomic_inc(&nr_mediated_pmu_vms); + return 0; +} +EXPORT_SYMBOL_FOR_KVM(perf_create_mediated_pmu); + +void perf_release_mediated_pmu(void) +{ + if (WARN_ON_ONCE(!atomic_read(&nr_mediated_pmu_vms))) + return; + + atomic_dec(&nr_mediated_pmu_vms); +} +EXPORT_SYMBOL_FOR_KVM(perf_release_mediated_pmu); + +/* When loading a guest's mediated PMU, schedule out all exclude_guest events. */ +void perf_load_guest_context(void) +{ + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + + lockdep_assert_irqs_disabled(); + + guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx); + + if (WARN_ON_ONCE(__this_cpu_read(guest_ctx_loaded))) + return; + + perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST); + ctx_sched_out(&cpuctx->ctx, NULL, EVENT_GUEST); + if (cpuctx->task_ctx) { + perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST); + task_ctx_sched_out(cpuctx->task_ctx, NULL, EVENT_GUEST); + } + + perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST); + if (cpuctx->task_ctx) + perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST); + + __this_cpu_write(guest_ctx_loaded, true); +} +EXPORT_SYMBOL_GPL(perf_load_guest_context); + +void perf_put_guest_context(void) +{ + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + + lockdep_assert_irqs_disabled(); + + guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx); + + if (WARN_ON_ONCE(!__this_cpu_read(guest_ctx_loaded))) + return; + + perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST); + if (cpuctx->task_ctx) + perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST); + + perf_event_sched_in(cpuctx, cpuctx->task_ctx, NULL, EVENT_GUEST); + + if (cpuctx->task_ctx) + perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST); + perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST); + + __this_cpu_write(guest_ctx_loaded, false); +} +EXPORT_SYMBOL_GPL(perf_put_guest_context); +#else +static int mediated_pmu_account_event(struct perf_event *event) { return 0; } +static void mediated_pmu_unaccount_event(struct perf_event *event) {} +#endif + /* * Holding the top-level event's child_mutex means that any * descendant process that has inherited this event will block @@ -6548,22 +6837,22 @@ void perf_event_update_userpage(struct perf_event *event) goto unlock; /* - * compute total_time_enabled, total_time_running - * based on snapshot values taken when the event - * was last scheduled in. + * Disable preemption to guarantee consistent time stamps are stored to + * the user page. + */ + preempt_disable(); + + /* + * Compute total_time_enabled, total_time_running based on snapshot + * values taken when the event was last scheduled in. * - * we cannot simply called update_context_time() - * because of locking issue as we can be called in - * NMI context + * We cannot simply call update_context_time() because doing so would + * lead to deadlock when called from NMI context. */ calc_timer_values(event, &now, &enabled, &running); userpg = rb->user_page; - /* - * Disable preemption to guarantee consistent time stamps are stored to - * the user page. - */ - preempt_disable(); + ++userpg->lock; barrier(); userpg->index = perf_event_index(event); @@ -6997,6 +7286,15 @@ static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event, if (data_page_nr(event->rb) != nr_pages) return -EINVAL; + /* + * If this event doesn't have mmap_count, we're attempting to + * create an alias of another event's mmap(); this would mean + * both events will end up scribbling the same user_page; + * which makes no sense. + */ + if (!refcount_read(&event->mmap_count)) + return -EBUSY; + if (refcount_inc_not_zero(&event->rb->mmap_count)) { /* * Success -- managed to mmap() the same buffer @@ -7374,6 +7672,7 @@ struct perf_guest_info_callbacks __rcu *perf_guest_cbs; DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state); DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip); DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); +DEFINE_STATIC_CALL_RET0(__perf_guest_handle_mediated_pmi, *perf_guest_cbs->handle_mediated_pmi); void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { @@ -7388,6 +7687,10 @@ void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) if (cbs->handle_intel_pt_intr) static_call_update(__perf_guest_handle_intel_pt_intr, cbs->handle_intel_pt_intr); + + if (cbs->handle_mediated_pmi) + static_call_update(__perf_guest_handle_mediated_pmi, + cbs->handle_mediated_pmi); } EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); @@ -7399,8 +7702,8 @@ void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) rcu_assign_pointer(perf_guest_cbs, NULL); static_call_update(__perf_guest_state, (void *)&__static_call_return0); static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0); - static_call_update(__perf_guest_handle_intel_pt_intr, - (void *)&__static_call_return0); + static_call_update(__perf_guest_handle_intel_pt_intr, (void *)&__static_call_return0); + static_call_update(__perf_guest_handle_mediated_pmi, (void *)&__static_call_return0); synchronize_rcu(); } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); @@ -7451,7 +7754,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user, if (user_mode(regs)) { regs_user->abi = perf_reg_abi(current); regs_user->regs = regs; - } else if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { + } else if (is_user_task(current)) { perf_get_regs_user(regs_user, regs); } else { regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; @@ -7860,13 +8163,11 @@ static void perf_output_read(struct perf_output_handle *handle, u64 read_format = event->attr.read_format; /* - * compute total_time_enabled, total_time_running - * based on snapshot values taken when the event - * was last scheduled in. + * Compute total_time_enabled, total_time_running based on snapshot + * values taken when the event was last scheduled in. * - * we cannot simply called update_context_time() - * because of locking issue as we are called in - * NMI context + * We cannot simply call update_context_time() because doing so would + * lead to deadlock when called from NMI context. */ if (read_format & PERF_FORMAT_TOTAL_TIMES) calc_timer_values(event, &now, &enabled, &running); @@ -8091,7 +8392,7 @@ static u64 perf_virt_to_phys(u64 virt) * Try IRQ-safe get_user_page_fast_only first. * If failed, leave phys_addr as 0. */ - if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { + if (is_user_task(current)) { struct page *p; pagefault_disable(); @@ -8206,7 +8507,7 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) { bool kernel = !event->attr.exclude_callchain_kernel; bool user = !event->attr.exclude_callchain_user && - !(current->flags & (PF_KTHREAD | PF_USER_WORKER)); + is_user_task(current); /* Disallow cross-task user callchains. */ bool crosstask = event->ctx->task && event->ctx->task != current; bool defer_user = IS_ENABLED(CONFIG_UNWIND_USER) && user && @@ -12034,7 +12335,7 @@ static void task_clock_event_update(struct perf_event *event, u64 now) static void task_clock_event_start(struct perf_event *event, int flags) { event->hw.state = 0; - local64_set(&event->hw.prev_count, event->ctx->time); + local64_set(&event->hw.prev_count, event->ctx->time.time); perf_swevent_start_hrtimer(event); } @@ -12043,7 +12344,7 @@ static void task_clock_event_stop(struct perf_event *event, int flags) event->hw.state = PERF_HES_STOPPED; perf_swevent_cancel_hrtimer(event); if (flags & PERF_EF_UPDATE) - task_clock_event_update(event, event->ctx->time); + task_clock_event_update(event, event->ctx->time.time); } static int task_clock_event_add(struct perf_event *event, int flags) @@ -12063,8 +12364,8 @@ static void task_clock_event_del(struct perf_event *event, int flags) static void task_clock_event_read(struct perf_event *event) { u64 now = perf_clock(); - u64 delta = now - event->ctx->timestamp; - u64 time = event->ctx->time + delta; + u64 delta = now - event->ctx->time.stamp; + u64 time = event->ctx->time.time + delta; task_clock_event_update(event, time); } @@ -13146,6 +13447,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (err) return ERR_PTR(err); + err = mediated_pmu_account_event(event); + if (err) + return ERR_PTR(err); + /* symmetric to unaccount_event() in _free_event() */ account_event(event); @@ -14285,8 +14590,11 @@ void perf_event_exit_task(struct task_struct *task) /* * Detach the perf_ctx_data for the system-wide event. + * + * Done without holding global_ctx_data_rwsem; typically + * attach_global_ctx_data() will skip over this task, but otherwise + * attach_task_ctx_data() will observe PF_EXITING. */ - guard(percpu_read)(&global_ctx_data_rwsem); detach_task_ctx_data(task); } @@ -14789,7 +15097,8 @@ static void perf_event_exit_cpu_context(int cpu) ctx = &cpuctx->ctx; mutex_lock(&ctx->mutex); - smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); + if (ctx->nr_events) + smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); cpuctx->online = 0; mutex_unlock(&ctx->mutex); mutex_unlock(&pmus_lock); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d546d32390a8..424ef2235b07 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -179,16 +179,16 @@ bool __weak is_trap_insn(uprobe_opcode_t *insn) void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) { - void *kaddr = kmap_atomic(page); + void *kaddr = kmap_local_page(page); memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len); - kunmap_atomic(kaddr); + kunmap_local(kaddr); } static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len) { - void *kaddr = kmap_atomic(page); + void *kaddr = kmap_local_page(page); memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len); - kunmap_atomic(kaddr); + kunmap_local(kaddr); } static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *insn, @@ -323,7 +323,7 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d) return ret == 0 ? -EBUSY : ret; } - kaddr = kmap_atomic(page); + kaddr = kmap_local_page(page); ptr = kaddr + (vaddr & ~PAGE_MASK); if (unlikely(*ptr + d < 0)) { @@ -336,7 +336,7 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d) *ptr += d; ret = 0; out: - kunmap_atomic(kaddr); + kunmap_local(kaddr); put_page(page); return ret; } @@ -1138,7 +1138,7 @@ static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm) bool ret = false; down_read(&uprobe->consumer_rwsem); - list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { + list_for_each_entry(uc, &uprobe->consumers, cons_node) { ret = consumer_filter(uc, mm); if (ret) break; @@ -1694,6 +1694,12 @@ static const struct vm_special_mapping xol_mapping = { .mremap = xol_mremap, }; +unsigned long __weak arch_uprobe_get_xol_area(void) +{ + /* Try to map as high as possible, this is only a hint. */ + return get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); +} + /* Slot allocation for XOL */ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) { @@ -1709,9 +1715,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) } if (!area->vaddr) { - /* Try to map as high as possible, this is only a hint. */ - area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, - PAGE_SIZE, 0, 0); + area->vaddr = arch_uprobe_get_xol_area(); if (IS_ERR_VALUE(area->vaddr)) { ret = area->vaddr; goto fail; diff --git a/kernel/fork.c b/kernel/fork.c index b1f3915d5f8e..e832da9d15a4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -97,6 +97,7 @@ #include <linux/kasan.h> #include <linux/scs.h> #include <linux/io_uring.h> +#include <linux/io_uring_types.h> #include <linux/bpf.h> #include <linux/stackprotector.h> #include <linux/user_events.h> @@ -1356,7 +1357,7 @@ struct file *get_task_exe_file(struct task_struct *task) * @task: The task. * * Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning - * this kernel workthread has transiently adopted a user mm with use_mm, + * this kernel workthread has transiently adopted a user mm with kthread_use_mm, * to do its AIO) is not set and if so returns a reference to it, after * bumping up the use count. User must release the mm via mmput() * after use. Typically used by /proc and ptrace. @@ -1828,9 +1829,6 @@ static inline void rcu_copy_process(struct task_struct *p) #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_TRACE_RCU p->trc_reader_nesting = 0; - p->trc_reader_special.s = 0; - INIT_LIST_HEAD(&p->trc_holdout_list); - INIT_LIST_HEAD(&p->trc_blkd_node); #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } @@ -2071,7 +2069,7 @@ __latent_entropy struct task_struct *copy_process( p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL; /* - * Clear TID on mm_release()? + * TID is cleared in mm_release() when the task exits */ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL; @@ -2129,6 +2127,10 @@ __latent_entropy struct task_struct *copy_process( #ifdef CONFIG_IO_URING p->io_uring = NULL; + retval = io_uring_fork(p); + if (unlikely(retval)) + goto bad_fork_cleanup_delayacct; + retval = -EAGAIN; #endif p->default_timer_slack_ns = current->timer_slack_ns; @@ -2525,6 +2527,7 @@ bad_fork_cleanup_policy: mpol_put(p->mempolicy); #endif bad_fork_cleanup_delayacct: + io_uring_free(p); delayacct_tsk_free(p); bad_fork_cleanup_count: dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 1b4254d19a73..05cba4e16dad 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -92,9 +92,6 @@ config GENERIC_MSI_IRQ config IRQ_MSI_IOMMU bool -config IRQ_TIMINGS - bool - config GENERIC_IRQ_MATRIX_ALLOCATOR bool diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 6ab3a4055667..86a2e5ae08f9 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -1,10 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o kexec.o -obj-$(CONFIG_IRQ_TIMINGS) += timings.o -ifeq ($(CONFIG_TEST_IRQ_TIMINGS),y) - CFLAGS_timings.o += -DDEBUG -endif obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 678f094d261a..6147a07d0127 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -974,7 +974,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, irq_state_set_disabled(desc); if (is_chained) { desc->action = NULL; - WARN_ON(irq_chip_pm_put(irq_desc_get_irq_data(desc))); + irq_chip_pm_put(irq_desc_get_irq_data(desc)); } desc->depth = 1; } @@ -1122,7 +1122,7 @@ void irq_cpu_offline(void) } #endif -#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY #ifdef CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS /** @@ -1194,6 +1194,15 @@ EXPORT_SYMBOL_GPL(handle_fasteoi_mask_irq); #endif /* CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS */ +#ifdef CONFIG_SMP +void irq_chip_pre_redirect_parent(struct irq_data *data) +{ + data = data->parent_data; + data->chip->irq_pre_redirect(data); +} +EXPORT_SYMBOL_GPL(irq_chip_pre_redirect_parent); +#endif + /** * irq_chip_set_parent_state - set the state of a parent interrupt. * @@ -1476,6 +1485,19 @@ void irq_chip_release_resources_parent(struct irq_data *data) data->chip->irq_release_resources(data); } EXPORT_SYMBOL_GPL(irq_chip_release_resources_parent); +#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ + +#ifdef CONFIG_SMP +int irq_chip_redirect_set_affinity(struct irq_data *data, const struct cpumask *dest, bool force) +{ + struct irq_redirect *redir = &irq_data_to_desc(data)->redirect; + + WRITE_ONCE(redir->target_cpu, cpumask_first(dest)); + irq_data_update_effective_affinity(data, dest); + + return IRQ_SET_MASK_OK_DONE; +} +EXPORT_SYMBOL_GPL(irq_chip_redirect_set_affinity); #endif /** @@ -1530,20 +1552,20 @@ int irq_chip_pm_get(struct irq_data *data) } /** - * irq_chip_pm_put - Disable power for an IRQ chip + * irq_chip_pm_put - Drop a PM reference on an IRQ chip * @data: Pointer to interrupt specific data * - * Disable the power to the IRQ chip referenced by the interrupt data - * structure, belongs. Note that power will only be disabled, once this - * function has been called for all IRQs that have called irq_chip_pm_get(). + * Drop a power management reference, acquired via irq_chip_pm_get(), on the IRQ + * chip represented by the interrupt data structure. + * + * Note that this will not disable power to the IRQ chip until this function + * has been called for all IRQs that have called irq_chip_pm_get() and it may + * not disable power at all (if user space prevents that, for example). */ -int irq_chip_pm_put(struct irq_data *data) +void irq_chip_pm_put(struct irq_data *data) { struct device *dev = irq_get_pm_device(data); - int retval = 0; - - if (IS_ENABLED(CONFIG_PM) && dev) - retval = pm_runtime_put(dev); - return (retval < 0) ? retval : 0; + if (dev) + pm_runtime_put(dev); } diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 755346ea9819..cd5689e383b0 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -177,9 +177,11 @@ void irq_migrate_all_off_this_cpu(void) bool affinity_broken; desc = irq_to_desc(irq); - scoped_guard(raw_spinlock, &desc->lock) + scoped_guard(raw_spinlock, &desc->lock) { affinity_broken = migrate_one_irq(desc); - + if (affinity_broken && desc->affinity_notify) + irq_affinity_schedule_notify_work(desc); + } if (affinity_broken) { pr_debug_ratelimited("IRQ %u: no longer affine to CPU%u\n", irq, smp_processor_id()); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 786f5570a640..b7d52821837b 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -188,8 +188,6 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc) unsigned int irq = desc->irq_data.irq; struct irqaction *action; - record_irq_time(desc); - for_each_action_of_desc(desc, action) { irqreturn_t res; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 0164ca48da59..9412e57056f5 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -135,6 +135,7 @@ extern bool irq_can_set_affinity_usr(unsigned int irq); extern int irq_do_set_affinity(struct irq_data *data, const struct cpumask *dest, bool force); +extern void irq_affinity_schedule_notify_work(struct irq_desc *desc); #ifdef CONFIG_SMP extern int irq_setup_affinity(struct irq_desc *desc); @@ -142,7 +143,6 @@ extern int irq_setup_affinity(struct irq_desc *desc); static inline int irq_setup_affinity(struct irq_desc *desc) { return 0; } #endif - #define for_each_action_of_desc(desc, act) \ for (act = desc->action; act; act = act->next) @@ -288,116 +288,6 @@ static inline void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { } #endif -#ifdef CONFIG_IRQ_TIMINGS - -#define IRQ_TIMINGS_SHIFT 5 -#define IRQ_TIMINGS_SIZE (1 << IRQ_TIMINGS_SHIFT) -#define IRQ_TIMINGS_MASK (IRQ_TIMINGS_SIZE - 1) - -/** - * struct irq_timings - irq timings storing structure - * @values: a circular buffer of u64 encoded <timestamp,irq> values - * @count: the number of elements in the array - */ -struct irq_timings { - u64 values[IRQ_TIMINGS_SIZE]; - int count; -}; - -DECLARE_PER_CPU(struct irq_timings, irq_timings); - -extern void irq_timings_free(int irq); -extern int irq_timings_alloc(int irq); - -static inline void irq_remove_timings(struct irq_desc *desc) -{ - desc->istate &= ~IRQS_TIMINGS; - - irq_timings_free(irq_desc_get_irq(desc)); -} - -static inline void irq_setup_timings(struct irq_desc *desc, struct irqaction *act) -{ - int irq = irq_desc_get_irq(desc); - int ret; - - /* - * We don't need the measurement because the idle code already - * knows the next expiry event. - */ - if (act->flags & __IRQF_TIMER) - return; - - /* - * In case the timing allocation fails, we just want to warn, - * not fail, so letting the system boot anyway. - */ - ret = irq_timings_alloc(irq); - if (ret) { - pr_warn("Failed to allocate irq timing stats for irq%d (%d)", - irq, ret); - return; - } - - desc->istate |= IRQS_TIMINGS; -} - -extern void irq_timings_enable(void); -extern void irq_timings_disable(void); - -DECLARE_STATIC_KEY_FALSE(irq_timing_enabled); - -/* - * The interrupt number and the timestamp are encoded into a single - * u64 variable to optimize the size. - * 48 bit time stamp and 16 bit IRQ number is way sufficient. - * Who cares an IRQ after 78 hours of idle time? - */ -static inline u64 irq_timing_encode(u64 timestamp, int irq) -{ - return (timestamp << 16) | irq; -} - -static inline int irq_timing_decode(u64 value, u64 *timestamp) -{ - *timestamp = value >> 16; - return value & U16_MAX; -} - -static __always_inline void irq_timings_push(u64 ts, int irq) -{ - struct irq_timings *timings = this_cpu_ptr(&irq_timings); - - timings->values[timings->count & IRQ_TIMINGS_MASK] = - irq_timing_encode(ts, irq); - - timings->count++; -} - -/* - * The function record_irq_time is only called in one place in the - * interrupts handler. We want this function always inline so the code - * inside is embedded in the function and the static key branching - * code can act at the higher level. Without the explicit - * __always_inline we can end up with a function call and a small - * overhead in the hotpath for nothing. - */ -static __always_inline void record_irq_time(struct irq_desc *desc) -{ - if (!static_branch_likely(&irq_timing_enabled)) - return; - - if (desc->istate & IRQS_TIMINGS) - irq_timings_push(local_clock(), irq_desc_get_irq(desc)); -} -#else -static inline void irq_remove_timings(struct irq_desc *desc) {} -static inline void irq_setup_timings(struct irq_desc *desc, - struct irqaction *act) {}; -static inline void record_irq_time(struct irq_desc *desc) {} -#endif /* CONFIG_IRQ_TIMINGS */ - - #ifdef CONFIG_GENERIC_IRQ_CHIP void irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, int num_ct, unsigned int irq_base, diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index f8e4e13dbe33..022b3741dd7a 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -78,8 +78,12 @@ static int alloc_masks(struct irq_desc *desc, int node) return 0; } -static void desc_smp_init(struct irq_desc *desc, int node, - const struct cpumask *affinity) +static void irq_redirect_work(struct irq_work *work) +{ + handle_irq_desc(container_of(work, struct irq_desc, redirect.work)); +} + +static void desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { if (!affinity) affinity = irq_default_affinity; @@ -91,6 +95,7 @@ static void desc_smp_init(struct irq_desc *desc, int node, #ifdef CONFIG_NUMA desc->irq_common_data.node = node; #endif + desc->redirect.work = IRQ_WORK_INIT_HARD(irq_redirect_work); } static void free_masks(struct irq_desc *desc) @@ -115,8 +120,6 @@ static inline void free_masks(struct irq_desc *desc) { } static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, const struct cpumask *affinity, struct module *owner) { - int cpu; - desc->irq_common_data.handler_data = NULL; desc->irq_common_data.msi_desc = NULL; @@ -134,8 +137,6 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, desc->tot_count = 0; desc->name = NULL; desc->owner = owner; - for_each_possible_cpu(cpu) - *per_cpu_ptr(desc->kstat_irqs, cpu) = (struct irqstat) { }; desc_smp_init(desc, node, affinity); } @@ -621,9 +622,14 @@ EXPORT_SYMBOL(irq_to_desc); static void free_desc(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); + int cpu; scoped_guard(raw_spinlock_irqsave, &desc->lock) desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL); + + for_each_possible_cpu(cpu) + *per_cpu_ptr(desc->kstat_irqs, cpu) = (struct irqstat) { }; + delete_irq_desc(irq); } @@ -766,6 +772,83 @@ int generic_handle_domain_nmi(struct irq_domain *domain, irq_hw_number_t hwirq) WARN_ON_ONCE(!in_nmi()); return handle_irq_desc(irq_resolve_mapping(domain, hwirq)); } + +#ifdef CONFIG_SMP +static bool demux_redirect_remote(struct irq_desc *desc) +{ + guard(raw_spinlock)(&desc->lock); + const struct cpumask *m = irq_data_get_effective_affinity_mask(&desc->irq_data); + unsigned int target_cpu = READ_ONCE(desc->redirect.target_cpu); + + if (desc->irq_data.chip->irq_pre_redirect) + desc->irq_data.chip->irq_pre_redirect(&desc->irq_data); + + /* + * If the interrupt handler is already running on a CPU that's included + * in the interrupt's affinity mask, redirection is not necessary. + */ + if (cpumask_test_cpu(smp_processor_id(), m)) + return false; + + /* + * The desc->action check protects against IRQ shutdown: __free_irq() sets + * desc->action to NULL while holding desc->lock, which we also hold. + * + * Calling irq_work_queue_on() here is safe w.r.t. CPU unplugging: + * - takedown_cpu() schedules multi_cpu_stop() on all active CPUs, + * including the one that's taken down. + * - multi_cpu_stop() acts like a barrier, which means all active + * CPUs go through MULTI_STOP_DISABLE_IRQ and disable hard IRQs + * *before* the dying CPU runs take_cpu_down() in MULTI_STOP_RUN. + * - Hard IRQs are re-enabled at the end of multi_cpu_stop(), *after* + * the dying CPU has run take_cpu_down() in MULTI_STOP_RUN. + * - Since we run in hard IRQ context, we run either before or after + * take_cpu_down() but never concurrently. + * - If we run before take_cpu_down(), the dying CPU hasn't been marked + * offline yet (it's marked via take_cpu_down() -> __cpu_disable()), + * so the WARN in irq_work_queue_on() can't occur. + * - Furthermore, the work item we queue will be flushed later via + * take_cpu_down() -> cpuhp_invoke_callback_range_nofail() -> + * smpcfd_dying_cpu() -> irq_work_run(). + * - If we run after take_cpu_down(), target_cpu has been already + * updated via take_cpu_down() -> __cpu_disable(), which eventually + * calls irq_do_set_affinity() during IRQ migration. So, target_cpu + * no longer points to the dying CPU in this case. + */ + if (desc->action) + irq_work_queue_on(&desc->redirect.work, target_cpu); + + return true; +} +#else /* CONFIG_SMP */ +static bool demux_redirect_remote(struct irq_desc *desc) +{ + return false; +} +#endif + +/** + * generic_handle_demux_domain_irq - Invoke the handler for a hardware interrupt + * of a demultiplexing domain. + * @domain: The domain where to perform the lookup + * @hwirq: The hardware interrupt number to convert to a logical one + * + * Returns: True on success, or false if lookup has failed + */ +bool generic_handle_demux_domain_irq(struct irq_domain *domain, irq_hw_number_t hwirq) +{ + struct irq_desc *desc = irq_resolve_mapping(domain, hwirq); + + if (unlikely(!desc)) + return false; + + if (demux_redirect_remote(desc)) + return true; + + return !handle_irq_desc(desc); +} +EXPORT_SYMBOL_GPL(generic_handle_demux_domain_irq); + #endif /* Dynamic interrupt handling */ diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 2652c4cfd877..c2258b133939 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -33,6 +33,7 @@ static void irq_domain_free_one_irq(struct irq_domain *domain, unsigned int virq struct irqchip_fwid { struct fwnode_handle fwnode; + struct fwnode_handle *parent; unsigned int type; char *name; phys_addr_t *pa; @@ -53,8 +54,16 @@ static const char *irqchip_fwnode_get_name(const struct fwnode_handle *fwnode) return fwid->name; } +static struct fwnode_handle *irqchip_fwnode_get_parent(const struct fwnode_handle *fwnode) +{ + struct irqchip_fwid *fwid = container_of(fwnode, struct irqchip_fwid, fwnode); + + return fwid->parent; +} + const struct fwnode_operations irqchip_fwnode_ops = { .get_name = irqchip_fwnode_get_name, + .get_parent = irqchip_fwnode_get_parent, }; EXPORT_SYMBOL_GPL(irqchip_fwnode_ops); @@ -65,6 +74,7 @@ EXPORT_SYMBOL_GPL(irqchip_fwnode_ops); * @id: Optional user provided id if name != NULL * @name: Optional user provided domain name * @pa: Optional user-provided physical address + * @parent: Optional parent fwnode_handle * * Allocate a struct irqchip_fwid, and return a pointer to the embedded * fwnode_handle (or NULL on failure). @@ -76,7 +86,8 @@ EXPORT_SYMBOL_GPL(irqchip_fwnode_ops); */ struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id, const char *name, - phys_addr_t *pa) + phys_addr_t *pa, + struct fwnode_handle *parent) { struct irqchip_fwid *fwid; char *n; @@ -104,6 +115,7 @@ struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id, fwid->type = type; fwid->name = n; fwid->pa = pa; + fwid->parent = parent; fwnode_init(&fwid->fwnode, &irqchip_fwnode_ops); return &fwid->fwnode; } @@ -187,7 +199,7 @@ static int irq_domain_set_name(struct irq_domain *domain, const struct irq_domai const struct fwnode_handle *fwnode = info->fwnode; if (is_fwnode_irqchip(fwnode)) { - struct irqchip_fwid *fwid = container_of(fwnode, struct irqchip_fwid, fwnode); + const struct irqchip_fwid *fwid = container_of(fwnode, struct irqchip_fwid, fwnode); /* * The name_suffix is only intended to be used to avoid a name @@ -1901,6 +1913,7 @@ void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs) irq_domain_free_irq_data(virq, nr_irqs); irq_free_descs(virq, nr_irqs); } +EXPORT_SYMBOL_GPL(irq_domain_free_irqs); static void irq_domain_free_one_irq(struct irq_domain *domain, unsigned int virq) { diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 349ae7979da0..cded3d960eb7 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -35,6 +35,16 @@ static int __init setup_forced_irqthreads(char *arg) early_param("threadirqs", setup_forced_irqthreads); #endif +#ifdef CONFIG_SMP +static inline void synchronize_irqwork(struct irq_desc *desc) +{ + /* Synchronize pending or on the fly redirect work */ + irq_work_sync(&desc->redirect.work); +} +#else +static inline void synchronize_irqwork(struct irq_desc *desc) { } +#endif + static int __irq_get_irqchip_state(struct irq_data *d, enum irqchip_irq_state which, bool *state); static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip) @@ -107,7 +117,9 @@ EXPORT_SYMBOL(synchronize_hardirq); static void __synchronize_irq(struct irq_desc *desc) { + synchronize_irqwork(desc); __synchronize_hardirq(desc, true); + /* * We made sure that no hardirq handler is running. Now verify that no * threaded handlers are active. @@ -217,8 +229,7 @@ static inline void irq_validate_effective_affinity(struct irq_data *data) { } static DEFINE_PER_CPU(struct cpumask, __tmp_mask); -int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, - bool force) +int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { struct cpumask *tmp_mask = this_cpu_ptr(&__tmp_mask); struct irq_desc *desc = irq_data_to_desc(data); @@ -347,6 +358,21 @@ static bool irq_set_affinity_deactivated(struct irq_data *data, return true; } +/** + * irq_affinity_schedule_notify_work - Schedule work to notify about affinity change + * @desc: Interrupt descriptor whose affinity changed + */ +void irq_affinity_schedule_notify_work(struct irq_desc *desc) +{ + lockdep_assert_held(&desc->lock); + + kref_get(&desc->affinity_notify->kref); + if (!schedule_work(&desc->affinity_notify->work)) { + /* Work was already scheduled, drop our extra ref */ + kref_put(&desc->affinity_notify->kref, desc->affinity_notify->release); + } +} + int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, bool force) { @@ -367,14 +393,9 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, irq_copy_pending(desc, mask); } - if (desc->affinity_notify) { - kref_get(&desc->affinity_notify->kref); - if (!schedule_work(&desc->affinity_notify->work)) { - /* Work was already scheduled, drop our extra ref */ - kref_put(&desc->affinity_notify->kref, - desc->affinity_notify->release); - } - } + if (desc->affinity_notify) + irq_affinity_schedule_notify_work(desc); + irqd_set(data, IRQD_AFFINITY_SET); return ret; @@ -1474,6 +1495,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) new->flags |= irqd_get_trigger_type(&desc->irq_data); /* + * IRQF_ONESHOT means the interrupt source in the IRQ chip will be + * masked until the threaded handled is done. If there is no thread + * handler then it makes no sense to have IRQF_ONESHOT. + */ + WARN_ON_ONCE(new->flags & IRQF_ONESHOT && !new->thread_fn); + + /* * Check whether the interrupt nests into another interrupt * thread. */ @@ -1778,8 +1806,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) chip_bus_sync_unlock(desc); mutex_unlock(&desc->request_mutex); - irq_setup_timings(desc, new); - wake_up_and_wait_for_irq_thread_ready(desc, new); wake_up_and_wait_for_irq_thread_ready(desc, new->secondary); @@ -1950,7 +1976,6 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) irq_release_resources(desc); chip_bus_sync_unlock(desc); - irq_remove_timings(desc); } mutex_unlock(&desc->request_mutex); @@ -2451,36 +2476,6 @@ void free_percpu_nmi(unsigned int irq, void __percpu *dev_id) kfree(__free_percpu_irq(irq, dev_id)); } -/** - * setup_percpu_irq - setup a per-cpu interrupt - * @irq: Interrupt line to setup - * @act: irqaction for the interrupt - * - * Used to statically setup per-cpu interrupts in the early boot process. - */ -int setup_percpu_irq(unsigned int irq, struct irqaction *act) -{ - struct irq_desc *desc = irq_to_desc(irq); - int retval; - - if (!desc || !irq_settings_is_per_cpu_devid(desc)) - return -EINVAL; - - retval = irq_chip_pm_get(&desc->irq_data); - if (retval < 0) - return retval; - - if (!act->affinity) - act->affinity = cpu_online_mask; - - retval = __setup_irq(irq, desc, act); - - if (retval) - irq_chip_pm_put(&desc->irq_data); - - return retval; -} - static struct irqaction *create_percpu_irqaction(irq_handler_t handler, unsigned long flags, const char *devname, const cpumask_t *affinity, @@ -2513,10 +2508,9 @@ struct irqaction *create_percpu_irqaction(irq_handler_t handler, unsigned long f } /** - * __request_percpu_irq - allocate a percpu interrupt line + * request_percpu_irq_affinity - allocate a percpu interrupt line * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. - * @flags: Interrupt type flags (IRQF_TIMER only) * @devname: An ascii name for the claiming device * @affinity: A cpumask describing the target CPUs for this interrupt * @dev_id: A percpu cookie passed back to the handler function @@ -2529,9 +2523,8 @@ struct irqaction *create_percpu_irqaction(irq_handler_t handler, unsigned long f * the handler gets called with the interrupted CPU's instance of * that variable. */ -int __request_percpu_irq(unsigned int irq, irq_handler_t handler, - unsigned long flags, const char *devname, - const cpumask_t *affinity, void __percpu *dev_id) +int request_percpu_irq_affinity(unsigned int irq, irq_handler_t handler, const char *devname, + const cpumask_t *affinity, void __percpu *dev_id) { struct irqaction *action; struct irq_desc *desc; @@ -2545,10 +2538,7 @@ int __request_percpu_irq(unsigned int irq, irq_handler_t handler, !irq_settings_is_per_cpu_devid(desc)) return -EINVAL; - if (flags && flags != IRQF_TIMER) - return -EINVAL; - - action = create_percpu_irqaction(handler, flags, devname, affinity, dev_id); + action = create_percpu_irqaction(handler, 0, devname, affinity, dev_id); if (!action) return -ENOMEM; @@ -2567,7 +2557,7 @@ int __request_percpu_irq(unsigned int irq, irq_handler_t handler, return retval; } -EXPORT_SYMBOL_GPL(__request_percpu_irq); +EXPORT_SYMBOL_GPL(request_percpu_irq_affinity); /** * request_percpu_nmi - allocate a percpu interrupt line for NMI delivery diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 77258eafbf63..b0999a4f1f68 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -12,6 +12,7 @@ #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/mutex.h> +#include <linux/string.h> #include "internals.h" @@ -317,7 +318,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) if (!desc->dir || action->dir || !action->name || !name_unique(irq, action)) return; - snprintf(name, MAX_NAMELEN, "%s", action->name); + strscpy(name, action->name); /* create /proc/irq/1234/handler/ */ action->dir = proc_mkdir(name, desc->dir); diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c deleted file mode 100644 index 4b7315e99bd6..000000000000 --- a/kernel/irq/timings.c +++ /dev/null @@ -1,959 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -// Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org> -#define pr_fmt(fmt) "irq_timings: " fmt - -#include <linux/kernel.h> -#include <linux/percpu.h> -#include <linux/slab.h> -#include <linux/static_key.h> -#include <linux/init.h> -#include <linux/interrupt.h> -#include <linux/idr.h> -#include <linux/irq.h> -#include <linux/math64.h> -#include <linux/log2.h> - -#include <trace/events/irq.h> - -#include "internals.h" - -DEFINE_STATIC_KEY_FALSE(irq_timing_enabled); - -DEFINE_PER_CPU(struct irq_timings, irq_timings); - -static DEFINE_IDR(irqt_stats); - -void irq_timings_enable(void) -{ - static_branch_enable(&irq_timing_enabled); -} - -void irq_timings_disable(void) -{ - static_branch_disable(&irq_timing_enabled); -} - -/* - * The main goal of this algorithm is to predict the next interrupt - * occurrence on the current CPU. - * - * Currently, the interrupt timings are stored in a circular array - * buffer every time there is an interrupt, as a tuple: the interrupt - * number and the associated timestamp when the event occurred <irq, - * timestamp>. - * - * For every interrupt occurring in a short period of time, we can - * measure the elapsed time between the occurrences for the same - * interrupt and we end up with a suite of intervals. The experience - * showed the interrupts are often coming following a periodic - * pattern. - * - * The objective of the algorithm is to find out this periodic pattern - * in a fastest way and use its period to predict the next irq event. - * - * When the next interrupt event is requested, we are in the situation - * where the interrupts are disabled and the circular buffer - * containing the timings is filled with the events which happened - * after the previous next-interrupt-event request. - * - * At this point, we read the circular buffer and we fill the irq - * related statistics structure. After this step, the circular array - * containing the timings is empty because all the values are - * dispatched in their corresponding buffers. - * - * Now for each interrupt, we can predict the next event by using the - * suffix array, log interval and exponential moving average - * - * 1. Suffix array - * - * Suffix array is an array of all the suffixes of a string. It is - * widely used as a data structure for compression, text search, ... - * For instance for the word 'banana', the suffixes will be: 'banana' - * 'anana' 'nana' 'ana' 'na' 'a' - * - * Usually, the suffix array is sorted but for our purpose it is - * not necessary and won't provide any improvement in the context of - * the solved problem where we clearly define the boundaries of the - * search by a max period and min period. - * - * The suffix array will build a suite of intervals of different - * length and will look for the repetition of each suite. If the suite - * is repeating then we have the period because it is the length of - * the suite whatever its position in the buffer. - * - * 2. Log interval - * - * We saw the irq timings allow to compute the interval of the - * occurrences for a specific interrupt. We can reasonably assume the - * longer is the interval, the higher is the error for the next event - * and we can consider storing those interval values into an array - * where each slot in the array correspond to an interval at the power - * of 2 of the index. For example, index 12 will contain values - * between 2^11 and 2^12. - * - * At the end we have an array of values where at each index defines a - * [2^index - 1, 2 ^ index] interval values allowing to store a large - * number of values inside a small array. - * - * For example, if we have the value 1123, then we store it at - * ilog2(1123) = 10 index value. - * - * Storing those value at the specific index is done by computing an - * exponential moving average for this specific slot. For instance, - * for values 1800, 1123, 1453, ... fall under the same slot (10) and - * the exponential moving average is computed every time a new value - * is stored at this slot. - * - * 3. Exponential Moving Average - * - * The EMA is largely used to track a signal for stocks or as a low - * pass filter. The magic of the formula, is it is very simple and the - * reactivity of the average can be tuned with the factors called - * alpha. - * - * The higher the alphas are, the faster the average respond to the - * signal change. In our case, if a slot in the array is a big - * interval, we can have numbers with a big difference between - * them. The impact of those differences in the average computation - * can be tuned by changing the alpha value. - * - * - * -- The algorithm -- - * - * We saw the different processing above, now let's see how they are - * used together. - * - * For each interrupt: - * For each interval: - * Compute the index = ilog2(interval) - * Compute a new_ema(buffer[index], interval) - * Store the index in a circular buffer - * - * Compute the suffix array of the indexes - * - * For each suffix: - * If the suffix is reverse-found 3 times - * Return suffix - * - * Return Not found - * - * However we can not have endless suffix array to be build, it won't - * make sense and it will add an extra overhead, so we can restrict - * this to a maximum suffix length of 5 and a minimum suffix length of - * 2. The experience showed 5 is the majority of the maximum pattern - * period found for different devices. - * - * The result is a pattern finding less than 1us for an interrupt. - * - * Example based on real values: - * - * Example 1 : MMC write/read interrupt interval: - * - * 223947, 1240, 1384, 1386, 1386, - * 217416, 1236, 1384, 1386, 1387, - * 214719, 1241, 1386, 1387, 1384, - * 213696, 1234, 1384, 1386, 1388, - * 219904, 1240, 1385, 1389, 1385, - * 212240, 1240, 1386, 1386, 1386, - * 214415, 1236, 1384, 1386, 1387, - * 214276, 1234, 1384, 1388, ? - * - * For each element, apply ilog2(value) - * - * 15, 8, 8, 8, 8, - * 15, 8, 8, 8, 8, - * 15, 8, 8, 8, 8, - * 15, 8, 8, 8, 8, - * 15, 8, 8, 8, 8, - * 15, 8, 8, 8, 8, - * 15, 8, 8, 8, 8, - * 15, 8, 8, 8, ? - * - * Max period of 5, we take the last (max_period * 3) 15 elements as - * we can be confident if the pattern repeats itself three times it is - * a repeating pattern. - * - * 8, - * 15, 8, 8, 8, 8, - * 15, 8, 8, 8, 8, - * 15, 8, 8, 8, ? - * - * Suffixes are: - * - * 1) 8, 15, 8, 8, 8 <- max period - * 2) 8, 15, 8, 8 - * 3) 8, 15, 8 - * 4) 8, 15 <- min period - * - * From there we search the repeating pattern for each suffix. - * - * buffer: 8, 15, 8, 8, 8, 8, 15, 8, 8, 8, 8, 15, 8, 8, 8 - * | | | | | | | | | | | | | | | - * 8, 15, 8, 8, 8 | | | | | | | | | | - * 8, 15, 8, 8, 8 | | | | | - * 8, 15, 8, 8, 8 - * - * When moving the suffix, we found exactly 3 matches. - * - * The first suffix with period 5 is repeating. - * - * The next event is (3 * max_period) % suffix_period - * - * In this example, the result 0, so the next event is suffix[0] => 8 - * - * However, 8 is the index in the array of exponential moving average - * which was calculated on the fly when storing the values, so the - * interval is ema[8] = 1366 - * - * - * Example 2: - * - * 4, 3, 5, 100, - * 3, 3, 5, 117, - * 4, 4, 5, 112, - * 4, 3, 4, 110, - * 3, 5, 3, 117, - * 4, 4, 5, 112, - * 4, 3, 4, 110, - * 3, 4, 5, 112, - * 4, 3, 4, 110 - * - * ilog2 - * - * 0, 0, 0, 4, - * 0, 0, 0, 4, - * 0, 0, 0, 4, - * 0, 0, 0, 4, - * 0, 0, 0, 4, - * 0, 0, 0, 4, - * 0, 0, 0, 4, - * 0, 0, 0, 4, - * 0, 0, 0, 4 - * - * Max period 5: - * 0, 0, 4, - * 0, 0, 0, 4, - * 0, 0, 0, 4, - * 0, 0, 0, 4 - * - * Suffixes: - * - * 1) 0, 0, 4, 0, 0 - * 2) 0, 0, 4, 0 - * 3) 0, 0, 4 - * 4) 0, 0 - * - * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4 - * | | | | | | X - * 0, 0, 4, 0, 0, | X - * 0, 0 - * - * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4 - * | | | | | | | | | | | | | | | - * 0, 0, 4, 0, | | | | | | | | | | | - * 0, 0, 4, 0, | | | | | | | - * 0, 0, 4, 0, | | | - * 0 0 4 - * - * Pattern is found 3 times, the remaining is 1 which results from - * (max_period * 3) % suffix_period. This value is the index in the - * suffix arrays. The suffix array for a period 4 has the value 4 - * at index 1. - */ -#define EMA_ALPHA_VAL 64 -#define EMA_ALPHA_SHIFT 7 - -#define PREDICTION_PERIOD_MIN 3 -#define PREDICTION_PERIOD_MAX 5 -#define PREDICTION_FACTOR 4 -#define PREDICTION_MAX 10 /* 2 ^ PREDICTION_MAX useconds */ -#define PREDICTION_BUFFER_SIZE 16 /* slots for EMAs, hardly more than 16 */ - -/* - * Number of elements in the circular buffer: If it happens it was - * flushed before, then the number of elements could be smaller than - * IRQ_TIMINGS_SIZE, so the count is used, otherwise the array size is - * used as we wrapped. The index begins from zero when we did not - * wrap. That could be done in a nicer way with the proper circular - * array structure type but with the cost of extra computation in the - * interrupt handler hot path. We choose efficiency. - */ -#define for_each_irqts(i, irqts) \ - for (i = irqts->count < IRQ_TIMINGS_SIZE ? \ - 0 : irqts->count & IRQ_TIMINGS_MASK, \ - irqts->count = min(IRQ_TIMINGS_SIZE, \ - irqts->count); \ - irqts->count > 0; irqts->count--, \ - i = (i + 1) & IRQ_TIMINGS_MASK) - -struct irqt_stat { - u64 last_ts; - u64 ema_time[PREDICTION_BUFFER_SIZE]; - int timings[IRQ_TIMINGS_SIZE]; - int circ_timings[IRQ_TIMINGS_SIZE]; - int count; -}; - -/* - * Exponential moving average computation - */ -static u64 irq_timings_ema_new(u64 value, u64 ema_old) -{ - s64 diff; - - if (unlikely(!ema_old)) - return value; - - diff = (value - ema_old) * EMA_ALPHA_VAL; - /* - * We can use a s64 type variable to be added with the u64 - * ema_old variable as this one will never have its topmost - * bit set, it will be always smaller than 2^63 nanosec - * interrupt interval (292 years). - */ - return ema_old + (diff >> EMA_ALPHA_SHIFT); -} - -static int irq_timings_next_event_index(int *buffer, size_t len, int period_max) -{ - int period; - - /* - * Move the beginning pointer to the end minus the max period x 3. - * We are at the point we can begin searching the pattern - */ - buffer = &buffer[len - (period_max * 3)]; - - /* Adjust the length to the maximum allowed period x 3 */ - len = period_max * 3; - - /* - * The buffer contains the suite of intervals, in a ilog2 - * basis, we are looking for a repetition. We point the - * beginning of the search three times the length of the - * period beginning at the end of the buffer. We do that for - * each suffix. - */ - for (period = period_max; period >= PREDICTION_PERIOD_MIN; period--) { - - /* - * The first comparison always succeed because the - * suffix is deduced from the first n-period bytes of - * the buffer and we compare the initial suffix with - * itself, so we can skip the first iteration. - */ - int idx = period; - size_t size = period; - - /* - * We look if the suite with period 'i' repeat - * itself. If it is truncated at the end, as it - * repeats we can use the period to find out the next - * element with the modulo. - */ - while (!memcmp(buffer, &buffer[idx], size * sizeof(int))) { - - /* - * Move the index in a period basis - */ - idx += size; - - /* - * If this condition is reached, all previous - * memcmp were successful, so the period is - * found. - */ - if (idx == len) - return buffer[len % period]; - - /* - * If the remaining elements to compare are - * smaller than the period, readjust the size - * of the comparison for the last iteration. - */ - if (len - idx < period) - size = len - idx; - } - } - - return -1; -} - -static u64 __irq_timings_next_event(struct irqt_stat *irqs, int irq, u64 now) -{ - int index, i, period_max, count, start, min = INT_MAX; - - if ((now - irqs->last_ts) >= NSEC_PER_SEC) { - irqs->count = irqs->last_ts = 0; - return U64_MAX; - } - - /* - * As we want to find three times the repetition, we need a - * number of intervals greater or equal to three times the - * maximum period, otherwise we truncate the max period. - */ - period_max = irqs->count > (3 * PREDICTION_PERIOD_MAX) ? - PREDICTION_PERIOD_MAX : irqs->count / 3; - - /* - * If we don't have enough irq timings for this prediction, - * just bail out. - */ - if (period_max <= PREDICTION_PERIOD_MIN) - return U64_MAX; - - /* - * 'count' will depends if the circular buffer wrapped or not - */ - count = irqs->count < IRQ_TIMINGS_SIZE ? - irqs->count : IRQ_TIMINGS_SIZE; - - start = irqs->count < IRQ_TIMINGS_SIZE ? - 0 : (irqs->count & IRQ_TIMINGS_MASK); - - /* - * Copy the content of the circular buffer into another buffer - * in order to linearize the buffer instead of dealing with - * wrapping indexes and shifted array which will be prone to - * error and extremely difficult to debug. - */ - for (i = 0; i < count; i++) { - int index = (start + i) & IRQ_TIMINGS_MASK; - - irqs->timings[i] = irqs->circ_timings[index]; - min = min_t(int, irqs->timings[i], min); - } - - index = irq_timings_next_event_index(irqs->timings, count, period_max); - if (index < 0) - return irqs->last_ts + irqs->ema_time[min]; - - return irqs->last_ts + irqs->ema_time[index]; -} - -static __always_inline int irq_timings_interval_index(u64 interval) -{ - /* - * The PREDICTION_FACTOR increase the interval size for the - * array of exponential average. - */ - u64 interval_us = (interval >> 10) / PREDICTION_FACTOR; - - return likely(interval_us) ? ilog2(interval_us) : 0; -} - -static __always_inline void __irq_timings_store(int irq, struct irqt_stat *irqs, - u64 interval) -{ - int index; - - /* - * Get the index in the ema table for this interrupt. - */ - index = irq_timings_interval_index(interval); - - if (index > PREDICTION_BUFFER_SIZE - 1) { - irqs->count = 0; - return; - } - - /* - * Store the index as an element of the pattern in another - * circular array. - */ - irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index; - - irqs->ema_time[index] = irq_timings_ema_new(interval, - irqs->ema_time[index]); - - irqs->count++; -} - -static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts) -{ - u64 old_ts = irqs->last_ts; - u64 interval; - - /* - * The timestamps are absolute time values, we need to compute - * the timing interval between two interrupts. - */ - irqs->last_ts = ts; - - /* - * The interval type is u64 in order to deal with the same - * type in our computation, that prevent mindfuck issues with - * overflow, sign and division. - */ - interval = ts - old_ts; - - /* - * The interrupt triggered more than one second apart, that - * ends the sequence as predictable for our purpose. In this - * case, assume we have the beginning of a sequence and the - * timestamp is the first value. As it is impossible to - * predict anything at this point, return. - * - * Note the first timestamp of the sequence will always fall - * in this test because the old_ts is zero. That is what we - * want as we need another timestamp to compute an interval. - */ - if (interval >= NSEC_PER_SEC) { - irqs->count = 0; - return; - } - - __irq_timings_store(irq, irqs, interval); -} - -/** - * irq_timings_next_event - Return when the next event is supposed to arrive - * @now: current time - * - * During the last busy cycle, the number of interrupts is incremented - * and stored in the irq_timings structure. This information is - * necessary to: - * - * - know if the index in the table wrapped up: - * - * If more than the array size interrupts happened during the - * last busy/idle cycle, the index wrapped up and we have to - * begin with the next element in the array which is the last one - * in the sequence, otherwise it is at the index 0. - * - * - have an indication of the interrupts activity on this CPU - * (eg. irq/sec) - * - * The values are 'consumed' after inserting in the statistical model, - * thus the count is reinitialized. - * - * The array of values **must** be browsed in the time direction, the - * timestamp must increase between an element and the next one. - * - * Returns a nanosec time based estimation of the earliest interrupt, - * U64_MAX otherwise. - */ -u64 irq_timings_next_event(u64 now) -{ - struct irq_timings *irqts = this_cpu_ptr(&irq_timings); - struct irqt_stat *irqs; - struct irqt_stat __percpu *s; - u64 ts, next_evt = U64_MAX; - int i, irq = 0; - - /* - * This function must be called with the local irq disabled in - * order to prevent the timings circular buffer to be updated - * while we are reading it. - */ - lockdep_assert_irqs_disabled(); - - if (!irqts->count) - return next_evt; - - /* - * Number of elements in the circular buffer: If it happens it - * was flushed before, then the number of elements could be - * smaller than IRQ_TIMINGS_SIZE, so the count is used, - * otherwise the array size is used as we wrapped. The index - * begins from zero when we did not wrap. That could be done - * in a nicer way with the proper circular array structure - * type but with the cost of extra computation in the - * interrupt handler hot path. We choose efficiency. - * - * Inject measured irq/timestamp to the pattern prediction - * model while decrementing the counter because we consume the - * data from our circular buffer. - */ - for_each_irqts(i, irqts) { - irq = irq_timing_decode(irqts->values[i], &ts); - s = idr_find(&irqt_stats, irq); - if (s) - irq_timings_store(irq, this_cpu_ptr(s), ts); - } - - /* - * Look in the list of interrupts' statistics, the earliest - * next event. - */ - idr_for_each_entry(&irqt_stats, s, i) { - - irqs = this_cpu_ptr(s); - - ts = __irq_timings_next_event(irqs, i, now); - if (ts <= now) - return now; - - if (ts < next_evt) - next_evt = ts; - } - - return next_evt; -} - -void irq_timings_free(int irq) -{ - struct irqt_stat __percpu *s; - - s = idr_find(&irqt_stats, irq); - if (s) { - free_percpu(s); - idr_remove(&irqt_stats, irq); - } -} - -int irq_timings_alloc(int irq) -{ - struct irqt_stat __percpu *s; - int id; - - /* - * Some platforms can have the same private interrupt per cpu, - * so this function may be called several times with the - * same interrupt number. Just bail out in case the per cpu - * stat structure is already allocated. - */ - s = idr_find(&irqt_stats, irq); - if (s) - return 0; - - s = alloc_percpu(*s); - if (!s) - return -ENOMEM; - - idr_preload(GFP_KERNEL); - id = idr_alloc(&irqt_stats, s, irq, irq + 1, GFP_NOWAIT); - idr_preload_end(); - - if (id < 0) { - free_percpu(s); - return id; - } - - return 0; -} - -#ifdef CONFIG_TEST_IRQ_TIMINGS -struct timings_intervals { - u64 *intervals; - size_t count; -}; - -/* - * Intervals are given in nanosecond base - */ -static u64 intervals0[] __initdata = { - 10000, 50000, 200000, 500000, - 10000, 50000, 200000, 500000, - 10000, 50000, 200000, 500000, - 10000, 50000, 200000, 500000, - 10000, 50000, 200000, 500000, - 10000, 50000, 200000, 500000, - 10000, 50000, 200000, 500000, - 10000, 50000, 200000, 500000, - 10000, 50000, 200000, -}; - -static u64 intervals1[] __initdata = { - 223947000, 1240000, 1384000, 1386000, 1386000, - 217416000, 1236000, 1384000, 1386000, 1387000, - 214719000, 1241000, 1386000, 1387000, 1384000, - 213696000, 1234000, 1384000, 1386000, 1388000, - 219904000, 1240000, 1385000, 1389000, 1385000, - 212240000, 1240000, 1386000, 1386000, 1386000, - 214415000, 1236000, 1384000, 1386000, 1387000, - 214276000, 1234000, -}; - -static u64 intervals2[] __initdata = { - 4000, 3000, 5000, 100000, - 3000, 3000, 5000, 117000, - 4000, 4000, 5000, 112000, - 4000, 3000, 4000, 110000, - 3000, 5000, 3000, 117000, - 4000, 4000, 5000, 112000, - 4000, 3000, 4000, 110000, - 3000, 4000, 5000, 112000, - 4000, -}; - -static u64 intervals3[] __initdata = { - 1385000, 212240000, 1240000, - 1386000, 214415000, 1236000, - 1384000, 214276000, 1234000, - 1386000, 214415000, 1236000, - 1385000, 212240000, 1240000, - 1386000, 214415000, 1236000, - 1384000, 214276000, 1234000, - 1386000, 214415000, 1236000, - 1385000, 212240000, 1240000, -}; - -static u64 intervals4[] __initdata = { - 10000, 50000, 10000, 50000, - 10000, 50000, 10000, 50000, - 10000, 50000, 10000, 50000, - 10000, 50000, 10000, 50000, - 10000, 50000, 10000, 50000, - 10000, 50000, 10000, 50000, - 10000, 50000, 10000, 50000, - 10000, 50000, 10000, 50000, - 10000, -}; - -static struct timings_intervals tis[] __initdata = { - { intervals0, ARRAY_SIZE(intervals0) }, - { intervals1, ARRAY_SIZE(intervals1) }, - { intervals2, ARRAY_SIZE(intervals2) }, - { intervals3, ARRAY_SIZE(intervals3) }, - { intervals4, ARRAY_SIZE(intervals4) }, -}; - -static int __init irq_timings_test_next_index(struct timings_intervals *ti) -{ - int _buffer[IRQ_TIMINGS_SIZE]; - int buffer[IRQ_TIMINGS_SIZE]; - int index, start, i, count, period_max; - - count = ti->count - 1; - - period_max = count > (3 * PREDICTION_PERIOD_MAX) ? - PREDICTION_PERIOD_MAX : count / 3; - - /* - * Inject all values except the last one which will be used - * to compare with the next index result. - */ - pr_debug("index suite: "); - - for (i = 0; i < count; i++) { - index = irq_timings_interval_index(ti->intervals[i]); - _buffer[i & IRQ_TIMINGS_MASK] = index; - pr_cont("%d ", index); - } - - start = count < IRQ_TIMINGS_SIZE ? 0 : - count & IRQ_TIMINGS_MASK; - - count = min_t(int, count, IRQ_TIMINGS_SIZE); - - for (i = 0; i < count; i++) { - int index = (start + i) & IRQ_TIMINGS_MASK; - buffer[i] = _buffer[index]; - } - - index = irq_timings_next_event_index(buffer, count, period_max); - i = irq_timings_interval_index(ti->intervals[ti->count - 1]); - - if (index != i) { - pr_err("Expected (%d) and computed (%d) next indexes differ\n", - i, index); - return -EINVAL; - } - - return 0; -} - -static int __init irq_timings_next_index_selftest(void) -{ - int i, ret; - - for (i = 0; i < ARRAY_SIZE(tis); i++) { - - pr_info("---> Injecting intervals number #%d (count=%zd)\n", - i, tis[i].count); - - ret = irq_timings_test_next_index(&tis[i]); - if (ret) - break; - } - - return ret; -} - -static int __init irq_timings_test_irqs(struct timings_intervals *ti) -{ - struct irqt_stat __percpu *s; - struct irqt_stat *irqs; - int i, index, ret, irq = 0xACE5; - - ret = irq_timings_alloc(irq); - if (ret) { - pr_err("Failed to allocate irq timings\n"); - return ret; - } - - s = idr_find(&irqt_stats, irq); - if (!s) { - ret = -EIDRM; - goto out; - } - - irqs = this_cpu_ptr(s); - - for (i = 0; i < ti->count; i++) { - - index = irq_timings_interval_index(ti->intervals[i]); - pr_debug("%d: interval=%llu ema_index=%d\n", - i, ti->intervals[i], index); - - __irq_timings_store(irq, irqs, ti->intervals[i]); - if (irqs->circ_timings[i & IRQ_TIMINGS_MASK] != index) { - ret = -EBADSLT; - pr_err("Failed to store in the circular buffer\n"); - goto out; - } - } - - if (irqs->count != ti->count) { - ret = -ERANGE; - pr_err("Count differs\n"); - goto out; - } - - ret = 0; -out: - irq_timings_free(irq); - - return ret; -} - -static int __init irq_timings_irqs_selftest(void) -{ - int i, ret; - - for (i = 0; i < ARRAY_SIZE(tis); i++) { - pr_info("---> Injecting intervals number #%d (count=%zd)\n", - i, tis[i].count); - ret = irq_timings_test_irqs(&tis[i]); - if (ret) - break; - } - - return ret; -} - -static int __init irq_timings_test_irqts(struct irq_timings *irqts, - unsigned count) -{ - int start = count >= IRQ_TIMINGS_SIZE ? count - IRQ_TIMINGS_SIZE : 0; - int i, irq, oirq = 0xBEEF; - u64 ots = 0xDEAD, ts; - - /* - * Fill the circular buffer by using the dedicated function. - */ - for (i = 0; i < count; i++) { - pr_debug("%d: index=%d, ts=%llX irq=%X\n", - i, i & IRQ_TIMINGS_MASK, ots + i, oirq + i); - - irq_timings_push(ots + i, oirq + i); - } - - /* - * Compute the first elements values after the index wrapped - * up or not. - */ - ots += start; - oirq += start; - - /* - * Test the circular buffer count is correct. - */ - pr_debug("---> Checking timings array count (%d) is right\n", count); - if (WARN_ON(irqts->count != count)) - return -EINVAL; - - /* - * Test the macro allowing to browse all the irqts. - */ - pr_debug("---> Checking the for_each_irqts() macro\n"); - for_each_irqts(i, irqts) { - - irq = irq_timing_decode(irqts->values[i], &ts); - - pr_debug("index=%d, ts=%llX / %llX, irq=%X / %X\n", - i, ts, ots, irq, oirq); - - if (WARN_ON(ts != ots || irq != oirq)) - return -EINVAL; - - ots++; oirq++; - } - - /* - * The circular buffer should have be flushed when browsed - * with for_each_irqts - */ - pr_debug("---> Checking timings array is empty after browsing it\n"); - if (WARN_ON(irqts->count)) - return -EINVAL; - - return 0; -} - -static int __init irq_timings_irqts_selftest(void) -{ - struct irq_timings *irqts = this_cpu_ptr(&irq_timings); - int i, ret; - - /* - * Test the circular buffer with different number of - * elements. The purpose is to test at the limits (empty, half - * full, full, wrapped with the cursor at the boundaries, - * wrapped several times, etc ... - */ - int count[] = { 0, - IRQ_TIMINGS_SIZE >> 1, - IRQ_TIMINGS_SIZE, - IRQ_TIMINGS_SIZE + (IRQ_TIMINGS_SIZE >> 1), - 2 * IRQ_TIMINGS_SIZE, - (2 * IRQ_TIMINGS_SIZE) + 3, - }; - - for (i = 0; i < ARRAY_SIZE(count); i++) { - - pr_info("---> Checking the timings with %d/%d values\n", - count[i], IRQ_TIMINGS_SIZE); - - ret = irq_timings_test_irqts(irqts, count[i]); - if (ret) - break; - } - - return ret; -} - -static int __init irq_timings_selftest(void) -{ - int ret; - - pr_info("------------------- selftest start -----------------\n"); - - /* - * At this point, we don't except any subsystem to use the irq - * timings but us, so it should not be enabled. - */ - if (static_branch_unlikely(&irq_timing_enabled)) { - pr_warn("irq timings already initialized, skipping selftest\n"); - return 0; - } - - ret = irq_timings_irqts_selftest(); - if (ret) - goto out; - - ret = irq_timings_irqs_selftest(); - if (ret) - goto out; - - ret = irq_timings_next_index_selftest(); -out: - pr_info("---------- selftest end with %s -----------\n", - ret ? "failure" : "success"); - - return ret; -} -early_initcall(irq_timings_selftest); -#endif diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 049e296f586c..aec2f06858af 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -151,8 +151,10 @@ static unsigned int get_symbol_offset(unsigned long pos) unsigned long kallsyms_sym_address(int idx) { - /* values are unsigned offsets */ - return kallsyms_relative_base + (u32)kallsyms_offsets[idx]; + /* non-relocatable 32-bit kernels just embed the value directly */ + if (!IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_RELOCATABLE)) + return (u32)kallsyms_offsets[idx]; + return (unsigned long)offset_to_ptr(kallsyms_offsets + idx); } static unsigned int get_symbol_seq(int index) @@ -345,7 +347,7 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, return 1; } return !!module_address_lookup(addr, symbolsize, offset, NULL, NULL, namebuf) || - !!__bpf_address_lookup(addr, symbolsize, offset, namebuf); + !!bpf_address_lookup(addr, symbolsize, offset, namebuf); } static int kallsyms_lookup_buildid(unsigned long addr, @@ -355,8 +357,21 @@ static int kallsyms_lookup_buildid(unsigned long addr, { int ret; - namebuf[KSYM_NAME_LEN - 1] = 0; + /* + * kallsyms_lookus() returns pointer to namebuf on success and + * NULL on error. But some callers ignore the return value. + * Instead they expect @namebuf filled either with valid + * or empty string. + */ namebuf[0] = 0; + /* + * Initialize the module-related return values. They are not set + * when the symbol is in vmlinux or it is a bpf address. + */ + if (modname) + *modname = NULL; + if (modbuildid) + *modbuildid = NULL; if (is_ksym_addr(addr)) { unsigned long pos; @@ -365,10 +380,6 @@ static int kallsyms_lookup_buildid(unsigned long addr, /* Grab name */ kallsyms_expand_symbol(get_symbol_offset(pos), namebuf, KSYM_NAME_LEN); - if (modname) - *modname = NULL; - if (modbuildid) - *modbuildid = NULL; return strlen(namebuf); } @@ -377,12 +388,11 @@ static int kallsyms_lookup_buildid(unsigned long addr, ret = module_address_lookup(addr, symbolsize, offset, modname, modbuildid, namebuf); if (!ret) - ret = bpf_address_lookup(addr, symbolsize, - offset, modname, namebuf); + ret = bpf_address_lookup(addr, symbolsize, offset, namebuf); if (!ret) - ret = ftrace_mod_address_lookup(addr, symbolsize, - offset, modname, namebuf); + ret = ftrace_mod_address_lookup(addr, symbolsize, offset, + modname, modbuildid, namebuf); return ret; } @@ -426,6 +436,37 @@ int lookup_symbol_name(unsigned long addr, char *symname) return lookup_module_symbol_name(addr, symname); } +#ifdef CONFIG_STACKTRACE_BUILD_ID + +static int append_buildid(char *buffer, const char *modname, + const unsigned char *buildid) +{ + if (!modname) + return 0; + + if (!buildid) { + pr_warn_once("Undefined buildid for the module %s\n", modname); + return 0; + } + + /* build ID should match length of sprintf */ +#ifdef CONFIG_MODULES + static_assert(sizeof(typeof_member(struct module, build_id)) == 20); +#endif + + return sprintf(buffer, " %20phN", buildid); +} + +#else /* CONFIG_STACKTRACE_BUILD_ID */ + +static int append_buildid(char *buffer, const char *modname, + const unsigned char *buildid) +{ + return 0; +} + +#endif /* CONFIG_STACKTRACE_BUILD_ID */ + /* Look up a kernel symbol and return it in a text buffer. */ static int __sprint_symbol(char *buffer, unsigned long address, int symbol_offset, int add_offset, int add_buildid) @@ -435,6 +476,9 @@ static int __sprint_symbol(char *buffer, unsigned long address, unsigned long offset, size; int len; + /* Prevent module removal until modname and modbuildid are printed */ + guard(rcu)(); + address += symbol_offset; len = kallsyms_lookup_buildid(address, &size, &offset, &modname, &buildid, buffer); @@ -448,15 +492,8 @@ static int __sprint_symbol(char *buffer, unsigned long address, if (modname) { len += sprintf(buffer + len, " [%s", modname); -#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) - if (add_buildid && buildid) { - /* build ID should match length of sprintf */ -#if IS_ENABLED(CONFIG_MODULES) - static_assert(sizeof(typeof_member(struct module, build_id)) == 20); -#endif - len += sprintf(buffer + len, " %20phN", buildid); - } -#endif + if (add_buildid) + len += append_buildid(buffer + len, modname, buildid); len += sprintf(buffer + len, "]"); } diff --git a/kernel/kallsyms_internal.h b/kernel/kallsyms_internal.h index 9633782f8250..81a867dbe57d 100644 --- a/kernel/kallsyms_internal.h +++ b/kernel/kallsyms_internal.h @@ -8,7 +8,6 @@ extern const int kallsyms_offsets[]; extern const u8 kallsyms_names[]; extern const unsigned int kallsyms_num_syms; -extern const unsigned long kallsyms_relative_base; extern const char kallsyms_token_table[]; extern const u16 kallsyms_token_index[]; diff --git a/kernel/kcov.c b/kernel/kcov.c index 6563141f5de9..5397d0c14127 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -55,13 +55,13 @@ struct kcov { refcount_t refcount; /* The lock protects mode, size, area and t. */ spinlock_t lock; - enum kcov_mode mode; + enum kcov_mode mode __guarded_by(&lock); /* Size of arena (in long's). */ - unsigned int size; + unsigned int size __guarded_by(&lock); /* Coverage buffer shared with user space. */ - void *area; + void *area __guarded_by(&lock); /* Task for which we collect coverage, or NULL. */ - struct task_struct *t; + struct task_struct *t __guarded_by(&lock); /* Collecting coverage from remote (background) threads. */ bool remote; /* Size of remote area (in long's). */ @@ -391,6 +391,7 @@ void kcov_task_init(struct task_struct *t) } static void kcov_reset(struct kcov *kcov) + __must_hold(&kcov->lock) { kcov->t = NULL; kcov->mode = KCOV_MODE_INIT; @@ -400,6 +401,7 @@ static void kcov_reset(struct kcov *kcov) } static void kcov_remote_reset(struct kcov *kcov) + __must_hold(&kcov->lock) { int bkt; struct kcov_remote *remote; @@ -419,6 +421,7 @@ static void kcov_remote_reset(struct kcov *kcov) } static void kcov_disable(struct task_struct *t, struct kcov *kcov) + __must_hold(&kcov->lock) { kcov_task_reset(t); if (kcov->remote) @@ -435,8 +438,11 @@ static void kcov_get(struct kcov *kcov) static void kcov_put(struct kcov *kcov) { if (refcount_dec_and_test(&kcov->refcount)) { - kcov_remote_reset(kcov); - vfree(kcov->area); + /* Context-safety: no references left, object being destroyed. */ + context_unsafe( + kcov_remote_reset(kcov); + vfree(kcov->area); + ); kfree(kcov); } } @@ -491,6 +497,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) unsigned long size, off; struct page *page; unsigned long flags; + void *area; spin_lock_irqsave(&kcov->lock, flags); size = kcov->size * sizeof(unsigned long); @@ -499,10 +506,11 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) res = -EINVAL; goto exit; } + area = kcov->area; spin_unlock_irqrestore(&kcov->lock, flags); vm_flags_set(vma, VM_DONTEXPAND); for (off = 0; off < size; off += PAGE_SIZE) { - page = vmalloc_to_page(kcov->area + off); + page = vmalloc_to_page(area + off); res = vm_insert_page(vma, vma->vm_start + off, page); if (res) { pr_warn_once("kcov: vm_insert_page() failed\n"); @@ -522,10 +530,10 @@ static int kcov_open(struct inode *inode, struct file *filep) kcov = kzalloc(sizeof(*kcov), GFP_KERNEL); if (!kcov) return -ENOMEM; + guard(spinlock_init)(&kcov->lock); kcov->mode = KCOV_MODE_DISABLED; kcov->sequence = 1; refcount_set(&kcov->refcount, 1); - spin_lock_init(&kcov->lock); filep->private_data = kcov; return nonseekable_open(inode, filep); } @@ -556,6 +564,7 @@ static int kcov_get_mode(unsigned long arg) * vmalloc fault handling path is instrumented. */ static void kcov_fault_in_area(struct kcov *kcov) + __must_hold(&kcov->lock) { unsigned long stride = PAGE_SIZE / sizeof(unsigned long); unsigned long *area = kcov->area; @@ -584,6 +593,7 @@ static inline bool kcov_check_handle(u64 handle, bool common_valid, static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, unsigned long arg) + __must_hold(&kcov->lock) { struct task_struct *t; unsigned long flags, unused; @@ -814,6 +824,7 @@ static inline bool kcov_mode_enabled(unsigned int mode) } static void kcov_remote_softirq_start(struct task_struct *t) + __must_hold(&kcov_percpu_data.lock) { struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data); unsigned int mode; @@ -831,6 +842,7 @@ static void kcov_remote_softirq_start(struct task_struct *t) } static void kcov_remote_softirq_stop(struct task_struct *t) + __must_hold(&kcov_percpu_data.lock) { struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data); @@ -896,10 +908,12 @@ void kcov_remote_start(u64 handle) /* Put in kcov_remote_stop(). */ kcov_get(kcov); /* - * Read kcov fields before unlock to prevent races with - * KCOV_DISABLE / kcov_remote_reset(). + * Read kcov fields before unlocking kcov_remote_lock to prevent races + * with KCOV_DISABLE and kcov_remote_reset(); cannot acquire kcov->lock + * here, because it might lead to deadlock given kcov_remote_lock is + * acquired _after_ kcov->lock elsewhere. */ - mode = kcov->mode; + mode = context_unsafe(kcov->mode); sequence = kcov->sequence; if (in_task()) { size = kcov->remote_size; diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile index a45f3dfc8d14..824f30c93252 100644 --- a/kernel/kcsan/Makefile +++ b/kernel/kcsan/Makefile @@ -1,4 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 +CONTEXT_ANALYSIS := y + KCSAN_SANITIZE := n KCOV_INSTRUMENT := n UBSAN_SANITIZE := n diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index 219d22857c98..8ef8167be745 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -176,7 +176,7 @@ static bool __report_matches(const struct expect_report *r) /* Title */ cur = expect[0]; - end = &expect[0][sizeof(expect[0]) - 1]; + end = ARRAY_END(expect[0]); cur += scnprintf(cur, end - cur, "BUG: KCSAN: %s in ", is_assert ? "assert: race" : "data-race"); if (r->access[1].fn) { @@ -200,7 +200,7 @@ static bool __report_matches(const struct expect_report *r) /* Access 1 */ cur = expect[1]; - end = &expect[1][sizeof(expect[1]) - 1]; + end = ARRAY_END(expect[1]); if (!r->access[1].fn) cur += scnprintf(cur, end - cur, "race at unknown origin, with "); diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index e95ce7d7a76e..11a48b78f8d1 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -116,6 +116,7 @@ static DEFINE_RAW_SPINLOCK(report_lock); * been reported since (now - KCSAN_REPORT_ONCE_IN_MS). */ static bool rate_limit_report(unsigned long frame1, unsigned long frame2) + __must_hold(&report_lock) { struct report_time *use_entry = &report_times[0]; unsigned long invalid_before; @@ -366,6 +367,7 @@ static int sym_strcmp(void *addr1, void *addr2) static void print_stack_trace(unsigned long stack_entries[], int num_entries, unsigned long reordered_to) + __must_hold(&report_lock) { stack_trace_print(stack_entries, num_entries, 0); if (reordered_to) @@ -373,6 +375,7 @@ print_stack_trace(unsigned long stack_entries[], int num_entries, unsigned long } static void print_verbose_info(struct task_struct *task) + __must_hold(&report_lock) { if (!task) return; @@ -389,6 +392,7 @@ static void print_report(enum kcsan_value_change value_change, const struct access_info *ai, struct other_info *other_info, u64 old, u64 new, u64 mask) + __must_hold(&report_lock) { unsigned long reordered_to = 0; unsigned long stack_entries[NUM_STACK_ENTRIES] = { 0 }; @@ -496,6 +500,7 @@ static void print_report(enum kcsan_value_change value_change, } static void release_report(unsigned long *flags, struct other_info *other_info) + __releases(&report_lock) { /* * Use size to denote valid/invalid, since KCSAN entirely ignores @@ -507,13 +512,11 @@ static void release_report(unsigned long *flags, struct other_info *other_info) /* * Sets @other_info->task and awaits consumption of @other_info. - * - * Precondition: report_lock is held. - * Postcondition: report_lock is held. */ static void set_other_info_task_blocking(unsigned long *flags, const struct access_info *ai, struct other_info *other_info) + __must_hold(&report_lock) { /* * We may be instrumenting a code-path where current->state is already @@ -572,6 +575,7 @@ static void set_other_info_task_blocking(unsigned long *flags, static void prepare_report_producer(unsigned long *flags, const struct access_info *ai, struct other_info *other_info) + __must_not_hold(&report_lock) { raw_spin_lock_irqsave(&report_lock, *flags); @@ -603,6 +607,7 @@ static void prepare_report_producer(unsigned long *flags, static bool prepare_report_consumer(unsigned long *flags, const struct access_info *ai, struct other_info *other_info) + __cond_acquires(true, &report_lock) { raw_spin_lock_irqsave(&report_lock, *flags); diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index eb62a9794242..2bfbb2d144e6 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -883,6 +883,60 @@ out_free_sha_regions: #ifdef CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY /* + * kexec_purgatory_find_symbol - find a symbol in the purgatory + * @pi: Purgatory to search in. + * @name: Name of the symbol. + * + * Return: pointer to symbol in read-only symtab on success, NULL on error. + */ +static const Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, + const char *name) +{ + const Elf_Shdr *sechdrs; + const Elf_Ehdr *ehdr; + const Elf_Sym *syms; + const char *strtab; + int i, k; + + if (!pi->ehdr) + return NULL; + + ehdr = pi->ehdr; + sechdrs = (void *)ehdr + ehdr->e_shoff; + + for (i = 0; i < ehdr->e_shnum; i++) { + if (sechdrs[i].sh_type != SHT_SYMTAB) + continue; + + if (sechdrs[i].sh_link >= ehdr->e_shnum) + /* Invalid strtab section number */ + continue; + strtab = (void *)ehdr + sechdrs[sechdrs[i].sh_link].sh_offset; + syms = (void *)ehdr + sechdrs[i].sh_offset; + + /* Go through symbols for a match */ + for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) { + if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL) + continue; + + if (strcmp(strtab + syms[k].st_name, name) != 0) + continue; + + if (syms[k].st_shndx == SHN_UNDEF || + syms[k].st_shndx >= ehdr->e_shnum) { + pr_debug("Symbol: %s has bad section index %d.\n", + name, syms[k].st_shndx); + return NULL; + } + + /* Found the symbol we are looking for */ + return &syms[k]; + } + } + + return NULL; +} +/* * kexec_purgatory_setup_kbuf - prepare buffer to load purgatory. * @pi: Purgatory to be loaded. * @kbuf: Buffer to setup. @@ -960,6 +1014,10 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, unsigned long offset; size_t sechdrs_size; Elf_Shdr *sechdrs; + const Elf_Sym *entry_sym; + u16 entry_shndx = 0; + unsigned long entry_off = 0; + bool start_fixed = false; int i; /* @@ -977,6 +1035,12 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, bss_addr = kbuf->mem + kbuf->bufsz; kbuf->image->start = pi->ehdr->e_entry; + entry_sym = kexec_purgatory_find_symbol(pi, "purgatory_start"); + if (entry_sym) { + entry_shndx = entry_sym->st_shndx; + entry_off = entry_sym->st_value; + } + for (i = 0; i < pi->ehdr->e_shnum; i++) { unsigned long align; void *src, *dst; @@ -994,6 +1058,13 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, offset = ALIGN(offset, align); + if (!start_fixed && entry_sym && i == entry_shndx && + (sechdrs[i].sh_flags & SHF_EXECINSTR) && + entry_off < sechdrs[i].sh_size) { + kbuf->image->start = kbuf->mem + offset + entry_off; + start_fixed = true; + } + /* * Check if the segment contains the entry point, if so, * calculate the value of image->start based on it. @@ -1004,13 +1075,14 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, * is not set to the initial value, and warn the user so they * have a chance to fix their purgatory's linker script. */ - if (sechdrs[i].sh_flags & SHF_EXECINSTR && + if (!start_fixed && sechdrs[i].sh_flags & SHF_EXECINSTR && pi->ehdr->e_entry >= sechdrs[i].sh_addr && pi->ehdr->e_entry < (sechdrs[i].sh_addr + sechdrs[i].sh_size) && - !WARN_ON(kbuf->image->start != pi->ehdr->e_entry)) { + kbuf->image->start == pi->ehdr->e_entry) { kbuf->image->start -= sechdrs[i].sh_addr; kbuf->image->start += kbuf->mem + offset; + start_fixed = true; } src = (void *)pi->ehdr + sechdrs[i].sh_offset; @@ -1128,61 +1200,6 @@ out_free_kbuf: return ret; } -/* - * kexec_purgatory_find_symbol - find a symbol in the purgatory - * @pi: Purgatory to search in. - * @name: Name of the symbol. - * - * Return: pointer to symbol in read-only symtab on success, NULL on error. - */ -static const Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, - const char *name) -{ - const Elf_Shdr *sechdrs; - const Elf_Ehdr *ehdr; - const Elf_Sym *syms; - const char *strtab; - int i, k; - - if (!pi->ehdr) - return NULL; - - ehdr = pi->ehdr; - sechdrs = (void *)ehdr + ehdr->e_shoff; - - for (i = 0; i < ehdr->e_shnum; i++) { - if (sechdrs[i].sh_type != SHT_SYMTAB) - continue; - - if (sechdrs[i].sh_link >= ehdr->e_shnum) - /* Invalid strtab section number */ - continue; - strtab = (void *)ehdr + sechdrs[sechdrs[i].sh_link].sh_offset; - syms = (void *)ehdr + sechdrs[i].sh_offset; - - /* Go through symbols for a match */ - for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) { - if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL) - continue; - - if (strcmp(strtab + syms[k].st_name, name) != 0) - continue; - - if (syms[k].st_shndx == SHN_UNDEF || - syms[k].st_shndx >= ehdr->e_shnum) { - pr_debug("Symbol: %s has bad section index %d.\n", - name, syms[k].st_shndx); - return NULL; - } - - /* Found the symbol we are looking for */ - return &syms[k]; - } - } - - return NULL; -} - void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name) { struct purgatory_info *pi = &image->purgatory_info; diff --git a/kernel/kthread.c b/kernel/kthread.c index 39511dd2abc9..c9507689e181 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -35,8 +35,8 @@ static DEFINE_SPINLOCK(kthread_create_lock); static LIST_HEAD(kthread_create_list); struct task_struct *kthreadd_task; -static LIST_HEAD(kthreads_hotplug); -static DEFINE_MUTEX(kthreads_hotplug_lock); +static LIST_HEAD(kthread_affinity_list); +static DEFINE_MUTEX(kthread_affinity_lock); struct kthread_create_info { @@ -69,7 +69,7 @@ struct kthread { /* To store the full name if task comm is truncated. */ char *full_name; struct task_struct *task; - struct list_head hotplug_node; + struct list_head affinity_node; struct cpumask *preferred_affinity; }; @@ -128,7 +128,7 @@ bool set_kthread_struct(struct task_struct *p) init_completion(&kthread->exited); init_completion(&kthread->parked); - INIT_LIST_HEAD(&kthread->hotplug_node); + INIT_LIST_HEAD(&kthread->affinity_node); p->vfork_done = &kthread->exited; kthread->task = p; @@ -323,10 +323,10 @@ void __noreturn kthread_exit(long result) { struct kthread *kthread = to_kthread(current); kthread->result = result; - if (!list_empty(&kthread->hotplug_node)) { - mutex_lock(&kthreads_hotplug_lock); - list_del(&kthread->hotplug_node); - mutex_unlock(&kthreads_hotplug_lock); + if (!list_empty(&kthread->affinity_node)) { + mutex_lock(&kthread_affinity_lock); + list_del(&kthread->affinity_node); + mutex_unlock(&kthread_affinity_lock); if (kthread->preferred_affinity) { kfree(kthread->preferred_affinity); @@ -362,17 +362,20 @@ static void kthread_fetch_affinity(struct kthread *kthread, struct cpumask *cpum { const struct cpumask *pref; + guard(rcu)(); + if (kthread->preferred_affinity) { pref = kthread->preferred_affinity; } else { - if (WARN_ON_ONCE(kthread->node == NUMA_NO_NODE)) - return; - pref = cpumask_of_node(kthread->node); + if (kthread->node == NUMA_NO_NODE) + pref = housekeeping_cpumask(HK_TYPE_DOMAIN); + else + pref = cpumask_of_node(kthread->node); } - cpumask_and(cpumask, pref, housekeeping_cpumask(HK_TYPE_KTHREAD)); + cpumask_and(cpumask, pref, housekeeping_cpumask(HK_TYPE_DOMAIN)); if (cpumask_empty(cpumask)) - cpumask_copy(cpumask, housekeeping_cpumask(HK_TYPE_KTHREAD)); + cpumask_copy(cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN)); } static void kthread_affine_node(void) @@ -380,32 +383,29 @@ static void kthread_affine_node(void) struct kthread *kthread = to_kthread(current); cpumask_var_t affinity; - WARN_ON_ONCE(kthread_is_per_cpu(current)); + if (WARN_ON_ONCE(kthread_is_per_cpu(current))) + return; - if (kthread->node == NUMA_NO_NODE) { - housekeeping_affine(current, HK_TYPE_KTHREAD); - } else { - if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) { - WARN_ON_ONCE(1); - return; - } + if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) { + WARN_ON_ONCE(1); + return; + } - mutex_lock(&kthreads_hotplug_lock); - WARN_ON_ONCE(!list_empty(&kthread->hotplug_node)); - list_add_tail(&kthread->hotplug_node, &kthreads_hotplug); - /* - * The node cpumask is racy when read from kthread() but: - * - a racing CPU going down will either fail on the subsequent - * call to set_cpus_allowed_ptr() or be migrated to housekeepers - * afterwards by the scheduler. - * - a racing CPU going up will be handled by kthreads_online_cpu() - */ - kthread_fetch_affinity(kthread, affinity); - set_cpus_allowed_ptr(current, affinity); - mutex_unlock(&kthreads_hotplug_lock); + mutex_lock(&kthread_affinity_lock); + WARN_ON_ONCE(!list_empty(&kthread->affinity_node)); + list_add_tail(&kthread->affinity_node, &kthread_affinity_list); + /* + * The node cpumask is racy when read from kthread() but: + * - a racing CPU going down will either fail on the subsequent + * call to set_cpus_allowed_ptr() or be migrated to housekeepers + * afterwards by the scheduler. + * - a racing CPU going up will be handled by kthreads_online_cpu() + */ + kthread_fetch_affinity(kthread, affinity); + set_cpus_allowed_ptr(current, affinity); + mutex_unlock(&kthread_affinity_lock); - free_cpumask_var(affinity); - } + free_cpumask_var(affinity); } static int kthread(void *_create) @@ -453,6 +453,10 @@ static int kthread(void *_create) self->started = 1; + /* + * Apply default node affinity if no call to kthread_bind[_mask]() nor + * kthread_affine_preferred() was issued before the first wake-up. + */ if (!(current->flags & PF_NO_SETAFFINITY) && !self->preferred_affinity) kthread_affine_node(); @@ -820,12 +824,13 @@ int kthreadd(void *unused) /* Setup a clean context for our children to inherit. */ set_task_comm(tsk, comm); ignore_signals(tsk); - set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD)); set_mems_allowed(node_states[N_MEMORY]); current->flags |= PF_NOFREEZE; cgroup_init_kthreadd(); + kthread_affine_node(); + for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (list_empty(&kthread_create_list)) @@ -851,6 +856,18 @@ int kthreadd(void *unused) return 0; } +/** + * kthread_affine_preferred - Define a kthread's preferred affinity + * @p: thread created by kthread_create(). + * @mask: preferred mask of CPUs (might not be online, must be possible) for @p + * to run on. + * + * Similar to kthread_bind_mask() except that the affinity is not a requirement + * but rather a preference that can be constrained by CPU isolation or CPU hotplug. + * Must be called before the first wakeup of the kthread. + * + * Returns 0 if the affinity has been applied. + */ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask) { struct kthread *kthread = to_kthread(p); @@ -873,16 +890,16 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask) goto out; } - mutex_lock(&kthreads_hotplug_lock); + mutex_lock(&kthread_affinity_lock); cpumask_copy(kthread->preferred_affinity, mask); - WARN_ON_ONCE(!list_empty(&kthread->hotplug_node)); - list_add_tail(&kthread->hotplug_node, &kthreads_hotplug); + WARN_ON_ONCE(!list_empty(&kthread->affinity_node)); + list_add_tail(&kthread->affinity_node, &kthread_affinity_list); kthread_fetch_affinity(kthread, affinity); scoped_guard (raw_spinlock_irqsave, &p->pi_lock) set_cpus_allowed_force(p, affinity); - mutex_unlock(&kthreads_hotplug_lock); + mutex_unlock(&kthread_affinity_lock); out: free_cpumask_var(affinity); @@ -890,22 +907,15 @@ out: } EXPORT_SYMBOL_GPL(kthread_affine_preferred); -/* - * Re-affine kthreads according to their preferences - * and the newly online CPU. The CPU down part is handled - * by select_fallback_rq() which default re-affines to - * housekeepers from other nodes in case the preferred - * affinity doesn't apply anymore. - */ -static int kthreads_online_cpu(unsigned int cpu) +static int kthreads_update_affinity(bool force) { cpumask_var_t affinity; struct kthread *k; int ret; - guard(mutex)(&kthreads_hotplug_lock); + guard(mutex)(&kthread_affinity_lock); - if (list_empty(&kthreads_hotplug)) + if (list_empty(&kthread_affinity_list)) return 0; if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) @@ -913,14 +923,29 @@ static int kthreads_online_cpu(unsigned int cpu) ret = 0; - list_for_each_entry(k, &kthreads_hotplug, hotplug_node) { + list_for_each_entry(k, &kthread_affinity_list, affinity_node) { if (WARN_ON_ONCE((k->task->flags & PF_NO_SETAFFINITY) || kthread_is_per_cpu(k->task))) { ret = -EINVAL; continue; } - kthread_fetch_affinity(k, affinity); - set_cpus_allowed_ptr(k->task, affinity); + + /* + * Unbound kthreads without preferred affinity are already affine + * to housekeeping, whether those CPUs are online or not. So no need + * to handle newly online CPUs for them. However housekeeping changes + * have to be applied. + * + * But kthreads with a preferred affinity or node are different: + * if none of their preferred CPUs are online and part of + * housekeeping at the same time, they must be affine to housekeeping. + * But as soon as one of their preferred CPU becomes online, they must + * be affine to them. + */ + if (force || k->preferred_affinity || k->node != NUMA_NO_NODE) { + kthread_fetch_affinity(k, affinity); + set_cpus_allowed_ptr(k->task, affinity); + } } free_cpumask_var(affinity); @@ -928,6 +953,33 @@ static int kthreads_online_cpu(unsigned int cpu) return ret; } +/** + * kthreads_update_housekeeping - Update kthreads affinity on cpuset change + * + * When cpuset changes a partition type to/from "isolated" or updates related + * cpumasks, propagate the housekeeping cpumask change to preferred kthreads + * affinity. + * + * Returns 0 if successful, -ENOMEM if temporary mask couldn't + * be allocated or -EINVAL in case of internal error. + */ +int kthreads_update_housekeeping(void) +{ + return kthreads_update_affinity(true); +} + +/* + * Re-affine kthreads according to their preferences + * and the newly online CPU. The CPU down part is handled + * by select_fallback_rq() which default re-affines to + * housekeepers from other nodes in case the preferred + * affinity doesn't apply anymore. + */ +static int kthreads_online_cpu(unsigned int cpu) +{ + return kthreads_update_affinity(false); +} + static int kthreads_init(void) { return cpuhp_setup_state(CPUHP_AP_KTHREADS_ONLINE, "kthreads:online", diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 9917756dae46..1acbad2dbfdf 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -1356,6 +1356,25 @@ void klp_module_going(struct module *mod) mutex_unlock(&klp_mutex); } +void *klp_find_section_by_name(const struct module *mod, const char *name, + size_t *sec_size) +{ + struct klp_modinfo *info = mod->klp_info; + + for (int i = 1; i < info->hdr.e_shnum; i++) { + Elf_Shdr *shdr = &info->sechdrs[i]; + + if (!strcmp(info->secstrings + shdr->sh_name, name)) { + *sec_size = shdr->sh_size; + return (void *)shdr->sh_addr; + } + } + + *sec_size = 0; + return NULL; +} +EXPORT_SYMBOL_GPL(klp_find_section_by_name); + static int __init klp_init(void) { klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj); diff --git a/kernel/liveupdate/Kconfig b/kernel/liveupdate/Kconfig index d2aeaf13c3ac..1a8513f16ef7 100644 --- a/kernel/liveupdate/Kconfig +++ b/kernel/liveupdate/Kconfig @@ -54,7 +54,6 @@ config KEXEC_HANDOVER_ENABLE_DEFAULT config LIVEUPDATE bool "Live Update Orchestrator" depends on KEXEC_HANDOVER - depends on SHMEM help Enable the Live Update Orchestrator. Live Update is a mechanism, typically based on kexec, that allows the kernel to be updated @@ -73,4 +72,20 @@ config LIVEUPDATE If unsure, say N. +config LIVEUPDATE_MEMFD + bool "Live update support for memfd" + depends on LIVEUPDATE + depends on MEMFD_CREATE + depends on SHMEM + default LIVEUPDATE + help + Enable live update support for memfd regions. This allows preserving + memfd-backed memory across kernel live updates. + + This can be used to back VM memory with memfds, allowing the guest + memory to persist, or for other user workloads needing to preserve + pages. + + If unsure, say N. + endmenu diff --git a/kernel/liveupdate/Makefile b/kernel/liveupdate/Makefile index 7cad2eece32d..d2f779cbe279 100644 --- a/kernel/liveupdate/Makefile +++ b/kernel/liveupdate/Makefile @@ -3,6 +3,7 @@ luo-y := \ luo_core.o \ luo_file.o \ + luo_flb.o \ luo_session.o obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index d4482b6e3cae..fb3a7b67676e 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -15,6 +15,7 @@ #include <linux/count_zeros.h> #include <linux/kexec.h> #include <linux/kexec_handover.h> +#include <linux/kho/abi/kexec_handover.h> #include <linux/libfdt.h> #include <linux/list.h> #include <linux/memblock.h> @@ -24,7 +25,6 @@ #include <asm/early_ioremap.h> -#include "kexec_handover_internal.h" /* * KHO is tightly coupled with mm init and needs access to some of mm * internal APIs. @@ -33,10 +33,7 @@ #include "../kexec_internal.h" #include "kexec_handover_internal.h" -#define KHO_FDT_COMPATIBLE "kho-v1" -#define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map" -#define PROP_SUB_FDT "fdt" - +/* The magic token for preserved pages */ #define KHO_PAGE_MAGIC 0x4b484f50U /* ASCII for 'KHOP' */ /* @@ -219,10 +216,32 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, return 0; } +/* For physically contiguous 0-order pages. */ +static void kho_init_pages(struct page *page, unsigned long nr_pages) +{ + for (unsigned long i = 0; i < nr_pages; i++) + set_page_count(page + i, 1); +} + +static void kho_init_folio(struct page *page, unsigned int order) +{ + unsigned long nr_pages = (1 << order); + + /* Head page gets refcount of 1. */ + set_page_count(page, 1); + + /* For higher order folios, tail pages get a page count of zero. */ + for (unsigned long i = 1; i < nr_pages; i++) + set_page_count(page + i, 0); + + if (order > 0) + prep_compound_page(page, order); +} + static struct page *kho_restore_page(phys_addr_t phys, bool is_folio) { struct page *page = pfn_to_online_page(PHYS_PFN(phys)); - unsigned int nr_pages, ref_cnt; + unsigned long nr_pages; union kho_page_info info; if (!page) @@ -240,20 +259,19 @@ static struct page *kho_restore_page(phys_addr_t phys, bool is_folio) /* Clear private to make sure later restores on this page error out. */ page->private = 0; - /* Head page gets refcount of 1. */ - set_page_count(page, 1); - - /* - * For higher order folios, tail pages get a page count of zero. - * For physically contiguous order-0 pages every pages gets a page - * count of 1 - */ - ref_cnt = is_folio ? 0 : 1; - for (unsigned int i = 1; i < nr_pages; i++) - set_page_count(page + i, ref_cnt); - if (is_folio && info.order) - prep_compound_page(page, info.order); + if (is_folio) + kho_init_folio(page, info.order); + else + kho_init_pages(page, nr_pages); + + /* Always mark headpage's codetag as empty to avoid accounting mismatch */ + clear_page_tag_ref(page); + if (!is_folio) { + /* Also do that for the non-compound tail pages */ + for (unsigned int i = 1; i < nr_pages; i++) + clear_page_tag_ref(page + i); + } adjust_managed_page_count(page, nr_pages); return page; @@ -281,9 +299,9 @@ EXPORT_SYMBOL_GPL(kho_restore_folio); * Restore a contiguous list of order 0 pages that was preserved with * kho_preserve_pages(). * - * Return: 0 on success, error code on failure + * Return: the first page on success, NULL on failure. */ -struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages) +struct page *kho_restore_pages(phys_addr_t phys, unsigned long nr_pages) { const unsigned long start_pfn = PHYS_PFN(phys); const unsigned long end_pfn = start_pfn + nr_pages; @@ -378,7 +396,7 @@ static void kho_update_memory_map(struct khoser_mem_chunk *first_chunk) void *ptr; u64 phys; - ptr = fdt_getprop_w(kho_out.fdt, 0, PROP_PRESERVED_MEMORY_MAP, NULL); + ptr = fdt_getprop_w(kho_out.fdt, 0, KHO_FDT_MEMORY_MAP_PROP_NAME, NULL); /* Check and discard previous memory map */ phys = get_unaligned((u64 *)ptr); @@ -466,7 +484,7 @@ static phys_addr_t __init kho_get_mem_map_phys(const void *fdt) const void *mem_ptr; int len; - mem_ptr = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len); + mem_ptr = fdt_getprop(fdt, 0, KHO_FDT_MEMORY_MAP_PROP_NAME, &len); if (!mem_ptr || len != sizeof(u64)) { pr_err("failed to get preserved memory bitmaps\n"); return 0; @@ -637,11 +655,13 @@ static void __init kho_reserve_scratch(void) scratch_size_update(); /* FIXME: deal with node hot-plug/remove */ - kho_scratch_cnt = num_online_nodes() + 2; + kho_scratch_cnt = nodes_weight(node_states[N_MEMORY]) + 2; size = kho_scratch_cnt * sizeof(*kho_scratch); kho_scratch = memblock_alloc(size, PAGE_SIZE); - if (!kho_scratch) + if (!kho_scratch) { + pr_err("Failed to reserve scratch array\n"); goto err_disable_kho; + } /* * reserve scratch area in low memory for lowmem allocations in the @@ -650,8 +670,10 @@ static void __init kho_reserve_scratch(void) size = scratch_size_lowmem; addr = memblock_phys_alloc_range(size, CMA_MIN_ALIGNMENT_BYTES, 0, ARCH_LOW_ADDRESS_LIMIT); - if (!addr) + if (!addr) { + pr_err("Failed to reserve lowmem scratch buffer\n"); goto err_free_scratch_desc; + } kho_scratch[i].addr = addr; kho_scratch[i].size = size; @@ -660,20 +682,28 @@ static void __init kho_reserve_scratch(void) /* reserve large contiguous area for allocations without nid */ size = scratch_size_global; addr = memblock_phys_alloc(size, CMA_MIN_ALIGNMENT_BYTES); - if (!addr) + if (!addr) { + pr_err("Failed to reserve global scratch buffer\n"); goto err_free_scratch_areas; + } kho_scratch[i].addr = addr; kho_scratch[i].size = size; i++; - for_each_online_node(nid) { + /* + * Loop over nodes that have both memory and are online. Skip + * memoryless nodes, as we can not allocate scratch areas there. + */ + for_each_node_state(nid, N_MEMORY) { size = scratch_size_node(nid); addr = memblock_alloc_range_nid(size, CMA_MIN_ALIGNMENT_BYTES, 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid, true); - if (!addr) + if (!addr) { + pr_err("Failed to reserve nid %d scratch buffer\n", nid); goto err_free_scratch_areas; + } kho_scratch[i].addr = addr; kho_scratch[i].size = size; @@ -727,7 +757,8 @@ int kho_add_subtree(const char *name, void *fdt) goto out_pack; } - err = fdt_setprop(root_fdt, off, PROP_SUB_FDT, &phys, sizeof(phys)); + err = fdt_setprop(root_fdt, off, KHO_FDT_SUB_TREE_PROP_NAME, + &phys, sizeof(phys)); if (err < 0) goto out_pack; @@ -758,7 +789,7 @@ void kho_remove_subtree(void *fdt) const u64 *val; int len; - val = fdt_getprop(root_fdt, off, PROP_SUB_FDT, &len); + val = fdt_getprop(root_fdt, off, KHO_FDT_SUB_TREE_PROP_NAME, &len); if (!val || len != sizeof(phys_addr_t)) continue; @@ -823,7 +854,7 @@ EXPORT_SYMBOL_GPL(kho_unpreserve_folio); * * Return: 0 on success, error code on failure */ -int kho_preserve_pages(struct page *page, unsigned int nr_pages) +int kho_preserve_pages(struct page *page, unsigned long nr_pages) { struct kho_mem_track *track = &kho_out.track; const unsigned long start_pfn = page_to_pfn(page); @@ -867,7 +898,7 @@ EXPORT_SYMBOL_GPL(kho_preserve_pages); * kho_preserve_pages() call. Unpreserving arbitrary sub-ranges of larger * preserved blocks is not supported. */ -void kho_unpreserve_pages(struct page *page, unsigned int nr_pages) +void kho_unpreserve_pages(struct page *page, unsigned long nr_pages) { struct kho_mem_track *track = &kho_out.track; const unsigned long start_pfn = page_to_pfn(page); @@ -877,21 +908,6 @@ void kho_unpreserve_pages(struct page *page, unsigned int nr_pages) } EXPORT_SYMBOL_GPL(kho_unpreserve_pages); -struct kho_vmalloc_hdr { - DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *); -}; - -#define KHO_VMALLOC_SIZE \ - ((PAGE_SIZE - sizeof(struct kho_vmalloc_hdr)) / \ - sizeof(phys_addr_t)) - -struct kho_vmalloc_chunk { - struct kho_vmalloc_hdr hdr; - phys_addr_t phys[KHO_VMALLOC_SIZE]; -}; - -static_assert(sizeof(struct kho_vmalloc_chunk) == PAGE_SIZE); - /* vmalloc flags KHO supports */ #define KHO_VMALLOC_SUPPORTED_FLAGS (VM_ALLOC | VM_ALLOW_HUGE_VMAP) @@ -1006,8 +1022,10 @@ int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation) chunk->phys[idx++] = phys; if (idx == ARRAY_SIZE(chunk->phys)) { chunk = new_vmalloc_chunk(chunk); - if (!chunk) + if (!chunk) { + err = -ENOMEM; goto err_free; + } idx = 0; } } @@ -1305,7 +1323,7 @@ int kho_retrieve_subtree(const char *name, phys_addr_t *phys) if (offset < 0) return -ENOENT; - val = fdt_getprop(fdt, offset, PROP_SUB_FDT, &len); + val = fdt_getprop(fdt, offset, KHO_FDT_SUB_TREE_PROP_NAME, &len); if (!val || len != sizeof(*val)) return -EINVAL; @@ -1325,7 +1343,7 @@ static __init int kho_out_fdt_setup(void) err |= fdt_finish_reservemap(root); err |= fdt_begin_node(root, ""); err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE); - err |= fdt_property(root, PROP_PRESERVED_MEMORY_MAP, &empty_mem_map, + err |= fdt_property(root, KHO_FDT_MEMORY_MAP_PROP_NAME, &empty_mem_map, sizeof(empty_mem_map)); err |= fdt_end_node(root); err |= fdt_finish(root); @@ -1441,46 +1459,40 @@ void __init kho_memory_init(void) void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len, phys_addr_t scratch_phys, u64 scratch_len) { + unsigned int scratch_cnt = scratch_len / sizeof(*kho_scratch); struct kho_scratch *scratch = NULL; phys_addr_t mem_map_phys; void *fdt = NULL; - int err = 0; - unsigned int scratch_cnt = scratch_len / sizeof(*kho_scratch); + int err; /* Validate the input FDT */ fdt = early_memremap(fdt_phys, fdt_len); if (!fdt) { pr_warn("setup: failed to memremap FDT (0x%llx)\n", fdt_phys); - err = -EFAULT; - goto out; + goto err_report; } err = fdt_check_header(fdt); if (err) { pr_warn("setup: handover FDT (0x%llx) is invalid: %d\n", fdt_phys, err); - err = -EINVAL; - goto out; + goto err_unmap_fdt; } err = fdt_node_check_compatible(fdt, 0, KHO_FDT_COMPATIBLE); if (err) { pr_warn("setup: handover FDT (0x%llx) is incompatible with '%s': %d\n", fdt_phys, KHO_FDT_COMPATIBLE, err); - err = -EINVAL; - goto out; + goto err_unmap_fdt; } mem_map_phys = kho_get_mem_map_phys(fdt); - if (!mem_map_phys) { - err = -ENOENT; - goto out; - } + if (!mem_map_phys) + goto err_unmap_fdt; scratch = early_memremap(scratch_phys, scratch_len); if (!scratch) { pr_warn("setup: failed to memremap scratch (phys=0x%llx, len=%lld)\n", scratch_phys, scratch_len); - err = -EFAULT; - goto out; + goto err_unmap_fdt; } /* @@ -1497,7 +1509,7 @@ void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len, if (WARN_ON(err)) { pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %pe", &area->addr, &size, ERR_PTR(err)); - goto out; + goto err_unmap_scratch; } pr_debug("Marked 0x%pa+0x%pa as scratch", &area->addr, &size); } @@ -1519,13 +1531,14 @@ void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len, kho_scratch_cnt = scratch_cnt; pr_info("found kexec handover data.\n"); -out: - if (fdt) - early_memunmap(fdt, fdt_len); - if (scratch) - early_memunmap(scratch, scratch_len); - if (err) - pr_warn("disabling KHO revival: %d\n", err); + return; + +err_unmap_scratch: + early_memunmap(scratch, scratch_len); +err_unmap_fdt: + early_memunmap(fdt, fdt_len); +err_report: + pr_warn("disabling KHO revival\n"); } /* Helper functions for kexec_file_load */ diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c index 944663d99dd9..dda7bb57d421 100644 --- a/kernel/liveupdate/luo_core.c +++ b/kernel/liveupdate/luo_core.c @@ -35,8 +35,7 @@ * iommu, interrupts, vfio, participating filesystems, and memory management. * * LUO uses Kexec Handover to transfer memory state from the current kernel to - * the next kernel. For more details see - * Documentation/core-api/kho/concepts.rst. + * the next kernel. For more details see Documentation/core-api/kho/index.rst. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -128,7 +127,9 @@ static int __init luo_early_startup(void) if (err) return err; - return 0; + err = luo_flb_setup_incoming(luo_global.fdt_in); + + return err; } static int __init liveupdate_early_init(void) @@ -165,6 +166,7 @@ static int __init luo_fdt_setup(void) err |= fdt_property_string(fdt_out, "compatible", LUO_FDT_COMPATIBLE); err |= fdt_property(fdt_out, LUO_FDT_LIVEUPDATE_NUM, &ln, sizeof(ln)); err |= luo_session_setup_outgoing(fdt_out); + err |= luo_flb_setup_outgoing(fdt_out); err |= fdt_end_node(fdt_out); err |= fdt_finish(fdt_out); if (err) @@ -226,6 +228,8 @@ int liveupdate_reboot(void) if (err) return err; + luo_flb_serialize(); + err = kho_finalize(); if (err) { pr_err("kho_finalize failed %d\n", err); diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c index a32a777f6df8..4c7df52a6507 100644 --- a/kernel/liveupdate/luo_file.c +++ b/kernel/liveupdate/luo_file.c @@ -104,6 +104,7 @@ #include <linux/io.h> #include <linux/kexec_handover.h> #include <linux/kho/abi/luo.h> +#include <linux/list_private.h> #include <linux/liveupdate.h> #include <linux/module.h> #include <linux/sizes.h> @@ -273,7 +274,7 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd) goto err_fput; err = -ENOENT; - luo_list_for_each_private(fh, &luo_file_handler_list, list) { + list_private_for_each_entry(fh, &luo_file_handler_list, list) { if (fh->ops->can_preserve(fh, file)) { err = 0; break; @@ -284,10 +285,14 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd) if (err) goto err_free_files_mem; + err = luo_flb_file_preserve(fh); + if (err) + goto err_free_files_mem; + luo_file = kzalloc(sizeof(*luo_file), GFP_KERNEL); if (!luo_file) { err = -ENOMEM; - goto err_free_files_mem; + goto err_flb_unpreserve; } luo_file->file = file; @@ -311,6 +316,8 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd) err_kfree: kfree(luo_file); +err_flb_unpreserve: + luo_flb_file_unpreserve(fh); err_free_files_mem: luo_free_files_mem(file_set); err_fput: @@ -352,6 +359,7 @@ void luo_file_unpreserve_files(struct luo_file_set *file_set) args.serialized_data = luo_file->serialized_data; args.private_data = luo_file->private_data; luo_file->fh->ops->unpreserve(&args); + luo_flb_file_unpreserve(luo_file->fh); list_del(&luo_file->list); file_set->count--; @@ -402,8 +410,6 @@ static void luo_file_unfreeze_one(struct luo_file_set *file_set, luo_file->fh->ops->unfreeze(&args); } - - luo_file->serialized_data = 0; } static void __luo_file_unfreeze(struct luo_file_set *file_set, @@ -629,6 +635,7 @@ static void luo_file_finish_one(struct luo_file_set *file_set, args.retrieved = luo_file->retrieved; luo_file->fh->ops->finish(&args); + luo_flb_file_finish(luo_file->fh); } /** @@ -760,7 +767,7 @@ int luo_file_deserialize(struct luo_file_set *file_set, bool handler_found = false; struct luo_file *luo_file; - luo_list_for_each_private(fh, &luo_file_handler_list, list) { + list_private_for_each_entry(fh, &luo_file_handler_list, list) { if (!strcmp(fh->compatible, file_ser[i].compatible)) { handler_found = true; break; @@ -835,7 +842,7 @@ int liveupdate_register_file_handler(struct liveupdate_file_handler *fh) return -EBUSY; /* Check for duplicate compatible strings */ - luo_list_for_each_private(fh_iter, &luo_file_handler_list, list) { + list_private_for_each_entry(fh_iter, &luo_file_handler_list, list) { if (!strcmp(fh_iter->compatible, fh->compatible)) { pr_err("File handler registration failed: Compatible string '%s' already registered.\n", fh->compatible); @@ -850,10 +857,13 @@ int liveupdate_register_file_handler(struct liveupdate_file_handler *fh) goto err_resume; } + INIT_LIST_HEAD(&ACCESS_PRIVATE(fh, flb_list)); INIT_LIST_HEAD(&ACCESS_PRIVATE(fh, list)); list_add_tail(&ACCESS_PRIVATE(fh, list), &luo_file_handler_list); luo_session_resume(); + liveupdate_test_register(fh); + return 0; err_resume: @@ -870,23 +880,38 @@ err_resume: * * It ensures safe removal by checking that: * No live update session is currently in progress. + * No FLB registered with this file handler. * * If the unregistration fails, the internal test state is reverted. * * Return: 0 Success. -EOPNOTSUPP when live update is not enabled. -EBUSY A live - * update is in progress, can't quiesce live update. + * update is in progress, can't quiesce live update or FLB is registred with + * this file handler. */ int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh) { + int err = -EBUSY; + if (!liveupdate_enabled()) return -EOPNOTSUPP; + liveupdate_test_unregister(fh); + if (!luo_session_quiesce()) - return -EBUSY; + goto err_register; + + if (!list_empty(&ACCESS_PRIVATE(fh, flb_list))) + goto err_resume; list_del(&ACCESS_PRIVATE(fh, list)); module_put(fh->ops->owner); luo_session_resume(); return 0; + +err_resume: + luo_session_resume(); +err_register: + liveupdate_test_register(fh); + return err; } diff --git a/kernel/liveupdate/luo_flb.c b/kernel/liveupdate/luo_flb.c new file mode 100644 index 000000000000..4c437de5c0b0 --- /dev/null +++ b/kernel/liveupdate/luo_flb.c @@ -0,0 +1,654 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin <pasha.tatashin@soleen.com> + */ + +/** + * DOC: LUO File Lifecycle Bound Global Data + * + * File-Lifecycle-Bound (FLB) objects provide a mechanism for managing global + * state that is shared across multiple live-updatable files. The lifecycle of + * this shared state is tied to the preservation of the files that depend on it. + * + * An FLB represents a global resource, such as the IOMMU core state, that is + * required by multiple file descriptors (e.g., all VFIO fds). + * + * The preservation of the FLB's state is triggered when the *first* file + * depending on it is preserved. The cleanup of this state (unpreserve or + * finish) is triggered when the *last* file depending on it is unpreserved or + * finished. + * + * Handler Dependency: A file handler declares its dependency on one or more + * FLBs by registering them via liveupdate_register_flb(). + * + * Callback Model: Each FLB is defined by a set of operations + * (&struct liveupdate_flb_ops) that LUO invokes at key points: + * + * - .preserve(): Called for the first file. Saves global state. + * - .unpreserve(): Called for the last file (if aborted pre-reboot). + * - .retrieve(): Called on-demand in the new kernel to restore the state. + * - .finish(): Called for the last file in the new kernel for cleanup. + * + * This reference-counted approach ensures that shared state is saved exactly + * once and restored exactly once, regardless of how many files depend on it, + * and that its lifecycle is correctly managed across the kexec transition. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/cleanup.h> +#include <linux/err.h> +#include <linux/errno.h> +#include <linux/io.h> +#include <linux/kexec_handover.h> +#include <linux/kho/abi/luo.h> +#include <linux/libfdt.h> +#include <linux/list_private.h> +#include <linux/liveupdate.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/unaligned.h> +#include "luo_internal.h" + +#define LUO_FLB_PGCNT 1ul +#define LUO_FLB_MAX (((LUO_FLB_PGCNT << PAGE_SHIFT) - \ + sizeof(struct luo_flb_header_ser)) / sizeof(struct luo_flb_ser)) + +struct luo_flb_header { + struct luo_flb_header_ser *header_ser; + struct luo_flb_ser *ser; + bool active; +}; + +struct luo_flb_global { + struct luo_flb_header incoming; + struct luo_flb_header outgoing; + struct list_head list; + long count; +}; + +static struct luo_flb_global luo_flb_global = { + .list = LIST_HEAD_INIT(luo_flb_global.list), +}; + +/* + * struct luo_flb_link - Links an FLB definition to a file handler's internal + * list of dependencies. + * @flb: A pointer to the registered &struct liveupdate_flb definition. + * @list: The list_head for linking. + */ +struct luo_flb_link { + struct liveupdate_flb *flb; + struct list_head list; +}; + +/* luo_flb_get_private - Access private field, and if needed initialize it. */ +static struct luo_flb_private *luo_flb_get_private(struct liveupdate_flb *flb) +{ + struct luo_flb_private *private = &ACCESS_PRIVATE(flb, private); + + if (!private->initialized) { + mutex_init(&private->incoming.lock); + mutex_init(&private->outgoing.lock); + INIT_LIST_HEAD(&private->list); + private->users = 0; + private->initialized = true; + } + + return private; +} + +static int luo_flb_file_preserve_one(struct liveupdate_flb *flb) +{ + struct luo_flb_private *private = luo_flb_get_private(flb); + + scoped_guard(mutex, &private->outgoing.lock) { + if (!private->outgoing.count) { + struct liveupdate_flb_op_args args = {0}; + int err; + + args.flb = flb; + err = flb->ops->preserve(&args); + if (err) + return err; + private->outgoing.data = args.data; + private->outgoing.obj = args.obj; + } + private->outgoing.count++; + } + + return 0; +} + +static void luo_flb_file_unpreserve_one(struct liveupdate_flb *flb) +{ + struct luo_flb_private *private = luo_flb_get_private(flb); + + scoped_guard(mutex, &private->outgoing.lock) { + private->outgoing.count--; + if (!private->outgoing.count) { + struct liveupdate_flb_op_args args = {0}; + + args.flb = flb; + args.data = private->outgoing.data; + args.obj = private->outgoing.obj; + + if (flb->ops->unpreserve) + flb->ops->unpreserve(&args); + + private->outgoing.data = 0; + private->outgoing.obj = NULL; + } + } +} + +static int luo_flb_retrieve_one(struct liveupdate_flb *flb) +{ + struct luo_flb_private *private = luo_flb_get_private(flb); + struct luo_flb_header *fh = &luo_flb_global.incoming; + struct liveupdate_flb_op_args args = {0}; + bool found = false; + int err; + + guard(mutex)(&private->incoming.lock); + + if (private->incoming.finished) + return -ENODATA; + + if (private->incoming.retrieved) + return 0; + + if (!fh->active) + return -ENODATA; + + for (int i = 0; i < fh->header_ser->count; i++) { + if (!strcmp(fh->ser[i].name, flb->compatible)) { + private->incoming.data = fh->ser[i].data; + private->incoming.count = fh->ser[i].count; + found = true; + break; + } + } + + if (!found) + return -ENOENT; + + args.flb = flb; + args.data = private->incoming.data; + + err = flb->ops->retrieve(&args); + if (err) + return err; + + private->incoming.obj = args.obj; + private->incoming.retrieved = true; + + return 0; +} + +static void luo_flb_file_finish_one(struct liveupdate_flb *flb) +{ + struct luo_flb_private *private = luo_flb_get_private(flb); + u64 count; + + scoped_guard(mutex, &private->incoming.lock) + count = --private->incoming.count; + + if (!count) { + struct liveupdate_flb_op_args args = {0}; + + if (!private->incoming.retrieved) { + int err = luo_flb_retrieve_one(flb); + + if (WARN_ON(err)) + return; + } + + scoped_guard(mutex, &private->incoming.lock) { + args.flb = flb; + args.obj = private->incoming.obj; + flb->ops->finish(&args); + + private->incoming.data = 0; + private->incoming.obj = NULL; + private->incoming.finished = true; + } + } +} + +/** + * luo_flb_file_preserve - Notifies FLBs that a file is about to be preserved. + * @fh: The file handler for the preserved file. + * + * This function iterates through all FLBs associated with the given file + * handler. It increments the reference count for each FLB. If the count becomes + * 1, it triggers the FLB's .preserve() callback to save the global state. + * + * This operation is atomic. If any FLB's .preserve() op fails, it will roll + * back by calling .unpreserve() on any FLBs that were successfully preserved + * during this call. + * + * Context: Called from luo_preserve_file() + * Return: 0 on success, or a negative errno on failure. + */ +int luo_flb_file_preserve(struct liveupdate_file_handler *fh) +{ + struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list); + struct luo_flb_link *iter; + int err = 0; + + list_for_each_entry(iter, flb_list, list) { + err = luo_flb_file_preserve_one(iter->flb); + if (err) + goto exit_err; + } + + return 0; + +exit_err: + list_for_each_entry_continue_reverse(iter, flb_list, list) + luo_flb_file_unpreserve_one(iter->flb); + + return err; +} + +/** + * luo_flb_file_unpreserve - Notifies FLBs that a dependent file was unpreserved. + * @fh: The file handler for the unpreserved file. + * + * This function iterates through all FLBs associated with the given file + * handler, in reverse order of registration. It decrements the reference count + * for each FLB. If the count becomes 0, it triggers the FLB's .unpreserve() + * callback to clean up the global state. + * + * Context: Called when a preserved file is being cleaned up before reboot + * (e.g., from luo_file_unpreserve_files()). + */ +void luo_flb_file_unpreserve(struct liveupdate_file_handler *fh) +{ + struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list); + struct luo_flb_link *iter; + + list_for_each_entry_reverse(iter, flb_list, list) + luo_flb_file_unpreserve_one(iter->flb); +} + +/** + * luo_flb_file_finish - Notifies FLBs that a dependent file has been finished. + * @fh: The file handler for the finished file. + * + * This function iterates through all FLBs associated with the given file + * handler, in reverse order of registration. It decrements the incoming + * reference count for each FLB. If the count becomes 0, it triggers the FLB's + * .finish() callback for final cleanup in the new kernel. + * + * Context: Called from luo_file_finish() for each file being finished. + */ +void luo_flb_file_finish(struct liveupdate_file_handler *fh) +{ + struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list); + struct luo_flb_link *iter; + + list_for_each_entry_reverse(iter, flb_list, list) + luo_flb_file_finish_one(iter->flb); +} + +/** + * liveupdate_register_flb - Associate an FLB with a file handler and register it globally. + * @fh: The file handler that will now depend on the FLB. + * @flb: The File-Lifecycle-Bound object to associate. + * + * Establishes a dependency, informing the LUO core that whenever a file of + * type @fh is preserved, the state of @flb must also be managed. + * + * On the first registration of a given @flb object, it is added to a global + * registry. This function checks for duplicate registrations, both for a + * specific handler and globally, and ensures the total number of unique + * FLBs does not exceed the system limit. + * + * Context: Typically called from a subsystem's module init function after + * both the handler and the FLB have been defined and initialized. + * Return: 0 on success. Returns a negative errno on failure: + * -EINVAL if arguments are NULL or not initialized. + * -ENOMEM on memory allocation failure. + * -EEXIST if this FLB is already registered with this handler. + * -ENOSPC if the maximum number of global FLBs has been reached. + * -EOPNOTSUPP if live update is disabled or not configured. + */ +int liveupdate_register_flb(struct liveupdate_file_handler *fh, + struct liveupdate_flb *flb) +{ + struct luo_flb_private *private = luo_flb_get_private(flb); + struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list); + struct luo_flb_link *link __free(kfree) = NULL; + struct liveupdate_flb *gflb; + struct luo_flb_link *iter; + int err; + + if (!liveupdate_enabled()) + return -EOPNOTSUPP; + + if (WARN_ON(!flb->ops->preserve || !flb->ops->unpreserve || + !flb->ops->retrieve || !flb->ops->finish)) { + return -EINVAL; + } + + /* + * File handler must already be registered, as it initializes the + * flb_list + */ + if (WARN_ON(list_empty(&ACCESS_PRIVATE(fh, list)))) + return -EINVAL; + + link = kzalloc(sizeof(*link), GFP_KERNEL); + if (!link) + return -ENOMEM; + + /* + * Ensure the system is quiescent (no active sessions). + * This acts as a global lock for registration: no other thread can + * be in this section, and no sessions can be creating/using FDs. + */ + if (!luo_session_quiesce()) + return -EBUSY; + + /* Check that this FLB is not already linked to this file handler */ + err = -EEXIST; + list_for_each_entry(iter, flb_list, list) { + if (iter->flb == flb) + goto err_resume; + } + + /* + * If this FLB is not linked to global list it's the first time the FLB + * is registered + */ + if (!private->users) { + if (WARN_ON(!list_empty(&private->list))) { + err = -EINVAL; + goto err_resume; + } + + if (luo_flb_global.count == LUO_FLB_MAX) { + err = -ENOSPC; + goto err_resume; + } + + /* Check that compatible string is unique in global list */ + list_private_for_each_entry(gflb, &luo_flb_global.list, private.list) { + if (!strcmp(gflb->compatible, flb->compatible)) + goto err_resume; + } + + if (!try_module_get(flb->ops->owner)) { + err = -EAGAIN; + goto err_resume; + } + + list_add_tail(&private->list, &luo_flb_global.list); + luo_flb_global.count++; + } + + /* Finally, link the FLB to the file handler */ + private->users++; + link->flb = flb; + list_add_tail(&no_free_ptr(link)->list, flb_list); + luo_session_resume(); + + return 0; + +err_resume: + luo_session_resume(); + return err; +} + +/** + * liveupdate_unregister_flb - Remove an FLB dependency from a file handler. + * @fh: The file handler that is currently depending on the FLB. + * @flb: The File-Lifecycle-Bound object to remove. + * + * Removes the association between the specified file handler and the FLB + * previously established by liveupdate_register_flb(). + * + * This function manages the global lifecycle of the FLB. It decrements the + * FLB's usage count. If this was the last file handler referencing this FLB, + * the FLB is removed from the global registry and the reference to its + * owner module (acquired during registration) is released. + * + * Context: This function ensures the session is quiesced (no active FDs + * being created) during the update. It is typically called from a + * subsystem's module exit function. + * Return: 0 on success. + * -EOPNOTSUPP if live update is disabled. + * -EBUSY if the live update session is active and cannot be quiesced. + * -ENOENT if the FLB was not found in the file handler's list. + */ +int liveupdate_unregister_flb(struct liveupdate_file_handler *fh, + struct liveupdate_flb *flb) +{ + struct luo_flb_private *private = luo_flb_get_private(flb); + struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list); + struct luo_flb_link *iter; + int err = -ENOENT; + + if (!liveupdate_enabled()) + return -EOPNOTSUPP; + + /* + * Ensure the system is quiescent (no active sessions). + * This acts as a global lock for unregistration. + */ + if (!luo_session_quiesce()) + return -EBUSY; + + /* Find and remove the link from the file handler's list */ + list_for_each_entry(iter, flb_list, list) { + if (iter->flb == flb) { + list_del(&iter->list); + kfree(iter); + err = 0; + break; + } + } + + if (err) + goto err_resume; + + private->users--; + /* + * If this is the last file-handler with which we are registred, remove + * from the global list, and relese module reference. + */ + if (!private->users) { + list_del_init(&private->list); + luo_flb_global.count--; + module_put(flb->ops->owner); + } + + luo_session_resume(); + + return 0; + +err_resume: + luo_session_resume(); + return err; +} + +/** + * liveupdate_flb_get_incoming - Retrieve the incoming FLB object. + * @flb: The FLB definition. + * @objp: Output parameter; will be populated with the live shared object. + * + * Returns a pointer to its shared live object for the incoming (post-reboot) + * path. + * + * If this is the first time the object is requested in the new kernel, this + * function will trigger the FLB's .retrieve() callback to reconstruct the + * object from its preserved state. Subsequent calls will return the same + * cached object. + * + * Return: 0 on success, or a negative errno on failure. -ENODATA means no + * incoming FLB data, -ENOENT means specific flb not found in the incoming + * data, and -EOPNOTSUPP when live update is disabled or not configured. + */ +int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, void **objp) +{ + struct luo_flb_private *private = luo_flb_get_private(flb); + + if (!liveupdate_enabled()) + return -EOPNOTSUPP; + + if (!private->incoming.obj) { + int err = luo_flb_retrieve_one(flb); + + if (err) + return err; + } + + guard(mutex)(&private->incoming.lock); + *objp = private->incoming.obj; + + return 0; +} + +/** + * liveupdate_flb_get_outgoing - Retrieve the outgoing FLB object. + * @flb: The FLB definition. + * @objp: Output parameter; will be populated with the live shared object. + * + * Returns a pointer to its shared live object for the outgoing (pre-reboot) + * path. + * + * This function assumes the object has already been created by the FLB's + * .preserve() callback, which is triggered when the first dependent file + * is preserved. + * + * Return: 0 on success, or a negative errno on failure. + */ +int liveupdate_flb_get_outgoing(struct liveupdate_flb *flb, void **objp) +{ + struct luo_flb_private *private = luo_flb_get_private(flb); + + if (!liveupdate_enabled()) + return -EOPNOTSUPP; + + guard(mutex)(&private->outgoing.lock); + *objp = private->outgoing.obj; + + return 0; +} + +int __init luo_flb_setup_outgoing(void *fdt_out) +{ + struct luo_flb_header_ser *header_ser; + u64 header_ser_pa; + int err; + + header_ser = kho_alloc_preserve(LUO_FLB_PGCNT << PAGE_SHIFT); + if (IS_ERR(header_ser)) + return PTR_ERR(header_ser); + + header_ser_pa = virt_to_phys(header_ser); + + err = fdt_begin_node(fdt_out, LUO_FDT_FLB_NODE_NAME); + err |= fdt_property_string(fdt_out, "compatible", + LUO_FDT_FLB_COMPATIBLE); + err |= fdt_property(fdt_out, LUO_FDT_FLB_HEADER, &header_ser_pa, + sizeof(header_ser_pa)); + err |= fdt_end_node(fdt_out); + + if (err) + goto err_unpreserve; + + header_ser->pgcnt = LUO_FLB_PGCNT; + luo_flb_global.outgoing.header_ser = header_ser; + luo_flb_global.outgoing.ser = (void *)(header_ser + 1); + luo_flb_global.outgoing.active = true; + + return 0; + +err_unpreserve: + kho_unpreserve_free(header_ser); + + return err; +} + +int __init luo_flb_setup_incoming(void *fdt_in) +{ + struct luo_flb_header_ser *header_ser; + int err, header_size, offset; + const void *ptr; + u64 header_ser_pa; + + offset = fdt_subnode_offset(fdt_in, 0, LUO_FDT_FLB_NODE_NAME); + if (offset < 0) { + pr_err("Unable to get FLB node [%s]\n", LUO_FDT_FLB_NODE_NAME); + + return -ENOENT; + } + + err = fdt_node_check_compatible(fdt_in, offset, + LUO_FDT_FLB_COMPATIBLE); + if (err) { + pr_err("FLB node is incompatible with '%s' [%d]\n", + LUO_FDT_FLB_COMPATIBLE, err); + + return -EINVAL; + } + + header_size = 0; + ptr = fdt_getprop(fdt_in, offset, LUO_FDT_FLB_HEADER, &header_size); + if (!ptr || header_size != sizeof(u64)) { + pr_err("Unable to get FLB header property '%s' [%d]\n", + LUO_FDT_FLB_HEADER, header_size); + + return -EINVAL; + } + + header_ser_pa = get_unaligned((u64 *)ptr); + header_ser = phys_to_virt(header_ser_pa); + + luo_flb_global.incoming.header_ser = header_ser; + luo_flb_global.incoming.ser = (void *)(header_ser + 1); + luo_flb_global.incoming.active = true; + + return 0; +} + +/** + * luo_flb_serialize - Serializes all active FLB objects for KHO. + * + * This function is called from the reboot path. It iterates through all + * registered File-Lifecycle-Bound (FLB) objects. For each FLB that has been + * preserved (i.e., its reference count is greater than zero), it writes its + * metadata into the memory region designated for Kexec Handover. + * + * The serialized data includes the FLB's compatibility string, its opaque + * data handle, and the final reference count. This allows the new kernel to + * find the appropriate handler and reconstruct the FLB's state. + * + * Context: Called from liveupdate_reboot() just before kho_finalize(). + */ +void luo_flb_serialize(void) +{ + struct luo_flb_header *fh = &luo_flb_global.outgoing; + struct liveupdate_flb *gflb; + int i = 0; + + list_private_for_each_entry(gflb, &luo_flb_global.list, private.list) { + struct luo_flb_private *private = luo_flb_get_private(gflb); + + if (private->outgoing.count > 0) { + strscpy(fh->ser[i].name, gflb->compatible, + sizeof(fh->ser[i].name)); + fh->ser[i].data = private->outgoing.data; + fh->ser[i].count = private->outgoing.count; + i++; + } + } + + fh->header_ser->count = i; +} diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h index c8973b543d1d..8083d8739b09 100644 --- a/kernel/liveupdate/luo_internal.h +++ b/kernel/liveupdate/luo_internal.h @@ -40,13 +40,6 @@ static inline int luo_ucmd_respond(struct luo_ucmd *ucmd, */ #define luo_restore_fail(__fmt, ...) panic(__fmt, ##__VA_ARGS__) -/* Mimics list_for_each_entry() but for private list head entries */ -#define luo_list_for_each_private(pos, head, member) \ - for (struct list_head *__iter = (head)->next; \ - __iter != (head) && \ - ({ pos = container_of(__iter, typeof(*(pos)), member); 1; }); \ - __iter = __iter->next) - /** * struct luo_file_set - A set of files that belong to the same sessions. * @files_list: An ordered list of files associated with this session, it is @@ -107,4 +100,19 @@ int luo_file_deserialize(struct luo_file_set *file_set, void luo_file_set_init(struct luo_file_set *file_set); void luo_file_set_destroy(struct luo_file_set *file_set); +int luo_flb_file_preserve(struct liveupdate_file_handler *fh); +void luo_flb_file_unpreserve(struct liveupdate_file_handler *fh); +void luo_flb_file_finish(struct liveupdate_file_handler *fh); +int __init luo_flb_setup_outgoing(void *fdt); +int __init luo_flb_setup_incoming(void *fdt); +void luo_flb_serialize(void); + +#ifdef CONFIG_LIVEUPDATE_TEST +void liveupdate_test_register(struct liveupdate_file_handler *fh); +void liveupdate_test_unregister(struct liveupdate_file_handler *fh); +#else +static inline void liveupdate_test_register(struct liveupdate_file_handler *fh) { } +static inline void liveupdate_test_unregister(struct liveupdate_file_handler *fh) { } +#endif + #endif /* _LINUX_LUO_INTERNAL_H */ diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index bcb1b9fea588..79b5e45f8d4c 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -13,7 +13,8 @@ #include <linux/slab.h> #include <linux/ww_mutex.h> -static DEFINE_WD_CLASS(ww_class); +static DEFINE_WD_CLASS(wd_class); +static DEFINE_WW_CLASS(ww_class); struct workqueue_struct *wq; #ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH @@ -54,16 +55,16 @@ static void test_mutex_work(struct work_struct *work) ww_mutex_unlock(&mtx->mutex); } -static int __test_mutex(unsigned int flags) +static int __test_mutex(struct ww_class *class, unsigned int flags) { #define TIMEOUT (HZ / 16) struct test_mutex mtx; struct ww_acquire_ctx ctx; int ret; - ww_mutex_init(&mtx.mutex, &ww_class); + ww_mutex_init(&mtx.mutex, class); if (flags & TEST_MTX_CTX) - ww_acquire_init(&ctx, &ww_class); + ww_acquire_init(&ctx, class); INIT_WORK_ONSTACK(&mtx.work, test_mutex_work); init_completion(&mtx.ready); @@ -71,7 +72,7 @@ static int __test_mutex(unsigned int flags) init_completion(&mtx.done); mtx.flags = flags; - schedule_work(&mtx.work); + queue_work(wq, &mtx.work); wait_for_completion(&mtx.ready); ww_mutex_lock(&mtx.mutex, (flags & TEST_MTX_CTX) ? &ctx : NULL); @@ -106,13 +107,13 @@ static int __test_mutex(unsigned int flags) #undef TIMEOUT } -static int test_mutex(void) +static int test_mutex(struct ww_class *class) { int ret; int i; for (i = 0; i < __TEST_MTX_LAST; i++) { - ret = __test_mutex(i); + ret = __test_mutex(class, i); if (ret) return ret; } @@ -120,15 +121,15 @@ static int test_mutex(void) return 0; } -static int test_aa(bool trylock) +static int test_aa(struct ww_class *class, bool trylock) { struct ww_mutex mutex; struct ww_acquire_ctx ctx; int ret; const char *from = trylock ? "trylock" : "lock"; - ww_mutex_init(&mutex, &ww_class); - ww_acquire_init(&ctx, &ww_class); + ww_mutex_init(&mutex, class); + ww_acquire_init(&ctx, class); if (!trylock) { ret = ww_mutex_lock(&mutex, &ctx); @@ -177,6 +178,7 @@ out: struct test_abba { struct work_struct work; + struct ww_class *class; struct ww_mutex a_mutex; struct ww_mutex b_mutex; struct completion a_ready; @@ -191,7 +193,7 @@ static void test_abba_work(struct work_struct *work) struct ww_acquire_ctx ctx; int err; - ww_acquire_init_noinject(&ctx, &ww_class); + ww_acquire_init_noinject(&ctx, abba->class); if (!abba->trylock) ww_mutex_lock(&abba->b_mutex, &ctx); else @@ -217,23 +219,24 @@ static void test_abba_work(struct work_struct *work) abba->result = err; } -static int test_abba(bool trylock, bool resolve) +static int test_abba(struct ww_class *class, bool trylock, bool resolve) { struct test_abba abba; struct ww_acquire_ctx ctx; int err, ret; - ww_mutex_init(&abba.a_mutex, &ww_class); - ww_mutex_init(&abba.b_mutex, &ww_class); + ww_mutex_init(&abba.a_mutex, class); + ww_mutex_init(&abba.b_mutex, class); INIT_WORK_ONSTACK(&abba.work, test_abba_work); init_completion(&abba.a_ready); init_completion(&abba.b_ready); + abba.class = class; abba.trylock = trylock; abba.resolve = resolve; - schedule_work(&abba.work); + queue_work(wq, &abba.work); - ww_acquire_init_noinject(&ctx, &ww_class); + ww_acquire_init_noinject(&ctx, class); if (!trylock) ww_mutex_lock(&abba.a_mutex, &ctx); else @@ -278,6 +281,7 @@ static int test_abba(bool trylock, bool resolve) struct test_cycle { struct work_struct work; + struct ww_class *class; struct ww_mutex a_mutex; struct ww_mutex *b_mutex; struct completion *a_signal; @@ -291,7 +295,7 @@ static void test_cycle_work(struct work_struct *work) struct ww_acquire_ctx ctx; int err, erra = 0; - ww_acquire_init_noinject(&ctx, &ww_class); + ww_acquire_init_noinject(&ctx, cycle->class); ww_mutex_lock(&cycle->a_mutex, &ctx); complete(cycle->a_signal); @@ -314,7 +318,7 @@ static void test_cycle_work(struct work_struct *work) cycle->result = err ?: erra; } -static int __test_cycle(unsigned int nthreads) +static int __test_cycle(struct ww_class *class, unsigned int nthreads) { struct test_cycle *cycles; unsigned int n, last = nthreads - 1; @@ -327,7 +331,8 @@ static int __test_cycle(unsigned int nthreads) for (n = 0; n < nthreads; n++) { struct test_cycle *cycle = &cycles[n]; - ww_mutex_init(&cycle->a_mutex, &ww_class); + cycle->class = class; + ww_mutex_init(&cycle->a_mutex, class); if (n == last) cycle->b_mutex = &cycles[0].a_mutex; else @@ -367,13 +372,13 @@ static int __test_cycle(unsigned int nthreads) return ret; } -static int test_cycle(unsigned int ncpus) +static int test_cycle(struct ww_class *class, unsigned int ncpus) { unsigned int n; int ret; for (n = 2; n <= ncpus + 1; n++) { - ret = __test_cycle(n); + ret = __test_cycle(class, n); if (ret) return ret; } @@ -384,6 +389,7 @@ static int test_cycle(unsigned int ncpus) struct stress { struct work_struct work; struct ww_mutex *locks; + struct ww_class *class; unsigned long timeout; int nlocks; }; @@ -443,7 +449,7 @@ static void stress_inorder_work(struct work_struct *work) int contended = -1; int n, err; - ww_acquire_init(&ctx, &ww_class); + ww_acquire_init(&ctx, stress->class); retry: err = 0; for (n = 0; n < nlocks; n++) { @@ -511,7 +517,7 @@ static void stress_reorder_work(struct work_struct *work) order = NULL; do { - ww_acquire_init(&ctx, &ww_class); + ww_acquire_init(&ctx, stress->class); list_for_each_entry(ll, &locks, link) { err = ww_mutex_lock(ll->lock, &ctx); @@ -570,7 +576,7 @@ static void stress_one_work(struct work_struct *work) #define STRESS_ONE BIT(2) #define STRESS_ALL (STRESS_INORDER | STRESS_REORDER | STRESS_ONE) -static int stress(int nlocks, int nthreads, unsigned int flags) +static int stress(struct ww_class *class, int nlocks, int nthreads, unsigned int flags) { struct ww_mutex *locks; struct stress *stress_array; @@ -588,7 +594,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags) } for (n = 0; n < nlocks; n++) - ww_mutex_init(&locks[n], &ww_class); + ww_mutex_init(&locks[n], class); count = 0; for (n = 0; nthreads; n++) { @@ -617,6 +623,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags) stress = &stress_array[count++]; INIT_WORK(&stress->work, fn); + stress->class = class; stress->locks = locks; stress->nlocks = nlocks; stress->timeout = jiffies + 2*HZ; @@ -635,59 +642,131 @@ static int stress(int nlocks, int nthreads, unsigned int flags) return 0; } -static int __init test_ww_mutex_init(void) +static int run_tests(struct ww_class *class) { int ncpus = num_online_cpus(); int ret, i; - printk(KERN_INFO "Beginning ww mutex selftests\n"); - - prandom_seed_state(&rng, get_random_u64()); - - wq = alloc_workqueue("test-ww_mutex", WQ_UNBOUND, 0); - if (!wq) - return -ENOMEM; - - ret = test_mutex(); + ret = test_mutex(class); if (ret) return ret; - ret = test_aa(false); + ret = test_aa(class, false); if (ret) return ret; - ret = test_aa(true); + ret = test_aa(class, true); if (ret) return ret; for (i = 0; i < 4; i++) { - ret = test_abba(i & 1, i & 2); + ret = test_abba(class, i & 1, i & 2); if (ret) return ret; } - ret = test_cycle(ncpus); + ret = test_cycle(class, ncpus); + if (ret) + return ret; + + ret = stress(class, 16, 2 * ncpus, STRESS_INORDER); + if (ret) + return ret; + + ret = stress(class, 16, 2 * ncpus, STRESS_REORDER); if (ret) return ret; - ret = stress(16, 2*ncpus, STRESS_INORDER); + ret = stress(class, 2046, hweight32(STRESS_ALL) * ncpus, STRESS_ALL); if (ret) return ret; - ret = stress(16, 2*ncpus, STRESS_REORDER); + return 0; +} + +static int run_test_classes(void) +{ + int ret; + + pr_info("Beginning ww (wound) mutex selftests\n"); + + ret = run_tests(&ww_class); if (ret) return ret; - ret = stress(2046, hweight32(STRESS_ALL)*ncpus, STRESS_ALL); + pr_info("Beginning ww (die) mutex selftests\n"); + ret = run_tests(&wd_class); if (ret) return ret; - printk(KERN_INFO "All ww mutex selftests passed\n"); + pr_info("All ww mutex selftests passed\n"); return 0; } +static DEFINE_MUTEX(run_lock); + +static ssize_t run_tests_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (!mutex_trylock(&run_lock)) { + pr_err("Test already running\n"); + return count; + } + + run_test_classes(); + mutex_unlock(&run_lock); + + return count; +} + +static struct kobj_attribute run_tests_attribute = + __ATTR(run_tests, 0664, NULL, run_tests_store); + +static struct attribute *attrs[] = { + &run_tests_attribute.attr, + NULL, /* need to NULL terminate the list of attributes */ +}; + +static struct attribute_group attr_group = { + .attrs = attrs, +}; + +static struct kobject *test_ww_mutex_kobj; + +static int __init test_ww_mutex_init(void) +{ + int ret; + + prandom_seed_state(&rng, get_random_u64()); + + wq = alloc_workqueue("test-ww_mutex", WQ_UNBOUND, 0); + if (!wq) + return -ENOMEM; + + test_ww_mutex_kobj = kobject_create_and_add("test_ww_mutex", kernel_kobj); + if (!test_ww_mutex_kobj) { + destroy_workqueue(wq); + return -ENOMEM; + } + + /* Create the files associated with this kobject */ + ret = sysfs_create_group(test_ww_mutex_kobj, &attr_group); + if (ret) { + kobject_put(test_ww_mutex_kobj); + destroy_workqueue(wq); + return ret; + } + + mutex_lock(&run_lock); + ret = run_test_classes(); + mutex_unlock(&run_lock); + + return ret; +} + static void __exit test_ww_mutex_exit(void) { + kobject_put(test_ww_mutex_kobj); destroy_workqueue(wq); } diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index 2a1beebf1d37..be74917802ad 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -299,10 +299,6 @@ choice possible to load a signed module containing the algorithm to check the signature on that module. -config MODULE_SIG_SHA1 - bool "SHA-1" - select CRYPTO_SHA1 - config MODULE_SIG_SHA256 bool "SHA-256" select CRYPTO_SHA256 @@ -332,7 +328,6 @@ endchoice config MODULE_SIG_HASH string depends on MODULE_SIG || IMA_APPRAISE_MODSIG - default "sha1" if MODULE_SIG_SHA1 default "sha256" if MODULE_SIG_SHA256 default "sha384" if MODULE_SIG_SHA384 default "sha512" if MODULE_SIG_SHA512 diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c index 474e68f0f063..36f52a232a12 100644 --- a/kernel/module/decompress.c +++ b/kernel/module/decompress.c @@ -17,16 +17,16 @@ static int module_extend_max_pages(struct load_info *info, unsigned int extent) { struct page **new_pages; + unsigned int new_max = info->max_pages + extent; - new_pages = kvmalloc_array(info->max_pages + extent, - sizeof(info->pages), GFP_KERNEL); + new_pages = kvrealloc(info->pages, + size_mul(new_max, sizeof(*info->pages)), + GFP_KERNEL); if (!new_pages) return -ENOMEM; - memcpy(new_pages, info->pages, info->max_pages * sizeof(info->pages)); - kvfree(info->pages); info->pages = new_pages; - info->max_pages += extent; + info->max_pages = new_max; return 0; } diff --git a/kernel/module/dups.c b/kernel/module/dups.c index bd2149fbe117..0b633f2edda6 100644 --- a/kernel/module/dups.c +++ b/kernel/module/dups.c @@ -113,7 +113,7 @@ static void kmod_dup_request_complete(struct work_struct *work) * let this linger forever as this is just a boot optimization for * possible abuses of vmalloc() incurred by finit_module() thrashing. */ - queue_delayed_work(system_wq, &kmod_req->delete_work, 60 * HZ); + queue_delayed_work(system_dfl_wq, &kmod_req->delete_work, 60 * HZ); } bool kmod_dup_request_exists_wait(char *module_name, bool wait, int *dup_ret) @@ -240,7 +240,7 @@ void kmod_dup_request_announce(char *module_name, int ret) * There is no rush. But we also don't want to hold the * caller up forever or introduce any boot delays. */ - queue_work(system_wq, &kmod_req->complete_work); + queue_work(system_dfl_wq, &kmod_req->complete_work); out: mutex_unlock(&kmod_dup_mutex); diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c index 00a60796327c..0fc11e45df9b 100644 --- a/kernel/module/kallsyms.c +++ b/kernel/module/kallsyms.c @@ -334,13 +334,8 @@ int module_address_lookup(unsigned long addr, if (mod) { if (modname) *modname = mod->name; - if (modbuildid) { -#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) - *modbuildid = mod->build_id; -#else - *modbuildid = NULL; -#endif - } + if (modbuildid) + *modbuildid = module_buildid(mod); sym = find_kallsyms_symbol(mod, addr, size, offset); diff --git a/kernel/padata.c b/kernel/padata.c index aa66d91e20f9..db7c75787a2b 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -819,7 +819,7 @@ static void __padata_free(struct padata_instance *pinst) #define kobj2pinst(_kobj) \ container_of(_kobj, struct padata_instance, kobj) #define attr2pentry(_attr) \ - container_of(_attr, struct padata_sysfs_entry, attr) + container_of_const(_attr, struct padata_sysfs_entry, attr) static void padata_sysfs_release(struct kobject *kobj) { @@ -829,13 +829,13 @@ static void padata_sysfs_release(struct kobject *kobj) struct padata_sysfs_entry { struct attribute attr; - ssize_t (*show)(struct padata_instance *, struct attribute *, char *); - ssize_t (*store)(struct padata_instance *, struct attribute *, + ssize_t (*show)(struct padata_instance *, const struct attribute *, char *); + ssize_t (*store)(struct padata_instance *, const struct attribute *, const char *, size_t); }; static ssize_t show_cpumask(struct padata_instance *pinst, - struct attribute *attr, char *buf) + const struct attribute *attr, char *buf) { struct cpumask *cpumask; ssize_t len; @@ -853,7 +853,7 @@ static ssize_t show_cpumask(struct padata_instance *pinst, } static ssize_t store_cpumask(struct padata_instance *pinst, - struct attribute *attr, + const struct attribute *attr, const char *buf, size_t count) { cpumask_var_t new_cpumask; @@ -880,10 +880,10 @@ out: } #define PADATA_ATTR_RW(_name, _show_name, _store_name) \ - static struct padata_sysfs_entry _name##_attr = \ + static const struct padata_sysfs_entry _name##_attr = \ __ATTR(_name, 0644, _show_name, _store_name) -#define PADATA_ATTR_RO(_name, _show_name) \ - static struct padata_sysfs_entry _name##_attr = \ +#define PADATA_ATTR_RO(_name, _show_name) \ + static const struct padata_sysfs_entry _name##_attr = \ __ATTR(_name, 0400, _show_name, NULL) PADATA_ATTR_RW(serial_cpumask, show_cpumask, store_cpumask); @@ -894,7 +894,7 @@ PADATA_ATTR_RW(parallel_cpumask, show_cpumask, store_cpumask); * serial_cpumask [RW] - cpumask for serial workers * parallel_cpumask [RW] - cpumask for parallel workers */ -static struct attribute *padata_default_attrs[] = { +static const struct attribute *const padata_default_attrs[] = { &serial_cpumask_attr.attr, ¶llel_cpumask_attr.attr, NULL, @@ -904,8 +904,8 @@ ATTRIBUTE_GROUPS(padata_default); static ssize_t padata_sysfs_show(struct kobject *kobj, struct attribute *attr, char *buf) { + const struct padata_sysfs_entry *pentry; struct padata_instance *pinst; - struct padata_sysfs_entry *pentry; ssize_t ret = -EIO; pinst = kobj2pinst(kobj); @@ -919,8 +919,8 @@ static ssize_t padata_sysfs_show(struct kobject *kobj, static ssize_t padata_sysfs_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { + const struct padata_sysfs_entry *pentry; struct padata_instance *pinst; - struct padata_sysfs_entry *pentry; ssize_t ret = -EIO; pinst = kobj2pinst(kobj); diff --git a/kernel/panic.c b/kernel/panic.c index 0d52210a9e2b..c78600212b6c 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -42,6 +42,7 @@ #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 +#define PANIC_MSG_BUFSZ 1024 #ifdef CONFIG_SMP /* @@ -74,6 +75,8 @@ EXPORT_SYMBOL_GPL(panic_timeout); unsigned long panic_print; +static int panic_force_cpu = -1; + ATOMIC_NOTIFIER_HEAD(panic_notifier_list); EXPORT_SYMBOL(panic_notifier_list); @@ -131,7 +134,8 @@ static int proc_taint(const struct ctl_table *table, int write, static int sysctl_panic_print_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - panic_print_deprecated(); + if (write) + panic_print_deprecated(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } @@ -299,6 +303,150 @@ void __weak crash_smp_send_stop(void) } atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); +atomic_t panic_redirect_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); + +#if defined(CONFIG_SMP) && defined(CONFIG_CRASH_DUMP) +static char *panic_force_buf; + +static int __init panic_force_cpu_setup(char *str) +{ + int cpu; + + if (!str) + return -EINVAL; + + if (kstrtoint(str, 0, &cpu) || cpu < 0 || cpu >= nr_cpu_ids) { + pr_warn("panic_force_cpu: invalid value '%s'\n", str); + return -EINVAL; + } + + panic_force_cpu = cpu; + return 0; +} +early_param("panic_force_cpu", panic_force_cpu_setup); + +static int __init panic_force_cpu_late_init(void) +{ + if (panic_force_cpu < 0) + return 0; + + panic_force_buf = kmalloc(PANIC_MSG_BUFSZ, GFP_KERNEL); + + return 0; +} +late_initcall(panic_force_cpu_late_init); + +static void do_panic_on_target_cpu(void *info) +{ + panic("%s", (char *)info); +} + +/** + * panic_smp_redirect_cpu - Redirect panic to target CPU + * @target_cpu: CPU that should handle the panic + * @msg: formatted panic message + * + * Default implementation uses IPI. Architectures with NMI support + * can override this for more reliable delivery. + * + * Return: 0 on success, negative errno on failure + */ +int __weak panic_smp_redirect_cpu(int target_cpu, void *msg) +{ + static call_single_data_t panic_csd; + + panic_csd.func = do_panic_on_target_cpu; + panic_csd.info = msg; + + return smp_call_function_single_async(target_cpu, &panic_csd); +} + +/** + * panic_try_force_cpu - Redirect panic to a specific CPU for crash kernel + * @fmt: panic message format string + * @args: arguments for format string + * + * Some platforms require panic handling to occur on a specific CPU + * for the crash kernel to function correctly. This function redirects + * panic handling to the CPU specified via the panic_force_cpu= boot parameter. + * + * Returns false if panic should proceed on current CPU. + * Returns true if panic was redirected. + */ +__printf(1, 0) +static bool panic_try_force_cpu(const char *fmt, va_list args) +{ + int this_cpu = raw_smp_processor_id(); + int old_cpu = PANIC_CPU_INVALID; + const char *msg; + + /* Feature not enabled via boot parameter */ + if (panic_force_cpu < 0) + return false; + + /* Already on target CPU - proceed normally */ + if (this_cpu == panic_force_cpu) + return false; + + /* Target CPU is offline, can't redirect */ + if (!cpu_online(panic_force_cpu)) { + pr_warn("panic: target CPU %d is offline, continuing on CPU %d\n", + panic_force_cpu, this_cpu); + return false; + } + + /* Another panic already in progress */ + if (panic_in_progress()) + return false; + + /* + * Only one CPU can do the redirect. Use atomic cmpxchg to ensure + * we don't race with another CPU also trying to redirect. + */ + if (!atomic_try_cmpxchg(&panic_redirect_cpu, &old_cpu, this_cpu)) + return false; + + /* + * Use dynamically allocated buffer if available, otherwise + * fall back to static message for early boot panics or allocation failure. + */ + if (panic_force_buf) { + vsnprintf(panic_force_buf, PANIC_MSG_BUFSZ, fmt, args); + msg = panic_force_buf; + } else { + msg = "Redirected panic (buffer unavailable)"; + } + + console_verbose(); + bust_spinlocks(1); + + pr_emerg("panic: Redirecting from CPU %d to CPU %d for crash kernel.\n", + this_cpu, panic_force_cpu); + + /* Dump original CPU before redirecting */ + if (!test_taint(TAINT_DIE) && + oops_in_progress <= 1 && + IS_ENABLED(CONFIG_DEBUG_BUGVERBOSE)) { + dump_stack(); + } + + if (panic_smp_redirect_cpu(panic_force_cpu, (void *)msg) != 0) { + atomic_set(&panic_redirect_cpu, PANIC_CPU_INVALID); + pr_warn("panic: failed to redirect to CPU %d, continuing on CPU %d\n", + panic_force_cpu, this_cpu); + return false; + } + + /* IPI/NMI sent, this CPU should stop */ + return true; +} +#else +__printf(1, 0) +static inline bool panic_try_force_cpu(const char *fmt, va_list args) +{ + return false; +} +#endif /* CONFIG_SMP && CONFIG_CRASH_DUMP */ bool panic_try_start(void) { @@ -427,7 +575,7 @@ static void panic_other_cpus_shutdown(bool crash_kexec) */ void vpanic(const char *fmt, va_list args) { - static char buf[1024]; + static char buf[PANIC_MSG_BUFSZ]; long i, i_next = 0, len; int state = 0; bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers; @@ -451,6 +599,15 @@ void vpanic(const char *fmt, va_list args) local_irq_disable(); preempt_disable_notrace(); + /* Redirect panic to target CPU if configured via panic_force_cpu=. */ + if (panic_try_force_cpu(fmt, args)) { + /* + * Mark ourselves offline so panic_other_cpus_shutdown() won't wait + * for us on architectures that check num_online_cpus(). + */ + set_cpu_online(smp_processor_id(), false); + panic_smp_self_stop(); + } /* * It's possible to come here directly from a panic-assertion and * not have preempt disabled. Some functions called from here want @@ -483,7 +640,11 @@ void vpanic(const char *fmt, va_list args) /* * Avoid nested stack-dumping if a panic occurs during oops processing */ - if (test_taint(TAINT_DIE) || oops_in_progress > 1) { + if (atomic_read(&panic_redirect_cpu) != PANIC_CPU_INVALID && + panic_force_cpu == raw_smp_processor_id()) { + pr_emerg("panic: Redirected from CPU %d, skipping stack dump.\n", + atomic_read(&panic_redirect_cpu)); + } else if (test_taint(TAINT_DIE) || oops_in_progress > 1) { panic_this_cpu_backtrace_printed = true; } else if (IS_ENABLED(CONFIG_DEBUG_BUGVERBOSE)) { dump_stack(); @@ -1014,7 +1175,6 @@ static int panic_print_set(const char *val, const struct kernel_param *kp) static int panic_print_get(char *val, const struct kernel_param *kp) { - panic_print_deprecated(); return param_get_ulong(val, kp); } diff --git a/kernel/params.c b/kernel/params.c index b96cfd693c99..7c2242f64bf0 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -596,12 +596,6 @@ static ssize_t param_attr_store(const struct module_attribute *mattr, } #endif -#ifdef CONFIG_MODULES -#define __modinit -#else -#define __modinit __init -#endif - #ifdef CONFIG_SYSFS void kernel_param_lock(struct module *mod) { @@ -626,9 +620,9 @@ EXPORT_SYMBOL(kernel_param_unlock); * create file in sysfs. Returns an error on out of memory. Always cleans up * if there's an error. */ -static __modinit int add_sysfs_param(struct module_kobject *mk, - const struct kernel_param *kp, - const char *name) +static __init_or_module int add_sysfs_param(struct module_kobject *mk, + const struct kernel_param *kp, + const char *name) { struct module_param_attrs *new_mp; struct attribute **new_attrs; @@ -761,7 +755,8 @@ void destroy_params(const struct kernel_param *params, unsigned num) params[i].ops->free(params[i].arg); } -struct module_kobject __modinit * lookup_or_create_module_kobject(const char *name) +struct module_kobject * __init_or_module +lookup_or_create_module_kobject(const char *name) { struct module_kobject *mk; struct kobject *kobj; diff --git a/kernel/pid.c b/kernel/pid.c index a31771bc89c1..f45ae56db7da 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -159,58 +159,86 @@ void free_pids(struct pid **pids) free_pid(pids[tmp]); } -struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, - size_t set_tid_size) +struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid, + size_t arg_set_tid_size) { + int set_tid[MAX_PID_NS_LEVEL + 1] = {}; + int pid_max[MAX_PID_NS_LEVEL + 1] = {}; struct pid *pid; enum pid_type type; int i, nr; struct pid_namespace *tmp; struct upid *upid; int retval = -ENOMEM; + bool retried_preload; /* - * set_tid_size contains the size of the set_tid array. Starting at + * arg_set_tid_size contains the size of the arg_set_tid array. Starting at * the most nested currently active PID namespace it tells alloc_pid() * which PID to set for a process in that most nested PID namespace - * up to set_tid_size PID namespaces. It does not have to set the PID - * for a process in all nested PID namespaces but set_tid_size must + * up to arg_set_tid_size PID namespaces. It does not have to set the PID + * for a process in all nested PID namespaces but arg_set_tid_size must * never be greater than the current ns->level + 1. */ - if (set_tid_size > ns->level + 1) + if (arg_set_tid_size > ns->level + 1) return ERR_PTR(-EINVAL); + /* + * Prep before we take locks: + * + * 1. allocate and fill in pid struct + */ pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); if (!pid) return ERR_PTR(retval); - tmp = ns; + get_pid_ns(ns); pid->level = ns->level; + refcount_set(&pid->count, 1); + spin_lock_init(&pid->lock); + for (type = 0; type < PIDTYPE_MAX; ++type) + INIT_HLIST_HEAD(&pid->tasks[type]); + init_waitqueue_head(&pid->wait_pidfd); + INIT_HLIST_HEAD(&pid->inodes); - for (i = ns->level; i >= 0; i--) { - int tid = 0; - int pid_max = READ_ONCE(tmp->pid_max); + /* + * 2. perm check checkpoint_restore_ns_capable() + * + * This stores found pid_max to make sure the used value is the same should + * later code need it. + */ + for (tmp = ns, i = ns->level; i >= 0; i--) { + pid_max[ns->level - i] = READ_ONCE(tmp->pid_max); - if (set_tid_size) { - tid = set_tid[ns->level - i]; + if (arg_set_tid_size) { + int tid = set_tid[ns->level - i] = arg_set_tid[ns->level - i]; retval = -EINVAL; - if (tid < 1 || tid >= pid_max) - goto out_free; + if (tid < 1 || tid >= pid_max[ns->level - i]) + goto out_abort; /* * Also fail if a PID != 1 is requested and * no PID 1 exists. */ if (tid != 1 && !tmp->child_reaper) - goto out_free; + goto out_abort; retval = -EPERM; if (!checkpoint_restore_ns_capable(tmp->user_ns)) - goto out_free; - set_tid_size--; + goto out_abort; + arg_set_tid_size--; } - idr_preload(GFP_KERNEL); - spin_lock(&pidmap_lock); + tmp = tmp->parent; + } + + /* + * Prep is done, id allocation goes here: + */ + retried_preload = false; + idr_preload(GFP_KERNEL); + spin_lock(&pidmap_lock); + for (tmp = ns, i = ns->level; i >= 0;) { + int tid = set_tid[ns->level - i]; if (tid) { nr = idr_alloc(&tmp->idr, NULL, tid, @@ -220,6 +248,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, * alreay in use. Return EEXIST in that case. */ if (nr == -ENOSPC) + nr = -EEXIST; } else { int pid_min = 1; @@ -235,19 +264,42 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, * a partially initialized PID (see below). */ nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, - pid_max, GFP_ATOMIC); + pid_max[ns->level - i], GFP_ATOMIC); + if (nr == -ENOSPC) + nr = -EAGAIN; } - spin_unlock(&pidmap_lock); - idr_preload_end(); - if (nr < 0) { - retval = (nr == -ENOSPC) ? -EAGAIN : nr; + if (unlikely(nr < 0)) { + /* + * Preload more memory if idr_alloc{,cyclic} failed with -ENOMEM. + * + * The IDR API only allows us to preload memory for one call, while we may end + * up doing several under pidmap_lock with GFP_ATOMIC. The situation may be + * salvageable with GFP_KERNEL. But make sure to not loop indefinitely if preload + * did not help (the routine unfortunately returns void, so we have no idea + * if it got anywhere). + * + * The lock can be safely dropped and picked up as historically pid allocation + * for different namespaces was *not* atomic -- we try to hold on to it the + * entire time only for performance reasons. + */ + if (nr == -ENOMEM && !retried_preload) { + spin_unlock(&pidmap_lock); + idr_preload_end(); + retried_preload = true; + idr_preload(GFP_KERNEL); + spin_lock(&pidmap_lock); + continue; + } + retval = nr; goto out_free; } pid->numbers[i].nr = nr; pid->numbers[i].ns = tmp; tmp = tmp->parent; + i--; + retried_preload = false; } /* @@ -257,25 +309,15 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, * is what we have exposed to userspace for a long time and it is * documented behavior for pid namespaces. So we can't easily * change it even if there were an error code better suited. + * + * This can't be done earlier because we need to preserve other + * error conditions. */ retval = -ENOMEM; - - get_pid_ns(ns); - refcount_set(&pid->count, 1); - spin_lock_init(&pid->lock); - for (type = 0; type < PIDTYPE_MAX; ++type) - INIT_HLIST_HEAD(&pid->tasks[type]); - - init_waitqueue_head(&pid->wait_pidfd); - INIT_HLIST_HEAD(&pid->inodes); - - upid = pid->numbers + ns->level; - idr_preload(GFP_KERNEL); - spin_lock(&pidmap_lock); - if (!(ns->pid_allocated & PIDNS_ADDING)) - goto out_unlock; + if (unlikely(!(ns->pid_allocated & PIDNS_ADDING))) + goto out_free; pidfs_add_pid(pid); - for ( ; upid >= pid->numbers; --upid) { + for (upid = pid->numbers + ns->level; upid >= pid->numbers; --upid) { /* Make the PID visible to find_pid_ns. */ idr_replace(&upid->ns->idr, pid, upid->nr); upid->ns->pid_allocated++; @@ -286,13 +328,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, return pid; -out_unlock: - spin_unlock(&pidmap_lock); - idr_preload_end(); - put_pid_ns(ns); - out_free: - spin_lock(&pidmap_lock); while (++i <= ns->level) { upid = pid->numbers + i; idr_remove(&upid->ns->idr, upid->nr); @@ -303,7 +339,10 @@ out_free: idr_set_cursor(&ns->idr, 0); spin_unlock(&pidmap_lock); + idr_preload_end(); +out_abort: + put_pid_ns(ns); kmem_cache_free(ns->pid_cachep, pid); return ERR_PTR(retval); } diff --git a/kernel/power/main.c b/kernel/power/main.c index 03b2c5495c77..5f8c9e12eaec 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -1125,7 +1125,7 @@ EXPORT_SYMBOL_GPL(pm_wq); static int __init pm_start_workqueues(void) { - pm_wq = alloc_workqueue("pm", WQ_FREEZABLE | WQ_UNBOUND, 0); + pm_wq = alloc_workqueue("pm", WQ_UNBOUND, 0); if (!pm_wq) return -ENOMEM; diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 8050e5182835..c4eb284b8e72 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -174,10 +174,10 @@ sector_t alloc_swapdev_block(int swap) * Allocate a swap page and register that it has been allocated, so that * it can be freed in case of an error. */ - offset = swp_offset(get_swap_page_of_type(swap)); + offset = swp_offset(swap_alloc_hibernation_slot(swap)); if (offset) { if (swsusp_extents_insert(offset)) - swap_free(swp_entry(swap, offset)); + swap_free_hibernation_slot(swp_entry(swap, offset)); else return swapdev_block(swap, offset); } @@ -186,6 +186,7 @@ sector_t alloc_swapdev_block(int swap) void free_all_swap_pages(int swap) { + unsigned long offset; struct rb_node *node; /* @@ -197,8 +198,9 @@ void free_all_swap_pages(int swap) ext = rb_entry(node, struct swsusp_extent, node); rb_erase(node, &swsusp_extents); - swap_free_nr(swp_entry(swap, ext->start), - ext->end - ext->start + 1); + + for (offset = ext->start; offset <= ext->end; offset++) + swap_free_hibernation_slot(swp_entry(swap, offset)); kfree(ext); } @@ -902,8 +904,8 @@ out_clean: for (thr = 0; thr < nr_threads; thr++) { if (data[thr].thr) kthread_stop(data[thr].thr); - if (data[thr].cr) - acomp_request_free(data[thr].cr); + + acomp_request_free(data[thr].cr); if (!IS_ERR_OR_NULL(data[thr].cc)) crypto_free_acomp(data[thr].cc); @@ -1502,8 +1504,8 @@ out_clean: for (thr = 0; thr < nr_threads; thr++) { if (data[thr].thr) kthread_stop(data[thr].thr); - if (data[thr].cr) - acomp_request_free(data[thr].cr); + + acomp_request_free(data[thr].cr); if (!IS_ERR_OR_NULL(data[thr].cc)) crypto_free_acomp(data[thr].cc); diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 5f5f626f4279..5fdea5682756 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -281,12 +281,20 @@ struct printk_buffers { * nothing to output and this record should be skipped. * @seq: The sequence number of the record used for @pbufs->outbuf. * @dropped: The number of dropped records from reading @seq. + * @cpu: CPU on which the message was generated. + * @pid: PID of the task that generated the message + * @comm: Name of the task that generated the message. */ struct printk_message { struct printk_buffers *pbufs; unsigned int outbuf_len; u64 seq; unsigned long dropped; +#ifdef CONFIG_PRINTK_EXECUTION_CTX + int cpu; + pid_t pid; + char comm[TASK_COMM_LEN]; +#endif }; bool printk_get_next_message(struct printk_message *pmsg, u64 seq, diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c index 32fc12e53675..d558b18505cd 100644 --- a/kernel/printk/nbcon.c +++ b/kernel/printk/nbcon.c @@ -946,6 +946,20 @@ void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) } EXPORT_SYMBOL_GPL(nbcon_reacquire_nobuf); +#ifdef CONFIG_PRINTK_EXECUTION_CTX +static void wctxt_load_execution_ctx(struct nbcon_write_context *wctxt, + struct printk_message *pmsg) +{ + wctxt->cpu = pmsg->cpu; + wctxt->pid = pmsg->pid; + memcpy(wctxt->comm, pmsg->comm, sizeof(wctxt->comm)); + static_assert(sizeof(wctxt->comm) == sizeof(pmsg->comm)); +} +#else +static void wctxt_load_execution_ctx(struct nbcon_write_context *wctxt, + struct printk_message *pmsg) {} +#endif + /** * nbcon_emit_next_record - Emit a record in the acquired context * @wctxt: The write context that will be handed to the write function @@ -1048,6 +1062,8 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt, bool use_a /* Initialize the write context for driver callbacks. */ nbcon_write_context_set_buf(wctxt, &pmsg.pbufs->outbuf[0], pmsg.outbuf_len); + wctxt_load_execution_ctx(wctxt, &pmsg); + if (use_atomic) con->write_atomic(con, wctxt); else @@ -1758,9 +1774,12 @@ bool nbcon_alloc(struct console *con) /* Synchronize the kthread start. */ lockdep_assert_console_list_lock_held(); - /* The write_thread() callback is mandatory. */ - if (WARN_ON(!con->write_thread)) + /* Check for mandatory nbcon callbacks. */ + if (WARN_ON(!con->write_thread || + !con->device_lock || + !con->device_unlock)) { return false; + } rcuwait_init(&con->rcuwait); init_irq_work(&con->irq_work, nbcon_irq_work); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 1d765ad242b8..4ddeb55e1207 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -245,6 +245,7 @@ int devkmsg_sysctl_set_loglvl(const struct ctl_table *table, int write, * For console list or console->flags updates */ void console_list_lock(void) + __acquires(&console_mutex) { /* * In unregister_console() and console_force_preferred_locked(), @@ -269,6 +270,7 @@ EXPORT_SYMBOL(console_list_lock); * Counterpart to console_list_lock() */ void console_list_unlock(void) + __releases(&console_mutex) { mutex_unlock(&console_mutex); } @@ -2131,11 +2133,39 @@ static inline void printk_delay(int level) } } +#define CALLER_ID_MASK 0x80000000 + static inline u32 printk_caller_id(void) { return in_task() ? task_pid_nr(current) : - 0x80000000 + smp_processor_id(); + CALLER_ID_MASK + smp_processor_id(); +} + +#ifdef CONFIG_PRINTK_EXECUTION_CTX +/* Store the opposite info than caller_id. */ +static u32 printk_caller_id2(void) +{ + return !in_task() ? task_pid_nr(current) : + CALLER_ID_MASK + smp_processor_id(); +} + +static pid_t printk_info_get_pid(const struct printk_info *info) +{ + u32 caller_id = info->caller_id; + u32 caller_id2 = info->caller_id2; + + return caller_id & CALLER_ID_MASK ? caller_id2 : caller_id; +} + +static int printk_info_get_cpu(const struct printk_info *info) +{ + u32 caller_id = info->caller_id; + u32 caller_id2 = info->caller_id2; + + return ((caller_id & CALLER_ID_MASK ? + caller_id : caller_id2) & ~CALLER_ID_MASK); } +#endif /** * printk_parse_prefix - Parse level and control flags. @@ -2213,6 +2243,28 @@ static u16 printk_sprint(char *text, u16 size, int facility, return text_len; } +#ifdef CONFIG_PRINTK_EXECUTION_CTX +static void printk_store_execution_ctx(struct printk_info *info) +{ + info->caller_id2 = printk_caller_id2(); + get_task_comm(info->comm, current); +} + +static void pmsg_load_execution_ctx(struct printk_message *pmsg, + const struct printk_info *info) +{ + pmsg->cpu = printk_info_get_cpu(info); + pmsg->pid = printk_info_get_pid(info); + memcpy(pmsg->comm, info->comm, sizeof(pmsg->comm)); + static_assert(sizeof(pmsg->comm) == sizeof(info->comm)); +} +#else +static void printk_store_execution_ctx(struct printk_info *info) {} + +static void pmsg_load_execution_ctx(struct printk_message *pmsg, + const struct printk_info *info) {} +#endif + __printf(4, 0) int vprintk_store(int facility, int level, const struct dev_printk_info *dev_info, @@ -2320,6 +2372,7 @@ int vprintk_store(int facility, int level, r.info->caller_id = caller_id; if (dev_info) memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); + printk_store_execution_ctx(r.info); /* A message without a trailing newline can be continued. */ if (!(flags & LOG_NEWLINE)) @@ -3002,6 +3055,7 @@ bool printk_get_next_message(struct printk_message *pmsg, u64 seq, pmsg->seq = r.info->seq; pmsg->dropped = r.info->seq - seq; force_con = r.info->flags & LOG_FORCE_CON; + pmsg_load_execution_ctx(pmsg, r.info); /* * Skip records that are not forced to be printed on consoles and that diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h index 4ef81349d9fb..1651b53ece34 100644 --- a/kernel/printk/printk_ringbuffer.h +++ b/kernel/printk/printk_ringbuffer.h @@ -23,6 +23,11 @@ struct printk_info { u8 flags:5; /* internal record flags */ u8 level:3; /* syslog level */ u32 caller_id; /* thread id or processor id */ +#ifdef CONFIG_PRINTK_EXECUTION_CTX + u32 caller_id2; /* caller_id complement */ + /* name of the task that generated the message */ + char comm[TASK_COMM_LEN]; +#endif struct dev_printk_info dev_info; }; diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 4d9b21f69eaa..762299291e09 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -82,7 +82,7 @@ config NEED_SRCU_NMI_SAFE def_bool HAVE_NMI && !ARCH_HAS_NMI_SAFE_THIS_CPU_OPS && !TINY_SRCU config TASKS_RCU_GENERIC - def_bool TASKS_RCU || TASKS_RUDE_RCU || TASKS_TRACE_RCU + def_bool TASKS_RCU || TASKS_RUDE_RCU help This option enables generic infrastructure code supporting task-based RCU implementations. Not for manual selection. @@ -142,6 +142,29 @@ config TASKS_TRACE_RCU default n select IRQ_WORK +config TASKS_TRACE_RCU_NO_MB + bool "Override RCU Tasks Trace inclusion of read-side memory barriers" + depends on RCU_EXPERT && TASKS_TRACE_RCU + default ARCH_WANTS_NO_INSTR + help + This option prevents the use of read-side memory barriers in + rcu_read_lock_tasks_trace() and rcu_read_unlock_tasks_trace() + even in kernels built with CONFIG_ARCH_WANTS_NO_INSTR=n, that is, + in kernels that do not have noinstr set up in entry/exit code. + By setting this option, you are promising to carefully review + use of ftrace, BPF, and friends to ensure that no tracing + operation is attached to a function that runs in that portion + of the entry/exit code that RCU does not watch, that is, + where rcu_is_watching() returns false. Alternatively, you + might choose to never remove traces except by rebooting. + + Those wishing to disable read-side memory barriers for an entire + architecture can select this Kconfig option, hence the polarity. + + Say Y here if you need speed and will review use of tracing. + Say N here for certain esoteric testing of RCU itself. + Take the default if you are unsure. + config RCU_STALL_COMMON def_bool TREE_RCU help @@ -313,24 +336,6 @@ config RCU_NOCB_CPU_CB_BOOST Say Y here if you want to set RT priority for offloading kthreads. Say N here if you are building a !PREEMPT_RT kernel and are unsure. -config TASKS_TRACE_RCU_READ_MB - bool "Tasks Trace RCU readers use memory barriers in user and idle" - depends on RCU_EXPERT && TASKS_TRACE_RCU - default PREEMPT_RT || NR_CPUS < 8 - help - Use this option to further reduce the number of IPIs sent - to CPUs executing in userspace or idle during tasks trace - RCU grace periods. Given that a reasonable setting of - the rcupdate.rcu_task_ipi_delay kernel boot parameter - eliminates such IPIs for many workloads, proper setting - of this Kconfig option is important mostly for aggressive - real-time installations and for battery-powered devices, - hence the default chosen above. - - Say Y here if you hate IPIs. - Say N here if you hate read-side memory barriers. - Take the default if you are unsure. - config RCU_LAZY bool "RCU callback lazy invocation functionality" depends on RCU_NOCB_CPU diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 9cf01832a6c3..dc5d614b372c 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -544,10 +544,6 @@ struct task_struct *get_rcu_tasks_rude_gp_kthread(void); void rcu_tasks_rude_get_gp_data(int *flags, unsigned long *gp_seq); #endif // # ifdef CONFIG_TASKS_RUDE_RCU -#ifdef CONFIG_TASKS_TRACE_RCU -void rcu_tasks_trace_get_gp_data(int *flags, unsigned long *gp_seq); -#endif - #ifdef CONFIG_TASKS_RCU_GENERIC void tasks_cblist_init_generic(void); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ @@ -673,11 +669,6 @@ void show_rcu_tasks_rude_gp_kthread(void); #else static inline void show_rcu_tasks_rude_gp_kthread(void) {} #endif -#if !defined(CONFIG_TINY_RCU) && defined(CONFIG_TASKS_TRACE_RCU) -void show_rcu_tasks_trace_gp_kthread(void); -#else -static inline void show_rcu_tasks_trace_gp_kthread(void) {} -#endif #ifdef CONFIG_TINY_RCU static inline bool rcu_cpu_beenfullyonline(int cpu) { return true; } diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 7484d8ad5767..1c50f89fbd6f 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -400,11 +400,6 @@ static void tasks_trace_scale_read_unlock(int idx) rcu_read_unlock_trace(); } -static void rcu_tasks_trace_scale_stats(void) -{ - rcu_tasks_trace_torture_stats_print(scale_type, SCALE_FLAG); -} - static struct rcu_scale_ops tasks_tracing_ops = { .ptype = RCU_TASKS_FLAVOR, .init = rcu_sync_scale_init, @@ -416,8 +411,6 @@ static struct rcu_scale_ops tasks_tracing_ops = { .gp_barrier = rcu_barrier_tasks_trace, .sync = synchronize_rcu_tasks_trace, .exp_sync = synchronize_rcu_tasks_trace, - .rso_gp_kthread = get_rcu_tasks_trace_gp_kthread, - .stats = IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_trace_scale_stats, .name = "tasks-tracing" }; diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 07e51974b06b..47ce7f49b52c 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1178,10 +1178,9 @@ static struct rcu_torture_ops tasks_tracing_ops = { .deferred_free = rcu_tasks_tracing_torture_deferred_free, .sync = synchronize_rcu_tasks_trace, .exp_sync = synchronize_rcu_tasks_trace, + .exp_current = rcu_tasks_trace_expedite_current, .call = call_rcu_tasks_trace, .cb_barrier = rcu_barrier_tasks_trace, - .gp_kthread_dbg = show_rcu_tasks_trace_gp_kthread, - .get_gp_data = rcu_tasks_trace_get_gp_data, .cbflood_max = 50000, .irq_capable = 1, .slow_gps = 1, @@ -1750,7 +1749,7 @@ rcu_torture_writer(void *arg) ulo[i] = cur_ops->get_comp_state(); gp_snap = cur_ops->start_gp_poll(); rcu_torture_writer_state = RTWS_POLL_WAIT; - if (cur_ops->exp_current && !torture_random(&rand) % 0xff) + if (cur_ops->exp_current && !(torture_random(&rand) & 0xff)) cur_ops->exp_current(); while (!cur_ops->poll_gp_state(gp_snap)) { gp_snap1 = cur_ops->get_gp_state(); @@ -1772,7 +1771,7 @@ rcu_torture_writer(void *arg) cur_ops->get_comp_state_full(&rgo[i]); cur_ops->start_gp_poll_full(&gp_snap_full); rcu_torture_writer_state = RTWS_POLL_WAIT_FULL; - if (cur_ops->exp_current && !torture_random(&rand) % 0xff) + if (cur_ops->exp_current && !(torture_random(&rand) & 0xff)) cur_ops->exp_current(); while (!cur_ops->poll_gp_state_full(&gp_snap_full)) { cur_ops->get_gp_state_full(&gp_snap1_full); @@ -2455,6 +2454,9 @@ static DEFINE_TORTURE_RANDOM_PERCPU(rcu_torture_timer_rand); */ static void rcu_torture_timer(struct timer_list *unused) { + WARN_ON_ONCE(!in_serving_softirq()); + WARN_ON_ONCE(in_hardirq()); + WARN_ON_ONCE(in_nmi()); atomic_long_inc(&n_rcu_torture_timers); (void)rcu_torture_one_read(this_cpu_ptr(&rcu_torture_timer_rand), -1); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index ea3f128de06f..c469c708fdd6 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -262,7 +262,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) ssp->srcu_sup->srcu_gp_seq_needed_exp = SRCU_GP_SEQ_INITIAL_VAL; ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns(); if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) { - if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) + if (!init_srcu_struct_nodes(ssp, is_static ? GFP_ATOMIC : GFP_KERNEL)) goto err_free_sda; WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG); } diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 2dc044fd126e..76f952196a29 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -161,11 +161,6 @@ static void tasks_rcu_exit_srcu_stall(struct timer_list *unused); static DEFINE_TIMER(tasks_rcu_exit_srcu_stall_timer, tasks_rcu_exit_srcu_stall); #endif -/* Avoid IPIing CPUs early in the grace period. */ -#define RCU_TASK_IPI_DELAY (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) ? HZ / 2 : 0) -static int rcu_task_ipi_delay __read_mostly = RCU_TASK_IPI_DELAY; -module_param(rcu_task_ipi_delay, int, 0644); - /* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ #define RCU_TASK_BOOT_STALL_TIMEOUT (HZ * 30) #define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) @@ -718,7 +713,6 @@ static void __init rcu_tasks_bootup_oddness(void) #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } - /* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) { @@ -801,9 +795,7 @@ static void rcu_tasks_torture_stats_print_generic(struct rcu_tasks *rtp, char *t #endif // #ifndef CONFIG_TINY_RCU -static void exit_tasks_rcu_finish_trace(struct task_struct *t); - -#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) +#if defined(CONFIG_TASKS_RCU) //////////////////////////////////////////////////////////////////////// // @@ -898,7 +890,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) rtp->postgp_func(rtp); } -#endif /* #if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) */ +#endif /* #if defined(CONFIG_TASKS_RCU) */ #ifdef CONFIG_TASKS_RCU @@ -1322,13 +1314,11 @@ void exit_tasks_rcu_finish(void) raw_spin_lock_irqsave_rcu_node(rtpcp, flags); list_del_init(&t->rcu_tasks_exit_list); raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); - - exit_tasks_rcu_finish_trace(t); } #else /* #ifdef CONFIG_TASKS_RCU */ void exit_tasks_rcu_start(void) { } -void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); } +void exit_tasks_rcu_finish(void) { } #endif /* #else #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_RUDE_RCU @@ -1449,682 +1439,11 @@ EXPORT_SYMBOL_GPL(rcu_tasks_rude_get_gp_data); #endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ -//////////////////////////////////////////////////////////////////////// -// -// Tracing variant of Tasks RCU. This variant is designed to be used -// to protect tracing hooks, including those of BPF. This variant -// therefore: -// -// 1. Has explicit read-side markers to allow finite grace periods -// in the face of in-kernel loops for PREEMPT=n builds. -// -// 2. Protects code in the idle loop, exception entry/exit, and -// CPU-hotplug code paths, similar to the capabilities of SRCU. -// -// 3. Avoids expensive read-side instructions, having overhead similar -// to that of Preemptible RCU. -// -// There are of course downsides. For example, the grace-period code -// can send IPIs to CPUs, even when those CPUs are in the idle loop or -// in nohz_full userspace. If needed, these downsides can be at least -// partially remedied. -// -// Perhaps most important, this variant of RCU does not affect the vanilla -// flavors, rcu_preempt and rcu_sched. The fact that RCU Tasks Trace -// readers can operate from idle, offline, and exception entry/exit in no -// way allows rcu_preempt and rcu_sched readers to also do so. -// -// The implementation uses rcu_tasks_wait_gp(), which relies on function -// pointers in the rcu_tasks structure. The rcu_spawn_tasks_trace_kthread() -// function sets these function pointers up so that rcu_tasks_wait_gp() -// invokes these functions in this order: -// -// rcu_tasks_trace_pregp_step(): -// Disables CPU hotplug, adds all currently executing tasks to the -// holdout list, then checks the state of all tasks that blocked -// or were preempted within their current RCU Tasks Trace read-side -// critical section, adding them to the holdout list if appropriate. -// Finally, this function re-enables CPU hotplug. -// The ->pertask_func() pointer is NULL, so there is no per-task processing. -// rcu_tasks_trace_postscan(): -// Invokes synchronize_rcu() to wait for late-stage exiting tasks -// to finish exiting. -// check_all_holdout_tasks_trace(), repeatedly until holdout list is empty: -// Scans the holdout list, attempting to identify a quiescent state -// for each task on the list. If there is a quiescent state, the -// corresponding task is removed from the holdout list. Once this -// list is empty, the grace period has completed. -// rcu_tasks_trace_postgp(): -// Provides the needed full memory barrier and does debug checks. -// -// The exit_tasks_rcu_finish_trace() synchronizes with exiting tasks. -// -// Pre-grace-period update-side code is ordered before the grace period -// via the ->cbs_lock and barriers in rcu_tasks_kthread(). Pre-grace-period -// read-side code is ordered before the grace period by atomic operations -// on .b.need_qs flag of each task involved in this process, or by scheduler -// context-switch ordering (for locked-down non-running readers). - -// The lockdep state must be outside of #ifdef to be useful. -#ifdef CONFIG_DEBUG_LOCK_ALLOC -static struct lock_class_key rcu_lock_trace_key; -struct lockdep_map rcu_trace_lock_map = - STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_trace", &rcu_lock_trace_key); -EXPORT_SYMBOL_GPL(rcu_trace_lock_map); -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ - -#ifdef CONFIG_TASKS_TRACE_RCU - -// Record outstanding IPIs to each CPU. No point in sending two... -static DEFINE_PER_CPU(bool, trc_ipi_to_cpu); - -// The number of detections of task quiescent state relying on -// heavyweight readers executing explicit memory barriers. -static unsigned long n_heavy_reader_attempts; -static unsigned long n_heavy_reader_updates; -static unsigned long n_heavy_reader_ofl_updates; -static unsigned long n_trc_holdouts; - -void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); -DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace, - "RCU Tasks Trace"); - -/* Load from ->trc_reader_special.b.need_qs with proper ordering. */ -static u8 rcu_ld_need_qs(struct task_struct *t) -{ - smp_mb(); // Enforce full grace-period ordering. - return smp_load_acquire(&t->trc_reader_special.b.need_qs); -} - -/* Store to ->trc_reader_special.b.need_qs with proper ordering. */ -static void rcu_st_need_qs(struct task_struct *t, u8 v) -{ - smp_store_release(&t->trc_reader_special.b.need_qs, v); - smp_mb(); // Enforce full grace-period ordering. -} - -/* - * Do a cmpxchg() on ->trc_reader_special.b.need_qs, allowing for - * the four-byte operand-size restriction of some platforms. - * - * Returns the old value, which is often ignored. - */ -u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new) -{ - return cmpxchg(&t->trc_reader_special.b.need_qs, old, new); -} -EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs); - -/* - * If we are the last reader, signal the grace-period kthread. - * Also remove from the per-CPU list of blocked tasks. - */ -void rcu_read_unlock_trace_special(struct task_struct *t) -{ - unsigned long flags; - struct rcu_tasks_percpu *rtpcp; - union rcu_special trs; - - // Open-coded full-word version of rcu_ld_need_qs(). - smp_mb(); // Enforce full grace-period ordering. - trs = smp_load_acquire(&t->trc_reader_special); - - if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && t->trc_reader_special.b.need_mb) - smp_mb(); // Pairs with update-side barriers. - // Update .need_qs before ->trc_reader_nesting for irq/NMI handlers. - if (trs.b.need_qs == (TRC_NEED_QS_CHECKED | TRC_NEED_QS)) { - u8 result = rcu_trc_cmpxchg_need_qs(t, TRC_NEED_QS_CHECKED | TRC_NEED_QS, - TRC_NEED_QS_CHECKED); - - WARN_ONCE(result != trs.b.need_qs, "%s: result = %d", __func__, result); - } - if (trs.b.blocked) { - rtpcp = per_cpu_ptr(rcu_tasks_trace.rtpcpu, t->trc_blkd_cpu); - raw_spin_lock_irqsave_rcu_node(rtpcp, flags); - list_del_init(&t->trc_blkd_node); - WRITE_ONCE(t->trc_reader_special.b.blocked, false); - raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); - } - WRITE_ONCE(t->trc_reader_nesting, 0); -} -EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special); - -/* Add a newly blocked reader task to its CPU's list. */ -void rcu_tasks_trace_qs_blkd(struct task_struct *t) -{ - unsigned long flags; - struct rcu_tasks_percpu *rtpcp; - - local_irq_save(flags); - rtpcp = this_cpu_ptr(rcu_tasks_trace.rtpcpu); - raw_spin_lock_rcu_node(rtpcp); // irqs already disabled - t->trc_blkd_cpu = smp_processor_id(); - if (!rtpcp->rtp_blkd_tasks.next) - INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks); - list_add(&t->trc_blkd_node, &rtpcp->rtp_blkd_tasks); - WRITE_ONCE(t->trc_reader_special.b.blocked, true); - raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); -} -EXPORT_SYMBOL_GPL(rcu_tasks_trace_qs_blkd); - -/* Add a task to the holdout list, if it is not already on the list. */ -static void trc_add_holdout(struct task_struct *t, struct list_head *bhp) -{ - if (list_empty(&t->trc_holdout_list)) { - get_task_struct(t); - list_add(&t->trc_holdout_list, bhp); - n_trc_holdouts++; - } -} - -/* Remove a task from the holdout list, if it is in fact present. */ -static void trc_del_holdout(struct task_struct *t) -{ - if (!list_empty(&t->trc_holdout_list)) { - list_del_init(&t->trc_holdout_list); - put_task_struct(t); - n_trc_holdouts--; - } -} - -/* IPI handler to check task state. */ -static void trc_read_check_handler(void *t_in) -{ - int nesting; - struct task_struct *t = current; - struct task_struct *texp = t_in; - - // If the task is no longer running on this CPU, leave. - if (unlikely(texp != t)) - goto reset_ipi; // Already on holdout list, so will check later. - - // If the task is not in a read-side critical section, and - // if this is the last reader, awaken the grace-period kthread. - nesting = READ_ONCE(t->trc_reader_nesting); - if (likely(!nesting)) { - rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED); - goto reset_ipi; - } - // If we are racing with an rcu_read_unlock_trace(), try again later. - if (unlikely(nesting < 0)) - goto reset_ipi; - - // Get here if the task is in a read-side critical section. - // Set its state so that it will update state for the grace-period - // kthread upon exit from that critical section. - rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED); - -reset_ipi: - // Allow future IPIs to be sent on CPU and for task. - // Also order this IPI handler against any later manipulations of - // the intended task. - smp_store_release(per_cpu_ptr(&trc_ipi_to_cpu, smp_processor_id()), false); // ^^^ - smp_store_release(&texp->trc_ipi_to_cpu, -1); // ^^^ -} - -/* Callback function for scheduler to check locked-down task. */ -static int trc_inspect_reader(struct task_struct *t, void *bhp_in) -{ - struct list_head *bhp = bhp_in; - int cpu = task_cpu(t); - int nesting; - bool ofl = cpu_is_offline(cpu); - - if (task_curr(t) && !ofl) { - // If no chance of heavyweight readers, do it the hard way. - if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) - return -EINVAL; - - // If heavyweight readers are enabled on the remote task, - // we can inspect its state despite its currently running. - // However, we cannot safely change its state. - n_heavy_reader_attempts++; - // Check for "running" idle tasks on offline CPUs. - if (!rcu_watching_zero_in_eqs(cpu, &t->trc_reader_nesting)) - return -EINVAL; // No quiescent state, do it the hard way. - n_heavy_reader_updates++; - nesting = 0; - } else { - // The task is not running, so C-language access is safe. - nesting = t->trc_reader_nesting; - WARN_ON_ONCE(ofl && task_curr(t) && (t != idle_task(task_cpu(t)))); - if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && ofl) - n_heavy_reader_ofl_updates++; - } - - // If not exiting a read-side critical section, mark as checked - // so that the grace-period kthread will remove it from the - // holdout list. - if (!nesting) { - rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED); - return 0; // In QS, so done. - } - if (nesting < 0) - return -EINVAL; // Reader transitioning, try again later. - - // The task is in a read-side critical section, so set up its - // state so that it will update state upon exit from that critical - // section. - if (!rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED)) - trc_add_holdout(t, bhp); - return 0; -} - -/* Attempt to extract the state for the specified task. */ -static void trc_wait_for_one_reader(struct task_struct *t, - struct list_head *bhp) -{ - int cpu; - - // If a previous IPI is still in flight, let it complete. - if (smp_load_acquire(&t->trc_ipi_to_cpu) != -1) // Order IPI - return; - - // The current task had better be in a quiescent state. - if (t == current) { - rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED); - WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting)); - return; - } - - // Attempt to nail down the task for inspection. - get_task_struct(t); - if (!task_call_func(t, trc_inspect_reader, bhp)) { - put_task_struct(t); - return; - } - put_task_struct(t); - - // If this task is not yet on the holdout list, then we are in - // an RCU read-side critical section. Otherwise, the invocation of - // trc_add_holdout() that added it to the list did the necessary - // get_task_struct(). Either way, the task cannot be freed out - // from under this code. - - // If currently running, send an IPI, either way, add to list. - trc_add_holdout(t, bhp); - if (task_curr(t) && - time_after(jiffies + 1, rcu_tasks_trace.gp_start + rcu_task_ipi_delay)) { - // The task is currently running, so try IPIing it. - cpu = task_cpu(t); - - // If there is already an IPI outstanding, let it happen. - if (per_cpu(trc_ipi_to_cpu, cpu) || t->trc_ipi_to_cpu >= 0) - return; - - per_cpu(trc_ipi_to_cpu, cpu) = true; - t->trc_ipi_to_cpu = cpu; - rcu_tasks_trace.n_ipis++; - if (smp_call_function_single(cpu, trc_read_check_handler, t, 0)) { - // Just in case there is some other reason for - // failure than the target CPU being offline. - WARN_ONCE(1, "%s(): smp_call_function_single() failed for CPU: %d\n", - __func__, cpu); - rcu_tasks_trace.n_ipis_fails++; - per_cpu(trc_ipi_to_cpu, cpu) = false; - t->trc_ipi_to_cpu = -1; - } - } -} - -/* - * Initialize for first-round processing for the specified task. - * Return false if task is NULL or already taken care of, true otherwise. - */ -static bool rcu_tasks_trace_pertask_prep(struct task_struct *t, bool notself) -{ - // During early boot when there is only the one boot CPU, there - // is no idle task for the other CPUs. Also, the grace-period - // kthread is always in a quiescent state. In addition, just return - // if this task is already on the list. - if (unlikely(t == NULL) || (t == current && notself) || !list_empty(&t->trc_holdout_list)) - return false; - - rcu_st_need_qs(t, 0); - t->trc_ipi_to_cpu = -1; - return true; -} - -/* Do first-round processing for the specified task. */ -static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop) -{ - if (rcu_tasks_trace_pertask_prep(t, true)) - trc_wait_for_one_reader(t, hop); -} - -/* Initialize for a new RCU-tasks-trace grace period. */ -static void rcu_tasks_trace_pregp_step(struct list_head *hop) -{ - LIST_HEAD(blkd_tasks); - int cpu; - unsigned long flags; - struct rcu_tasks_percpu *rtpcp; - struct task_struct *t; - - // There shouldn't be any old IPIs, but... - for_each_possible_cpu(cpu) - WARN_ON_ONCE(per_cpu(trc_ipi_to_cpu, cpu)); - - // Disable CPU hotplug across the CPU scan for the benefit of - // any IPIs that might be needed. This also waits for all readers - // in CPU-hotplug code paths. - cpus_read_lock(); - - // These rcu_tasks_trace_pertask_prep() calls are serialized to - // allow safe access to the hop list. - for_each_online_cpu(cpu) { - rcu_read_lock(); - // Note that cpu_curr_snapshot() picks up the target - // CPU's current task while its runqueue is locked with - // an smp_mb__after_spinlock(). This ensures that either - // the grace-period kthread will see that task's read-side - // critical section or the task will see the updater's pre-GP - // accesses. The trailing smp_mb() in cpu_curr_snapshot() - // does not currently play a role other than simplify - // that function's ordering semantics. If these simplified - // ordering semantics continue to be redundant, that smp_mb() - // might be removed. - t = cpu_curr_snapshot(cpu); - if (rcu_tasks_trace_pertask_prep(t, true)) - trc_add_holdout(t, hop); - rcu_read_unlock(); - cond_resched_tasks_rcu_qs(); - } - - // Only after all running tasks have been accounted for is it - // safe to take care of the tasks that have blocked within their - // current RCU tasks trace read-side critical section. - for_each_possible_cpu(cpu) { - rtpcp = per_cpu_ptr(rcu_tasks_trace.rtpcpu, cpu); - raw_spin_lock_irqsave_rcu_node(rtpcp, flags); - list_splice_init(&rtpcp->rtp_blkd_tasks, &blkd_tasks); - while (!list_empty(&blkd_tasks)) { - rcu_read_lock(); - t = list_first_entry(&blkd_tasks, struct task_struct, trc_blkd_node); - list_del_init(&t->trc_blkd_node); - list_add(&t->trc_blkd_node, &rtpcp->rtp_blkd_tasks); - raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); - rcu_tasks_trace_pertask(t, hop); - rcu_read_unlock(); - raw_spin_lock_irqsave_rcu_node(rtpcp, flags); - } - raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); - cond_resched_tasks_rcu_qs(); - } - - // Re-enable CPU hotplug now that the holdout list is populated. - cpus_read_unlock(); -} - -/* - * Do intermediate processing between task and holdout scans. - */ -static void rcu_tasks_trace_postscan(struct list_head *hop) -{ - // Wait for late-stage exiting tasks to finish exiting. - // These might have passed the call to exit_tasks_rcu_finish(). - - // If you remove the following line, update rcu_trace_implies_rcu_gp()!!! - synchronize_rcu(); - // Any tasks that exit after this point will set - // TRC_NEED_QS_CHECKED in ->trc_reader_special.b.need_qs. -} - -/* Communicate task state back to the RCU tasks trace stall warning request. */ -struct trc_stall_chk_rdr { - int nesting; - int ipi_to_cpu; - u8 needqs; -}; - -static int trc_check_slow_task(struct task_struct *t, void *arg) -{ - struct trc_stall_chk_rdr *trc_rdrp = arg; - - if (task_curr(t) && cpu_online(task_cpu(t))) - return false; // It is running, so decline to inspect it. - trc_rdrp->nesting = READ_ONCE(t->trc_reader_nesting); - trc_rdrp->ipi_to_cpu = READ_ONCE(t->trc_ipi_to_cpu); - trc_rdrp->needqs = rcu_ld_need_qs(t); - return true; -} - -/* Show the state of a task stalling the current RCU tasks trace GP. */ -static void show_stalled_task_trace(struct task_struct *t, bool *firstreport) -{ - int cpu; - struct trc_stall_chk_rdr trc_rdr; - bool is_idle_tsk = is_idle_task(t); - - if (*firstreport) { - pr_err("INFO: rcu_tasks_trace detected stalls on tasks:\n"); - *firstreport = false; - } - cpu = task_cpu(t); - if (!task_call_func(t, trc_check_slow_task, &trc_rdr)) - pr_alert("P%d: %c%c\n", - t->pid, - ".I"[t->trc_ipi_to_cpu >= 0], - ".i"[is_idle_tsk]); - else - pr_alert("P%d: %c%c%c%c nesting: %d%c%c cpu: %d%s\n", - t->pid, - ".I"[trc_rdr.ipi_to_cpu >= 0], - ".i"[is_idle_tsk], - ".N"[cpu >= 0 && tick_nohz_full_cpu(cpu)], - ".B"[!!data_race(t->trc_reader_special.b.blocked)], - trc_rdr.nesting, - " !CN"[trc_rdr.needqs & 0x3], - " ?"[trc_rdr.needqs > 0x3], - cpu, cpu_online(cpu) ? "" : "(offline)"); - sched_show_task(t); -} - -/* List stalled IPIs for RCU tasks trace. */ -static void show_stalled_ipi_trace(void) -{ - int cpu; - - for_each_possible_cpu(cpu) - if (per_cpu(trc_ipi_to_cpu, cpu)) - pr_alert("\tIPI outstanding to CPU %d\n", cpu); -} - -/* Do one scan of the holdout list. */ -static void check_all_holdout_tasks_trace(struct list_head *hop, - bool needreport, bool *firstreport) -{ - struct task_struct *g, *t; - - // Disable CPU hotplug across the holdout list scan for IPIs. - cpus_read_lock(); - - list_for_each_entry_safe(t, g, hop, trc_holdout_list) { - // If safe and needed, try to check the current task. - if (READ_ONCE(t->trc_ipi_to_cpu) == -1 && - !(rcu_ld_need_qs(t) & TRC_NEED_QS_CHECKED)) - trc_wait_for_one_reader(t, hop); - - // If check succeeded, remove this task from the list. - if (smp_load_acquire(&t->trc_ipi_to_cpu) == -1 && - rcu_ld_need_qs(t) == TRC_NEED_QS_CHECKED) - trc_del_holdout(t); - else if (needreport) - show_stalled_task_trace(t, firstreport); - cond_resched_tasks_rcu_qs(); - } - - // Re-enable CPU hotplug now that the holdout list scan has completed. - cpus_read_unlock(); - - if (needreport) { - if (*firstreport) - pr_err("INFO: rcu_tasks_trace detected stalls? (Late IPI?)\n"); - show_stalled_ipi_trace(); - } -} - -static void rcu_tasks_trace_empty_fn(void *unused) -{ -} - -/* Wait for grace period to complete and provide ordering. */ -static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) -{ - int cpu; - - // Wait for any lingering IPI handlers to complete. Note that - // if a CPU has gone offline or transitioned to userspace in the - // meantime, all IPI handlers should have been drained beforehand. - // Yes, this assumes that CPUs process IPIs in order. If that ever - // changes, there will need to be a recheck and/or timed wait. - for_each_online_cpu(cpu) - if (WARN_ON_ONCE(smp_load_acquire(per_cpu_ptr(&trc_ipi_to_cpu, cpu)))) - smp_call_function_single(cpu, rcu_tasks_trace_empty_fn, NULL, 1); - - smp_mb(); // Caller's code must be ordered after wakeup. - // Pairs with pretty much every ordering primitive. -} - -/* Report any needed quiescent state for this exiting task. */ -static void exit_tasks_rcu_finish_trace(struct task_struct *t) -{ - union rcu_special trs = READ_ONCE(t->trc_reader_special); - - rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED); - WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting)); - if (WARN_ON_ONCE(rcu_ld_need_qs(t) & TRC_NEED_QS || trs.b.blocked)) - rcu_read_unlock_trace_special(t); - else - WRITE_ONCE(t->trc_reader_nesting, 0); -} - -/** - * call_rcu_tasks_trace() - Queue a callback trace task-based grace period - * @rhp: structure to be used for queueing the RCU updates. - * @func: actual callback function to be invoked after the grace period - * - * The callback function will be invoked some time after a trace rcu-tasks - * grace period elapses, in other words after all currently executing - * trace rcu-tasks read-side critical sections have completed. These - * read-side critical sections are delimited by calls to rcu_read_lock_trace() - * and rcu_read_unlock_trace(). - * - * See the description of call_rcu() for more detailed information on - * memory ordering guarantees. - */ -void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func) -{ - call_rcu_tasks_generic(rhp, func, &rcu_tasks_trace); -} -EXPORT_SYMBOL_GPL(call_rcu_tasks_trace); - -/** - * synchronize_rcu_tasks_trace - wait for a trace rcu-tasks grace period - * - * Control will return to the caller some time after a trace rcu-tasks - * grace period has elapsed, in other words after all currently executing - * trace rcu-tasks read-side critical sections have elapsed. These read-side - * critical sections are delimited by calls to rcu_read_lock_trace() - * and rcu_read_unlock_trace(). - * - * This is a very specialized primitive, intended only for a few uses in - * tracing and other situations requiring manipulation of function preambles - * and profiling hooks. The synchronize_rcu_tasks_trace() function is not - * (yet) intended for heavy use from multiple CPUs. - * - * See the description of synchronize_rcu() for more detailed information - * on memory ordering guarantees. - */ -void synchronize_rcu_tasks_trace(void) -{ - RCU_LOCKDEP_WARN(lock_is_held(&rcu_trace_lock_map), "Illegal synchronize_rcu_tasks_trace() in RCU Tasks Trace read-side critical section"); - synchronize_rcu_tasks_generic(&rcu_tasks_trace); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_trace); - -/** - * rcu_barrier_tasks_trace - Wait for in-flight call_rcu_tasks_trace() callbacks. - * - * Although the current implementation is guaranteed to wait, it is not - * obligated to, for example, if there are no pending callbacks. - */ -void rcu_barrier_tasks_trace(void) -{ - rcu_barrier_tasks_generic(&rcu_tasks_trace); -} -EXPORT_SYMBOL_GPL(rcu_barrier_tasks_trace); - -int rcu_tasks_trace_lazy_ms = -1; -module_param(rcu_tasks_trace_lazy_ms, int, 0444); - -static int __init rcu_spawn_tasks_trace_kthread(void) -{ - if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) { - rcu_tasks_trace.gp_sleep = HZ / 10; - rcu_tasks_trace.init_fract = HZ / 10; - } else { - rcu_tasks_trace.gp_sleep = HZ / 200; - if (rcu_tasks_trace.gp_sleep <= 0) - rcu_tasks_trace.gp_sleep = 1; - rcu_tasks_trace.init_fract = HZ / 200; - if (rcu_tasks_trace.init_fract <= 0) - rcu_tasks_trace.init_fract = 1; - } - if (rcu_tasks_trace_lazy_ms >= 0) - rcu_tasks_trace.lazy_jiffies = msecs_to_jiffies(rcu_tasks_trace_lazy_ms); - rcu_tasks_trace.pregp_func = rcu_tasks_trace_pregp_step; - rcu_tasks_trace.postscan_func = rcu_tasks_trace_postscan; - rcu_tasks_trace.holdouts_func = check_all_holdout_tasks_trace; - rcu_tasks_trace.postgp_func = rcu_tasks_trace_postgp; - rcu_spawn_tasks_kthread_generic(&rcu_tasks_trace); - return 0; -} - -#if !defined(CONFIG_TINY_RCU) -void show_rcu_tasks_trace_gp_kthread(void) -{ - char buf[64]; - - snprintf(buf, sizeof(buf), "N%lu h:%lu/%lu/%lu", - data_race(n_trc_holdouts), - data_race(n_heavy_reader_ofl_updates), - data_race(n_heavy_reader_updates), - data_race(n_heavy_reader_attempts)); - show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf); -} -EXPORT_SYMBOL_GPL(show_rcu_tasks_trace_gp_kthread); - -void rcu_tasks_trace_torture_stats_print(char *tt, char *tf) -{ - rcu_tasks_torture_stats_print_generic(&rcu_tasks_trace, tt, tf, ""); -} -EXPORT_SYMBOL_GPL(rcu_tasks_trace_torture_stats_print); -#endif // !defined(CONFIG_TINY_RCU) - -struct task_struct *get_rcu_tasks_trace_gp_kthread(void) -{ - return rcu_tasks_trace.kthread_ptr; -} -EXPORT_SYMBOL_GPL(get_rcu_tasks_trace_gp_kthread); - -void rcu_tasks_trace_get_gp_data(int *flags, unsigned long *gp_seq) -{ - *flags = 0; - *gp_seq = rcu_seq_current(&rcu_tasks_trace.tasks_gp_seq); -} -EXPORT_SYMBOL_GPL(rcu_tasks_trace_get_gp_data); - -#else /* #ifdef CONFIG_TASKS_TRACE_RCU */ -static void exit_tasks_rcu_finish_trace(struct task_struct *t) { } -#endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */ - #ifndef CONFIG_TINY_RCU void show_rcu_tasks_gp_kthreads(void) { show_rcu_tasks_classic_gp_kthread(); show_rcu_tasks_rude_gp_kthread(); - show_rcu_tasks_trace_gp_kthread(); } #endif /* #ifndef CONFIG_TINY_RCU */ @@ -2251,10 +1570,6 @@ void __init tasks_cblist_init_generic(void) #ifdef CONFIG_TASKS_RUDE_RCU cblist_init_generic(&rcu_tasks_rude); #endif - -#ifdef CONFIG_TASKS_TRACE_RCU - cblist_init_generic(&rcu_tasks_trace); -#endif } static int __init rcu_init_tasks_generic(void) @@ -2267,10 +1582,6 @@ static int __init rcu_init_tasks_generic(void) rcu_spawn_tasks_rude_kthread(); #endif -#ifdef CONFIG_TASKS_TRACE_RCU - rcu_spawn_tasks_trace_kthread(); -#endif - // Run the self-tests. rcu_tasks_initiate_self_tests(); @@ -2281,3 +1592,16 @@ core_initcall(rcu_init_tasks_generic); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ static inline void rcu_tasks_bootup_oddness(void) {} #endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ + +#ifdef CONFIG_TASKS_TRACE_RCU + +//////////////////////////////////////////////////////////////////////// +// +// Tracing variant of Tasks RCU. This variant is designed to be used +// to protect tracing hooks, including those of BPF. This variant +// is implemented via a straightforward mapping onto SRCU-fast. + +DEFINE_SRCU_FAST(rcu_tasks_trace_srcu_struct); +EXPORT_SYMBOL_GPL(rcu_tasks_trace_srcu_struct); + +#endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 293bbd9ac3f4..55df6d37145e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -160,6 +160,7 @@ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, unsigned long gps, unsigned long flags); static void invoke_rcu_core(void); static void rcu_report_exp_rdp(struct rcu_data *rdp); +static void rcu_report_qs_rdp(struct rcu_data *rdp); static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp); static bool rcu_rdp_is_offloaded(struct rcu_data *rdp); static bool rcu_rdp_cpu_online(struct rcu_data *rdp); @@ -1983,6 +1984,17 @@ static noinline_for_stack bool rcu_gp_init(void) if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) on_each_cpu(rcu_strict_gp_boundary, NULL, 0); + /* + * Immediately report QS for the GP kthread's CPU. The GP kthread + * cannot be in an RCU read-side critical section while running + * the FQS scan. This eliminates the need for a second FQS wait + * when all CPUs are idle. + */ + preempt_disable(); + rcu_qs(); + rcu_report_qs_rdp(this_cpu_ptr(&rcu_data)); + preempt_enable(); + return true; } @@ -3769,7 +3781,7 @@ static void rcu_barrier_entrain(struct rcu_data *rdp) } rcu_nocb_unlock(rdp); if (wake_nocb) - wake_nocb_gp(rdp, false); + wake_nocb_gp(rdp); smp_store_release(&rdp->barrier_seq_snap, gseq); } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index b8bbe7960cda..7dfc57e9adb1 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -203,7 +203,7 @@ struct rcu_data { /* during and after the last grace */ /* period it is aware of. */ struct irq_work defer_qs_iw; /* Obtain later scheduler attention. */ - int defer_qs_iw_pending; /* Scheduler attention pending? */ + int defer_qs_pending; /* irqwork or softirq pending? */ struct work_struct strict_work; /* Schedule readers for strict GPs. */ /* 2) batch handling */ @@ -301,7 +301,6 @@ struct rcu_data { #define RCU_NOCB_WAKE_BYPASS 1 #define RCU_NOCB_WAKE_LAZY 2 #define RCU_NOCB_WAKE 3 -#define RCU_NOCB_WAKE_FORCE 4 #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500)) /* For jiffies_till_first_fqs and */ @@ -500,7 +499,7 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp); static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); static void rcu_init_one_nocb(struct rcu_node *rnp); -static bool wake_nocb_gp(struct rcu_data *rdp, bool force); +static bool wake_nocb_gp(struct rcu_data *rdp); static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, unsigned long j, bool lazy); static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head, diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 96c49c56fc14..82cada459e5d 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -589,7 +589,12 @@ static void synchronize_rcu_expedited_stall(unsigned long jiffies_start, unsigne pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", j - jiffies_start, rcu_state.expedited_sequence, data_race(rnp_root->expmask), ".T"[!!data_race(rnp_root->exp_tasks)]); - if (ndetected) { + if (!ndetected) { + // This is invoked from the grace-period worker, so + // a new grace period cannot have started. And if this + // worker were stalled, we would not get here. ;-) + pr_err("INFO: Expedited stall ended before state dump start\n"); + } else { pr_err("blocking rcu_node structures (internal RCU debug):"); rcu_for_each_node_breadth_first(rnp) { if (rnp == rnp_root) diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index e6cd56603cad..b3337c7231cc 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -190,9 +190,18 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) init_swait_queue_head(&rnp->nocb_gp_wq[1]); } +/* Clear any pending deferred wakeup timer (nocb_gp_lock must be held). */ +static void nocb_defer_wakeup_cancel(struct rcu_data *rdp_gp) +{ + if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { + WRITE_ONCE(rdp_gp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); + timer_delete(&rdp_gp->nocb_timer); + } +} + static bool __wake_nocb_gp(struct rcu_data *rdp_gp, struct rcu_data *rdp, - bool force, unsigned long flags) + unsigned long flags) __releases(rdp_gp->nocb_gp_lock) { bool needwake = false; @@ -204,12 +213,9 @@ static bool __wake_nocb_gp(struct rcu_data *rdp_gp, return false; } - if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { - WRITE_ONCE(rdp_gp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - timer_delete(&rdp_gp->nocb_timer); - } + nocb_defer_wakeup_cancel(rdp_gp); - if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) { + if (READ_ONCE(rdp_gp->nocb_gp_sleep)) { WRITE_ONCE(rdp_gp->nocb_gp_sleep, false); needwake = true; } @@ -225,13 +231,13 @@ static bool __wake_nocb_gp(struct rcu_data *rdp_gp, /* * Kick the GP kthread for this NOCB group. */ -static bool wake_nocb_gp(struct rcu_data *rdp, bool force) +static bool wake_nocb_gp(struct rcu_data *rdp) { unsigned long flags; struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); - return __wake_nocb_gp(rdp_gp, rdp, force, flags); + return __wake_nocb_gp(rdp_gp, rdp, flags); } #ifdef CONFIG_RCU_LAZY @@ -518,22 +524,17 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, } /* - * Awaken the no-CBs grace-period kthread if needed, either due to it - * legitimately being asleep or due to overload conditions. - * - * If warranted, also wake up the kthread servicing this CPUs queues. + * Awaken the no-CBs grace-period kthread if needed due to it legitimately + * being asleep. */ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, unsigned long flags) __releases(rdp->nocb_lock) { long bypass_len; - unsigned long cur_gp_seq; - unsigned long j; long lazy_len; long len; struct task_struct *t; - struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; // If we are being polled or there is no kthread, just leave. t = READ_ONCE(rdp->nocb_gp_kthread); @@ -549,47 +550,26 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, lazy_len = READ_ONCE(rdp->lazy_len); if (was_alldone) { rdp->qlen_last_fqs_check = len; + rcu_nocb_unlock(rdp); // Only lazy CBs in bypass list if (lazy_len && bypass_len == lazy_len) { - rcu_nocb_unlock(rdp); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY, TPS("WakeLazy")); } else if (!irqs_disabled_flags(flags)) { /* ... if queue was empty ... */ - rcu_nocb_unlock(rdp); - wake_nocb_gp(rdp, false); + wake_nocb_gp(rdp); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeEmpty")); } else { - rcu_nocb_unlock(rdp); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE, TPS("WakeEmptyIsDeferred")); } - } else if (len > rdp->qlen_last_fqs_check + qhimark) { - /* ... or if many callbacks queued. */ - rdp->qlen_last_fqs_check = len; - j = jiffies; - if (j != rdp->nocb_gp_adv_time && - rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && - rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) { - rcu_advance_cbs_nowake(rdp->mynode, rdp); - rdp->nocb_gp_adv_time = j; - } - smp_mb(); /* Enqueue before timer_pending(). */ - if ((rdp->nocb_cb_sleep || - !rcu_segcblist_ready_cbs(&rdp->cblist)) && - !timer_pending(&rdp_gp->nocb_timer)) { - rcu_nocb_unlock(rdp); - wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE, - TPS("WakeOvfIsDeferred")); - } else { - rcu_nocb_unlock(rdp); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); - } - } else { - rcu_nocb_unlock(rdp); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); + + return; } + + rcu_nocb_unlock(rdp); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); } static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head, @@ -814,10 +794,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) if (rdp_toggling) my_rdp->nocb_toggling_rdp = NULL; - if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { - WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - timer_delete(&my_rdp->nocb_timer); - } + nocb_defer_wakeup_cancel(my_rdp); WRITE_ONCE(my_rdp->nocb_gp_sleep, true); raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); } else { @@ -966,7 +943,6 @@ static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp, unsigned long flags) __releases(rdp_gp->nocb_gp_lock) { - int ndw; int ret; if (!rcu_nocb_need_deferred_wakeup(rdp_gp, level)) { @@ -974,8 +950,7 @@ static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp, return false; } - ndw = rdp_gp->nocb_defer_wakeup; - ret = __wake_nocb_gp(rdp_gp, rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); + ret = __wake_nocb_gp(rdp_gp, rdp, flags); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake")); return ret; @@ -991,7 +966,6 @@ static void do_nocb_deferred_wakeup_timer(struct timer_list *t) trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer")); raw_spin_lock_irqsave(&rdp->nocb_gp_lock, flags); - smp_mb__after_spinlock(); /* Timer expire before wakeup. */ do_nocb_deferred_wakeup_common(rdp, rdp, RCU_NOCB_WAKE_BYPASS, flags); } @@ -1272,7 +1246,7 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) } rcu_nocb_try_flush_bypass(rdp, jiffies); rcu_nocb_unlock_irqrestore(rdp, flags); - wake_nocb_gp(rdp, false); + wake_nocb_gp(rdp); sc->nr_to_scan -= _count; count += _count; if (sc->nr_to_scan <= 0) @@ -1657,7 +1631,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) { } -static bool wake_nocb_gp(struct rcu_data *rdp, bool force) +static bool wake_nocb_gp(struct rcu_data *rdp) { return false; } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index dbe2d02be824..95ad967adcf3 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -487,8 +487,8 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) union rcu_special special; rdp = this_cpu_ptr(&rcu_data); - if (rdp->defer_qs_iw_pending == DEFER_QS_PENDING) - rdp->defer_qs_iw_pending = DEFER_QS_IDLE; + if (rdp->defer_qs_pending == DEFER_QS_PENDING) + rdp->defer_qs_pending = DEFER_QS_IDLE; /* * If RCU core is waiting for this CPU to exit its critical section, @@ -645,7 +645,7 @@ static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp) * 5. Deferred QS reporting does not happen. */ if (rcu_preempt_depth() > 0) - WRITE_ONCE(rdp->defer_qs_iw_pending, DEFER_QS_IDLE); + WRITE_ONCE(rdp->defer_qs_pending, DEFER_QS_IDLE); } /* @@ -747,7 +747,10 @@ static void rcu_read_unlock_special(struct task_struct *t) // Using softirq, safe to awaken, and either the // wakeup is free or there is either an expedited // GP in flight or a potential need to deboost. - raise_softirq_irqoff(RCU_SOFTIRQ); + if (rdp->defer_qs_pending != DEFER_QS_PENDING) { + rdp->defer_qs_pending = DEFER_QS_PENDING; + raise_softirq_irqoff(RCU_SOFTIRQ); + } } else { // Enabling BH or preempt does reschedule, so... // Also if no expediting and no possible deboosting, @@ -755,11 +758,11 @@ static void rcu_read_unlock_special(struct task_struct *t) // tick enabled. set_need_resched_current(); if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled && - needs_exp && rdp->defer_qs_iw_pending != DEFER_QS_PENDING && + needs_exp && rdp->defer_qs_pending != DEFER_QS_PENDING && cpu_online(rdp->cpu)) { // Get scheduler to re-evaluate and call hooks. // If !IRQ_WORK, FQS scan will eventually IPI. - rdp->defer_qs_iw_pending = DEFER_QS_PENDING; + rdp->defer_qs_pending = DEFER_QS_PENDING; irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); } } diff --git a/kernel/resource.c b/kernel/resource.c index e4e9bac12e6e..31341bdd7707 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -48,6 +48,14 @@ struct resource iomem_resource = { }; EXPORT_SYMBOL(iomem_resource); +struct resource soft_reserve_resource = { + .name = "Soft Reserved", + .start = 0, + .end = -1, + .desc = IORES_DESC_SOFT_RESERVED, + .flags = IORESOURCE_MEM, +}; + static DEFINE_RWLOCK(resource_lock); /* @@ -82,7 +90,7 @@ static struct resource *next_resource(struct resource *p, bool skip_children, #ifdef CONFIG_PROC_FS -enum { MAX_IORES_LEVEL = 5 }; +enum { MAX_IORES_LEVEL = 8 }; static void *r_start(struct seq_file *m, loff_t *pos) __acquires(resource_lock) @@ -321,13 +329,14 @@ static bool is_type_match(struct resource *p, unsigned long flags, unsigned long } /** - * find_next_iomem_res - Finds the lowest iomem resource that covers part of - * [@start..@end]. + * find_next_res - Finds the lowest resource that covers part of + * [@start..@end]. * * If a resource is found, returns 0 and @*res is overwritten with the part * of the resource that's within [@start..@end]; if none is found, returns * -ENODEV. Returns -EINVAL for invalid parameters. * + * @parent: resource tree root to search * @start: start address of the resource searched for * @end: end address of same resource * @flags: flags which the resource must have @@ -337,9 +346,9 @@ static bool is_type_match(struct resource *p, unsigned long flags, unsigned long * The caller must specify @start, @end, @flags, and @desc * (which may be IORES_DESC_NONE). */ -static int find_next_iomem_res(resource_size_t start, resource_size_t end, - unsigned long flags, unsigned long desc, - struct resource *res) +static int find_next_res(struct resource *parent, resource_size_t start, + resource_size_t end, unsigned long flags, + unsigned long desc, struct resource *res) { /* Skip children until we find a top level range that matches */ bool skip_children = true; @@ -353,7 +362,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, read_lock(&resource_lock); - for_each_resource(&iomem_resource, p, skip_children) { + for_each_resource(parent, p, skip_children) { /* If we passed the resource we are looking for, stop */ if (p->start > end) { p = NULL; @@ -390,16 +399,23 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, return p ? 0 : -ENODEV; } -static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, - unsigned long flags, unsigned long desc, - void *arg, - int (*func)(struct resource *, void *)) +static int find_next_iomem_res(resource_size_t start, resource_size_t end, + unsigned long flags, unsigned long desc, + struct resource *res) +{ + return find_next_res(&iomem_resource, start, end, flags, desc, res); +} + +static int walk_res_desc(struct resource *parent, resource_size_t start, + resource_size_t end, unsigned long flags, + unsigned long desc, void *arg, + int (*func)(struct resource *, void *)) { struct resource res; int ret = -EINVAL; while (start < end && - !find_next_iomem_res(start, end, flags, desc, &res)) { + !find_next_res(parent, start, end, flags, desc, &res)) { ret = (*func)(&res, arg); if (ret) break; @@ -410,6 +426,15 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, return ret; } +static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, + unsigned long flags, unsigned long desc, + void *arg, + int (*func)(struct resource *, void *)) +{ + return walk_res_desc(&iomem_resource, start, end, flags, desc, arg, func); +} + + /** * walk_iomem_res_desc - Walks through iomem resources and calls func() * with matching resource ranges. @@ -435,6 +460,18 @@ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, EXPORT_SYMBOL_GPL(walk_iomem_res_desc); /* + * In support of device drivers claiming Soft Reserved resources, walk the Soft + * Reserved resource deferral tree. + */ +int walk_soft_reserve_res(u64 start, u64 end, void *arg, + int (*func)(struct resource *, void *)) +{ + return walk_res_desc(&soft_reserve_resource, start, end, IORESOURCE_MEM, + IORES_DESC_SOFT_RESERVED, arg, func); +} +EXPORT_SYMBOL_GPL(walk_soft_reserve_res); + +/* * This function calls the @func callback against all memory ranges of type * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY. * Now, this function is only for System RAM, it deals with full ranges and @@ -656,6 +693,18 @@ int region_intersects(resource_size_t start, size_t size, unsigned long flags, } EXPORT_SYMBOL_GPL(region_intersects); +/* + * Check if the provided range is registered in the Soft Reserved resource + * deferral tree for driver consideration. + */ +int region_intersects_soft_reserve(resource_size_t start, size_t size) +{ + guard(read_lock)(&resource_lock); + return __region_intersects(&soft_reserve_resource, start, size, + IORESOURCE_MEM, IORES_DESC_SOFT_RESERVED); +} +EXPORT_SYMBOL_GPL(region_intersects_soft_reserve); + void __weak arch_remove_reservations(struct resource *avail) { } diff --git a/kernel/rseq.c b/kernel/rseq.c index 395d8b002350..b0973d19f366 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -71,6 +71,9 @@ #define RSEQ_BUILD_SLOW_PATH #include <linux/debugfs.h> +#include <linux/hrtimer.h> +#include <linux/percpu.h> +#include <linux/prctl.h> #include <linux/ratelimit.h> #include <linux/rseq_entry.h> #include <linux/sched.h> @@ -120,7 +123,6 @@ void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, } #endif /* CONFIG_TRACEPOINTS */ -#ifdef CONFIG_DEBUG_FS #ifdef CONFIG_RSEQ_STATS DEFINE_PER_CPU(struct rseq_stats, rseq_stats); @@ -138,6 +140,13 @@ static int rseq_stats_show(struct seq_file *m, void *p) stats.cs += data_race(per_cpu(rseq_stats.cs, cpu)); stats.clear += data_race(per_cpu(rseq_stats.clear, cpu)); stats.fixup += data_race(per_cpu(rseq_stats.fixup, cpu)); + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) { + stats.s_granted += data_race(per_cpu(rseq_stats.s_granted, cpu)); + stats.s_expired += data_race(per_cpu(rseq_stats.s_expired, cpu)); + stats.s_revoked += data_race(per_cpu(rseq_stats.s_revoked, cpu)); + stats.s_yielded += data_race(per_cpu(rseq_stats.s_yielded, cpu)); + stats.s_aborted += data_race(per_cpu(rseq_stats.s_aborted, cpu)); + } } seq_printf(m, "exit: %16lu\n", stats.exit); @@ -148,6 +157,13 @@ static int rseq_stats_show(struct seq_file *m, void *p) seq_printf(m, "cs: %16lu\n", stats.cs); seq_printf(m, "clear: %16lu\n", stats.clear); seq_printf(m, "fixup: %16lu\n", stats.fixup); + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) { + seq_printf(m, "sgrant: %16lu\n", stats.s_granted); + seq_printf(m, "sexpir: %16lu\n", stats.s_expired); + seq_printf(m, "srevok: %16lu\n", stats.s_revoked); + seq_printf(m, "syield: %16lu\n", stats.s_yielded); + seq_printf(m, "sabort: %16lu\n", stats.s_aborted); + } return 0; } @@ -205,16 +221,19 @@ static const struct file_operations debug_ops = { .release = single_release, }; +static void rseq_slice_ext_init(struct dentry *root_dir); + static int __init rseq_debugfs_init(void) { struct dentry *root_dir = debugfs_create_dir("rseq", NULL); debugfs_create_file("debug", 0644, root_dir, NULL, &debug_ops); rseq_stats_init(root_dir); + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) + rseq_slice_ext_init(root_dir); return 0; } __initcall(rseq_debugfs_init); -#endif /* CONFIG_DEBUG_FS */ static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id) { @@ -389,6 +408,8 @@ static bool rseq_reset_ids(void) */ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) { + u32 rseqfl = 0; + if (flags & RSEQ_FLAG_UNREGISTER) { if (flags & ~RSEQ_FLAG_UNREGISTER) return -EINVAL; @@ -405,7 +426,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 return 0; } - if (unlikely(flags)) + if (unlikely(flags & ~(RSEQ_FLAG_SLICE_EXT_DEFAULT_ON))) return -EINVAL; if (current->rseq.usrptr) { @@ -440,6 +461,13 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 if (!access_ok(rseq, rseq_len)) return -EFAULT; + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) { + rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; + if (rseq_slice_extension_enabled() && + (flags & RSEQ_FLAG_SLICE_EXT_DEFAULT_ON)) + rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; + } + scoped_user_write_access(rseq, efault) { /* * If the rseq_cs pointer is non-NULL on registration, clear it to @@ -449,11 +477,13 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 * clearing the fields. Don't bother reading it, just reset it. */ unsafe_put_user(0UL, &rseq->rseq_cs, efault); + unsafe_put_user(rseqfl, &rseq->flags, efault); /* Initialize IDs in user space */ unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault); unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault); unsafe_put_user(0U, &rseq->node_id, efault); unsafe_put_user(0U, &rseq->mm_cid, efault); + unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); } /* @@ -464,6 +494,10 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 current->rseq.len = rseq_len; current->rseq.sig = sig; +#ifdef CONFIG_RSEQ_SLICE_EXTENSION + current->rseq.slice.state.enabled = !!(rseqfl & RSEQ_CS_FLAG_SLICE_EXT_ENABLED); +#endif + /* * If rseq was previously inactive, and has just been * registered, ensure the cpu_id_start and cpu_id fields @@ -476,3 +510,328 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 efault: return -EFAULT; } + +#ifdef CONFIG_RSEQ_SLICE_EXTENSION +struct slice_timer { + struct hrtimer timer; + void *cookie; +}; + +static const unsigned int rseq_slice_ext_nsecs_min = 5 * NSEC_PER_USEC; +static const unsigned int rseq_slice_ext_nsecs_max = 50 * NSEC_PER_USEC; +unsigned int rseq_slice_ext_nsecs __read_mostly = rseq_slice_ext_nsecs_min; +static DEFINE_PER_CPU(struct slice_timer, slice_timer); +DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key); + +/* + * When the timer expires and the task is still in user space, the return + * from interrupt will revoke the grant and schedule. If the task already + * entered the kernel via a syscall and the timer fires before the syscall + * work was able to cancel it, then depending on the preemption model this + * will either reschedule on return from interrupt or in the syscall work + * below. + */ +static enum hrtimer_restart rseq_slice_expired(struct hrtimer *tmr) +{ + struct slice_timer *st = container_of(tmr, struct slice_timer, timer); + + /* + * Validate that the task which armed the timer is still on the + * CPU. It could have been scheduled out without canceling the + * timer. + */ + if (st->cookie == current && current->rseq.slice.state.granted) { + rseq_stat_inc(rseq_stats.s_expired); + set_need_resched_current(); + } + return HRTIMER_NORESTART; +} + +bool __rseq_arm_slice_extension_timer(void) +{ + struct slice_timer *st = this_cpu_ptr(&slice_timer); + struct task_struct *curr = current; + + lockdep_assert_irqs_disabled(); + + /* + * This check prevents a task, which got a time slice extension + * granted, from exceeding the maximum scheduling latency when the + * grant expired before going out to user space. Don't bother to + * clear the grant here, it will be cleaned up automatically before + * going out to user space after being scheduled back in. + */ + if ((unlikely(curr->rseq.slice.expires < ktime_get_mono_fast_ns()))) { + set_need_resched_current(); + return true; + } + + /* + * Store the task pointer as a cookie for comparison in the timer + * function. This is safe as the timer is CPU local and cannot be + * in the expiry function at this point. + */ + st->cookie = curr; + hrtimer_start(&st->timer, curr->rseq.slice.expires, HRTIMER_MODE_ABS_PINNED_HARD); + /* Arm the syscall entry work */ + set_task_syscall_work(curr, SYSCALL_RSEQ_SLICE); + return false; +} + +static void rseq_cancel_slice_extension_timer(void) +{ + struct slice_timer *st = this_cpu_ptr(&slice_timer); + + /* + * st->cookie can be safely read as preemption is disabled and the + * timer is CPU local. + * + * As this is most probably the first expiring timer, the cancel is + * expensive as it has to reprogram the hardware, but that's less + * expensive than going through a full hrtimer_interrupt() cycle + * for nothing. + * + * hrtimer_try_to_cancel() is sufficient here as the timer is CPU + * local and once the hrtimer code disabled interrupts the timer + * callback cannot be running. + */ + if (st->cookie == current) + hrtimer_try_to_cancel(&st->timer); +} + +static inline void rseq_slice_set_need_resched(struct task_struct *curr) +{ + /* + * The interrupt guard is required to prevent inconsistent state in + * this case: + * + * set_tsk_need_resched() + * --> Interrupt + * wakeup() + * set_tsk_need_resched() + * set_preempt_need_resched() + * schedule_on_return() + * clear_tsk_need_resched() + * clear_preempt_need_resched() + * set_preempt_need_resched() <- Inconsistent state + * + * This is safe vs. a remote set of TIF_NEED_RESCHED because that + * only sets the already set bit and does not create inconsistent + * state. + */ + scoped_guard(irq) + set_need_resched_current(); +} + +static void rseq_slice_validate_ctrl(u32 expected) +{ + u32 __user *sctrl = ¤t->rseq.usrptr->slice_ctrl.all; + u32 uval; + + if (get_user(uval, sctrl) || uval != expected) + force_sig(SIGSEGV); +} + +/* + * Invoked from syscall entry if a time slice extension was granted and the + * kernel did not clear it before user space left the critical section. + * + * While the recommended way to relinquish the CPU side effect free is + * rseq_slice_yield(2), any syscall within a granted slice terminates the + * grant and immediately reschedules if required. This supports onion layer + * applications, where the code requesting the grant cannot control the + * code within the critical section. + */ +void rseq_syscall_enter_work(long syscall) +{ + struct task_struct *curr = current; + struct rseq_slice_ctrl ctrl = { .granted = curr->rseq.slice.state.granted }; + + clear_task_syscall_work(curr, SYSCALL_RSEQ_SLICE); + + if (static_branch_unlikely(&rseq_debug_enabled)) + rseq_slice_validate_ctrl(ctrl.all); + + /* + * The kernel might have raced, revoked the grant and updated + * userspace, but kept the SLICE work set. + */ + if (!ctrl.granted) + return; + + /* + * Required to stabilize the per CPU timer pointer and to make + * set_tsk_need_resched() correct on PREEMPT[RT] kernels. + * + * Leaving the scope will reschedule on preemption models FULL, + * LAZY and RT if necessary. + */ + scoped_guard(preempt) { + rseq_cancel_slice_extension_timer(); + /* + * Now that preemption is disabled, quickly check whether + * the task was already rescheduled before arriving here. + */ + if (!curr->rseq.event.sched_switch) { + rseq_slice_set_need_resched(curr); + + if (syscall == __NR_rseq_slice_yield) { + rseq_stat_inc(rseq_stats.s_yielded); + /* Update the yielded state for syscall return */ + curr->rseq.slice.yielded = 1; + } else { + rseq_stat_inc(rseq_stats.s_aborted); + } + } + } + /* Reschedule on NONE/VOLUNTARY preemption models */ + cond_resched(); + + /* Clear the grant in kernel state and user space */ + curr->rseq.slice.state.granted = false; + if (put_user(0U, &curr->rseq.usrptr->slice_ctrl.all)) + force_sig(SIGSEGV); +} + +int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3) +{ + switch (arg2) { + case PR_RSEQ_SLICE_EXTENSION_GET: + if (arg3) + return -EINVAL; + return current->rseq.slice.state.enabled ? PR_RSEQ_SLICE_EXT_ENABLE : 0; + + case PR_RSEQ_SLICE_EXTENSION_SET: { + u32 rflags, valid = RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; + bool enable = !!(arg3 & PR_RSEQ_SLICE_EXT_ENABLE); + + if (arg3 & ~PR_RSEQ_SLICE_EXT_ENABLE) + return -EINVAL; + if (!rseq_slice_extension_enabled()) + return -ENOTSUPP; + if (!current->rseq.usrptr) + return -ENXIO; + + /* No change? */ + if (enable == !!current->rseq.slice.state.enabled) + return 0; + + if (get_user(rflags, ¤t->rseq.usrptr->flags)) + goto die; + + if (current->rseq.slice.state.enabled) + valid |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; + + if ((rflags & valid) != valid) + goto die; + + rflags &= ~RSEQ_CS_FLAG_SLICE_EXT_ENABLED; + rflags |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; + if (enable) + rflags |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; + + if (put_user(rflags, ¤t->rseq.usrptr->flags)) + goto die; + + current->rseq.slice.state.enabled = enable; + return 0; + } + default: + return -EINVAL; + } +die: + force_sig(SIGSEGV); + return -EFAULT; +} + +/** + * sys_rseq_slice_yield - yield the current processor side effect free if a + * task granted with a time slice extension is done with + * the critical work before being forced out. + * + * Return: 1 if the task successfully yielded the CPU within the granted slice. + * 0 if the slice extension was either never granted or was revoked by + * going over the granted extension, using a syscall other than this one + * or being scheduled out earlier due to a subsequent interrupt. + * + * The syscall does not schedule because the syscall entry work immediately + * relinquishes the CPU and schedules if required. + */ +SYSCALL_DEFINE0(rseq_slice_yield) +{ + int yielded = !!current->rseq.slice.yielded; + + current->rseq.slice.yielded = 0; + return yielded; +} + +static int rseq_slice_ext_show(struct seq_file *m, void *p) +{ + seq_printf(m, "%d\n", rseq_slice_ext_nsecs); + return 0; +} + +static ssize_t rseq_slice_ext_write(struct file *file, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + unsigned int nsecs; + + if (kstrtouint_from_user(ubuf, count, 10, &nsecs)) + return -EINVAL; + + if (nsecs < rseq_slice_ext_nsecs_min) + return -ERANGE; + + if (nsecs > rseq_slice_ext_nsecs_max) + return -ERANGE; + + rseq_slice_ext_nsecs = nsecs; + + return count; +} + +static int rseq_slice_ext_open(struct inode *inode, struct file *file) +{ + return single_open(file, rseq_slice_ext_show, inode->i_private); +} + +static const struct file_operations slice_ext_ops = { + .open = rseq_slice_ext_open, + .read = seq_read, + .write = rseq_slice_ext_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static void rseq_slice_ext_init(struct dentry *root_dir) +{ + debugfs_create_file("slice_ext_nsec", 0644, root_dir, NULL, &slice_ext_ops); +} + +static int __init rseq_slice_cmdline(char *str) +{ + bool on; + + if (kstrtobool(str, &on)) + return 0; + + if (!on) + static_branch_disable(&rseq_slice_extension_key); + return 1; +} +__setup("rseq_slice_ext=", rseq_slice_cmdline); + +static int __init rseq_slice_init(void) +{ + unsigned int cpu; + + for_each_possible_cpu(cpu) { + hrtimer_setup(per_cpu_ptr(&slice_timer.timer, cpu), rseq_slice_expired, + CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED_HARD); + } + return 0; +} +device_initcall(rseq_slice_init); +#else +static void rseq_slice_ext_init(struct dentry *root_dir) { } +#endif /* CONFIG_RSEQ_SLICE_EXTENSION */ diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 8ae86371ddcd..b1f1a367034f 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -1,5 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 +CONTEXT_ANALYSIS_core.o := y +CONTEXT_ANALYSIS_fair.o := y + # The compilers are complaining about unused variables inside an if(0) scope # block. This is daft, shut them up. ccflags-y += $(call cc-disable-warning, unused-but-set-variable) diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index f5e6dd6a6b3a..2ae4fbf13431 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -173,6 +173,7 @@ notrace static void __sched_clock_work(struct work_struct *work) scd->tick_gtod, __gtod_offset, scd->tick_raw, __sched_clock_offset); + disable_sched_clock_irqtime(); static_branch_disable(&__sched_clock_stable); } @@ -238,6 +239,8 @@ static int __init sched_clock_init_late(void) if (__sched_clock_stable_early) __set_sched_clock_stable(); + else + disable_sched_clock_irqtime(); /* disable if clock unstable. */ return 0; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 045f83ad261e..759777694c78 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -119,6 +119,9 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_entry_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_exit_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_set_need_resched_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); DEFINE_PER_CPU(struct rnd_state, sched_rnd_state); @@ -396,6 +399,8 @@ static atomic_t sched_core_count; static struct cpumask sched_core_mask; static void sched_core_lock(int cpu, unsigned long *flags) + __context_unsafe(/* acquires multiple */) + __acquires(&runqueues.__lock) /* overapproximation */ { const struct cpumask *smt_mask = cpu_smt_mask(cpu); int t, i = 0; @@ -406,6 +411,8 @@ static void sched_core_lock(int cpu, unsigned long *flags) } static void sched_core_unlock(int cpu, unsigned long *flags) + __context_unsafe(/* releases multiple */) + __releases(&runqueues.__lock) /* overapproximation */ { const struct cpumask *smt_mask = cpu_smt_mask(cpu); int t; @@ -630,6 +637,7 @@ EXPORT_SYMBOL(__trace_set_current_state); */ void raw_spin_rq_lock_nested(struct rq *rq, int subclass) + __context_unsafe() { raw_spinlock_t *lock; @@ -655,6 +663,7 @@ void raw_spin_rq_lock_nested(struct rq *rq, int subclass) } bool raw_spin_rq_trylock(struct rq *rq) + __context_unsafe() { raw_spinlock_t *lock; bool ret; @@ -696,15 +705,16 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2) raw_spin_rq_lock(rq1); if (__rq_lockp(rq1) != __rq_lockp(rq2)) raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING); + else + __acquire_ctx_lock(__rq_lockp(rq2)); /* fake acquire */ double_rq_clock_clear_update(rq1, rq2); } /* - * __task_rq_lock - lock the rq @p resides on. + * ___task_rq_lock - lock the rq @p resides on. */ -struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) - __acquires(rq->lock) +struct rq *___task_rq_lock(struct task_struct *p, struct rq_flags *rf) { struct rq *rq; @@ -727,9 +737,7 @@ struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) /* * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. */ -struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) - __acquires(p->pi_lock) - __acquires(rq->lock) +struct rq *_task_rq_lock(struct task_struct *p, struct rq_flags *rf) { struct rq *rq; @@ -770,6 +778,11 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) * RQ-clock updating methods: */ +/* Use CONFIG_PARAVIRT as this will avoid more #ifdef in arch code. */ +#ifdef CONFIG_PARAVIRT +struct static_key paravirt_steal_rq_enabled; +#endif + static void update_rq_clock_task(struct rq *rq, s64 delta) { /* @@ -1136,6 +1149,7 @@ void __trace_set_need_resched(struct task_struct *curr, int tif) { trace_sched_set_need_resched_tp(curr, smp_processor_id(), tif); } +EXPORT_SYMBOL_GPL(__trace_set_need_resched); void resched_curr(struct rq *rq) { @@ -2090,7 +2104,6 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags) */ uclamp_rq_inc(rq, p, flags); - rq->queue_mask |= p->sched_class->queue_mask; p->sched_class->enqueue_task(rq, p, flags); psi_enqueue(p, flags); @@ -2123,7 +2136,6 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags) * and mark the task ->sched_delayed. */ uclamp_rq_dec(rq, p); - rq->queue_mask |= p->sched_class->queue_mask; return p->sched_class->dequeue_task(rq, p, flags); } @@ -2174,10 +2186,14 @@ void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags) { struct task_struct *donor = rq->donor; - if (p->sched_class == donor->sched_class) - donor->sched_class->wakeup_preempt(rq, p, flags); - else if (sched_class_above(p->sched_class, donor->sched_class)) + if (p->sched_class == rq->next_class) { + rq->next_class->wakeup_preempt(rq, p, flags); + + } else if (sched_class_above(p->sched_class, rq->next_class)) { + rq->next_class->wakeup_preempt(rq, p, flags); resched_curr(rq); + rq->next_class = p->sched_class; + } /* * A queue event has occurred, and we're going to schedule. In @@ -2431,6 +2447,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) */ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, struct task_struct *p, int new_cpu) + __must_hold(__rq_lockp(rq)) { lockdep_assert_rq_held(rq); @@ -2477,6 +2494,7 @@ struct set_affinity_pending { */ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, struct task_struct *p, int dest_cpu) + __must_hold(__rq_lockp(rq)) { /* Affinity changed (again). */ if (!is_cpu_allowed(p, dest_cpu)) @@ -2513,6 +2531,12 @@ static int migration_cpu_stop(void *data) */ flush_smp_call_function_queue(); + /* + * We may change the underlying rq, but the locks held will + * appropriately be "transferred" when switching. + */ + context_unsafe_alias(rq); + raw_spin_lock(&p->pi_lock); rq_lock(rq, &rf); @@ -2624,6 +2648,8 @@ int push_cpu_stop(void *arg) if (!lowest_rq) goto out_unlock; + lockdep_assert_rq_held(lowest_rq); + // XXX validate p is still the highest prio task if (task_rq(p) == rq) { move_queued_task_locked(rq, lowest_rq, p); @@ -2834,8 +2860,7 @@ void release_user_cpus_ptr(struct task_struct *p) */ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf, int dest_cpu, unsigned int flags) - __releases(rq->lock) - __releases(p->pi_lock) + __releases(__rq_lockp(rq), &p->pi_lock) { struct set_affinity_pending my_pending = { }, *pending = NULL; bool stop_pending, complete = false; @@ -2990,8 +3015,7 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p, struct affinity_context *ctx, struct rq *rq, struct rq_flags *rf) - __releases(rq->lock) - __releases(p->pi_lock) + __releases(__rq_lockp(rq), &p->pi_lock) { const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p); const struct cpumask *cpu_valid_mask = cpu_active_mask; @@ -3607,6 +3631,18 @@ static inline void ttwu_do_wakeup(struct task_struct *p) trace_sched_wakeup(p); } +void update_rq_avg_idle(struct rq *rq) +{ + u64 delta = rq_clock(rq) - rq->idle_stamp; + u64 max = 2*rq->max_idle_balance_cost; + + update_avg(&rq->avg_idle, delta); + + if (rq->avg_idle > max) + rq->avg_idle = max; + rq->idle_stamp = 0; +} + static void ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, struct rq_flags *rf) @@ -3642,18 +3678,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, p->sched_class->task_woken(rq, p); rq_repin_lock(rq, rf); } - - if (rq->idle_stamp) { - u64 delta = rq_clock(rq) - rq->idle_stamp; - u64 max = 2*rq->max_idle_balance_cost; - - update_avg(&rq->avg_idle, delta); - - if (rq->avg_idle > max) - rq->avg_idle = max; - - rq->idle_stamp = 0; - } } /* @@ -4273,29 +4297,30 @@ static bool __task_needs_rq_lock(struct task_struct *p) */ int task_call_func(struct task_struct *p, task_call_f func, void *arg) { - struct rq *rq = NULL; struct rq_flags rf; int ret; raw_spin_lock_irqsave(&p->pi_lock, rf.flags); - if (__task_needs_rq_lock(p)) - rq = __task_rq_lock(p, &rf); + if (__task_needs_rq_lock(p)) { + struct rq *rq = __task_rq_lock(p, &rf); - /* - * At this point the task is pinned; either: - * - blocked and we're holding off wakeups (pi->lock) - * - woken, and we're holding off enqueue (rq->lock) - * - queued, and we're holding off schedule (rq->lock) - * - running, and we're holding off de-schedule (rq->lock) - * - * The called function (@func) can use: task_curr(), p->on_rq and - * p->__state to differentiate between these states. - */ - ret = func(p, arg); + /* + * At this point the task is pinned; either: + * - blocked and we're holding off wakeups (pi->lock) + * - woken, and we're holding off enqueue (rq->lock) + * - queued, and we're holding off schedule (rq->lock) + * - running, and we're holding off de-schedule (rq->lock) + * + * The called function (@func) can use: task_curr(), p->on_rq and + * p->__state to differentiate between these states. + */ + ret = func(p, arg); - if (rq) __task_rq_unlock(rq, p, &rf); + } else { + ret = func(p, arg); + } raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); return ret; @@ -4972,6 +4997,8 @@ void balance_callbacks(struct rq *rq, struct balance_callback *head) static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf) + __releases(__rq_lockp(rq)) + __acquires(__rq_lockp(this_rq())) { /* * Since the runqueue lock will be released by the next @@ -4985,9 +5012,15 @@ prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf /* this is a valid case when another task releases the spinlock */ rq_lockp(rq)->owner = next; #endif + /* + * Model the rq reference switcheroo. + */ + __release(__rq_lockp(rq)); + __acquire(__rq_lockp(this_rq())); } static inline void finish_lock_switch(struct rq *rq) + __releases(__rq_lockp(rq)) { /* * If we are tracking spinlock dependencies then we have to @@ -5043,6 +5076,7 @@ static inline void kmap_local_sched_in(void) static inline void prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) + __must_hold(__rq_lockp(rq)) { kcov_prepare_switch(prev); sched_info_switch(rq, prev, next); @@ -5073,7 +5107,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, * because prev may have moved to another CPU. */ static struct rq *finish_task_switch(struct task_struct *prev) - __releases(rq->lock) + __releases(__rq_lockp(this_rq())) { struct rq *rq = this_rq(); struct mm_struct *mm = rq->prev_mm; @@ -5169,7 +5203,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) * @prev: the thread we just switched away from. */ asmlinkage __visible void schedule_tail(struct task_struct *prev) - __releases(rq->lock) + __releases(__rq_lockp(this_rq())) { /* * New tasks start with FORK_PREEMPT_COUNT, see there and @@ -5201,6 +5235,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) static __always_inline struct rq * context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next, struct rq_flags *rf) + __releases(__rq_lockp(rq)) { prepare_task_switch(rq, prev, next); @@ -5869,6 +5904,7 @@ static void prev_balance(struct rq *rq, struct task_struct *prev, */ static inline struct task_struct * __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + __must_hold(__rq_lockp(rq)) { const struct sched_class *class; struct task_struct *p; @@ -5969,6 +6005,7 @@ static void queue_core_balance(struct rq *rq); static struct task_struct * pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + __must_hold(__rq_lockp(rq)) { struct task_struct *next, *p, *max; const struct cpumask *smt_mask; @@ -6277,6 +6314,7 @@ static bool steal_cookie_task(int cpu, struct sched_domain *sd) } static void sched_core_balance(struct rq *rq) + __must_hold(__rq_lockp(rq)) { struct sched_domain *sd; int cpu = cpu_of(rq); @@ -6422,6 +6460,7 @@ static inline void sched_core_cpu_dying(unsigned int cpu) {} static struct task_struct * pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + __must_hold(__rq_lockp(rq)) { return __pick_next_task(rq, prev, rf); } @@ -6808,6 +6847,7 @@ static void __sched notrace __schedule(int sched_mode) pick_again: next = pick_next_task(rq, rq->donor, &rf); rq_set_donor(rq, next); + rq->next_class = next->sched_class; if (unlikely(task_is_blocked(next))) { next = find_proxy_task(rq, next, &rf); if (!next) @@ -7552,7 +7592,7 @@ int preempt_dynamic_mode = preempt_dynamic_undefined; int sched_dynamic_mode(const char *str) { -# ifndef CONFIG_PREEMPT_RT +# if !(defined(CONFIG_PREEMPT_RT) || defined(CONFIG_ARCH_HAS_PREEMPT_LAZY)) if (!strcmp(str, "none")) return preempt_dynamic_none; @@ -8045,6 +8085,12 @@ static int __balance_push_cpu_stop(void *arg) int cpu; scoped_guard (raw_spinlock_irq, &p->pi_lock) { + /* + * We may change the underlying rq, but the locks held will + * appropriately be "transferred" when switching. + */ + context_unsafe_alias(rq); + cpu = select_fallback_rq(rq->cpu, p); rq_lock(rq, &rf); @@ -8068,6 +8114,7 @@ static DEFINE_PER_CPU(struct cpu_stop_work, push_work); * effective when the hotplug motion is down. */ static void balance_push(struct rq *rq) + __must_hold(__rq_lockp(rq)) { struct task_struct *push_task = rq->curr; @@ -8477,6 +8524,9 @@ int sched_cpu_dying(unsigned int cpu) dump_rq_tasks(rq, KERN_WARNING); } dl_server_stop(&rq->fair_server); +#ifdef CONFIG_SCHED_CLASS_EXT + dl_server_stop(&rq->ext_server); +#endif rq_unlock_irqrestore(rq, &rf); calc_load_migrate(rq); @@ -8652,6 +8702,8 @@ void __init sched_init(void) rq->rt.rt_runtime = global_rt_runtime(); init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); #endif + rq->next_class = &idle_sched_class; + rq->sd = NULL; rq->rd = NULL; rq->cpu_capacity = SCHED_CAPACITY_SCALE; @@ -8680,6 +8732,9 @@ void __init sched_init(void) hrtick_rq_init(rq); atomic_set(&rq->nr_iowait, 0); fair_server_init(rq); +#ifdef CONFIG_SCHED_CLASS_EXT + ext_server_init(rq); +#endif #ifdef CONFIG_SCHED_CORE rq->core = rq; @@ -9111,6 +9166,7 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup) { unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; bool resched = false; + bool queued = false; struct rq *rq; CLASS(task_rq_lock, rq_guard)(tsk); @@ -9122,10 +9178,13 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup) scx_cgroup_move_task(tsk); if (scope->running) resched = true; + queued = scope->queued; } if (resched) resched_curr(rq); + else if (queued) + wakeup_preempt(rq, tsk, 0); __balance_callbacks(rq, &rq_guard.rf); } @@ -10269,7 +10328,8 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) * Serialization rules: * * mm::mm_cid::mutex: Serializes fork() and exit() and therefore - * protects mm::mm_cid::users. + * protects mm::mm_cid::users and mode switch + * transitions * * mm::mm_cid::lock: Serializes mm_update_max_cids() and * mm_update_cpus_allowed(). Nests in mm_cid::mutex @@ -10285,14 +10345,70 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) * * A CID is either owned by a task (stored in task_struct::mm_cid.cid) or * by a CPU (stored in mm::mm_cid.pcpu::cid). CIDs owned by CPUs have the - * MM_CID_ONCPU bit set. During transition from CPU to task ownership mode, - * MM_CID_TRANSIT is set on the per task CIDs. When this bit is set the - * task needs to drop the CID into the pool when scheduling out. Both bits - * (ONCPU and TRANSIT) are filtered out by task_cid() when the CID is - * actually handed over to user space in the RSEQ memory. + * MM_CID_ONCPU bit set. + * + * During the transition of ownership mode, the MM_CID_TRANSIT bit is set + * on the CIDs. When this bit is set the tasks drop the CID back into the + * pool when scheduling out. + * + * Both bits (ONCPU and TRANSIT) are filtered out by task_cid() when the + * CID is actually handed over to user space in the RSEQ memory. * * Mode switching: * + * The ownership mode is per process and stored in mm:mm_cid::mode with the + * following possible states: + * + * 0: Per task ownership + * 0 | MM_CID_TRANSIT: Transition from per CPU to per task + * MM_CID_ONCPU: Per CPU ownership + * MM_CID_ONCPU | MM_CID_TRANSIT: Transition from per task to per CPU + * + * All transitions of ownership mode happen in two phases: + * + * 1) mm:mm_cid::mode has the MM_CID_TRANSIT bit set. This is OR'ed on the + * CIDs and denotes that the CID is only temporarily owned by a + * task. When the task schedules out it drops the CID back into the + * pool if this bit is set. + * + * 2) The initiating context walks the per CPU space or the tasks to fixup + * or drop the CIDs and after completion it clears MM_CID_TRANSIT in + * mm:mm_cid::mode. After that point the CIDs are strictly task or CPU + * owned again. + * + * This two phase transition is required to prevent CID space exhaustion + * during the transition as a direct transfer of ownership would fail: + * + * - On task to CPU mode switch if a task is scheduled in on one CPU and + * then migrated to another CPU before the fixup freed enough per task + * CIDs. + * + * - On CPU to task mode switch if two tasks are scheduled in on the same + * CPU before the fixup freed per CPU CIDs. + * + * Both scenarios can result in a live lock because sched_in() is invoked + * with runqueue lock held and loops in search of a CID and the fixup + * thread can't make progress freeing them up because it is stuck on the + * same runqueue lock. + * + * While MM_CID_TRANSIT is active during the transition phase the MM_CID + * bitmap can be contended, but that's a temporary contention bound to the + * transition period. After that everything goes back into steady state and + * nothing except fork() and exit() will touch the bitmap. This is an + * acceptable tradeoff as it completely avoids complex serialization, + * memory barriers and atomic operations for the common case. + * + * Aside of that this mechanism also ensures RT compability: + * + * - The task which runs the fixup is fully preemptible except for the + * short runqueue lock held sections. + * + * - The transient impact of the bitmap contention is only problematic + * when there is a thundering herd scenario of tasks scheduling in and + * out concurrently. There is not much which can be done about that + * except for avoiding mode switching by a proper overall system + * configuration. + * * Switching to per CPU mode happens when the user count becomes greater * than the maximum number of CIDs, which is calculated by: * @@ -10306,12 +10422,13 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) * * At the point of switching to per CPU mode the new user is not yet * visible in the system, so the task which initiated the fork() runs the - * fixup function: mm_cid_fixup_tasks_to_cpu() walks the thread list and - * either transfers each tasks owned CID to the CPU the task runs on or - * drops it into the CID pool if a task is not on a CPU at that point in - * time. Tasks which schedule in before the task walk reaches them do the - * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() completes - * it's guaranteed that no task related to that MM owns a CID anymore. + * fixup function. mm_cid_fixup_tasks_to_cpu() walks the thread list and + * either marks each task owned CID with MM_CID_TRANSIT if the task is + * running on a CPU or drops it into the CID pool if a task is not on a + * CPU. Tasks which schedule in before the task walk reaches them do the + * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() + * completes it is guaranteed that no task related to that MM owns a CID + * anymore. * * Switching back to task mode happens when the user count goes below the * threshold which was recorded on the per CPU mode switch: @@ -10327,28 +10444,11 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) * run either in the deferred update function in context of a workqueue or * by a task which forks a new one or by a task which exits. Whatever * happens first. mm_cid_fixup_cpus_to_task() walks through the possible - * CPUs and either transfers the CPU owned CIDs to a related task which - * runs on the CPU or drops it into the pool. Tasks which schedule in on a - * CPU which the walk did not cover yet do the handover themself. - * - * This transition from CPU to per task ownership happens in two phases: - * - * 1) mm:mm_cid.transit contains MM_CID_TRANSIT This is OR'ed on the task - * CID and denotes that the CID is only temporarily owned by the - * task. When it schedules out the task drops the CID back into the - * pool if this bit is set. - * - * 2) The initiating context walks the per CPU space and after completion - * clears mm:mm_cid.transit. So after that point the CIDs are strictly - * task owned again. - * - * This two phase transition is required to prevent CID space exhaustion - * during the transition as a direct transfer of ownership would fail if - * two tasks are scheduled in on the same CPU before the fixup freed per - * CPU CIDs. - * - * When mm_cid_fixup_cpus_to_tasks() completes it's guaranteed that no CID - * related to that MM is owned by a CPU anymore. + * CPUs and either marks the CPU owned CIDs with MM_CID_TRANSIT if a + * related task is running on the CPU or drops it into the pool. Tasks + * which are scheduled in before the fixup covered them do the handover + * themself. When mm_cid_fixup_cpus_to_tasks() completes it is guaranteed + * that no CID related to that MM is owned by a CPU anymore. */ /* @@ -10379,6 +10479,7 @@ static inline unsigned int mm_cid_calc_pcpu_thrs(struct mm_mm_cid *mc) static bool mm_update_max_cids(struct mm_struct *mm) { struct mm_mm_cid *mc = &mm->mm_cid; + bool percpu = cid_on_cpu(mc->mode); lockdep_assert_held(&mm->mm_cid.lock); @@ -10387,7 +10488,7 @@ static bool mm_update_max_cids(struct mm_struct *mm) __mm_update_max_cids(mc); /* Check whether owner mode must be changed */ - if (!mc->percpu) { + if (!percpu) { /* Enable per CPU mode when the number of users is above max_cids */ if (mc->users > mc->max_cids) mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc); @@ -10398,12 +10499,17 @@ static bool mm_update_max_cids(struct mm_struct *mm) } /* Mode change required? */ - if (!!mc->percpu == !!mc->pcpu_thrs) + if (percpu == !!mc->pcpu_thrs) return false; - /* When switching back to per TASK mode, set the transition flag */ - if (!mc->pcpu_thrs) - WRITE_ONCE(mc->transit, MM_CID_TRANSIT); - WRITE_ONCE(mc->percpu, !!mc->pcpu_thrs); + + /* Flip the mode and set the transition flag to bridge the transfer */ + WRITE_ONCE(mc->mode, mc->mode ^ (MM_CID_TRANSIT | MM_CID_ONCPU)); + /* + * Order the store against the subsequent fixups so that + * acquire(rq::lock) cannot be reordered by the CPU before the + * store. + */ + smp_mb(); return true; } @@ -10428,7 +10534,7 @@ static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpu WRITE_ONCE(mc->nr_cpus_allowed, weight); __mm_update_max_cids(mc); - if (!mc->percpu) + if (!cid_on_cpu(mc->mode)) return; /* Adjust the threshold to the wider set */ @@ -10446,6 +10552,16 @@ static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpu irq_work_queue(&mc->irq_work); } +static inline void mm_cid_complete_transit(struct mm_struct *mm, unsigned int mode) +{ + /* + * Ensure that the store removing the TRANSIT bit cannot be + * reordered by the CPU before the fixups have been completed. + */ + smp_mb(); + WRITE_ONCE(mm->mm_cid.mode, mode); +} + static inline void mm_cid_transit_to_task(struct task_struct *t, struct mm_cid_pcpu *pcp) { if (cid_on_cpu(t->mm_cid.cid)) { @@ -10489,14 +10605,13 @@ static void mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm) } } } - /* Clear the transition bit */ - WRITE_ONCE(mm->mm_cid.transit, 0); + mm_cid_complete_transit(mm, 0); } -static inline void mm_cid_transfer_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp) +static inline void mm_cid_transit_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp) { if (cid_on_task(t->mm_cid.cid)) { - t->mm_cid.cid = cid_to_cpu_cid(t->mm_cid.cid); + t->mm_cid.cid = cid_to_transit_cid(t->mm_cid.cid); pcp->cid = t->mm_cid.cid; } } @@ -10509,18 +10624,17 @@ static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm if (!t->mm_cid.active) return false; if (cid_on_task(t->mm_cid.cid)) { - /* If running on the CPU, transfer the CID, otherwise drop it */ + /* If running on the CPU, put the CID in transit mode, otherwise drop it */ if (task_rq(t)->curr == t) - mm_cid_transfer_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t))); + mm_cid_transit_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t))); else mm_unset_cid_on_task(t); } return true; } -static void mm_cid_fixup_tasks_to_cpus(void) +static void mm_cid_do_fixup_tasks_to_cpus(struct mm_struct *mm) { - struct mm_struct *mm = current->mm; struct task_struct *p, *t; unsigned int users; @@ -10558,6 +10672,14 @@ static void mm_cid_fixup_tasks_to_cpus(void) } } +static void mm_cid_fixup_tasks_to_cpus(void) +{ + struct mm_struct *mm = current->mm; + + mm_cid_do_fixup_tasks_to_cpus(mm); + mm_cid_complete_transit(mm, MM_CID_ONCPU); +} + static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm) { t->mm_cid.active = 1; @@ -10586,17 +10708,17 @@ void sched_mm_cid_fork(struct task_struct *t) } if (!sched_mm_cid_add_user(t, mm)) { - if (!mm->mm_cid.percpu) + if (!cid_on_cpu(mm->mm_cid.mode)) t->mm_cid.cid = mm_get_cid(mm); return; } /* Handle the mode change and transfer current's CID */ - percpu = !!mm->mm_cid.percpu; + percpu = cid_on_cpu(mm->mm_cid.mode); if (!percpu) mm_cid_transit_to_task(current, pcp); else - mm_cid_transfer_to_cpu(current, pcp); + mm_cid_transit_to_cpu(current, pcp); } if (percpu) { @@ -10631,7 +10753,7 @@ static bool __sched_mm_cid_exit(struct task_struct *t) * affinity change increased the number of allowed CPUs and the * deferred fixup did not run yet. */ - if (WARN_ON_ONCE(mm->mm_cid.percpu)) + if (WARN_ON_ONCE(cid_on_cpu(mm->mm_cid.mode))) return false; /* * A failed fork(2) cleanup never gets here, so @current must have @@ -10664,8 +10786,13 @@ void sched_mm_cid_exit(struct task_struct *t) scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { if (!__sched_mm_cid_exit(t)) return; - /* Mode change required. Transfer currents CID */ - mm_cid_transit_to_task(current, this_cpu_ptr(mm->mm_cid.pcpu)); + /* + * Mode change. The task has the CID unset + * already and dealt with an eventually set + * TRANSIT bit. If the CID is owned by the CPU + * then drop it. + */ + mm_drop_cid_on_cpu(mm, this_cpu_ptr(mm->mm_cid.pcpu)); } mm_cid_fixup_cpus_to_tasks(mm); return; @@ -10722,7 +10849,7 @@ static void mm_cid_work_fn(struct work_struct *work) if (!mm_update_max_cids(mm)) return; /* Affinity changes can only switch back to task mode */ - if (WARN_ON_ONCE(mm->mm_cid.percpu)) + if (WARN_ON_ONCE(cid_on_cpu(mm->mm_cid.mode))) return; } mm_cid_fixup_cpus_to_tasks(mm); @@ -10743,8 +10870,7 @@ static void mm_cid_irq_work(struct irq_work *work) void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { mm->mm_cid.max_cids = 0; - mm->mm_cid.percpu = 0; - mm->mm_cid.transit = 0; + mm->mm_cid.mode = 0; mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed; mm->mm_cid.users = 0; mm->mm_cid.pcpu_thrs = 0; @@ -10780,13 +10906,12 @@ struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags |= DEQUEUE_NOCLOCK; } - if (flags & DEQUEUE_CLASS) { - if (p->sched_class->switching_from) - p->sched_class->switching_from(rq, p); - } + if ((flags & DEQUEUE_CLASS) && p->sched_class->switching_from) + p->sched_class->switching_from(rq, p); *ctx = (struct sched_change_ctx){ .p = p, + .class = p->sched_class, .flags = flags, .queued = task_on_rq_queued(p), .running = task_current_donor(rq, p), @@ -10817,6 +10942,11 @@ void sched_change_end(struct sched_change_ctx *ctx) lockdep_assert_rq_held(rq); + /* + * Changing class without *QUEUE_CLASS is bad. + */ + WARN_ON_ONCE(p->sched_class != ctx->class && !(ctx->flags & ENQUEUE_CLASS)); + if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switching_to) p->sched_class->switching_to(rq, p); @@ -10828,6 +10958,25 @@ void sched_change_end(struct sched_change_ctx *ctx) if (ctx->flags & ENQUEUE_CLASS) { if (p->sched_class->switched_to) p->sched_class->switched_to(rq, p); + + if (ctx->running) { + /* + * If this was a class promotion; let the old class + * know it got preempted. Note that none of the + * switch*_from() methods know the new class and none + * of the switch*_to() methods know the old class. + */ + if (sched_class_above(p->sched_class, ctx->class)) { + rq->next_class->wakeup_preempt(rq, p, 0); + rq->next_class = p->sched_class; + } + /* + * If this was a degradation in class; make sure to + * reschedule. + */ + if (sched_class_above(ctx->class, p->sched_class)) + resched_curr(rq); + } } else { p->sched_class->prio_changed(rq, p, ctx->prio); } diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 0ab5f9d4bc59..cfc40181f66e 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -682,7 +682,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) "sugov:%d", cpumask_first(policy->related_cpus)); if (IS_ERR(thread)) { - pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread)); + pr_err("failed to create sugov thread: %pe\n", thread); return PTR_ERR(thread); } diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 4f97896887ec..fbf31db0d2f3 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -12,6 +12,8 @@ #ifdef CONFIG_IRQ_TIME_ACCOUNTING +DEFINE_STATIC_KEY_FALSE(sched_clock_irqtime); + /* * There are no locks covering percpu hardirq/softirq time. * They are only modified in vtime_account, on corresponding CPU @@ -25,16 +27,15 @@ */ DEFINE_PER_CPU(struct irqtime, cpu_irqtime); -int sched_clock_irqtime; - void enable_sched_clock_irqtime(void) { - sched_clock_irqtime = 1; + static_branch_enable(&sched_clock_irqtime); } void disable_sched_clock_irqtime(void) { - sched_clock_irqtime = 0; + if (irqtime_enabled()) + static_branch_disable(&sched_clock_irqtime); } static void irqtime_account_delta(struct irqtime *irqtime, u64 delta, @@ -251,6 +252,19 @@ void __account_forceidle_time(struct task_struct *p, u64 delta) * ticks are not redelivered later. Due to that, this function may on * occasion account more time than the calling functions think elapsed. */ +#ifdef CONFIG_PARAVIRT +struct static_key paravirt_steal_enabled; + +#ifdef CONFIG_HAVE_PV_STEAL_CLOCK_GEN +static u64 native_steal_clock(int cpu) +{ + return 0; +} + +DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock); +#endif +#endif + static __always_inline u64 steal_account_process_time(u64 maxtime) { #ifdef CONFIG_PARAVIRT diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index c509f2e7d69d..d08b00429323 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1034,6 +1034,12 @@ static void update_dl_entity(struct sched_dl_entity *dl_se) return; } + /* + * When [4] D->A is followed by [1] A->B, dl_defer_running + * needs to be cleared, otherwise it will fail to properly + * start the zero-laxity timer. + */ + dl_se->dl_defer_running = 0; replenish_dl_new_period(dl_se, rq); } else if (dl_server(dl_se) && dl_se->dl_defer) { /* @@ -1443,8 +1449,8 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 dl_se->dl_defer_idle = 0; /* - * The fair server can consume its runtime while throttled (not queued/ - * running as regular CFS). + * The DL server can consume its runtime while throttled (not + * queued / running as regular CFS). * * If the server consumes its entire runtime in this state. The server * is not required for the current period. Thus, reset the server by @@ -1529,10 +1535,10 @@ throttle: } /* - * The fair server (sole dl_server) does not account for real-time - * workload because it is running fair work. + * The dl_server does not account for real-time workload because it + * is running fair work. */ - if (dl_se == &rq->fair_server) + if (dl_se->dl_server) return; #ifdef CONFIG_RT_GROUP_SCHED @@ -1567,9 +1573,9 @@ throttle: * In the non-defer mode, the idle time is not accounted, as the * server provides a guarantee. * - * If the dl_server is in defer mode, the idle time is also considered - * as time available for the fair server, avoiding a penalty for the - * rt scheduler that did not consumed that time. + * If the dl_server is in defer mode, the idle time is also considered as + * time available for the dl_server, avoiding a penalty for the rt + * scheduler that did not consumed that time. */ void dl_server_update_idle(struct sched_dl_entity *dl_se, s64 delta_exec) { @@ -1655,6 +1661,12 @@ void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec) * dl_server_active = 1; * enqueue_dl_entity() * update_dl_entity(WAKEUP) + * if (dl_time_before() || dl_entity_overflow()) + * dl_defer_running = 0; + * replenish_dl_new_period(); + * // fwd period + * dl_throttled = 1; + * dl_defer_armed = 1; * if (!dl_defer_running) * dl_defer_armed = 1; * dl_throttled = 1; @@ -1787,7 +1799,7 @@ void dl_server_start(struct sched_dl_entity *dl_se) struct rq *rq = dl_se->rq; dl_se->dl_defer_idle = 0; - if (!dl_server(dl_se) || dl_se->dl_server_active) + if (!dl_server(dl_se) || dl_se->dl_server_active || !dl_se->dl_runtime) return; /* @@ -1848,6 +1860,18 @@ void sched_init_dl_servers(void) dl_se->dl_server = 1; dl_se->dl_defer = 1; setup_new_dl_entity(dl_se); + +#ifdef CONFIG_SCHED_CLASS_EXT + dl_se = &rq->ext_server; + + WARN_ON(dl_server(dl_se)); + + dl_server_apply_params(dl_se, runtime, period, 1); + + dl_se->dl_server = 1; + dl_se->dl_defer = 1; + setup_new_dl_entity(dl_se); +#endif } } @@ -1874,7 +1898,6 @@ int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 perio int cpu = cpu_of(rq); struct dl_bw *dl_b; unsigned long cap; - int retval = 0; int cpus; dl_b = dl_bw_of(cpu); @@ -1906,7 +1929,7 @@ int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 perio dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); - return retval; + return 0; } /* @@ -2503,9 +2526,16 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) * Only called when both the current and waking task are -deadline * tasks. */ -static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, - int flags) +static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags) { + /* + * Can only get preempted by stop-class, and those should be + * few and short lived, doesn't really make sense to push + * anything away for that. + */ + if (p->sched_class != &dl_sched_class) + return; + if (dl_entity_preempt(&p->dl, &rq->donor->dl)) { resched_curr(rq); return; @@ -3179,6 +3209,36 @@ void dl_add_task_root_domain(struct task_struct *p) raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); } +static void dl_server_add_bw(struct root_domain *rd, int cpu) +{ + struct sched_dl_entity *dl_se; + + dl_se = &cpu_rq(cpu)->fair_server; + if (dl_server(dl_se) && cpu_active(cpu)) + __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(cpu)); + +#ifdef CONFIG_SCHED_CLASS_EXT + dl_se = &cpu_rq(cpu)->ext_server; + if (dl_server(dl_se) && cpu_active(cpu)) + __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(cpu)); +#endif +} + +static u64 dl_server_read_bw(int cpu) +{ + u64 dl_bw = 0; + + if (cpu_rq(cpu)->fair_server.dl_server) + dl_bw += cpu_rq(cpu)->fair_server.dl_bw; + +#ifdef CONFIG_SCHED_CLASS_EXT + if (cpu_rq(cpu)->ext_server.dl_server) + dl_bw += cpu_rq(cpu)->ext_server.dl_bw; +#endif + + return dl_bw; +} + void dl_clear_root_domain(struct root_domain *rd) { int i; @@ -3197,12 +3257,8 @@ void dl_clear_root_domain(struct root_domain *rd) * dl_servers are not tasks. Since dl_add_task_root_domain ignores * them, we need to account for them here explicitly. */ - for_each_cpu(i, rd->span) { - struct sched_dl_entity *dl_se = &cpu_rq(i)->fair_server; - - if (dl_server(dl_se) && cpu_active(i)) - __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(i)); - } + for_each_cpu(i, rd->span) + dl_server_add_bw(rd, i); } void dl_clear_root_domain_cpu(int cpu) @@ -3348,9 +3404,6 @@ static int task_is_throttled_dl(struct task_struct *p, int cpu) #endif DEFINE_SCHED_CLASS(dl) = { - - .queue_mask = 8, - .enqueue_task = enqueue_task_dl, .dequeue_task = dequeue_task_dl, .yield_task = yield_task_dl, @@ -3644,6 +3697,9 @@ static void __dl_clear_params(struct sched_dl_entity *dl_se) dl_se->dl_non_contending = 0; dl_se->dl_overrun = 0; dl_se->dl_server = 0; + dl_se->dl_defer = 0; + dl_se->dl_defer_running = 0; + dl_se->dl_defer_armed = 0; #ifdef CONFIG_RT_MUTEXES dl_se->pi_se = dl_se; @@ -3701,7 +3757,7 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw) unsigned long flags, cap; struct dl_bw *dl_b; bool overflow = 0; - u64 fair_server_bw = 0; + u64 dl_server_bw = 0; rcu_read_lock_sched(); dl_b = dl_bw_of(cpu); @@ -3734,27 +3790,26 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw) cap -= arch_scale_cpu_capacity(cpu); /* - * cpu is going offline and NORMAL tasks will be moved away - * from it. We can thus discount dl_server bandwidth - * contribution as it won't need to be servicing tasks after - * the cpu is off. + * cpu is going offline and NORMAL and EXT tasks will be + * moved away from it. We can thus discount dl_server + * bandwidth contribution as it won't need to be servicing + * tasks after the cpu is off. */ - if (cpu_rq(cpu)->fair_server.dl_server) - fair_server_bw = cpu_rq(cpu)->fair_server.dl_bw; + dl_server_bw = dl_server_read_bw(cpu); /* * Not much to check if no DEADLINE bandwidth is present. * dl_servers we can discount, as tasks will be moved out the * offlined CPUs anyway. */ - if (dl_b->total_bw - fair_server_bw > 0) { + if (dl_b->total_bw - dl_server_bw > 0) { /* * Leaving at least one CPU for DEADLINE tasks seems a * wise thing to do. As said above, cpu is not offline * yet, so account for that. */ if (dl_bw_cpus(cpu) - 1) - overflow = __dl_overflow(dl_b, cap, fair_server_bw, 0); + overflow = __dl_overflow(dl_b, cap, dl_server_bw, 0); else overflow = 1; } diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 41caa22e0680..b24f40f05019 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -172,18 +172,12 @@ static const struct file_operations sched_feat_fops = { static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - char buf[16]; unsigned int scaling; + int ret; - if (cnt > 15) - cnt = 15; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - buf[cnt] = '\0'; - - if (kstrtouint(buf, 10, &scaling)) - return -EINVAL; + ret = kstrtouint_from_user(ubuf, cnt, 10, &scaling); + if (ret) + return ret; if (scaling >= SCHED_TUNABLESCALING_END) return -EINVAL; @@ -243,7 +237,7 @@ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, static int sched_dynamic_show(struct seq_file *m, void *v) { - int i = IS_ENABLED(CONFIG_PREEMPT_RT) * 2; + int i = (IS_ENABLED(CONFIG_PREEMPT_RT) || IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY)) * 2; int j; /* Count entries in NULL terminated preempt_modes */ @@ -336,17 +330,19 @@ enum dl_param { DL_PERIOD, }; -static unsigned long fair_server_period_max = (1UL << 22) * NSEC_PER_USEC; /* ~4 seconds */ -static unsigned long fair_server_period_min = (100) * NSEC_PER_USEC; /* 100 us */ +static unsigned long dl_server_period_max = (1UL << 22) * NSEC_PER_USEC; /* ~4 seconds */ +static unsigned long dl_server_period_min = (100) * NSEC_PER_USEC; /* 100 us */ -static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos, enum dl_param param) +static ssize_t sched_server_write_common(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos, enum dl_param param, + void *server) { long cpu = (long) ((struct seq_file *) filp->private_data)->private; + struct sched_dl_entity *dl_se = (struct sched_dl_entity *)server; + u64 old_runtime, runtime, period; struct rq *rq = cpu_rq(cpu); - u64 runtime, period; + int retval = 0; size_t err; - int retval; u64 value; err = kstrtoull_from_user(ubuf, cnt, 10, &value); @@ -354,8 +350,8 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu return err; scoped_guard (rq_lock_irqsave, rq) { - runtime = rq->fair_server.dl_runtime; - period = rq->fair_server.dl_period; + old_runtime = runtime = dl_se->dl_runtime; + period = dl_se->dl_period; switch (param) { case DL_RUNTIME: @@ -371,60 +367,68 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu } if (runtime > period || - period > fair_server_period_max || - period < fair_server_period_min) { + period > dl_server_period_max || + period < dl_server_period_min) { return -EINVAL; } update_rq_clock(rq); - dl_server_stop(&rq->fair_server); - - retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0); - if (retval) - cnt = retval; + dl_server_stop(dl_se); + retval = dl_server_apply_params(dl_se, runtime, period, 0); + dl_server_start(dl_se); - if (!runtime) - printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n", - cpu_of(rq)); + if (retval < 0) + return retval; + } - if (rq->cfs.h_nr_queued) - dl_server_start(&rq->fair_server); + if (!!old_runtime ^ !!runtime) { + pr_info("%s server %sabled on CPU %d%s.\n", + server == &rq->fair_server ? "Fair" : "Ext", + runtime ? "en" : "dis", + cpu_of(rq), + runtime ? "" : ", system may malfunction due to starvation"); } *ppos += cnt; return cnt; } -static size_t sched_fair_server_show(struct seq_file *m, void *v, enum dl_param param) +static size_t sched_server_show_common(struct seq_file *m, void *v, enum dl_param param, + void *server) { - unsigned long cpu = (unsigned long) m->private; - struct rq *rq = cpu_rq(cpu); + struct sched_dl_entity *dl_se = (struct sched_dl_entity *)server; u64 value; switch (param) { case DL_RUNTIME: - value = rq->fair_server.dl_runtime; + value = dl_se->dl_runtime; break; case DL_PERIOD: - value = rq->fair_server.dl_period; + value = dl_se->dl_period; break; } seq_printf(m, "%llu\n", value); return 0; - } static ssize_t sched_fair_server_runtime_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_RUNTIME); + long cpu = (long) ((struct seq_file *) filp->private_data)->private; + struct rq *rq = cpu_rq(cpu); + + return sched_server_write_common(filp, ubuf, cnt, ppos, DL_RUNTIME, + &rq->fair_server); } static int sched_fair_server_runtime_show(struct seq_file *m, void *v) { - return sched_fair_server_show(m, v, DL_RUNTIME); + unsigned long cpu = (unsigned long) m->private; + struct rq *rq = cpu_rq(cpu); + + return sched_server_show_common(m, v, DL_RUNTIME, &rq->fair_server); } static int sched_fair_server_runtime_open(struct inode *inode, struct file *filp) @@ -440,16 +444,57 @@ static const struct file_operations fair_server_runtime_fops = { .release = single_release, }; +#ifdef CONFIG_SCHED_CLASS_EXT +static ssize_t +sched_ext_server_runtime_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + long cpu = (long) ((struct seq_file *) filp->private_data)->private; + struct rq *rq = cpu_rq(cpu); + + return sched_server_write_common(filp, ubuf, cnt, ppos, DL_RUNTIME, + &rq->ext_server); +} + +static int sched_ext_server_runtime_show(struct seq_file *m, void *v) +{ + unsigned long cpu = (unsigned long) m->private; + struct rq *rq = cpu_rq(cpu); + + return sched_server_show_common(m, v, DL_RUNTIME, &rq->ext_server); +} + +static int sched_ext_server_runtime_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_ext_server_runtime_show, inode->i_private); +} + +static const struct file_operations ext_server_runtime_fops = { + .open = sched_ext_server_runtime_open, + .write = sched_ext_server_runtime_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_SCHED_CLASS_EXT */ + static ssize_t sched_fair_server_period_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_PERIOD); + long cpu = (long) ((struct seq_file *) filp->private_data)->private; + struct rq *rq = cpu_rq(cpu); + + return sched_server_write_common(filp, ubuf, cnt, ppos, DL_PERIOD, + &rq->fair_server); } static int sched_fair_server_period_show(struct seq_file *m, void *v) { - return sched_fair_server_show(m, v, DL_PERIOD); + unsigned long cpu = (unsigned long) m->private; + struct rq *rq = cpu_rq(cpu); + + return sched_server_show_common(m, v, DL_PERIOD, &rq->fair_server); } static int sched_fair_server_period_open(struct inode *inode, struct file *filp) @@ -465,6 +510,40 @@ static const struct file_operations fair_server_period_fops = { .release = single_release, }; +#ifdef CONFIG_SCHED_CLASS_EXT +static ssize_t +sched_ext_server_period_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + long cpu = (long) ((struct seq_file *) filp->private_data)->private; + struct rq *rq = cpu_rq(cpu); + + return sched_server_write_common(filp, ubuf, cnt, ppos, DL_PERIOD, + &rq->ext_server); +} + +static int sched_ext_server_period_show(struct seq_file *m, void *v) +{ + unsigned long cpu = (unsigned long) m->private; + struct rq *rq = cpu_rq(cpu); + + return sched_server_show_common(m, v, DL_PERIOD, &rq->ext_server); +} + +static int sched_ext_server_period_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_ext_server_period_show, inode->i_private); +} + +static const struct file_operations ext_server_period_fops = { + .open = sched_ext_server_period_open, + .write = sched_ext_server_period_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_SCHED_CLASS_EXT */ + static struct dentry *debugfs_sched; static void debugfs_fair_server_init(void) @@ -488,6 +567,29 @@ static void debugfs_fair_server_init(void) } } +#ifdef CONFIG_SCHED_CLASS_EXT +static void debugfs_ext_server_init(void) +{ + struct dentry *d_ext; + unsigned long cpu; + + d_ext = debugfs_create_dir("ext_server", debugfs_sched); + if (!d_ext) + return; + + for_each_possible_cpu(cpu) { + struct dentry *d_cpu; + char buf[32]; + + snprintf(buf, sizeof(buf), "cpu%lu", cpu); + d_cpu = debugfs_create_dir(buf, d_ext); + + debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &ext_server_runtime_fops); + debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &ext_server_period_fops); + } +} +#endif /* CONFIG_SCHED_CLASS_EXT */ + static __init int sched_init_debug(void) { struct dentry __maybe_unused *numa; @@ -526,6 +628,9 @@ static __init int sched_init_debug(void) debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); debugfs_fair_server_init(); +#ifdef CONFIG_SCHED_CLASS_EXT + debugfs_ext_server_init(); +#endif return 0; } diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index afe28c04d5aa..c18e81e8ef51 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -194,6 +194,7 @@ MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microsecond #include <trace/events/sched_ext.h> static void process_ddsp_deferred_locals(struct rq *rq); +static bool task_dead_and_done(struct task_struct *p); static u32 reenq_local(struct rq *rq); static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags); static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, @@ -958,6 +959,8 @@ static void update_curr_scx(struct rq *rq) if (!curr->scx.slice) touch_core_sched(rq, curr); } + + dl_server_update(&rq->ext_server, delta_exec); } static bool scx_dsq_priq_less(struct rb_node *node_a, @@ -1501,6 +1504,10 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags if (enq_flags & SCX_ENQ_WAKEUP) touch_core_sched(rq, p); + /* Start dl_server if this is the first task being enqueued */ + if (rq->scx.nr_running == 1) + dl_server_start(&rq->ext_server); + do_enqueue_task(rq, p, enq_flags, sticky_cpu); out: rq->scx.flags &= ~SCX_RQ_IN_WAKEUP; @@ -2453,7 +2460,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) /* see kick_cpus_irq_workfn() */ smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); - rq_modified_clear(rq); + rq->next_class = &ext_sched_class; rq_unpin_lock(rq, rf); balance_one(rq, prev); @@ -2468,7 +2475,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) * If @force_scx is true, always try to pick a SCHED_EXT task, * regardless of any higher-priority sched classes activity. */ - if (!force_scx && rq_modified_above(rq, &ext_sched_class)) + if (!force_scx && sched_class_above(rq->next_class, &ext_sched_class)) return RETRY_TASK; keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; @@ -2512,6 +2519,33 @@ static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf) return do_pick_task_scx(rq, rf, false); } +/* + * Select the next task to run from the ext scheduling class. + * + * Use do_pick_task_scx() directly with @force_scx enabled, since the + * dl_server must always select a sched_ext task. + */ +static struct task_struct * +ext_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf) +{ + if (!scx_enabled()) + return NULL; + + return do_pick_task_scx(dl_se->rq, rf, true); +} + +/* + * Initialize the ext server deadline entity. + */ +void ext_server_init(struct rq *rq) +{ + struct sched_dl_entity *dl_se = &rq->ext_server; + + init_dl_entity(dl_se); + + dl_server_init(dl_se, rq, ext_server_pick_task); +} + #ifdef CONFIG_SCHED_CORE /** * scx_prio_less - Task ordering for core-sched @@ -2619,6 +2653,9 @@ static void set_cpus_allowed_scx(struct task_struct *p, set_cpus_allowed_common(p, ac); + if (task_dead_and_done(p)) + return; + /* * The effective cpumask is stored in @p->cpus_ptr which may temporarily * differ from the configured one in @p->cpus_mask. Always tell the bpf @@ -3034,10 +3071,45 @@ void scx_cancel_fork(struct task_struct *p) percpu_up_read(&scx_fork_rwsem); } +/** + * task_dead_and_done - Is a task dead and done running? + * @p: target task + * + * Once sched_ext_dead() removes the dead task from scx_tasks and exits it, the + * task no longer exists from SCX's POV. However, certain sched_class ops may be + * invoked on these dead tasks leading to failures - e.g. sched_setscheduler() + * may try to switch a task which finished sched_ext_dead() back into SCX + * triggering invalid SCX task state transitions and worse. + * + * Once a task has finished the final switch, sched_ext_dead() is the only thing + * that needs to happen on the task. Use this test to short-circuit sched_class + * operations which may be called on dead tasks. + */ +static bool task_dead_and_done(struct task_struct *p) +{ + struct rq *rq = task_rq(p); + + lockdep_assert_rq_held(rq); + + /* + * In do_task_dead(), a dying task sets %TASK_DEAD with preemption + * disabled and __schedule(). If @p has %TASK_DEAD set and off CPU, @p + * won't ever run again. + */ + return unlikely(READ_ONCE(p->__state) == TASK_DEAD) && + !task_on_cpu(rq, p); +} + void sched_ext_dead(struct task_struct *p) { unsigned long flags; + /* + * By the time control reaches here, @p has %TASK_DEAD set, switched out + * for the last time and then dropped the rq lock - task_dead_and_done() + * should be returning %true nullifying the straggling sched_class ops. + * Remove from scx_tasks and exit @p. + */ raw_spin_lock_irqsave(&scx_tasks_lock, flags); list_del_init(&p->scx.tasks_node); raw_spin_unlock_irqrestore(&scx_tasks_lock, flags); @@ -3063,6 +3135,9 @@ static void reweight_task_scx(struct rq *rq, struct task_struct *p, lockdep_assert_rq_held(task_rq(p)); + if (task_dead_and_done(p)) + return; + p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight)); if (SCX_HAS_OP(sch, set_weight)) SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq, @@ -3077,6 +3152,9 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p) { struct scx_sched *sch = scx_root; + if (task_dead_and_done(p)) + return; + scx_enable_task(p); /* @@ -3090,10 +3168,14 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p) static void switched_from_scx(struct rq *rq, struct task_struct *p) { + if (task_dead_and_done(p)) + return; + scx_disable_task(p); } -static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {} +static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) {} + static void switched_to_scx(struct rq *rq, struct task_struct *p) {} int scx_check_setscheduler(struct task_struct *p, int policy) @@ -3354,8 +3436,6 @@ static void scx_cgroup_unlock(void) {} * their current sched_class. Call them directly from sched core instead. */ DEFINE_SCHED_CLASS(ext) = { - .queue_mask = 1, - .enqueue_task = enqueue_task_scx, .dequeue_task = dequeue_task_scx, .yield_task = yield_task_scx, @@ -3440,8 +3520,8 @@ static void destroy_dsq(struct scx_sched *sch, u64 dsq_id) * operations inside scheduler locks. */ dsq->id = SCX_DSQ_INVALID; - llist_add(&dsq->free_node, &dsqs_to_free); - irq_work_queue(&free_dsq_irq_work); + if (llist_add(&dsq->free_node, &dsqs_to_free)) + irq_work_queue(&free_dsq_irq_work); out_unlock_dsq: raw_spin_unlock_irqrestore(&dsq->lock, flags); @@ -7227,9 +7307,9 @@ BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_RCU_PROTECTED | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY) -BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, scx_bpf_exit_bstr) +BTF_ID_FLAGS(func, scx_bpf_error_bstr) +BTF_ID_FLAGS(func, scx_bpf_dump_bstr) BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur) @@ -7248,7 +7328,7 @@ BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_RET_NULL | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) #endif BTF_ID_FLAGS(func, scx_bpf_now) -BTF_ID_FLAGS(func, scx_bpf_events, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, scx_bpf_events) BTF_KFUNCS_END(scx_kfunc_ids_any) static const struct btf_kfunc_id_set scx_kfunc_set_any = { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e71302282671..1e22b7fadd70 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -524,10 +524,48 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); * Scheduling class tree data structure manipulation methods: */ +extern void __BUILD_BUG_vruntime_cmp(void); + +/* Use __builtin_strcmp() because of __HAVE_ARCH_STRCMP: */ + +#define vruntime_cmp(A, CMP_STR, B) ({ \ + int __res = 0; \ + \ + if (!__builtin_strcmp(CMP_STR, "<")) { \ + __res = ((s64)((A)-(B)) < 0); \ + } else if (!__builtin_strcmp(CMP_STR, "<=")) { \ + __res = ((s64)((A)-(B)) <= 0); \ + } else if (!__builtin_strcmp(CMP_STR, ">")) { \ + __res = ((s64)((A)-(B)) > 0); \ + } else if (!__builtin_strcmp(CMP_STR, ">=")) { \ + __res = ((s64)((A)-(B)) >= 0); \ + } else { \ + /* Unknown operator throws linker error: */ \ + __BUILD_BUG_vruntime_cmp(); \ + } \ + \ + __res; \ +}) + +extern void __BUILD_BUG_vruntime_op(void); + +#define vruntime_op(A, OP_STR, B) ({ \ + s64 __res = 0; \ + \ + if (!__builtin_strcmp(OP_STR, "-")) { \ + __res = (s64)((A)-(B)); \ + } else { \ + /* Unknown operator throws linker error: */ \ + __BUILD_BUG_vruntime_op(); \ + } \ + \ + __res; \ +}) + + static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime) { - s64 delta = (s64)(vruntime - max_vruntime); - if (delta > 0) + if (vruntime_cmp(vruntime, ">", max_vruntime)) max_vruntime = vruntime; return max_vruntime; @@ -535,8 +573,7 @@ static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime) static inline __maybe_unused u64 min_vruntime(u64 min_vruntime, u64 vruntime) { - s64 delta = (s64)(vruntime - min_vruntime); - if (delta < 0) + if (vruntime_cmp(vruntime, "<", min_vruntime)) min_vruntime = vruntime; return min_vruntime; @@ -549,12 +586,12 @@ static inline bool entity_before(const struct sched_entity *a, * Tiebreak on vruntime seems unnecessary since it can * hardly happen. */ - return (s64)(a->deadline - b->deadline) < 0; + return vruntime_cmp(a->deadline, "<", b->deadline); } static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) { - return (s64)(se->vruntime - cfs_rq->zero_vruntime); + return vruntime_op(se->vruntime, "-", cfs_rq->zero_vruntime); } #define __node_2_se(node) \ @@ -576,7 +613,7 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) * * \Sum lag_i = 0 * \Sum w_i * (V - v_i) = 0 - * \Sum w_i * V - w_i * v_i = 0 + * \Sum (w_i * V - w_i * v_i) = 0 * * From which we can solve an expression for V in v_i (which we have in * se->vruntime): @@ -607,11 +644,11 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) * Which we track using: * * v0 := cfs_rq->zero_vruntime - * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime - * \Sum w_i := cfs_rq->avg_load + * \Sum (v_i - v0) * w_i := cfs_rq->sum_w_vruntime + * \Sum w_i := cfs_rq->sum_weight * * Since zero_vruntime closely tracks the per-task service, these - * deltas: (v_i - v), will be in the order of the maximal (virtual) lag + * deltas: (v_i - v0), will be in the order of the maximal (virtual) lag * induced in the system due to quantisation. * * Also, we use scale_load_down() to reduce the size. @@ -619,32 +656,32 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) * As measured, the max (key * weight) value was ~44 bits for a kernel build. */ static void -avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) +sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) { unsigned long weight = scale_load_down(se->load.weight); s64 key = entity_key(cfs_rq, se); - cfs_rq->avg_vruntime += key * weight; - cfs_rq->avg_load += weight; + cfs_rq->sum_w_vruntime += key * weight; + cfs_rq->sum_weight += weight; } static void -avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) +sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) { unsigned long weight = scale_load_down(se->load.weight); s64 key = entity_key(cfs_rq, se); - cfs_rq->avg_vruntime -= key * weight; - cfs_rq->avg_load -= weight; + cfs_rq->sum_w_vruntime -= key * weight; + cfs_rq->sum_weight -= weight; } static inline -void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) +void sum_w_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) { /* - * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load + * v' = v + d ==> sum_w_vruntime' = sum_runtime - d*sum_weight */ - cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta; + cfs_rq->sum_w_vruntime -= cfs_rq->sum_weight * delta; } /* @@ -654,8 +691,8 @@ void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) u64 avg_vruntime(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; - s64 avg = cfs_rq->avg_vruntime; - long load = cfs_rq->avg_load; + s64 avg = cfs_rq->sum_w_vruntime; + long load = cfs_rq->sum_weight; if (curr && curr->on_rq) { unsigned long weight = scale_load_down(curr->load.weight); @@ -722,8 +759,8 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) { struct sched_entity *curr = cfs_rq->curr; - s64 avg = cfs_rq->avg_vruntime; - long load = cfs_rq->avg_load; + s64 avg = cfs_rq->sum_w_vruntime; + long load = cfs_rq->sum_weight; if (curr && curr->on_rq) { unsigned long weight = scale_load_down(curr->load.weight); @@ -732,7 +769,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) load += weight; } - return avg >= (s64)(vruntime - cfs_rq->zero_vruntime) * load; + return avg >= vruntime_op(vruntime, "-", cfs_rq->zero_vruntime) * load; } int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -743,9 +780,9 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) static void update_zero_vruntime(struct cfs_rq *cfs_rq) { u64 vruntime = avg_vruntime(cfs_rq); - s64 delta = (s64)(vruntime - cfs_rq->zero_vruntime); + s64 delta = vruntime_op(vruntime, "-", cfs_rq->zero_vruntime); - avg_vruntime_update(cfs_rq, delta); + sum_w_vruntime_update(cfs_rq, delta); cfs_rq->zero_vruntime = vruntime; } @@ -770,13 +807,12 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) return entity_before(__node_2_se(a), __node_2_se(b)); } -#define vruntime_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; }) - static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node *node) { if (node) { struct sched_entity *rse = __node_2_se(node); - if (vruntime_gt(min_vruntime, se, rse)) + + if (vruntime_cmp(se->min_vruntime, ">", rse->min_vruntime)) se->min_vruntime = rse->min_vruntime; } } @@ -819,7 +855,7 @@ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity, */ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - avg_vruntime_add(cfs_rq, se); + sum_w_vruntime_add(cfs_rq, se); update_zero_vruntime(cfs_rq); se->min_vruntime = se->vruntime; se->min_slice = se->slice; @@ -831,7 +867,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, &min_vruntime_cb); - avg_vruntime_sub(cfs_rq, se); + sum_w_vruntime_sub(cfs_rq, se); update_zero_vruntime(cfs_rq); } @@ -887,7 +923,7 @@ static inline void update_protect_slice(struct cfs_rq *cfs_rq, struct sched_enti static inline bool protect_slice(struct sched_entity *se) { - return ((s64)(se->vprot - se->vruntime) > 0); + return vruntime_cmp(se->vruntime, "<", se->vprot); } static inline void cancel_protect_slice(struct sched_entity *se) @@ -1024,7 +1060,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); */ static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if ((s64)(se->vruntime - se->deadline) < 0) + if (vruntime_cmp(se->vruntime, "<", se->deadline)) return false; /* @@ -1513,7 +1549,7 @@ static unsigned int task_scan_start(struct task_struct *p) /* Scale the maximum scan period with the amount of shared memory. */ rcu_read_lock(); - ng = rcu_dereference(p->numa_group); + ng = rcu_dereference_all(p->numa_group); if (ng) { unsigned long shared = group_faults_shared(ng); unsigned long private = group_faults_priv(ng); @@ -1580,7 +1616,7 @@ pid_t task_numa_group_id(struct task_struct *p) pid_t gid = 0; rcu_read_lock(); - ng = rcu_dereference(p->numa_group); + ng = rcu_dereference_all(p->numa_group); if (ng) gid = ng->gid; rcu_read_unlock(); @@ -2239,7 +2275,7 @@ static bool task_numa_compare(struct task_numa_env *env, return false; rcu_read_lock(); - cur = rcu_dereference(dst_rq->curr); + cur = rcu_dereference_all(dst_rq->curr); if (cur && ((cur->flags & (PF_EXITING | PF_KTHREAD)) || !cur->mm)) cur = NULL; @@ -2284,7 +2320,7 @@ static bool task_numa_compare(struct task_numa_env *env, * If dst and source tasks are in the same NUMA group, or not * in any group then look only at task weights. */ - cur_ng = rcu_dereference(cur->numa_group); + cur_ng = rcu_dereference_all(cur->numa_group); if (cur_ng == p_ng) { /* * Do not swap within a group or between tasks that have @@ -2458,11 +2494,8 @@ static void task_numa_find_cpu(struct task_numa_env *env, maymove = !load_too_imbalanced(src_load, dst_load, env); } - for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { - /* Skip this CPU if the source task cannot migrate */ - if (!cpumask_test_cpu(cpu, env->p->cpus_ptr)) - continue; - + /* Skip CPUs if the source task cannot migrate */ + for_each_cpu_and(cpu, cpumask_of_node(env->dst_nid), env->p->cpus_ptr) { env->dst_cpu = cpu; if (task_numa_compare(env, taskimp, groupimp, maymove)) break; @@ -2499,7 +2532,7 @@ static int task_numa_migrate(struct task_struct *p) * to satisfy here. */ rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); + sd = rcu_dereference_all(per_cpu(sd_numa, env.src_cpu)); if (sd) { env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; env.imb_numa_nr = sd->imb_numa_nr; @@ -2860,6 +2893,7 @@ static int preferred_group_nid(struct task_struct *p, int nid) } static void task_numa_placement(struct task_struct *p) + __context_unsafe(/* conditional locking */) { int seq, nid, max_nid = NUMA_NO_NODE; unsigned long max_faults = 0; @@ -3022,7 +3056,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, if (!cpupid_match_pid(tsk, cpupid)) goto no_join; - grp = rcu_dereference(tsk->numa_group); + grp = rcu_dereference_all(tsk->numa_group); if (!grp) goto no_join; @@ -3693,7 +3727,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) */ #define add_positive(_ptr, _val) do { \ typeof(_ptr) ptr = (_ptr); \ - typeof(_val) val = (_val); \ + __signed_scalar_typeof(*ptr) val = (_val); \ typeof(*ptr) res, var = READ_ONCE(*ptr); \ \ res = var + val; \ @@ -3705,23 +3739,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) } while (0) /* - * Unsigned subtract and clamp on underflow. - * - * Explicitly do a load-store to ensure the intermediate value never hits - * memory. This allows lockless observations without ever seeing the negative - * values. - */ -#define sub_positive(_ptr, _val) do { \ - typeof(_ptr) ptr = (_ptr); \ - typeof(*ptr) val = (_val); \ - typeof(*ptr) res, var = READ_ONCE(*ptr); \ - res = var - val; \ - if (res > var) \ - res = 0; \ - WRITE_ONCE(*ptr, res); \ -} while (0) - -/* * Remove and clamp on negative, from a local variable. * * A variant of sub_positive(), which does not use explicit load-store @@ -3732,21 +3749,39 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) *ptr -= min_t(typeof(*ptr), *ptr, _val); \ } while (0) + +/* + * Because of rounding, se->util_sum might ends up being +1 more than + * cfs->util_sum. Although this is not a problem by itself, detaching + * a lot of tasks with the rounding problem between 2 updates of + * util_avg (~1ms) can make cfs->util_sum becoming null whereas + * cfs_util_avg is not. + * + * Check that util_sum is still above its lower bound for the new + * util_avg. Given that period_contrib might have moved since the last + * sync, we are only sure that util_sum must be above or equal to + * util_avg * minimum possible divider + */ +#define __update_sa(sa, name, delta_avg, delta_sum) do { \ + add_positive(&(sa)->name##_avg, delta_avg); \ + add_positive(&(sa)->name##_sum, delta_sum); \ + (sa)->name##_sum = max_t(typeof((sa)->name##_sum), \ + (sa)->name##_sum, \ + (sa)->name##_avg * PELT_MIN_DIVIDER); \ +} while (0) + static inline void enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - cfs_rq->avg.load_avg += se->avg.load_avg; - cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum; + __update_sa(&cfs_rq->avg, load, se->avg.load_avg, + se_weight(se) * se->avg.load_sum); } static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); - sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum); - /* See update_cfs_rq_load_avg() */ - cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, - cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); + __update_sa(&cfs_rq->avg, load, -se->avg.load_avg, + se_weight(se) * -se->avg.load_sum); } static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags); @@ -4242,7 +4277,6 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq */ divider = get_pelt_divider(&cfs_rq->avg); - /* Set new sched_entity's utilization */ se->avg.util_avg = gcfs_rq->avg.util_avg; new_sum = se->avg.util_avg * divider; @@ -4250,12 +4284,7 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq se->avg.util_sum = new_sum; /* Update parent cfs_rq utilization */ - add_positive(&cfs_rq->avg.util_avg, delta_avg); - add_positive(&cfs_rq->avg.util_sum, delta_sum); - - /* See update_cfs_rq_load_avg() */ - cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum, - cfs_rq->avg.util_avg * PELT_MIN_DIVIDER); + __update_sa(&cfs_rq->avg, util, delta_avg, delta_sum); } static inline void @@ -4281,11 +4310,7 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf se->avg.runnable_sum = new_sum; /* Update parent cfs_rq runnable */ - add_positive(&cfs_rq->avg.runnable_avg, delta_avg); - add_positive(&cfs_rq->avg.runnable_sum, delta_sum); - /* See update_cfs_rq_load_avg() */ - cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum, - cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER); + __update_sa(&cfs_rq->avg, runnable, delta_avg, delta_sum); } static inline void @@ -4349,11 +4374,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq se->avg.load_sum = runnable_sum; se->avg.load_avg = load_avg; - add_positive(&cfs_rq->avg.load_avg, delta_avg); - add_positive(&cfs_rq->avg.load_sum, delta_sum); - /* See update_cfs_rq_load_avg() */ - cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, - cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); + __update_sa(&cfs_rq->avg, load, delta_avg, delta_sum); } static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) @@ -4450,7 +4471,7 @@ static inline void migrate_se_pelt_lag(struct sched_entity *se) rq = rq_of(cfs_rq); rcu_read_lock(); - is_idle = is_idle_task(rcu_dereference(rq->curr)); + is_idle = is_idle_task(rcu_dereference_all(rq->curr)); rcu_read_unlock(); /* @@ -4552,33 +4573,13 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) raw_spin_unlock(&cfs_rq->removed.lock); r = removed_load; - sub_positive(&sa->load_avg, r); - sub_positive(&sa->load_sum, r * divider); - /* See sa->util_sum below */ - sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER); + __update_sa(sa, load, -r, -r*divider); r = removed_util; - sub_positive(&sa->util_avg, r); - sub_positive(&sa->util_sum, r * divider); - /* - * Because of rounding, se->util_sum might ends up being +1 more than - * cfs->util_sum. Although this is not a problem by itself, detaching - * a lot of tasks with the rounding problem between 2 updates of - * util_avg (~1ms) can make cfs->util_sum becoming null whereas - * cfs_util_avg is not. - * Check that util_sum is still above its lower bound for the new - * util_avg. Given that period_contrib might have moved since the last - * sync, we are only sure that util_sum must be above or equal to - * util_avg * minimum possible divider - */ - sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER); + __update_sa(sa, util, -r, -r*divider); r = removed_runnable; - sub_positive(&sa->runnable_avg, r); - sub_positive(&sa->runnable_sum, r * divider); - /* See sa->util_sum above */ - sa->runnable_sum = max_t(u32, sa->runnable_sum, - sa->runnable_avg * PELT_MIN_DIVIDER); + __update_sa(sa, runnable, -r, -r*divider); /* * removed_runnable is the unweighted version of removed_load so we @@ -4663,17 +4664,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { dequeue_load_avg(cfs_rq, se); - sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); - sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); - /* See update_cfs_rq_load_avg() */ - cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum, - cfs_rq->avg.util_avg * PELT_MIN_DIVIDER); - - sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg); - sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum); - /* See update_cfs_rq_load_avg() */ - cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum, - cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER); + __update_sa(&cfs_rq->avg, util, -se->avg.util_avg, -se->avg.util_sum); + __update_sa(&cfs_rq->avg, runnable, -se->avg.runnable_avg, -se->avg.runnable_sum); add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); @@ -4781,7 +4773,8 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) return cfs_rq->avg.load_avg; } -static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf); +static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) + __must_hold(__rq_lockp(this_rq)); static inline unsigned long task_util(struct task_struct *p) { @@ -5175,7 +5168,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * * vl_i = (W + w_i)*vl'_i / W */ - load = cfs_rq->avg_load; + load = cfs_rq->sum_weight; if (curr && curr->on_rq) load += scale_load_down(curr->load.weight); @@ -6188,6 +6181,7 @@ next: * used to track this state. */ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags) + __must_hold(&cfs_b->lock) { int throttled; @@ -7147,8 +7141,7 @@ static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask); static struct { cpumask_var_t idle_cpus_mask; - atomic_t nr_cpus; - int has_blocked; /* Idle CPUS has blocked load */ + int has_blocked_load; /* Idle CPUS has blocked load */ int needs_update; /* Newly idle CPUs need their next_balance collated */ unsigned long next_balance; /* in jiffy units */ unsigned long next_blocked; /* Next update of blocked load in jiffies */ @@ -7506,7 +7499,7 @@ static inline void set_idle_cores(int cpu, int val) { struct sched_domain_shared *sds; - sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu)); if (sds) WRITE_ONCE(sds->has_idle_cores, val); } @@ -7515,7 +7508,7 @@ static inline bool test_idle_cores(int cpu) { struct sched_domain_shared *sds; - sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu)); if (sds) return READ_ONCE(sds->has_idle_cores); @@ -7644,7 +7637,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); if (sched_feat(SIS_UTIL)) { - sd_share = rcu_dereference(per_cpu(sd_llc_shared, target)); + sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, target)); if (sd_share) { /* because !--nr is the condition to stop scan */ nr = READ_ONCE(sd_share->nr_idle_scan) + 1; @@ -7850,7 +7843,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * sd_asym_cpucapacity rather than sd_llc. */ if (sched_asym_cpucap_active()) { - sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); + sd = rcu_dereference_all(per_cpu(sd_asym_cpucapacity, target)); /* * On an asymmetric CPU capacity system where an exclusive * cpuset defines a symmetric island (i.e. one unique @@ -7865,7 +7858,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) } } - sd = rcu_dereference(per_cpu(sd_llc, target)); + sd = rcu_dereference_all(per_cpu(sd_llc, target)); if (!sd) return target; @@ -8334,7 +8327,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) struct energy_env eenv; rcu_read_lock(); - pd = rcu_dereference(rd->pd); + pd = rcu_dereference_all(rd->pd); if (!pd) goto unlock; @@ -8342,7 +8335,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) * Energy-aware wake-up happens on the lowest sched_domain starting * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu. */ - sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity)); + sd = rcu_dereference_all(*this_cpu_ptr(&sd_asym_cpucapacity)); while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) sd = sd->parent; if (!sd) @@ -8365,9 +8358,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) int max_spare_cap_cpu = -1; int fits, max_fits = -1; - cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask); - - if (cpumask_empty(cpus)) + if (!cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask)) continue; /* Account external pressure for the energy estimation */ @@ -8744,7 +8735,7 @@ preempt_sync(struct rq *rq, int wake_flags, /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags) +static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_flags) { enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK; struct task_struct *donor = rq->donor; @@ -8752,6 +8743,12 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int struct cfs_rq *cfs_rq = task_cfs_rq(donor); int cse_is_idle, pse_is_idle; + /* + * XXX Getting preempted by higher class, try and find idle CPU? + */ + if (p->sched_class != &fair_sched_class) + return; + if (unlikely(se == pse)) return; @@ -8828,16 +8825,6 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int if ((wake_flags & WF_FORK) || pse->sched_delayed) return; - /* - * If @p potentially is completing work required by current then - * consider preemption. - * - * Reschedule if waker is no longer eligible. */ - if (in_task() && !entity_eligible(cfs_rq, se)) { - preempt_action = PREEMPT_WAKEUP_RESCHED; - goto preempt; - } - /* Prefer picking wakee soon if appropriate. */ if (sched_feat(NEXT_BUDDY) && set_preempt_buddy(cfs_rq, wake_flags, pse, se)) { @@ -8919,6 +8906,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) struct task_struct * pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + __must_hold(__rq_lockp(rq)) { struct sched_entity *se; struct task_struct *p; @@ -8995,12 +8983,6 @@ idle: goto again; } - /* - * rq is about to be idle, check if we need to update the - * lost_idle_time of clock_pelt - */ - update_idle_rq_clock_pelt(rq); - return NULL; } @@ -9349,7 +9331,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env) */ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { - struct numa_group *numa_group = rcu_dereference(p->numa_group); + struct numa_group *numa_group = rcu_dereference_all(p->numa_group); unsigned long src_weight, dst_weight; int src_nid, dst_nid, dist; @@ -9778,7 +9760,7 @@ static void attach_tasks(struct lb_env *env) } #ifdef CONFIG_NO_HZ_COMMON -static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) +static inline bool cfs_rq_has_blocked_load(struct cfs_rq *cfs_rq) { if (cfs_rq->avg.load_avg) return true; @@ -9811,16 +9793,16 @@ static inline void update_blocked_load_tick(struct rq *rq) WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies); } -static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) +static inline void update_has_blocked_load_status(struct rq *rq, bool has_blocked_load) { - if (!has_blocked) + if (!has_blocked_load) rq->has_blocked_load = 0; } #else /* !CONFIG_NO_HZ_COMMON: */ -static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; } +static inline bool cfs_rq_has_blocked_load(struct cfs_rq *cfs_rq) { return false; } static inline bool others_have_blocked(struct rq *rq) { return false; } static inline void update_blocked_load_tick(struct rq *rq) {} -static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {} +static inline void update_has_blocked_load_status(struct rq *rq, bool has_blocked_load) {} #endif /* !CONFIG_NO_HZ_COMMON */ static bool __update_blocked_others(struct rq *rq, bool *done) @@ -9877,7 +9859,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) list_del_leaf_cfs_rq(cfs_rq); /* Don't need periodic decay once load/util_avg are null */ - if (cfs_rq_has_blocked(cfs_rq)) + if (cfs_rq_has_blocked_load(cfs_rq)) *done = false; } @@ -9937,7 +9919,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) bool decayed; decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq); - if (cfs_rq_has_blocked(cfs_rq)) + if (cfs_rq_has_blocked_load(cfs_rq)) *done = false; return decayed; @@ -9949,23 +9931,27 @@ static unsigned long task_h_load(struct task_struct *p) } #endif /* !CONFIG_FAIR_GROUP_SCHED */ -static void sched_balance_update_blocked_averages(int cpu) +static void __sched_balance_update_blocked_averages(struct rq *rq) { bool decayed = false, done = true; - struct rq *rq = cpu_rq(cpu); - struct rq_flags rf; - rq_lock_irqsave(rq, &rf); update_blocked_load_tick(rq); - update_rq_clock(rq); decayed |= __update_blocked_others(rq, &done); decayed |= __update_blocked_fair(rq, &done); - update_blocked_load_status(rq, !done); + update_has_blocked_load_status(rq, !done); if (decayed) cpufreq_update_util(rq, 0); - rq_unlock_irqrestore(rq, &rf); +} + +static void sched_balance_update_blocked_averages(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + guard(rq_lock_irqsave)(rq); + update_rq_clock(rq); + __sched_balance_update_blocked_averages(rq); } /********** Helpers for sched_balance_find_src_group ************************/ @@ -10975,10 +10961,9 @@ sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int * take care of it. */ if (p->nr_cpus_allowed != NR_CPUS) { - struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); - - cpumask_and(cpus, sched_group_span(local), p->cpus_ptr); - imb_numa_nr = min(cpumask_weight(cpus), sd->imb_numa_nr); + unsigned int w = cpumask_weight_and(p->cpus_ptr, + sched_group_span(local)); + imb_numa_nr = min(w, sd->imb_numa_nr); } imbalance = abs(local_sgs.idle_cpus - idlest_sgs.idle_cpus); @@ -11025,7 +11010,7 @@ static void update_idle_cpu_scan(struct lb_env *env, if (env->sd->span_weight != llc_weight) return; - sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu)); + sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, env->dst_cpu)); if (!sd_share) return; @@ -11375,7 +11360,7 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env) goto force_balance; if (!is_rd_overutilized(env->dst_rq->rd) && - rcu_dereference(env->dst_rq->rd->pd)) + rcu_dereference_all(env->dst_rq->rd->pd)) goto out_balanced; /* ASYM feature bypasses nice load balance check */ @@ -12443,20 +12428,29 @@ static void nohz_balancer_kick(struct rq *rq) */ nohz_balance_exit_idle(rq); - /* - * None are in tickless mode and hence no need for NOHZ idle load - * balancing: - */ - if (likely(!atomic_read(&nohz.nr_cpus))) - return; - - if (READ_ONCE(nohz.has_blocked) && + if (READ_ONCE(nohz.has_blocked_load) && time_after(now, READ_ONCE(nohz.next_blocked))) flags = NOHZ_STATS_KICK; + /* + * Most of the time system is not 100% busy. i.e nohz.nr_cpus > 0 + * Skip the read if time is not due. + * + * If none are in tickless mode, there maybe a narrow window + * (28 jiffies, HZ=1000) where flags maybe set and kick_ilb called. + * But idle load balancing is not done as find_new_ilb fails. + * That's very rare. So read nohz.nr_cpus only if time is due. + */ if (time_before(now, nohz.next_balance)) goto out; + /* + * None are in tickless mode and hence no need for NOHZ idle load + * balancing + */ + if (unlikely(cpumask_empty(nohz.idle_cpus_mask))) + return; + if (rq->nr_running >= 2) { flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto out; @@ -12464,7 +12458,7 @@ static void nohz_balancer_kick(struct rq *rq) rcu_read_lock(); - sd = rcu_dereference(rq->sd); + sd = rcu_dereference_all(rq->sd); if (sd) { /* * If there's a runnable CFS task and the current CPU has reduced @@ -12476,7 +12470,7 @@ static void nohz_balancer_kick(struct rq *rq) } } - sd = rcu_dereference(per_cpu(sd_asym_packing, cpu)); + sd = rcu_dereference_all(per_cpu(sd_asym_packing, cpu)); if (sd) { /* * When ASYM_PACKING; see if there's a more preferred CPU @@ -12494,7 +12488,7 @@ static void nohz_balancer_kick(struct rq *rq) } } - sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu)); + sd = rcu_dereference_all(per_cpu(sd_asym_cpucapacity, cpu)); if (sd) { /* * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU @@ -12515,7 +12509,7 @@ static void nohz_balancer_kick(struct rq *rq) goto unlock; } - sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu)); if (sds) { /* * If there is an imbalance between LLC domains (IOW we could @@ -12547,7 +12541,7 @@ static void set_cpu_sd_state_busy(int cpu) struct sched_domain *sd; rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_llc, cpu)); + sd = rcu_dereference_all(per_cpu(sd_llc, cpu)); if (!sd || !sd->nohz_idle) goto unlock; @@ -12567,7 +12561,6 @@ void nohz_balance_exit_idle(struct rq *rq) rq->nohz_tick_stopped = 0; cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask); - atomic_dec(&nohz.nr_cpus); set_cpu_sd_state_busy(rq->cpu); } @@ -12577,7 +12570,7 @@ static void set_cpu_sd_state_idle(int cpu) struct sched_domain *sd; rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_llc, cpu)); + sd = rcu_dereference_all(per_cpu(sd_llc, cpu)); if (!sd || sd->nohz_idle) goto unlock; @@ -12611,9 +12604,9 @@ void nohz_balance_enter_idle(int cpu) /* * The tick is still stopped but load could have been added in the - * meantime. We set the nohz.has_blocked flag to trig a check of the + * meantime. We set the nohz.has_blocked_load flag to trig a check of the * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear - * of nohz.has_blocked can only happen after checking the new load + * of nohz.has_blocked_load can only happen after checking the new load */ if (rq->nohz_tick_stopped) goto out; @@ -12625,11 +12618,10 @@ void nohz_balance_enter_idle(int cpu) rq->nohz_tick_stopped = 1; cpumask_set_cpu(cpu, nohz.idle_cpus_mask); - atomic_inc(&nohz.nr_cpus); /* * Ensures that if nohz_idle_balance() fails to observe our - * @idle_cpus_mask store, it must observe the @has_blocked + * @idle_cpus_mask store, it must observe the @has_blocked_load * and @needs_update stores. */ smp_mb__after_atomic(); @@ -12642,7 +12634,7 @@ out: * Each time a cpu enter idle, we assume that it has blocked load and * enable the periodic update of the load of idle CPUs */ - WRITE_ONCE(nohz.has_blocked, 1); + WRITE_ONCE(nohz.has_blocked_load, 1); } static bool update_nohz_stats(struct rq *rq) @@ -12683,8 +12675,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) /* * We assume there will be no idle load after this update and clear - * the has_blocked flag. If a cpu enters idle in the mean time, it will - * set the has_blocked flag and trigger another update of idle load. + * the has_blocked_load flag. If a cpu enters idle in the mean time, it will + * set the has_blocked_load flag and trigger another update of idle load. * Because a cpu that becomes idle, is added to idle_cpus_mask before * setting the flag, we are sure to not clear the state and not * check the load of an idle cpu. @@ -12692,12 +12684,12 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) * Same applies to idle_cpus_mask vs needs_update. */ if (flags & NOHZ_STATS_KICK) - WRITE_ONCE(nohz.has_blocked, 0); + WRITE_ONCE(nohz.has_blocked_load, 0); if (flags & NOHZ_NEXT_KICK) WRITE_ONCE(nohz.needs_update, 0); /* - * Ensures that if we miss the CPU, we must see the has_blocked + * Ensures that if we miss the CPU, we must see the has_blocked_load * store from nohz_balance_enter_idle(). */ smp_mb(); @@ -12764,7 +12756,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) abort: /* There is still blocked load, enable periodic update */ if (has_blocked_load) - WRITE_ONCE(nohz.has_blocked, 1); + WRITE_ONCE(nohz.has_blocked_load, 1); } /* @@ -12826,7 +12818,7 @@ static void nohz_newidle_balance(struct rq *this_rq) return; /* Don't need to update blocked load of idle CPUs*/ - if (!READ_ONCE(nohz.has_blocked) || + if (!READ_ONCE(nohz.has_blocked_load) || time_before(jiffies, READ_ONCE(nohz.next_blocked))) return; @@ -12858,6 +12850,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } * > 0 - success, new (fair) tasks present */ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) + __must_hold(__rq_lockp(this_rq)) { unsigned long next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; @@ -12896,29 +12889,28 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) */ rq_unpin_lock(this_rq, rf); - rcu_read_lock(); - sd = rcu_dereference_check_sched_domain(this_rq->sd); - if (!sd) { - rcu_read_unlock(); + sd = rcu_dereference_sched_domain(this_rq->sd); + if (!sd) goto out; - } if (!get_rd_overloaded(this_rq->rd) || this_rq->avg_idle < sd->max_newidle_lb_cost) { update_next_balance(sd, &next_balance); - rcu_read_unlock(); goto out; } - rcu_read_unlock(); - - rq_modified_clear(this_rq); - raw_spin_rq_unlock(this_rq); + /* + * Include sched_balance_update_blocked_averages() in the cost + * calculation because it can be quite costly -- this ensures we skip + * it when avg_idle gets to be very low. + */ t0 = sched_clock_cpu(this_cpu); - sched_balance_update_blocked_averages(this_cpu); + __sched_balance_update_blocked_averages(this_rq); + + this_rq->next_class = &fair_sched_class; + raw_spin_rq_unlock(this_rq); - rcu_read_lock(); for_each_domain(this_cpu, sd) { u64 domain_cost; @@ -12968,7 +12960,6 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) if (pulled_task || !continue_balancing) break; } - rcu_read_unlock(); raw_spin_rq_lock(this_rq); @@ -12984,7 +12975,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) pulled_task = 1; /* If a higher prio class was modified, restart the pick */ - if (rq_modified_above(this_rq, &fair_sched_class)) + if (sched_class_above(this_rq->next_class, &fair_sched_class)) pulled_task = -1; out: @@ -13335,8 +13326,8 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, * zero_vruntime_fi, which would have been updated in prior calls * to se_fi_update(). */ - delta = (s64)(sea->vruntime - seb->vruntime) + - (s64)(cfs_rqb->zero_vruntime_fi - cfs_rqa->zero_vruntime_fi); + delta = vruntime_op(sea->vruntime, "-", seb->vruntime) + + vruntime_op(cfs_rqb->zero_vruntime_fi, "-", cfs_rqa->zero_vruntime_fi); return delta > 0; } @@ -13374,6 +13365,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) entity_tick(cfs_rq, se, queued); } + if (queued) { + if (!need_resched()) + hrtick_start_fair(rq, curr); + return; + } + if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); @@ -13882,15 +13879,12 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task * All the scheduling class methods: */ DEFINE_SCHED_CLASS(fair) = { - - .queue_mask = 2, - .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, .yield_to_task = yield_to_task_fair, - .wakeup_preempt = check_preempt_wakeup_fair, + .wakeup_preempt = wakeup_preempt_fair, .pick_task = pick_task_fair, .pick_next_task = pick_next_task_fair, @@ -13950,7 +13944,7 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) struct numa_group *ng; rcu_read_lock(); - ng = rcu_dereference(p->numa_group); + ng = rcu_dereference_all(p->numa_group); for_each_online_node(node) { if (p->numa_faults) { tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)]; diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 980d92bab8ab..136a6584be79 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -29,7 +29,7 @@ SCHED_FEAT(PREEMPT_SHORT, true) * wakeup-preemption), since its likely going to consume data we * touched, increases cache locality. */ -SCHED_FEAT(NEXT_BUDDY, true) +SCHED_FEAT(NEXT_BUDDY, false) /* * Allow completely ignoring cfs_rq->next; which can be set from various diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index c174afe1dd17..3681b6ad9276 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -460,6 +460,7 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct t { update_curr_idle(rq); scx_update_idle(rq, false, true); + update_rq_avg_idle(rq); } static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first) @@ -468,6 +469,12 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir scx_update_idle(rq, true, true); schedstat_inc(rq->sched_goidle); next->se.exec_start = rq_clock_task(rq); + + /* + * rq is about to be idle, check if we need to update the + * lost_idle_time of clock_pelt + */ + update_idle_rq_clock_pelt(rq); } struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf) @@ -530,15 +537,15 @@ static void update_curr_idle(struct rq *rq) se->exec_start = now; dl_server_update_idle(&rq->fair_server, delta_exec); +#ifdef CONFIG_SCHED_CLASS_EXT + dl_server_update_idle(&rq->ext_server, delta_exec); +#endif } /* * Simple, special scheduling class for the per-CPU idle tasks: */ DEFINE_SCHED_CLASS(idle) = { - - .queue_mask = 0, - /* no enqueue/yield_task for idle tasks */ /* dequeue is not valid, we print a debug message there: */ diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 3ad0d6df6a0a..3b725d39c06e 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -8,9 +8,11 @@ * */ #include <linux/sched/isolation.h> +#include <linux/pci.h> #include "sched.h" enum hk_flags { + HK_FLAG_DOMAIN_BOOT = BIT(HK_TYPE_DOMAIN_BOOT), HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN), HK_FLAG_MANAGED_IRQ = BIT(HK_TYPE_MANAGED_IRQ), HK_FLAG_KERNEL_NOISE = BIT(HK_TYPE_KERNEL_NOISE), @@ -20,7 +22,7 @@ DEFINE_STATIC_KEY_FALSE(housekeeping_overridden); EXPORT_SYMBOL_GPL(housekeeping_overridden); struct housekeeping { - cpumask_var_t cpumasks[HK_TYPE_MAX]; + struct cpumask __rcu *cpumasks[HK_TYPE_MAX]; unsigned long flags; }; @@ -28,21 +30,62 @@ static struct housekeeping housekeeping; bool housekeeping_enabled(enum hk_type type) { - return !!(housekeeping.flags & BIT(type)); + return !!(READ_ONCE(housekeeping.flags) & BIT(type)); } EXPORT_SYMBOL_GPL(housekeeping_enabled); +static bool housekeeping_dereference_check(enum hk_type type) +{ + if (IS_ENABLED(CONFIG_LOCKDEP) && type == HK_TYPE_DOMAIN) { + /* Cpuset isn't even writable yet? */ + if (system_state <= SYSTEM_SCHEDULING) + return true; + + /* CPU hotplug write locked, so cpuset partition can't be overwritten */ + if (IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_write_held()) + return true; + + /* Cpuset lock held, partitions not writable */ + if (IS_ENABLED(CONFIG_CPUSETS) && lockdep_is_cpuset_held()) + return true; + + return false; + } + + return true; +} + +static inline struct cpumask *housekeeping_cpumask_dereference(enum hk_type type) +{ + return rcu_dereference_all_check(housekeeping.cpumasks[type], + housekeeping_dereference_check(type)); +} + +const struct cpumask *housekeeping_cpumask(enum hk_type type) +{ + const struct cpumask *mask = NULL; + + if (static_branch_unlikely(&housekeeping_overridden)) { + if (READ_ONCE(housekeeping.flags) & BIT(type)) + mask = housekeeping_cpumask_dereference(type); + } + if (!mask) + mask = cpu_possible_mask; + return mask; +} +EXPORT_SYMBOL_GPL(housekeeping_cpumask); + int housekeeping_any_cpu(enum hk_type type) { int cpu; if (static_branch_unlikely(&housekeeping_overridden)) { if (housekeeping.flags & BIT(type)) { - cpu = sched_numa_find_closest(housekeeping.cpumasks[type], smp_processor_id()); + cpu = sched_numa_find_closest(housekeeping_cpumask(type), smp_processor_id()); if (cpu < nr_cpu_ids) return cpu; - cpu = cpumask_any_and_distribute(housekeeping.cpumasks[type], cpu_online_mask); + cpu = cpumask_any_and_distribute(housekeeping_cpumask(type), cpu_online_mask); if (likely(cpu < nr_cpu_ids)) return cpu; /* @@ -58,32 +101,69 @@ int housekeeping_any_cpu(enum hk_type type) } EXPORT_SYMBOL_GPL(housekeeping_any_cpu); -const struct cpumask *housekeeping_cpumask(enum hk_type type) -{ - if (static_branch_unlikely(&housekeeping_overridden)) - if (housekeeping.flags & BIT(type)) - return housekeeping.cpumasks[type]; - return cpu_possible_mask; -} -EXPORT_SYMBOL_GPL(housekeeping_cpumask); - void housekeeping_affine(struct task_struct *t, enum hk_type type) { if (static_branch_unlikely(&housekeeping_overridden)) if (housekeeping.flags & BIT(type)) - set_cpus_allowed_ptr(t, housekeeping.cpumasks[type]); + set_cpus_allowed_ptr(t, housekeeping_cpumask(type)); } EXPORT_SYMBOL_GPL(housekeeping_affine); bool housekeeping_test_cpu(int cpu, enum hk_type type) { - if (static_branch_unlikely(&housekeeping_overridden)) - if (housekeeping.flags & BIT(type)) - return cpumask_test_cpu(cpu, housekeeping.cpumasks[type]); + if (static_branch_unlikely(&housekeeping_overridden) && + READ_ONCE(housekeeping.flags) & BIT(type)) + return cpumask_test_cpu(cpu, housekeeping_cpumask(type)); return true; } EXPORT_SYMBOL_GPL(housekeeping_test_cpu); +int housekeeping_update(struct cpumask *isol_mask) +{ + struct cpumask *trial, *old = NULL; + int err; + + lockdep_assert_cpus_held(); + + trial = kmalloc(cpumask_size(), GFP_KERNEL); + if (!trial) + return -ENOMEM; + + cpumask_andnot(trial, housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT), isol_mask); + if (!cpumask_intersects(trial, cpu_online_mask)) { + kfree(trial); + return -EINVAL; + } + + if (!housekeeping.flags) + static_branch_enable_cpuslocked(&housekeeping_overridden); + + if (housekeeping.flags & HK_FLAG_DOMAIN) + old = housekeeping_cpumask_dereference(HK_TYPE_DOMAIN); + else + WRITE_ONCE(housekeeping.flags, housekeeping.flags | HK_FLAG_DOMAIN); + rcu_assign_pointer(housekeeping.cpumasks[HK_TYPE_DOMAIN], trial); + + synchronize_rcu(); + + pci_probe_flush_workqueue(); + mem_cgroup_flush_workqueue(); + vmstat_flush_workqueue(); + + err = workqueue_unbound_housekeeping_update(housekeeping_cpumask(HK_TYPE_DOMAIN)); + WARN_ON_ONCE(err < 0); + + err = tmigr_isolated_exclude_cpumask(isol_mask); + WARN_ON_ONCE(err < 0); + + err = kthreads_update_housekeeping(); + WARN_ON_ONCE(err < 0); + + kfree(old); + + return 0; +} + void __init housekeeping_init(void) { enum hk_type type; @@ -95,20 +175,33 @@ void __init housekeeping_init(void) if (housekeeping.flags & HK_FLAG_KERNEL_NOISE) sched_tick_offload_init(); - + /* + * Realloc with a proper allocator so that any cpumask update + * can indifferently free the old version with kfree(). + */ for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) { + struct cpumask *omask, *nmask = kmalloc(cpumask_size(), GFP_KERNEL); + + if (WARN_ON_ONCE(!nmask)) + return; + + omask = rcu_dereference(housekeeping.cpumasks[type]); + /* We need at least one CPU to handle housekeeping work */ - WARN_ON_ONCE(cpumask_empty(housekeeping.cpumasks[type])); + WARN_ON_ONCE(cpumask_empty(omask)); + cpumask_copy(nmask, omask); + RCU_INIT_POINTER(housekeeping.cpumasks[type], nmask); + memblock_free(omask, cpumask_size()); } } static void __init housekeeping_setup_type(enum hk_type type, cpumask_var_t housekeeping_staging) { + struct cpumask *mask = memblock_alloc_or_panic(cpumask_size(), SMP_CACHE_BYTES); - alloc_bootmem_cpumask_var(&housekeeping.cpumasks[type]); - cpumask_copy(housekeeping.cpumasks[type], - housekeeping_staging); + cpumask_copy(mask, housekeeping_staging); + RCU_INIT_POINTER(housekeeping.cpumasks[type], mask); } static int __init housekeeping_setup(char *str, unsigned long flags) @@ -161,7 +254,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags) for_each_set_bit(type, &iter_flags, HK_TYPE_MAX) { if (!cpumask_equal(housekeeping_staging, - housekeeping.cpumasks[type])) { + housekeeping_cpumask(type))) { pr_warn("Housekeeping: nohz_full= must match isolcpus=\n"); goto free_housekeeping_staging; } @@ -182,7 +275,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags) iter_flags = flags & (HK_FLAG_KERNEL_NOISE | HK_FLAG_DOMAIN); first_cpu = (type == HK_TYPE_MAX || !iter_flags) ? 0 : cpumask_first_and_and(cpu_present_mask, - housekeeping_staging, housekeeping.cpumasks[type]); + housekeeping_staging, housekeeping_cpumask(type)); if (first_cpu >= min(nr_cpu_ids, setup_max_cpus)) { pr_warn("Housekeeping: must include one present CPU " "neither in nohz_full= nor in isolcpus=domain, " @@ -239,7 +332,7 @@ static int __init housekeeping_isolcpus_setup(char *str) if (!strncmp(str, "domain,", 7)) { str += 7; - flags |= HK_FLAG_DOMAIN; + flags |= HK_FLAG_DOMAIN | HK_FLAG_DOMAIN_BOOT; continue; } @@ -269,7 +362,7 @@ static int __init housekeeping_isolcpus_setup(char *str) /* Default behaviour for isolcpus without flags */ if (!flags) - flags |= HK_FLAG_DOMAIN; + flags |= HK_FLAG_DOMAIN | HK_FLAG_DOMAIN_BOOT; return housekeeping_setup(str, flags); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f1867fe8e5c5..a7680477fa6f 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1615,6 +1615,12 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) { struct task_struct *donor = rq->donor; + /* + * XXX If we're preempted by DL, queue a push? + */ + if (p->sched_class != &rt_sched_class) + return; + if (p->prio < donor->prio) { resched_curr(rq); return; @@ -2100,6 +2106,7 @@ static void push_rt_tasks(struct rq *rq) */ static int rto_next_cpu(struct root_domain *rd) { + int this_cpu = smp_processor_id(); int next; int cpu; @@ -2123,6 +2130,10 @@ static int rto_next_cpu(struct root_domain *rd) rd->rto_cpu = cpu; + /* Do not send IPI to self */ + if (cpu == this_cpu) + continue; + if (cpu < nr_cpu_ids) return cpu; @@ -2568,9 +2579,6 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu) #endif /* CONFIG_SCHED_CORE */ DEFINE_SCHED_CLASS(rt) = { - - .queue_mask = 4, - .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 93fce4bbff5e..b82fb70a9d54 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -30,6 +30,7 @@ #include <linux/context_tracking.h> #include <linux/cpufreq.h> #include <linux/cpumask_api.h> +#include <linux/cpuset.h> #include <linux/ctype.h> #include <linux/file.h> #include <linux/fs_api.h> @@ -42,6 +43,8 @@ #include <linux/ktime_api.h> #include <linux/lockdep_api.h> #include <linux/lockdep.h> +#include <linux/memblock.h> +#include <linux/memcontrol.h> #include <linux/minmax.h> #include <linux/mm.h> #include <linux/module.h> @@ -65,6 +68,7 @@ #include <linux/types.h> #include <linux/u64_stats_sync_api.h> #include <linux/uaccess.h> +#include <linux/vmstat.h> #include <linux/wait_api.h> #include <linux/wait_bit.h> #include <linux/workqueue_api.h> @@ -82,9 +86,8 @@ struct rt_rq; struct sched_group; struct cpuidle_state; -#ifdef CONFIG_PARAVIRT +#if defined(CONFIG_PARAVIRT) && !defined(CONFIG_HAVE_PV_STEAL_CLOCK_GEN) # include <asm/paravirt.h> -# include <asm/paravirt_api_clock.h> #endif #include <asm/barrier.h> @@ -414,6 +417,7 @@ extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, extern void sched_init_dl_servers(void); extern void fair_server_init(struct rq *rq); +extern void ext_server_init(struct rq *rq); extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq); extern int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 period, bool init); @@ -670,16 +674,16 @@ struct balance_callback { void (*func)(struct rq *rq); }; -/* CFS-related fields in a runqueue */ +/* Fair scheduling SCHED_{NORMAL,BATCH,IDLE} related fields in a runqueue: */ struct cfs_rq { struct load_weight load; unsigned int nr_queued; - unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */ - unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */ - unsigned int h_nr_idle; /* SCHED_IDLE */ + unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int h_nr_idle; /* SCHED_IDLE */ - s64 avg_vruntime; - u64 avg_load; + s64 sum_w_vruntime; + u64 sum_weight; u64 zero_vruntime; #ifdef CONFIG_SCHED_CORE @@ -690,7 +694,7 @@ struct cfs_rq { struct rb_root_cached tasks_timeline; /* - * 'curr' points to currently running entity on this cfs_rq. + * 'curr' points to the currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ struct sched_entity *curr; @@ -726,9 +730,7 @@ struct cfs_rq { unsigned long h_load; u64 last_h_load_update; struct sched_entity *h_load_next; -#endif /* CONFIG_FAIR_GROUP_SCHED */ -#ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */ /* @@ -741,19 +743,19 @@ struct cfs_rq { */ int on_list; struct list_head leaf_cfs_rq_list; - struct task_group *tg; /* group that "owns" this runqueue */ + struct task_group *tg; /* Group that "owns" this runqueue */ /* Locally cached copy of our task_group's idle value */ int idle; -#ifdef CONFIG_CFS_BANDWIDTH +# ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; s64 runtime_remaining; u64 throttled_pelt_idle; -#ifndef CONFIG_64BIT +# ifndef CONFIG_64BIT u64 throttled_pelt_idle_copy; -#endif +# endif u64 throttled_clock; u64 throttled_clock_pelt; u64 throttled_clock_pelt_time; @@ -765,7 +767,7 @@ struct cfs_rq { struct list_head throttled_list; struct list_head throttled_csd_list; struct list_head throttled_limbo_list; -#endif /* CONFIG_CFS_BANDWIDTH */ +# endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ }; @@ -1117,28 +1119,50 @@ DECLARE_STATIC_KEY_FALSE(sched_uclamp_used); * acquire operations must be ordered by ascending &runqueue. */ struct rq { - /* runqueue lock: */ - raw_spinlock_t __lock; - - /* Per class runqueue modification mask; bits in class order. */ - unsigned int queue_mask; + /* + * The following members are loaded together, without holding the + * rq->lock, in an extremely hot loop in update_sg_lb_stats() + * (called from pick_next_task()). To reduce cache pollution from + * this operation, they are placed together on this dedicated cache + * line. Even though some of them are frequently modified, they are + * loaded much more frequently than they are stored. + */ unsigned int nr_running; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; - unsigned int numa_migrate_on; #endif + unsigned int ttwu_pending; + unsigned long cpu_capacity; +#ifdef CONFIG_SCHED_PROXY_EXEC + struct task_struct __rcu *donor; /* Scheduling context */ + struct task_struct __rcu *curr; /* Execution context */ +#else + union { + struct task_struct __rcu *donor; /* Scheduler context */ + struct task_struct __rcu *curr; /* Execution context */ + }; +#endif + struct task_struct *idle; + /* padding left here deliberately */ + + /* + * The next cacheline holds the (hot) runqueue lock, as well as + * some other less performance-critical fields. + */ + u64 nr_switches ____cacheline_aligned; + + /* runqueue lock: */ + raw_spinlock_t __lock; + #ifdef CONFIG_NO_HZ_COMMON - unsigned long last_blocked_load_update_tick; - unsigned int has_blocked_load; - call_single_data_t nohz_csd; unsigned int nohz_tick_stopped; atomic_t nohz_flags; + unsigned int has_blocked_load; + unsigned long last_blocked_load_update_tick; + call_single_data_t nohz_csd; #endif /* CONFIG_NO_HZ_COMMON */ - unsigned int ttwu_pending; - u64 nr_switches; - #ifdef CONFIG_UCLAMP_TASK /* Utilization clamp values based on CPU's RUNNABLE tasks */ struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned; @@ -1151,6 +1175,7 @@ struct rq { struct dl_rq dl; #ifdef CONFIG_SCHED_CLASS_EXT struct scx_rq scx; + struct sched_dl_entity ext_server; #endif struct sched_dl_entity fair_server; @@ -1161,6 +1186,9 @@ struct rq { struct list_head *tmp_alone_branch; #endif /* CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_NUMA_BALANCING + unsigned int numa_migrate_on; +#endif /* * This is part of a global counter where only the total sum * over all CPUs matters. A task can increase this counter on @@ -1169,36 +1197,29 @@ struct rq { */ unsigned long nr_uninterruptible; -#ifdef CONFIG_SCHED_PROXY_EXEC - struct task_struct __rcu *donor; /* Scheduling context */ - struct task_struct __rcu *curr; /* Execution context */ -#else - union { - struct task_struct __rcu *donor; /* Scheduler context */ - struct task_struct __rcu *curr; /* Execution context */ - }; -#endif struct sched_dl_entity *dl_server; - struct task_struct *idle; struct task_struct *stop; + const struct sched_class *next_class; unsigned long next_balance; struct mm_struct *prev_mm; - unsigned int clock_update_flags; - u64 clock; - /* Ensure that all clocks are in the same cache line */ + /* + * The following fields of clock data are frequently referenced + * and updated together, and should go on their own cache line. + */ u64 clock_task ____cacheline_aligned; u64 clock_pelt; + u64 clock; unsigned long lost_idle_time; + unsigned int clock_update_flags; u64 clock_pelt_idle; u64 clock_idle; + #ifndef CONFIG_64BIT u64 clock_pelt_idle_copy; u64 clock_idle_copy; #endif - atomic_t nr_iowait; - u64 last_seen_need_resched_ns; int ticks_without_resched; @@ -1209,8 +1230,6 @@ struct rq { struct root_domain *rd; struct sched_domain __rcu *sd; - unsigned long cpu_capacity; - struct balance_callback *balance_callback; unsigned char nohz_idle_balance; @@ -1320,7 +1339,9 @@ struct rq { call_single_data_t cfsb_csd; struct list_head cfsb_csd_list; #endif -}; + + atomic_t nr_iowait; +} __no_randomize_layout; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1358,8 +1379,13 @@ static inline u32 sched_rng(void) return prandom_u32_state(this_cpu_ptr(&sched_rnd_state)); } +static __always_inline struct rq *__this_rq(void) +{ + return this_cpu_ptr(&runqueues); +} + #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) -#define this_rq() this_cpu_ptr(&runqueues) +#define this_rq() __this_rq() #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() raw_cpu_ptr(&runqueues) @@ -1426,6 +1452,7 @@ static inline raw_spinlock_t *rq_lockp(struct rq *rq) } static inline raw_spinlock_t *__rq_lockp(struct rq *rq) + __returns_ctx_lock(rq_lockp(rq)) /* alias them */ { if (rq->core_enabled) return &rq->core->__lock; @@ -1525,6 +1552,7 @@ static inline raw_spinlock_t *rq_lockp(struct rq *rq) } static inline raw_spinlock_t *__rq_lockp(struct rq *rq) + __returns_ctx_lock(rq_lockp(rq)) /* alias them */ { return &rq->__lock; } @@ -1567,32 +1595,42 @@ static inline bool rt_group_sched_enabled(void) #endif /* !CONFIG_RT_GROUP_SCHED */ static inline void lockdep_assert_rq_held(struct rq *rq) + __assumes_ctx_lock(__rq_lockp(rq)) { lockdep_assert_held(__rq_lockp(rq)); } -extern void raw_spin_rq_lock_nested(struct rq *rq, int subclass); -extern bool raw_spin_rq_trylock(struct rq *rq); -extern void raw_spin_rq_unlock(struct rq *rq); +extern void raw_spin_rq_lock_nested(struct rq *rq, int subclass) + __acquires(__rq_lockp(rq)); + +extern bool raw_spin_rq_trylock(struct rq *rq) + __cond_acquires(true, __rq_lockp(rq)); + +extern void raw_spin_rq_unlock(struct rq *rq) + __releases(__rq_lockp(rq)); static inline void raw_spin_rq_lock(struct rq *rq) + __acquires(__rq_lockp(rq)) { raw_spin_rq_lock_nested(rq, 0); } static inline void raw_spin_rq_lock_irq(struct rq *rq) + __acquires(__rq_lockp(rq)) { local_irq_disable(); raw_spin_rq_lock(rq); } static inline void raw_spin_rq_unlock_irq(struct rq *rq) + __releases(__rq_lockp(rq)) { raw_spin_rq_unlock(rq); local_irq_enable(); } static inline unsigned long _raw_spin_rq_lock_irqsave(struct rq *rq) + __acquires(__rq_lockp(rq)) { unsigned long flags; @@ -1603,6 +1641,7 @@ static inline unsigned long _raw_spin_rq_lock_irqsave(struct rq *rq) } static inline void raw_spin_rq_unlock_irqrestore(struct rq *rq, unsigned long flags) + __releases(__rq_lockp(rq)) { raw_spin_rq_unlock(rq); local_irq_restore(flags); @@ -1676,6 +1715,7 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) #endif /* !CONFIG_FAIR_GROUP_SCHED */ +extern void update_rq_avg_idle(struct rq *rq); extern void update_rq_clock(struct rq *rq); /* @@ -1851,18 +1891,16 @@ static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) rq->clock_update_flags |= rf->clock_update_flags; } -extern -struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) - __acquires(rq->lock); +#define __task_rq_lock(...) __acquire_ret(___task_rq_lock(__VA_ARGS__), __rq_lockp(__ret)) +extern struct rq *___task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires_ret; -extern -struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) - __acquires(p->pi_lock) - __acquires(rq->lock); +#define task_rq_lock(...) __acquire_ret(_task_rq_lock(__VA_ARGS__), __rq_lockp(__ret)) +extern struct rq *_task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(&p->pi_lock) __acquires_ret; static inline void __task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) - __releases(rq->lock) + __releases(__rq_lockp(rq)) { rq_unpin_lock(rq, rf); raw_spin_rq_unlock(rq); @@ -1870,8 +1908,7 @@ __task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) static inline void task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) - __releases(rq->lock) - __releases(p->pi_lock) + __releases(__rq_lockp(rq), &p->pi_lock) { __task_rq_unlock(rq, p, rf); raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); @@ -1881,6 +1918,8 @@ DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct, _T->rq = task_rq_lock(_T->lock, &_T->rf), task_rq_unlock(_T->rq, _T->lock, &_T->rf), struct rq *rq; struct rq_flags rf) +DECLARE_LOCK_GUARD_1_ATTRS(task_rq_lock, __acquires(_T->pi_lock), __releases((*(struct task_struct **)_T)->pi_lock)) +#define class_task_rq_lock_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(task_rq_lock, _T) DEFINE_LOCK_GUARD_1(__task_rq_lock, struct task_struct, _T->rq = __task_rq_lock(_T->lock, &_T->rf), @@ -1888,42 +1927,42 @@ DEFINE_LOCK_GUARD_1(__task_rq_lock, struct task_struct, struct rq *rq; struct rq_flags rf) static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) - __acquires(rq->lock) + __acquires(__rq_lockp(rq)) { raw_spin_rq_lock_irqsave(rq, rf->flags); rq_pin_lock(rq, rf); } static inline void rq_lock_irq(struct rq *rq, struct rq_flags *rf) - __acquires(rq->lock) + __acquires(__rq_lockp(rq)) { raw_spin_rq_lock_irq(rq); rq_pin_lock(rq, rf); } static inline void rq_lock(struct rq *rq, struct rq_flags *rf) - __acquires(rq->lock) + __acquires(__rq_lockp(rq)) { raw_spin_rq_lock(rq); rq_pin_lock(rq, rf); } static inline void rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) - __releases(rq->lock) + __releases(__rq_lockp(rq)) { rq_unpin_lock(rq, rf); raw_spin_rq_unlock_irqrestore(rq, rf->flags); } static inline void rq_unlock_irq(struct rq *rq, struct rq_flags *rf) - __releases(rq->lock) + __releases(__rq_lockp(rq)) { rq_unpin_lock(rq, rf); raw_spin_rq_unlock_irq(rq); } static inline void rq_unlock(struct rq *rq, struct rq_flags *rf) - __releases(rq->lock) + __releases(__rq_lockp(rq)) { rq_unpin_lock(rq, rf); raw_spin_rq_unlock(rq); @@ -1934,18 +1973,27 @@ DEFINE_LOCK_GUARD_1(rq_lock, struct rq, rq_unlock(_T->lock, &_T->rf), struct rq_flags rf) +DECLARE_LOCK_GUARD_1_ATTRS(rq_lock, __acquires(__rq_lockp(_T)), __releases(__rq_lockp(*(struct rq **)_T))); +#define class_rq_lock_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(rq_lock, _T) + DEFINE_LOCK_GUARD_1(rq_lock_irq, struct rq, rq_lock_irq(_T->lock, &_T->rf), rq_unlock_irq(_T->lock, &_T->rf), struct rq_flags rf) +DECLARE_LOCK_GUARD_1_ATTRS(rq_lock_irq, __acquires(__rq_lockp(_T)), __releases(__rq_lockp(*(struct rq **)_T))); +#define class_rq_lock_irq_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(rq_lock_irq, _T) + DEFINE_LOCK_GUARD_1(rq_lock_irqsave, struct rq, rq_lock_irqsave(_T->lock, &_T->rf), rq_unlock_irqrestore(_T->lock, &_T->rf), struct rq_flags rf) -static inline struct rq *this_rq_lock_irq(struct rq_flags *rf) - __acquires(rq->lock) +DECLARE_LOCK_GUARD_1_ATTRS(rq_lock_irqsave, __acquires(__rq_lockp(_T)), __releases(__rq_lockp(*(struct rq **)_T))); +#define class_rq_lock_irqsave_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(rq_lock_irqsave, _T) + +#define this_rq_lock_irq(...) __acquire_ret(_this_rq_lock_irq(__VA_ARGS__), __rq_lockp(__ret)) +static inline struct rq *_this_rq_lock_irq(struct rq_flags *rf) __acquires_ret { struct rq *rq; @@ -2032,8 +2080,8 @@ queue_balance_callback(struct rq *rq, rq->balance_callback = head; } -#define rcu_dereference_check_sched_domain(p) \ - rcu_dereference_check((p), lockdep_is_held(&sched_domains_mutex)) +#define rcu_dereference_sched_domain(p) \ + rcu_dereference_all_check((p), lockdep_is_held(&sched_domains_mutex)) /* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. @@ -2043,7 +2091,7 @@ queue_balance_callback(struct rq *rq, * preempt-disabled sections. */ #define for_each_domain(cpu, __sd) \ - for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ + for (__sd = rcu_dereference_sched_domain(cpu_rq(cpu)->sd); \ __sd; __sd = __sd->parent) /* A mask of all the SD flags that have the SDF_SHARED_CHILD metaflag */ @@ -2451,15 +2499,6 @@ struct sched_class { #ifdef CONFIG_UCLAMP_TASK int uclamp_enabled; #endif - /* - * idle: 0 - * ext: 1 - * fair: 2 - * rt: 4 - * dl: 8 - * stop: 16 - */ - unsigned int queue_mask; /* * move_queued_task/activate_task/enqueue_task: rq->lock @@ -2618,20 +2657,6 @@ struct sched_class { #endif }; -/* - * Does not nest; only used around sched_class::pick_task() rq-lock-breaks. - */ -static inline void rq_modified_clear(struct rq *rq) -{ - rq->queue_mask = 0; -} - -static inline bool rq_modified_above(struct rq *rq, const struct sched_class * class) -{ - unsigned int mask = class->queue_mask; - return rq->queue_mask & ~((mask << 1) - 1); -} - static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { WARN_ON_ONCE(rq->donor != prev); @@ -3073,8 +3098,20 @@ static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) #define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \ __DEFINE_UNLOCK_GUARD(name, type, _unlock, type *lock2; __VA_ARGS__) \ static inline class_##name##_t class_##name##_constructor(type *lock, type *lock2) \ + __no_context_analysis \ { class_##name##_t _t = { .lock = lock, .lock2 = lock2 }, *_T = &_t; \ _lock; return _t; } +#define DECLARE_LOCK_GUARD_2_ATTRS(_name, _lock, _unlock1, _unlock2) \ +static inline class_##_name##_t class_##_name##_constructor(lock_##_name##_t *_T1, \ + lock_##_name##_t *_T2) _lock; \ +static __always_inline void __class_##_name##_cleanup_ctx1(class_##_name##_t **_T1) \ + __no_context_analysis _unlock1 { } \ +static __always_inline void __class_##_name##_cleanup_ctx2(class_##_name##_t **_T2) \ + __no_context_analysis _unlock2 { } +#define WITH_LOCK_GUARD_2_ATTRS(_name, _T1, _T2) \ + class_##_name##_constructor(_T1, _T2), \ + *__UNIQUE_ID(unlock1) __cleanup(__class_##_name##_cleanup_ctx1) = (void *)(_T1),\ + *__UNIQUE_ID(unlock2) __cleanup(__class_##_name##_cleanup_ctx2) = (void *)(_T2) static inline bool rq_order_less(struct rq *rq1, struct rq *rq2) { @@ -3102,7 +3139,8 @@ static inline bool rq_order_less(struct rq *rq1, struct rq *rq2) return rq1->cpu < rq2->cpu; } -extern void double_rq_lock(struct rq *rq1, struct rq *rq2); +extern void double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(__rq_lockp(rq1), __rq_lockp(rq2)); #ifdef CONFIG_PREEMPTION @@ -3115,9 +3153,8 @@ extern void double_rq_lock(struct rq *rq1, struct rq *rq2); * also adds more overhead and therefore may reduce throughput. */ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) - __releases(this_rq->lock) - __acquires(busiest->lock) - __acquires(this_rq->lock) + __must_hold(__rq_lockp(this_rq)) + __acquires(__rq_lockp(busiest)) { raw_spin_rq_unlock(this_rq); double_rq_lock(this_rq, busiest); @@ -3134,12 +3171,16 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) * regardless of entry order into the function. */ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) - __releases(this_rq->lock) - __acquires(busiest->lock) - __acquires(this_rq->lock) + __must_hold(__rq_lockp(this_rq)) + __acquires(__rq_lockp(busiest)) { - if (__rq_lockp(this_rq) == __rq_lockp(busiest) || - likely(raw_spin_rq_trylock(busiest))) { + if (__rq_lockp(this_rq) == __rq_lockp(busiest)) { + __acquire(__rq_lockp(busiest)); /* already held */ + double_rq_clock_clear_update(this_rq, busiest); + return 0; + } + + if (likely(raw_spin_rq_trylock(busiest))) { double_rq_clock_clear_update(this_rq, busiest); return 0; } @@ -3162,6 +3203,8 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) * double_lock_balance - lock the busiest runqueue, this_rq is locked already. */ static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) + __must_hold(__rq_lockp(this_rq)) + __acquires(__rq_lockp(busiest)) { lockdep_assert_irqs_disabled(); @@ -3169,14 +3212,17 @@ static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) } static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) - __releases(busiest->lock) + __releases(__rq_lockp(busiest)) { if (__rq_lockp(this_rq) != __rq_lockp(busiest)) raw_spin_rq_unlock(busiest); + else + __release(__rq_lockp(busiest)); /* fake release */ lock_set_subclass(&__rq_lockp(this_rq)->dep_map, 0, _RET_IP_); } static inline void double_lock(spinlock_t *l1, spinlock_t *l2) + __acquires(l1, l2) { if (l1 > l2) swap(l1, l2); @@ -3186,6 +3232,7 @@ static inline void double_lock(spinlock_t *l1, spinlock_t *l2) } static inline void double_lock_irq(spinlock_t *l1, spinlock_t *l2) + __acquires(l1, l2) { if (l1 > l2) swap(l1, l2); @@ -3195,6 +3242,7 @@ static inline void double_lock_irq(spinlock_t *l1, spinlock_t *l2) } static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2) + __acquires(l1, l2) { if (l1 > l2) swap(l1, l2); @@ -3204,6 +3252,7 @@ static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2) } static inline void double_raw_unlock(raw_spinlock_t *l1, raw_spinlock_t *l2) + __releases(l1, l2) { raw_spin_unlock(l1); raw_spin_unlock(l2); @@ -3213,6 +3262,13 @@ DEFINE_LOCK_GUARD_2(double_raw_spinlock, raw_spinlock_t, double_raw_lock(_T->lock, _T->lock2), double_raw_unlock(_T->lock, _T->lock2)) +DECLARE_LOCK_GUARD_2_ATTRS(double_raw_spinlock, + __acquires(_T1, _T2), + __releases(*(raw_spinlock_t **)_T1), + __releases(*(raw_spinlock_t **)_T2)); +#define class_double_raw_spinlock_constructor(_T1, _T2) \ + WITH_LOCK_GUARD_2_ATTRS(double_raw_spinlock, _T1, _T2) + /* * double_rq_unlock - safely unlock two runqueues * @@ -3220,13 +3276,12 @@ DEFINE_LOCK_GUARD_2(double_raw_spinlock, raw_spinlock_t, * you need to do so manually after calling. */ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) - __releases(rq1->lock) - __releases(rq2->lock) + __releases(__rq_lockp(rq1), __rq_lockp(rq2)) { if (__rq_lockp(rq1) != __rq_lockp(rq2)) raw_spin_rq_unlock(rq2); else - __release(rq2->lock); + __release(__rq_lockp(rq2)); /* fake release */ raw_spin_rq_unlock(rq1); } @@ -3337,11 +3392,11 @@ struct irqtime { }; DECLARE_PER_CPU(struct irqtime, cpu_irqtime); -extern int sched_clock_irqtime; +DECLARE_STATIC_KEY_FALSE(sched_clock_irqtime); static inline int irqtime_enabled(void) { - return sched_clock_irqtime; + return static_branch_likely(&sched_clock_irqtime); } /* @@ -3758,8 +3813,10 @@ static __always_inline void mm_unset_cid_on_task(struct task_struct *t) static __always_inline void mm_drop_cid_on_cpu(struct mm_struct *mm, struct mm_cid_pcpu *pcp) { /* Clear the ONCPU bit, but do not set UNSET in the per CPU storage */ - pcp->cid = cpu_cid_to_cid(pcp->cid); - mm_drop_cid(mm, pcp->cid); + if (cid_on_cpu(pcp->cid)) { + pcp->cid = cpu_cid_to_cid(pcp->cid); + mm_drop_cid(mm, pcp->cid); + } } static inline unsigned int __mm_get_cid(struct mm_struct *mm, unsigned int max_cids) @@ -3816,7 +3873,8 @@ static __always_inline void mm_cid_update_pcpu_cid(struct mm_struct *mm, unsigne __this_cpu_write(mm->mm_cid.pcpu->cid, cid); } -static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int cpu_cid) +static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int cpu_cid, + unsigned int mode) { unsigned int max_cids, tcid = t->mm_cid.cid; struct mm_struct *mm = t->mm; @@ -3841,12 +3899,17 @@ static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int /* Still nothing, allocate a new one */ if (!cid_on_cpu(cpu_cid)) cpu_cid = cid_to_cpu_cid(mm_get_cid(mm)); + + /* Handle the transition mode flag if required */ + if (mode & MM_CID_TRANSIT) + cpu_cid = cpu_cid_to_cid(cpu_cid) | MM_CID_TRANSIT; } mm_cid_update_pcpu_cid(mm, cpu_cid); mm_cid_update_task_cid(t, cpu_cid); } -static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int cpu_cid) +static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int cpu_cid, + unsigned int mode) { unsigned int max_cids, tcid = t->mm_cid.cid; struct mm_struct *mm = t->mm; @@ -3872,7 +3935,7 @@ static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int if (!cid_on_task(tcid)) tcid = mm_get_cid(mm); /* Set the transition mode flag if required */ - tcid |= READ_ONCE(mm->mm_cid.transit); + tcid |= mode & MM_CID_TRANSIT; } mm_cid_update_pcpu_cid(mm, tcid); mm_cid_update_task_cid(t, tcid); @@ -3881,26 +3944,46 @@ static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int static __always_inline void mm_cid_schedin(struct task_struct *next) { struct mm_struct *mm = next->mm; - unsigned int cpu_cid; + unsigned int cpu_cid, mode; if (!next->mm_cid.active) return; cpu_cid = __this_cpu_read(mm->mm_cid.pcpu->cid); - if (likely(!READ_ONCE(mm->mm_cid.percpu))) - mm_cid_from_task(next, cpu_cid); + mode = READ_ONCE(mm->mm_cid.mode); + if (likely(!cid_on_cpu(mode))) + mm_cid_from_task(next, cpu_cid, mode); else - mm_cid_from_cpu(next, cpu_cid); + mm_cid_from_cpu(next, cpu_cid, mode); } static __always_inline void mm_cid_schedout(struct task_struct *prev) { + struct mm_struct *mm = prev->mm; + unsigned int mode, cid; + /* During mode transitions CIDs are temporary and need to be dropped */ if (likely(!cid_in_transit(prev->mm_cid.cid))) return; - mm_drop_cid(prev->mm, cid_from_transit_cid(prev->mm_cid.cid)); - prev->mm_cid.cid = MM_CID_UNSET; + mode = READ_ONCE(mm->mm_cid.mode); + cid = cid_from_transit_cid(prev->mm_cid.cid); + + /* + * If transition mode is done, transfer ownership when the CID is + * within the convergence range to optimize the next schedule in. + */ + if (!cid_in_transit(mode) && cid < READ_ONCE(mm->mm_cid.max_cids)) { + if (cid_on_cpu(mode)) + cid = cid_to_cpu_cid(cid); + + /* Update both so that the next schedule in goes into the fast path */ + mm_cid_update_pcpu_cid(mm, cid); + prev->mm_cid.cid = cid; + } else { + mm_drop_cid(mm, cid); + prev->mm_cid.cid = MM_CID_UNSET; + } } static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) @@ -3924,6 +4007,7 @@ void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_s deactivate_task(src_rq, task, 0); set_task_cpu(task, dst_rq->cpu); activate_task(dst_rq, task, 0); + wakeup_preempt(dst_rq, task, 0); } static inline @@ -3993,6 +4077,7 @@ extern void balance_callbacks(struct rq *rq, struct balance_callback *head); struct sched_change_ctx { u64 prio; struct task_struct *p; + const struct sched_class *class; int flags; bool queued; bool running; diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index c903f1a42891..a612cf253c87 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -253,8 +253,10 @@ static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t) delta = rq_clock(rq) - t->sched_info.last_queued; t->sched_info.last_queued = 0; t->sched_info.run_delay += delta; - if (delta > t->sched_info.max_run_delay) + if (delta > t->sched_info.max_run_delay) { t->sched_info.max_run_delay = delta; + ktime_get_real_ts64(&t->sched_info.max_run_delay_ts); + } if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay)) t->sched_info.min_run_delay = delta; rq_sched_info_dequeue(rq, delta); @@ -278,8 +280,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t) t->sched_info.run_delay += delta; t->sched_info.last_arrival = now; t->sched_info.pcount++; - if (delta > t->sched_info.max_run_delay) + if (delta > t->sched_info.max_run_delay) { t->sched_info.max_run_delay = delta; + ktime_get_real_ts64(&t->sched_info.max_run_delay_ts); + } if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay)) t->sched_info.min_run_delay = delta; diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 4f9192be4b5b..f95798baddeb 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -97,9 +97,6 @@ static void update_curr_stop(struct rq *rq) * Simple, special scheduling class for the per-CPU stop tasks: */ DEFINE_SCHED_CLASS(stop) = { - - .queue_mask = 16, - .enqueue_task = enqueue_task_stop, .dequeue_task = dequeue_task_stop, .yield_task = yield_task_stop, diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index cf643a5ddedd..ac268da91778 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -508,6 +508,11 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) if (rq->fair_server.dl_server) __dl_server_attach_root(&rq->fair_server, rq); +#ifdef CONFIG_SCHED_CLASS_EXT + if (rq->ext_server.dl_server) + __dl_server_attach_root(&rq->ext_server, rq); +#endif + rq_unlock_irqrestore(rq, &rf); if (old_rd) diff --git a/kernel/signal.c b/kernel/signal.c index e42b8bd6922f..d65d0fe24bfb 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1355,8 +1355,8 @@ int zap_other_threads(struct task_struct *p) return count; } -struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, - unsigned long *flags) +struct sighand_struct *lock_task_sighand(struct task_struct *tsk, + unsigned long *flags) { struct sighand_struct *sighand; diff --git a/kernel/sys.c b/kernel/sys.c index 8b58eece4e58..35ea9d79a42e 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -31,7 +31,6 @@ #include <linux/tty.h> #include <linux/signal.h> #include <linux/cn_proc.h> -#include <linux/getcpu.h> #include <linux/task_io_accounting_ops.h> #include <linux/seccomp.h> #include <linux/cpu.h> @@ -53,6 +52,7 @@ #include <linux/time_namespace.h> #include <linux/binfmts.h> #include <linux/futex.h> +#include <linux/rseq.h> #include <linux/sched.h> #include <linux/sched/autogroup.h> @@ -2868,6 +2868,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_FUTEX_HASH: error = futex_hash_prctl(arg2, arg3, arg4); break; + case PR_RSEQ_SLICE_EXTENSION: + if (arg4 || arg5) + return -EINVAL; + error = rseq_slice_extension_prctl(arg2, arg3); + break; default: trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5); error = -EINVAL; @@ -2876,8 +2881,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return error; } -SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, - struct getcpu_cache __user *, unused) +SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, void __user *, unused) { int err = 0; int cpu = raw_smp_processor_id(); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index bf5d05c635ff..add3032da16f 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -390,6 +390,7 @@ COND_SYSCALL(setuid16); /* restartable sequence */ COND_SYSCALL(rseq); +COND_SYSCALL(rseq_slice_yield); COND_SYSCALL(uretprobe); COND_SYSCALL(uprobe); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index a1890a073196..df7194961658 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -252,7 +252,7 @@ enum wd_read_status { static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow) { - int64_t md = 2 * watchdog->uncertainty_margin; + int64_t md = watchdog->uncertainty_margin; unsigned int nretries, max_retries; int64_t wd_delay, wd_seq_delay; u64 wd_end, wd_end2; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 0e4bc1ca15ff..860af7a58428 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -50,6 +50,14 @@ #include "tick-internal.h" /* + * The resolution of the clocks. The resolution value is returned in + * the clock_getres() system call to give application programmers an + * idea of the (in)accuracy of timers. Timer values are rounded up to + * this resolution values. + */ +#define HIGH_RES_NSEC 1 + +/* * Masks for selecting the soft and hard context timers from * cpu_base->active */ @@ -806,7 +814,7 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) struct hrtimer_clock_base *base = timer->base; ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); - WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); + WARN_ON_ONCE(hrtimer_get_expires(timer) < 0); /* * CLOCK_REALTIME timer might be requested with an absolute @@ -943,7 +951,7 @@ void clock_was_set(unsigned int bases) cpumask_var_t mask; int cpu; - if (!hrtimer_hres_active(cpu_base) && !tick_nohz_active) + if (!hrtimer_hres_active(cpu_base) && !tick_nohz_is_active()) goto out_timerfd; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { @@ -1053,7 +1061,7 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) orun = ktime_divns(delta, incr); hrtimer_add_expires_ns(timer, incr * orun); - if (hrtimer_get_expires_tv64(timer) > now) + if (hrtimer_get_expires(timer) > now) return orun; /* * This (and the ktime_add() below) is the @@ -1742,7 +1750,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, lockdep_assert_held(&cpu_base->lock); - debug_deactivate(timer); + debug_hrtimer_deactivate(timer); base->running = timer; /* @@ -1835,7 +1843,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, * are right-of a not yet expired timer, because that * timer will have to trigger a wakeup anyway. */ - if (basenow < hrtimer_get_softexpires_tv64(timer)) + if (basenow < hrtimer_get_softexpires(timer)) break; __run_hrtimer(cpu_base, base, timer, &basenow, flags); diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 80a8a09a21a0..413e2389f0a5 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -66,14 +66,7 @@ static const struct k_clock clock_realtime, clock_monotonic; #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" #endif -static struct k_itimer *__lock_timer(timer_t timer_id); - -#define lock_timer(tid) \ -({ struct k_itimer *__timr; \ - __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid)); \ - __timr; \ -}) - +static struct k_itimer *lock_timer(timer_t timer_id); static inline void unlock_timer(struct k_itimer *timr) { if (likely((timr))) @@ -85,7 +78,7 @@ static inline void unlock_timer(struct k_itimer *timr) #define scoped_timer (scope) -DEFINE_CLASS(lock_timer, struct k_itimer *, unlock_timer(_T), __lock_timer(id), timer_t id); +DEFINE_CLASS(lock_timer, struct k_itimer *, unlock_timer(_T), lock_timer(id), timer_t id); DEFINE_CLASS_IS_COND_GUARD(lock_timer); static struct timer_hash_bucket *hash_bucket(struct signal_struct *sig, unsigned int nr) @@ -600,7 +593,7 @@ COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock, } #endif -static struct k_itimer *__lock_timer(timer_t timer_id) +static struct k_itimer *lock_timer(timer_t timer_id) { struct k_itimer *timr; diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index f39111830ca3..f3aaef695b8c 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -215,7 +215,7 @@ void sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) update_clock_read_data(&rd); - if (sched_clock_timer.function != NULL) { + if (ACCESS_PRIVATE(&sched_clock_timer, function) != NULL) { /* update timeout for clock wrap */ hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD); diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 4e4f7bbe2a64..597d816d22e8 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -156,7 +156,6 @@ static inline void tick_nohz_init(void) { } #endif #ifdef CONFIG_NO_HZ_COMMON -extern unsigned long tick_nohz_active; extern void timers_update_nohz(void); extern u64 get_jiffies_update(unsigned long *basej); # ifdef CONFIG_SMP @@ -171,7 +170,6 @@ extern void timer_expire_remote(unsigned int cpu); # endif #else /* CONFIG_NO_HZ_COMMON */ static inline void timers_update_nohz(void) { } -#define tick_nohz_active (0) #endif DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 2f8a7923fa27..f7907fadd63f 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -344,6 +344,9 @@ static bool check_tick_dependency(atomic_t *dep) { int val = atomic_read(dep); + if (likely(!tracepoint_enabled(tick_stop))) + return !val; + if (val & TICK_DEP_MASK_POSIX_TIMER) { trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER); return true; @@ -693,7 +696,7 @@ void __init tick_nohz_init(void) * NO HZ enabled ? */ bool tick_nohz_enabled __read_mostly = true; -unsigned long tick_nohz_active __read_mostly; +static unsigned long tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode */ @@ -704,6 +707,12 @@ static int __init setup_tick_nohz(char *str) __setup("nohz=", setup_tick_nohz); +bool tick_nohz_is_active(void) +{ + return tick_nohz_active; +} +EXPORT_SYMBOL_GPL(tick_nohz_is_active); + bool tick_nohz_tick_stopped(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); diff --git a/kernel/time/time_test.c b/kernel/time/time_test.c index 2889763165e5..1b99180da288 100644 --- a/kernel/time/time_test.c +++ b/kernel/time/time_test.c @@ -4,7 +4,9 @@ #include <linux/time.h> /* - * Traditional implementation of leap year evaluation. + * Traditional implementation of leap year evaluation, but note that long + * is a signed type and the tests do cover negative year values. So this + * can't use the is_leap_year() helper from rtc.h. */ static bool is_leap(long year) { diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c index 3d2a354cfe1c..2e64dbb6302d 100644 --- a/kernel/time/timecounter.c +++ b/kernel/time/timecounter.c @@ -62,38 +62,3 @@ u64 timecounter_read(struct timecounter *tc) } EXPORT_SYMBOL_GPL(timecounter_read); -/* - * This is like cyclecounter_cyc2ns(), but it is used for computing a - * time previous to the time stored in the cycle counter. - */ -static u64 cc_cyc2ns_backwards(const struct cyclecounter *cc, - u64 cycles, u64 mask, u64 frac) -{ - u64 ns = (u64) cycles; - - ns = ((ns * cc->mult) - frac) >> cc->shift; - - return ns; -} - -u64 timecounter_cyc2time(const struct timecounter *tc, - u64 cycle_tstamp) -{ - u64 delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask; - u64 nsec = tc->nsec, frac = tc->frac; - - /* - * Instead of always treating cycle_tstamp as more recent - * than tc->cycle_last, detect when it is too far in the - * future and treat it as old time stamp instead. - */ - if (delta > tc->cc->mask / 2) { - delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask; - nsec -= cc_cyc2ns_backwards(tc->cc, delta, tc->mask, frac); - } else { - nsec += cyclecounter_cyc2ns(tc->cc, delta, tc->mask, &frac); - } - - return nsec; -} -EXPORT_SYMBOL_GPL(timecounter_cyc2time); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3ec3daa4acab..91fa2003351c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2735,7 +2735,7 @@ static int __do_adjtimex(struct tk_data *tkd, struct __kernel_timex *txc, timekeeping_update_from_shadow(tkd, TK_CLOCK_WAS_SET); result->clock_set = true; } else { - tk_update_leap_state_all(&tk_core); + tk_update_leap_state_all(tkd); } /* Update the multiplier immediately if frequency was set directly */ diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 1f2364126894..7e1e3bde6b8b 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -281,7 +281,7 @@ DEFINE_STATIC_KEY_FALSE(timers_migration_enabled); static void timers_update_migration(void) { - if (sysctl_timer_migration && tick_nohz_active) + if (sysctl_timer_migration && tick_nohz_is_active()) static_branch_enable(&timers_migration_enabled); else static_branch_disable(&timers_migration_enabled); diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index 18dda1aa782d..6da9cd562b20 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -466,9 +466,8 @@ static inline bool tmigr_is_isolated(int cpu) { if (!static_branch_unlikely(&tmigr_exclude_isolated)) return false; - return (!housekeeping_cpu(cpu, HK_TYPE_DOMAIN) || - cpuset_cpu_is_isolated(cpu)) && - housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE); + return (!housekeeping_cpu(cpu, HK_TYPE_DOMAIN) && + housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)); } /* @@ -1497,7 +1496,7 @@ static int tmigr_clear_cpu_available(unsigned int cpu) return 0; } -static int tmigr_set_cpu_available(unsigned int cpu) +static int __tmigr_set_cpu_available(unsigned int cpu) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); @@ -1505,9 +1504,6 @@ static int tmigr_set_cpu_available(unsigned int cpu) if (WARN_ON_ONCE(!tmc->tmgroup)) return -EINVAL; - if (tmigr_is_isolated(cpu)) - return 0; - guard(mutex)(&tmigr_available_mutex); cpumask_set_cpu(cpu, tmigr_available_cpumask); @@ -1523,6 +1519,14 @@ static int tmigr_set_cpu_available(unsigned int cpu) return 0; } +static int tmigr_set_cpu_available(unsigned int cpu) +{ + if (tmigr_is_isolated(cpu)) + return 0; + + return __tmigr_set_cpu_available(cpu); +} + static void tmigr_cpu_isolate(struct work_struct *ignored) { tmigr_clear_cpu_available(smp_processor_id()); @@ -1530,7 +1534,12 @@ static void tmigr_cpu_isolate(struct work_struct *ignored) static void tmigr_cpu_unisolate(struct work_struct *ignored) { - tmigr_set_cpu_available(smp_processor_id()); + /* + * Don't call tmigr_is_isolated() ->housekeeping_cpu() directly because + * the cpuset mutex is correctly held by the workqueue caller but lockdep + * doesn't know that. + */ + __tmigr_set_cpu_available(smp_processor_id()); } /** diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index bfa2ec46e075..d7042a09fe46 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -50,6 +50,9 @@ config HAVE_DYNAMIC_FTRACE_WITH_REGS config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS bool +config HAVE_SINGLE_FTRACE_DIRECT_OPS + bool + config HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS bool diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index d031c8d80be4..c4db5c2e7103 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -793,7 +793,7 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, return PTR_ERR(bt); } blk_trace_setup_finalize(q, name, 1, bt, &buts2); - strcpy(buts.name, buts2.name); + strscpy(buts.name, buts2.name, BLKTRACE_BDEV_SIZE); mutex_unlock(&q->debugfs_mutex); if (copy_to_user(arg, &buts, sizeof(buts))) { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index fe28d86f7c35..f7baeb8278ca 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -830,7 +830,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type, struct task_struc info.si_code = SI_KERNEL; info.si_pid = 0; info.si_uid = 0; - info.si_value.sival_ptr = (void *)(unsigned long)value; + info.si_value.sival_ptr = (void __user __force *)(unsigned long)value; siginfo = &info; } @@ -1022,7 +1022,7 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = { .func = bpf_snprintf_btf, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM | MEM_WRITE, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE, @@ -1194,7 +1194,7 @@ const struct bpf_func_proto bpf_get_branch_snapshot_proto = { BPF_CALL_3(get_func_arg, void *, ctx, u32, n, u64 *, value) { /* This helper call is inlined by verifier. */ - u64 nr_args = ((u64 *)ctx)[-1]; + u64 nr_args = ((u64 *)ctx)[-1] & 0xFF; if ((u64) n >= nr_args) return -EINVAL; @@ -1214,7 +1214,7 @@ static const struct bpf_func_proto bpf_get_func_arg_proto = { BPF_CALL_2(get_func_ret, void *, ctx, u64 *, value) { /* This helper call is inlined by verifier. */ - u64 nr_args = ((u64 *)ctx)[-1]; + u64 nr_args = ((u64 *)ctx)[-1] & 0xFF; *value = ((u64 *)ctx)[nr_args]; return 0; @@ -1231,7 +1231,7 @@ static const struct bpf_func_proto bpf_get_func_ret_proto = { BPF_CALL_1(get_func_arg_cnt, void *, ctx) { /* This helper call is inlined by verifier. */ - return ((u64 *)ctx)[-1]; + return ((u64 *)ctx)[-1] & 0xFF; } static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = { @@ -1286,7 +1286,8 @@ static bool is_kprobe_multi(const struct bpf_prog *prog) static inline bool is_kprobe_session(const struct bpf_prog *prog) { - return prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION; + return prog->type == BPF_PROG_TYPE_KPROBE && + prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION; } static inline bool is_uprobe_multi(const struct bpf_prog *prog) @@ -1297,7 +1298,14 @@ static inline bool is_uprobe_multi(const struct bpf_prog *prog) static inline bool is_uprobe_session(const struct bpf_prog *prog) { - return prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION; + return prog->type == BPF_PROG_TYPE_KPROBE && + prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION; +} + +static inline bool is_trace_fsession(const struct bpf_prog *prog) +{ + return prog->type == BPF_PROG_TYPE_TRACING && + prog->expected_attach_type == BPF_TRACE_FSESSION; } static const struct bpf_func_proto * @@ -1526,7 +1534,7 @@ static const struct bpf_func_proto bpf_read_branch_records_proto = { .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM_OR_NULL, + .arg2_type = ARG_PTR_TO_MEM_OR_NULL | MEM_WRITE, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; @@ -1661,7 +1669,7 @@ static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = { .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; @@ -1734,11 +1742,17 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_d_path: return &bpf_d_path_proto; case BPF_FUNC_get_func_arg: - return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_proto : NULL; + if (bpf_prog_has_trampoline(prog) || + prog->expected_attach_type == BPF_TRACE_RAW_TP) + return &bpf_get_func_arg_proto; + return NULL; case BPF_FUNC_get_func_ret: return bpf_prog_has_trampoline(prog) ? &bpf_get_func_ret_proto : NULL; case BPF_FUNC_get_func_arg_cnt: - return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL; + if (bpf_prog_has_trampoline(prog) || + prog->expected_attach_type == BPF_TRACE_RAW_TP) + return &bpf_get_func_arg_cnt_proto; + return NULL; case BPF_FUNC_get_attach_cookie: if (prog->type == BPF_PROG_TYPE_TRACING && prog->expected_attach_type == BPF_TRACE_RAW_TP) @@ -2063,7 +2077,7 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) struct bpf_trace_run_ctx run_ctx; cant_sleep(); - if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { + if (unlikely(!bpf_prog_get_recursion_context(prog))) { bpf_prog_inc_misses_counter(prog); goto out; } @@ -2077,7 +2091,7 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) bpf_reset_run_ctx(old_run_ctx); out: - this_cpu_dec(*(prog->active)); + bpf_prog_put_recursion_context(prog); } #define UNPACK(...) __VA_ARGS__ @@ -2564,6 +2578,7 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx); err = bpf_prog_run(link->link.prog, regs); bpf_reset_run_ctx(old_run_ctx); + ftrace_partial_regs_update(fregs, bpf_kprobe_multi_pt_regs_ptr()); rcu_read_unlock(); out: @@ -3316,7 +3331,7 @@ static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx) __bpf_kfunc_start_defs(); -__bpf_kfunc bool bpf_session_is_return(void) +__bpf_kfunc bool bpf_session_is_return(void *ctx) { struct bpf_session_run_ctx *session_ctx; @@ -3324,7 +3339,7 @@ __bpf_kfunc bool bpf_session_is_return(void) return session_ctx->is_return; } -__bpf_kfunc __u64 *bpf_session_cookie(void) +__bpf_kfunc __u64 *bpf_session_cookie(void *ctx) { struct bpf_session_run_ctx *session_ctx; @@ -3334,34 +3349,39 @@ __bpf_kfunc __u64 *bpf_session_cookie(void) __bpf_kfunc_end_defs(); -BTF_KFUNCS_START(kprobe_multi_kfunc_set_ids) +BTF_KFUNCS_START(session_kfunc_set_ids) BTF_ID_FLAGS(func, bpf_session_is_return) BTF_ID_FLAGS(func, bpf_session_cookie) -BTF_KFUNCS_END(kprobe_multi_kfunc_set_ids) +BTF_KFUNCS_END(session_kfunc_set_ids) -static int bpf_kprobe_multi_filter(const struct bpf_prog *prog, u32 kfunc_id) +static int bpf_session_filter(const struct bpf_prog *prog, u32 kfunc_id) { - if (!btf_id_set8_contains(&kprobe_multi_kfunc_set_ids, kfunc_id)) + if (!btf_id_set8_contains(&session_kfunc_set_ids, kfunc_id)) return 0; - if (!is_kprobe_session(prog) && !is_uprobe_session(prog)) + if (!is_kprobe_session(prog) && !is_uprobe_session(prog) && !is_trace_fsession(prog)) return -EACCES; return 0; } -static const struct btf_kfunc_id_set bpf_kprobe_multi_kfunc_set = { +static const struct btf_kfunc_id_set bpf_session_kfunc_set = { .owner = THIS_MODULE, - .set = &kprobe_multi_kfunc_set_ids, - .filter = bpf_kprobe_multi_filter, + .set = &session_kfunc_set_ids, + .filter = bpf_session_filter, }; -static int __init bpf_kprobe_multi_kfuncs_init(void) +static int __init bpf_trace_kfuncs_init(void) { - return register_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_kprobe_multi_kfunc_set); + int err = 0; + + err = err ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_session_kfunc_set); + err = err ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_session_kfunc_set); + + return err; } -late_initcall(bpf_kprobe_multi_kfuncs_init); +late_initcall(bpf_trace_kfuncs_init); typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struct *tsk); @@ -3517,7 +3537,7 @@ __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid __bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign) { - return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, + return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign, copy_user_data_nofault, NULL); } @@ -3531,7 +3551,7 @@ __bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u64 off, __bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign) { - return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, + return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign, copy_user_str_nofault, NULL); } @@ -3545,14 +3565,14 @@ __bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u64 of __bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign) { - return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, + return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign, copy_user_data_sleepable, NULL); } __bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign) { - return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, + return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign, copy_user_str_sleepable, NULL); } @@ -3560,7 +3580,7 @@ __bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign, struct task_struct *tsk) { - return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, + return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign, copy_user_data_sleepable, tsk); } @@ -3568,7 +3588,7 @@ __bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u64 u64 size, const void __user *unsafe_ptr__ign, struct task_struct *tsk) { - return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, + return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign, copy_user_str_sleepable, tsk); } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index aa758efc3731..8fb38722fd5c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -68,7 +68,6 @@ }) /* hash bits for specific function selection */ -#define FTRACE_HASH_DEFAULT_BITS 10 #define FTRACE_HASH_MAX_BITS 12 #ifdef CONFIG_DYNAMIC_FTRACE @@ -1210,8 +1209,8 @@ static void __add_hash_entry(struct ftrace_hash *hash, hash->count++; } -static struct ftrace_func_entry * -add_hash_entry(struct ftrace_hash *hash, unsigned long ip) +struct ftrace_func_entry * +add_ftrace_hash_entry_direct(struct ftrace_hash *hash, unsigned long ip, unsigned long direct) { struct ftrace_func_entry *entry; @@ -1220,11 +1219,18 @@ add_hash_entry(struct ftrace_hash *hash, unsigned long ip) return NULL; entry->ip = ip; + entry->direct = direct; __add_hash_entry(hash, entry); return entry; } +static struct ftrace_func_entry * +add_hash_entry(struct ftrace_hash *hash, unsigned long ip) +{ + return add_ftrace_hash_entry_direct(hash, ip, 0); +} + static void free_hash_entry(struct ftrace_hash *hash, struct ftrace_func_entry *entry) @@ -1283,7 +1289,7 @@ static void clear_ftrace_mod_list(struct list_head *head) mutex_unlock(&ftrace_lock); } -static void free_ftrace_hash(struct ftrace_hash *hash) +void free_ftrace_hash(struct ftrace_hash *hash) { if (!hash || hash == EMPTY_HASH) return; @@ -1323,7 +1329,7 @@ void ftrace_free_filter(struct ftrace_ops *ops) } EXPORT_SYMBOL_GPL(ftrace_free_filter); -static struct ftrace_hash *alloc_ftrace_hash(int size_bits) +struct ftrace_hash *alloc_ftrace_hash(int size_bits) { struct ftrace_hash *hash; int size; @@ -1397,7 +1403,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) size = 1 << hash->size_bits; for (i = 0; i < size; i++) { hlist_for_each_entry(entry, &hash->buckets[i], hlist) { - if (add_hash_entry(new_hash, entry->ip) == NULL) + if (add_ftrace_hash_entry_direct(new_hash, entry->ip, entry->direct) == NULL) goto free_hash; } } @@ -2068,7 +2074,7 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, */ if (!ops->ops_func) return -EBUSY; - ret = ops->ops_func(ops, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF); + ret = ops->ops_func(ops, rec->ip, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF); if (ret) return ret; } else if (is_ipmodify) { @@ -2624,8 +2630,13 @@ unsigned long ftrace_find_rec_direct(unsigned long ip) static void call_direct_funcs(unsigned long ip, unsigned long pip, struct ftrace_ops *ops, struct ftrace_regs *fregs) { - unsigned long addr = READ_ONCE(ops->direct_call); + unsigned long addr; +#ifdef CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS + addr = ftrace_find_rec_direct(ip); +#else + addr = READ_ONCE(ops->direct_call); +#endif if (!addr) return; @@ -6049,15 +6060,8 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) if (ftrace_hash_empty(hash)) return -EINVAL; - /* This is a "raw" address, and this should never happen. */ - if (WARN_ON_ONCE(ftrace_is_jmp(addr))) - return -EINVAL; - mutex_lock(&direct_mutex); - if (ops->flags & FTRACE_OPS_FL_JMP) - addr = ftrace_jmp_set(addr); - /* Make sure requested entries are not already registered.. */ size = 1 << hash->size_bits; for (i = 0; i < size; i++) { @@ -6178,13 +6182,6 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) lockdep_assert_held_once(&direct_mutex); - /* This is a "raw" address, and this should never happen. */ - if (WARN_ON_ONCE(ftrace_is_jmp(addr))) - return -EINVAL; - - if (ops->flags & FTRACE_OPS_FL_JMP) - addr = ftrace_jmp_set(addr); - /* Enable the tmp_ops to have the same functions as the direct ops */ ftrace_ops_init(&tmp_ops); tmp_ops.func_hash = ops->func_hash; @@ -6289,6 +6286,368 @@ int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) return err; } EXPORT_SYMBOL_GPL(modify_ftrace_direct); + +static unsigned long hash_count(struct ftrace_hash *hash) +{ + return hash ? hash->count : 0; +} + +/** + * hash_add - adds two struct ftrace_hash and returns the result + * @a: struct ftrace_hash object + * @b: struct ftrace_hash object + * + * Returns struct ftrace_hash object on success, NULL on error. + */ +static struct ftrace_hash *hash_add(struct ftrace_hash *a, struct ftrace_hash *b) +{ + struct ftrace_func_entry *entry; + struct ftrace_hash *add; + int size; + + size = hash_count(a) + hash_count(b); + if (size > 32) + size = 32; + + add = alloc_and_copy_ftrace_hash(fls(size), a); + if (!add) + return NULL; + + size = 1 << b->size_bits; + for (int i = 0; i < size; i++) { + hlist_for_each_entry(entry, &b->buckets[i], hlist) { + if (add_ftrace_hash_entry_direct(add, entry->ip, entry->direct) == NULL) { + free_ftrace_hash(add); + return NULL; + } + } + } + return add; +} + +/** + * update_ftrace_direct_add - Updates @ops by adding direct + * callers provided in @hash + * @ops: The address of the struct ftrace_ops object + * @hash: The address of the struct ftrace_hash object + * + * This is used to add custom direct callers (ip -> addr) to @ops, + * specified in @hash. The @ops will be either registered or updated. + * + * Returns: zero on success. Non zero on error, which includes: + * -EINVAL - The @hash is empty + */ +int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace_hash *hash) +{ + struct ftrace_hash *old_direct_functions = NULL; + struct ftrace_hash *new_direct_functions; + struct ftrace_hash *old_filter_hash; + struct ftrace_hash *new_filter_hash = NULL; + struct ftrace_func_entry *entry; + int err = -EINVAL; + int size; + bool reg; + + if (!hash_count(hash)) + return -EINVAL; + + mutex_lock(&direct_mutex); + + /* Make sure requested entries are not already registered. */ + size = 1 << hash->size_bits; + for (int i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + if (__ftrace_lookup_ip(direct_functions, entry->ip)) + goto out_unlock; + } + } + + old_filter_hash = ops->func_hash ? ops->func_hash->filter_hash : NULL; + + /* If there's nothing in filter_hash we need to register the ops. */ + reg = hash_count(old_filter_hash) == 0; + if (reg) { + if (ops->func || ops->trampoline) + goto out_unlock; + if (ops->flags & FTRACE_OPS_FL_ENABLED) + goto out_unlock; + } + + err = -ENOMEM; + new_filter_hash = hash_add(old_filter_hash, hash); + if (!new_filter_hash) + goto out_unlock; + + new_direct_functions = hash_add(direct_functions, hash); + if (!new_direct_functions) + goto out_unlock; + + old_direct_functions = direct_functions; + rcu_assign_pointer(direct_functions, new_direct_functions); + + if (reg) { + ops->func = call_direct_funcs; + ops->flags |= MULTI_FLAGS; + ops->trampoline = FTRACE_REGS_ADDR; + ops->local_hash.filter_hash = new_filter_hash; + + err = register_ftrace_function_nolock(ops); + if (err) { + /* restore old filter on error */ + ops->local_hash.filter_hash = old_filter_hash; + + /* cleanup for possible another register call */ + ops->func = NULL; + ops->trampoline = 0; + } else { + new_filter_hash = old_filter_hash; + } + } else { + err = ftrace_update_ops(ops, new_filter_hash, EMPTY_HASH); + /* + * new_filter_hash is dup-ed, so we need to release it anyway, + * old_filter_hash either stays on error or is already released + */ + } + + if (err) { + /* reset direct_functions and free the new one */ + rcu_assign_pointer(direct_functions, old_direct_functions); + old_direct_functions = new_direct_functions; + } + + out_unlock: + mutex_unlock(&direct_mutex); + + if (old_direct_functions && old_direct_functions != EMPTY_HASH) + call_rcu_tasks(&old_direct_functions->rcu, register_ftrace_direct_cb); + free_ftrace_hash(new_filter_hash); + + return err; +} + +/** + * hash_sub - substracts @b from @a and returns the result + * @a: struct ftrace_hash object + * @b: struct ftrace_hash object + * + * Returns struct ftrace_hash object on success, NULL on error. + */ +static struct ftrace_hash *hash_sub(struct ftrace_hash *a, struct ftrace_hash *b) +{ + struct ftrace_func_entry *entry, *del; + struct ftrace_hash *sub; + int size; + + sub = alloc_and_copy_ftrace_hash(a->size_bits, a); + if (!sub) + return NULL; + + size = 1 << b->size_bits; + for (int i = 0; i < size; i++) { + hlist_for_each_entry(entry, &b->buckets[i], hlist) { + del = __ftrace_lookup_ip(sub, entry->ip); + if (WARN_ON_ONCE(!del)) { + free_ftrace_hash(sub); + return NULL; + } + remove_hash_entry(sub, del); + kfree(del); + } + } + return sub; +} + +/** + * update_ftrace_direct_del - Updates @ops by removing its direct + * callers provided in @hash + * @ops: The address of the struct ftrace_ops object + * @hash: The address of the struct ftrace_hash object + * + * This is used to delete custom direct callers (ip -> addr) in + * @ops specified via @hash. The @ops will be either unregistered + * updated. + * + * Returns: zero on success. Non zero on error, which includes: + * -EINVAL - The @hash is empty + * -EINVAL - The @ops is not registered + */ +int update_ftrace_direct_del(struct ftrace_ops *ops, struct ftrace_hash *hash) +{ + struct ftrace_hash *old_direct_functions = NULL; + struct ftrace_hash *new_direct_functions; + struct ftrace_hash *new_filter_hash = NULL; + struct ftrace_hash *old_filter_hash; + struct ftrace_func_entry *entry; + struct ftrace_func_entry *del; + unsigned long size; + int err = -EINVAL; + + if (!hash_count(hash)) + return -EINVAL; + if (check_direct_multi(ops)) + return -EINVAL; + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return -EINVAL; + if (direct_functions == EMPTY_HASH) + return -EINVAL; + + mutex_lock(&direct_mutex); + + old_filter_hash = ops->func_hash ? ops->func_hash->filter_hash : NULL; + + if (!hash_count(old_filter_hash)) + goto out_unlock; + + /* Make sure requested entries are already registered. */ + size = 1 << hash->size_bits; + for (int i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + del = __ftrace_lookup_ip(direct_functions, entry->ip); + if (!del || del->direct != entry->direct) + goto out_unlock; + } + } + + err = -ENOMEM; + new_filter_hash = hash_sub(old_filter_hash, hash); + if (!new_filter_hash) + goto out_unlock; + + new_direct_functions = hash_sub(direct_functions, hash); + if (!new_direct_functions) + goto out_unlock; + + /* If there's nothing left, we need to unregister the ops. */ + if (ftrace_hash_empty(new_filter_hash)) { + err = unregister_ftrace_function(ops); + if (!err) { + /* cleanup for possible another register call */ + ops->func = NULL; + ops->trampoline = 0; + ftrace_free_filter(ops); + ops->func_hash->filter_hash = NULL; + } + } else { + err = ftrace_update_ops(ops, new_filter_hash, EMPTY_HASH); + /* + * new_filter_hash is dup-ed, so we need to release it anyway, + * old_filter_hash either stays on error or is already released + */ + } + + if (err) { + /* free the new_direct_functions */ + old_direct_functions = new_direct_functions; + } else { + old_direct_functions = direct_functions; + rcu_assign_pointer(direct_functions, new_direct_functions); + } + + out_unlock: + mutex_unlock(&direct_mutex); + + if (old_direct_functions && old_direct_functions != EMPTY_HASH) + call_rcu_tasks(&old_direct_functions->rcu, register_ftrace_direct_cb); + free_ftrace_hash(new_filter_hash); + + return err; +} + +/** + * update_ftrace_direct_mod - Updates @ops by modifing its direct + * callers provided in @hash + * @ops: The address of the struct ftrace_ops object + * @hash: The address of the struct ftrace_hash object + * @do_direct_lock: If true lock the direct_mutex + * + * This is used to modify custom direct callers (ip -> addr) in + * @ops specified via @hash. + * + * This can be called from within ftrace ops_func callback with + * direct_mutex already locked, in which case @do_direct_lock + * needs to be false. + * + * Returns: zero on success. Non zero on error, which includes: + * -EINVAL - The @hash is empty + * -EINVAL - The @ops is not registered + */ +int update_ftrace_direct_mod(struct ftrace_ops *ops, struct ftrace_hash *hash, bool do_direct_lock) +{ + struct ftrace_func_entry *entry, *tmp; + static struct ftrace_ops tmp_ops = { + .func = ftrace_stub, + .flags = FTRACE_OPS_FL_STUB, + }; + struct ftrace_hash *orig_hash; + unsigned long size, i; + int err = -EINVAL; + + if (!hash_count(hash)) + return -EINVAL; + if (check_direct_multi(ops)) + return -EINVAL; + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return -EINVAL; + if (direct_functions == EMPTY_HASH) + return -EINVAL; + + /* + * We can be called from within ops_func callback with direct_mutex + * already taken. + */ + if (do_direct_lock) + mutex_lock(&direct_mutex); + + orig_hash = ops->func_hash ? ops->func_hash->filter_hash : NULL; + if (!orig_hash) + goto unlock; + + /* Enable the tmp_ops to have the same functions as the direct ops */ + ftrace_ops_init(&tmp_ops); + tmp_ops.func_hash = ops->func_hash; + + err = register_ftrace_function_nolock(&tmp_ops); + if (err) + goto unlock; + + /* + * Call __ftrace_hash_update_ipmodify() here, so that we can call + * ops->ops_func for the ops. This is needed because the above + * register_ftrace_function_nolock() worked on tmp_ops. + */ + err = __ftrace_hash_update_ipmodify(ops, orig_hash, orig_hash, true); + if (err) + goto out; + + /* + * Now the ftrace_ops_list_func() is called to do the direct callers. + * We can safely change the direct functions attached to each entry. + */ + mutex_lock(&ftrace_lock); + + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + tmp = __ftrace_lookup_ip(direct_functions, entry->ip); + if (!tmp) + continue; + tmp->direct = entry->direct; + } + } + + mutex_unlock(&ftrace_lock); + +out: + /* Removing the tmp_ops will add the updated direct callers to the functions */ + unregister_ftrace_function(&tmp_ops); + +unlock: + if (do_direct_lock) + mutex_unlock(&direct_mutex); + return err; +} + #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ /** @@ -7753,7 +8112,8 @@ ftrace_func_address_lookup(struct ftrace_mod_map *mod_map, int ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, - unsigned long *off, char **modname, char *sym) + unsigned long *off, char **modname, + const unsigned char **modbuildid, char *sym) { struct ftrace_mod_map *mod_map; int ret = 0; @@ -7765,6 +8125,8 @@ ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, if (ret) { if (modname) *modname = mod_map->mod->name; + if (modbuildid) + *modbuildid = module_buildid(mod_map->mod); break; } } @@ -8709,7 +9071,7 @@ static int prepare_direct_functions_for_ipmodify(struct ftrace_ops *ops) if (!op->ops_func) return -EBUSY; - ret = op->ops_func(op, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER); + ret = op->ops_func(op, ip, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER); if (ret) return ret; } @@ -8756,7 +9118,7 @@ static void cleanup_direct_functions_after_ipmodify(struct ftrace_ops *ops) /* The cleanup is optional, ignore any errors */ if (found_op && op->ops_func) - op->ops_func(op, FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER); + op->ops_func(op, ip, FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER); } } mutex_unlock(&direct_mutex); diff --git a/kernel/trace/rv/monitors/nrp/nrp.c b/kernel/trace/rv/monitors/nrp/nrp.c index 5a83b7171432..4b5646a70094 100644 --- a/kernel/trace/rv/monitors/nrp/nrp.c +++ b/kernel/trace/rv/monitors/nrp/nrp.c @@ -6,7 +6,6 @@ #include <linux/init.h> #include <linux/rv.h> #include <rv/instrumentation.h> -#include <rv/da_monitor.h> #define MODULE_NAME "nrp" @@ -15,17 +14,16 @@ #include <rv_trace.h> #include <monitors/sched/sched.h> +#define RV_MON_TYPE RV_MON_PER_TASK #include "nrp.h" - -static struct rv_monitor rv_nrp; -DECLARE_DA_MON_PER_TASK(nrp, unsigned char); +#include <rv/da_monitor.h> #ifdef CONFIG_X86_LOCAL_APIC #include <asm/trace/irq_vectors.h> static void handle_vector_irq_entry(void *data, int vector) { - da_handle_event_nrp(current, irq_entry_nrp); + da_handle_event(current, irq_entry_nrp); } static void attach_vector_irq(void) @@ -60,7 +58,7 @@ static void detach_vector_irq(void) { } static void handle_irq_entry(void *data, int irq, struct irqaction *action) { - da_handle_event_nrp(current, irq_entry_nrp); + da_handle_event(current, irq_entry_nrp); } static void handle_sched_need_resched(void *data, struct task_struct *tsk, @@ -72,22 +70,22 @@ static void handle_sched_need_resched(void *data, struct task_struct *tsk, * which may not mirror the system state but makes the monitor simpler, */ if (tif == TIF_NEED_RESCHED) - da_handle_start_event_nrp(tsk, sched_need_resched_nrp); + da_handle_start_event(tsk, sched_need_resched_nrp); } static void handle_schedule_entry(void *data, bool preempt) { if (preempt) - da_handle_event_nrp(current, schedule_entry_preempt_nrp); + da_handle_event(current, schedule_entry_preempt_nrp); else - da_handle_event_nrp(current, schedule_entry_nrp); + da_handle_event(current, schedule_entry_nrp); } static int enable_nrp(void) { int retval; - retval = da_monitor_init_nrp(); + retval = da_monitor_init(); if (retval) return retval; @@ -101,33 +99,33 @@ static int enable_nrp(void) static void disable_nrp(void) { - rv_nrp.enabled = 0; + rv_this.enabled = 0; rv_detach_trace_probe("nrp", irq_handler_entry, handle_irq_entry); rv_detach_trace_probe("nrp", sched_set_need_resched_tp, handle_sched_need_resched); rv_detach_trace_probe("nrp", sched_entry_tp, handle_schedule_entry); detach_vector_irq(); - da_monitor_destroy_nrp(); + da_monitor_destroy(); } -static struct rv_monitor rv_nrp = { +static struct rv_monitor rv_this = { .name = "nrp", .description = "need resched preempts.", .enable = enable_nrp, .disable = disable_nrp, - .reset = da_monitor_reset_all_nrp, + .reset = da_monitor_reset_all, .enabled = 0, }; static int __init register_nrp(void) { - return rv_register_monitor(&rv_nrp, &rv_sched); + return rv_register_monitor(&rv_this, &rv_sched); } static void __exit unregister_nrp(void) { - rv_unregister_monitor(&rv_nrp); + rv_unregister_monitor(&rv_this); } module_init(register_nrp); diff --git a/kernel/trace/rv/monitors/nrp/nrp.h b/kernel/trace/rv/monitors/nrp/nrp.h index c9f12207cbf6..3270d4c0139f 100644 --- a/kernel/trace/rv/monitors/nrp/nrp.h +++ b/kernel/trace/rv/monitors/nrp/nrp.h @@ -5,22 +5,24 @@ * Documentation/trace/rv/deterministic_automata.rst */ +#define MONITOR_NAME nrp + enum states_nrp { - preempt_irq_nrp = 0, + preempt_irq_nrp, any_thread_running_nrp, nested_preempt_nrp, rescheduling_nrp, - state_max_nrp + state_max_nrp, }; #define INVALID_STATE state_max_nrp enum events_nrp { - irq_entry_nrp = 0, + irq_entry_nrp, sched_need_resched_nrp, schedule_entry_nrp, schedule_entry_preempt_nrp, - event_max_nrp + event_max_nrp, }; struct automaton_nrp { @@ -36,38 +38,38 @@ static const struct automaton_nrp automaton_nrp = { "preempt_irq", "any_thread_running", "nested_preempt", - "rescheduling" + "rescheduling", }, .event_names = { "irq_entry", "sched_need_resched", "schedule_entry", - "schedule_entry_preempt" + "schedule_entry_preempt", }, .function = { { preempt_irq_nrp, preempt_irq_nrp, nested_preempt_nrp, - nested_preempt_nrp + nested_preempt_nrp, }, { any_thread_running_nrp, rescheduling_nrp, any_thread_running_nrp, - INVALID_STATE + INVALID_STATE, }, { nested_preempt_nrp, preempt_irq_nrp, any_thread_running_nrp, - any_thread_running_nrp + any_thread_running_nrp, }, { preempt_irq_nrp, rescheduling_nrp, any_thread_running_nrp, - any_thread_running_nrp + any_thread_running_nrp, }, }, .initial_state = preempt_irq_nrp, diff --git a/kernel/trace/rv/monitors/opid/opid.c b/kernel/trace/rv/monitors/opid/opid.c index 50d64e7fb8c4..25a40e90fa40 100644 --- a/kernel/trace/rv/monitors/opid/opid.c +++ b/kernel/trace/rv/monitors/opid/opid.c @@ -6,7 +6,6 @@ #include <linux/init.h> #include <linux/rv.h> #include <rv/instrumentation.h> -#include <rv/da_monitor.h> #define MODULE_NAME "opid" @@ -16,17 +15,16 @@ #include <rv_trace.h> #include <monitors/sched/sched.h> +#define RV_MON_TYPE RV_MON_PER_CPU #include "opid.h" - -static struct rv_monitor rv_opid; -DECLARE_DA_MON_PER_CPU(opid, unsigned char); +#include <rv/da_monitor.h> #ifdef CONFIG_X86_LOCAL_APIC #include <asm/trace/irq_vectors.h> static void handle_vector_irq_entry(void *data, int vector) { - da_handle_event_opid(irq_entry_opid); + da_handle_event(irq_entry_opid); } static void attach_vector_irq(void) @@ -61,52 +59,52 @@ static void detach_vector_irq(void) { } static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip) { - da_handle_event_opid(irq_disable_opid); + da_handle_event(irq_disable_opid); } static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip) { - da_handle_event_opid(irq_enable_opid); + da_handle_event(irq_enable_opid); } static void handle_irq_entry(void *data, int irq, struct irqaction *action) { - da_handle_event_opid(irq_entry_opid); + da_handle_event(irq_entry_opid); } static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip) { - da_handle_event_opid(preempt_disable_opid); + da_handle_event(preempt_disable_opid); } static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip) { - da_handle_event_opid(preempt_enable_opid); + da_handle_event(preempt_enable_opid); } static void handle_sched_need_resched(void *data, struct task_struct *tsk, int cpu, int tif) { /* The monitor's intitial state is not in_irq */ if (this_cpu_read(hardirq_context)) - da_handle_event_opid(sched_need_resched_opid); + da_handle_event(sched_need_resched_opid); else - da_handle_start_event_opid(sched_need_resched_opid); + da_handle_start_event(sched_need_resched_opid); } static void handle_sched_waking(void *data, struct task_struct *p) { /* The monitor's intitial state is not in_irq */ if (this_cpu_read(hardirq_context)) - da_handle_event_opid(sched_waking_opid); + da_handle_event(sched_waking_opid); else - da_handle_start_event_opid(sched_waking_opid); + da_handle_start_event(sched_waking_opid); } static int enable_opid(void) { int retval; - retval = da_monitor_init_opid(); + retval = da_monitor_init(); if (retval) return retval; @@ -124,7 +122,7 @@ static int enable_opid(void) static void disable_opid(void) { - rv_opid.enabled = 0; + rv_this.enabled = 0; rv_detach_trace_probe("opid", irq_disable, handle_irq_disable); rv_detach_trace_probe("opid", irq_enable, handle_irq_enable); @@ -135,29 +133,29 @@ static void disable_opid(void) rv_detach_trace_probe("opid", sched_waking, handle_sched_waking); detach_vector_irq(); - da_monitor_destroy_opid(); + da_monitor_destroy(); } /* * This is the monitor register section. */ -static struct rv_monitor rv_opid = { +static struct rv_monitor rv_this = { .name = "opid", .description = "operations with preemption and irq disabled.", .enable = enable_opid, .disable = disable_opid, - .reset = da_monitor_reset_all_opid, + .reset = da_monitor_reset_all, .enabled = 0, }; static int __init register_opid(void) { - return rv_register_monitor(&rv_opid, &rv_sched); + return rv_register_monitor(&rv_this, &rv_sched); } static void __exit unregister_opid(void) { - rv_unregister_monitor(&rv_opid); + rv_unregister_monitor(&rv_this); } module_init(register_opid); diff --git a/kernel/trace/rv/monitors/opid/opid.h b/kernel/trace/rv/monitors/opid/opid.h index b4b8c2ff7f64..092992514970 100644 --- a/kernel/trace/rv/monitors/opid/opid.h +++ b/kernel/trace/rv/monitors/opid/opid.h @@ -5,26 +5,28 @@ * Documentation/trace/rv/deterministic_automata.rst */ +#define MONITOR_NAME opid + enum states_opid { - disabled_opid = 0, + disabled_opid, enabled_opid, in_irq_opid, irq_disabled_opid, preempt_disabled_opid, - state_max_opid + state_max_opid, }; #define INVALID_STATE state_max_opid enum events_opid { - irq_disable_opid = 0, + irq_disable_opid, irq_enable_opid, irq_entry_opid, preempt_disable_opid, preempt_enable_opid, sched_need_resched_opid, sched_waking_opid, - event_max_opid + event_max_opid, }; struct automaton_opid { @@ -41,7 +43,7 @@ static const struct automaton_opid automaton_opid = { "enabled", "in_irq", "irq_disabled", - "preempt_disabled" + "preempt_disabled", }, .event_names = { "irq_disable", @@ -50,7 +52,7 @@ static const struct automaton_opid automaton_opid = { "preempt_disable", "preempt_enable", "sched_need_resched", - "sched_waking" + "sched_waking", }, .function = { { @@ -60,7 +62,7 @@ static const struct automaton_opid automaton_opid = { INVALID_STATE, irq_disabled_opid, disabled_opid, - disabled_opid + disabled_opid, }, { irq_disabled_opid, @@ -69,7 +71,7 @@ static const struct automaton_opid automaton_opid = { preempt_disabled_opid, enabled_opid, INVALID_STATE, - INVALID_STATE + INVALID_STATE, }, { INVALID_STATE, @@ -78,7 +80,7 @@ static const struct automaton_opid automaton_opid = { INVALID_STATE, INVALID_STATE, in_irq_opid, - in_irq_opid + in_irq_opid, }, { INVALID_STATE, @@ -87,7 +89,7 @@ static const struct automaton_opid automaton_opid = { disabled_opid, INVALID_STATE, irq_disabled_opid, - INVALID_STATE + INVALID_STATE, }, { disabled_opid, @@ -96,7 +98,7 @@ static const struct automaton_opid automaton_opid = { INVALID_STATE, enabled_opid, INVALID_STATE, - INVALID_STATE + INVALID_STATE, }, }, .initial_state = disabled_opid, diff --git a/kernel/trace/rv/monitors/rtapp/rtapp.c b/kernel/trace/rv/monitors/rtapp/rtapp.c index fd75fc927d65..17f271231c99 100644 --- a/kernel/trace/rv/monitors/rtapp/rtapp.c +++ b/kernel/trace/rv/monitors/rtapp/rtapp.c @@ -8,8 +8,6 @@ #include "rtapp.h" -struct rv_monitor rv_rtapp; - struct rv_monitor rv_rtapp = { .name = "rtapp", .description = "Collection of monitors for detecting problems with real-time applications", diff --git a/kernel/trace/rv/monitors/sched/sched.c b/kernel/trace/rv/monitors/sched/sched.c index d04db4b543f9..dd9d96fc6e21 100644 --- a/kernel/trace/rv/monitors/sched/sched.c +++ b/kernel/trace/rv/monitors/sched/sched.c @@ -8,8 +8,6 @@ #include "sched.h" -struct rv_monitor rv_sched; - struct rv_monitor rv_sched = { .name = "sched", .description = "container for several scheduler monitor specifications.", diff --git a/kernel/trace/rv/monitors/sco/sco.c b/kernel/trace/rv/monitors/sco/sco.c index 04c36405e2e3..5a3bd5e16e62 100644 --- a/kernel/trace/rv/monitors/sco/sco.c +++ b/kernel/trace/rv/monitors/sco/sco.c @@ -6,7 +6,6 @@ #include <linux/init.h> #include <linux/rv.h> #include <rv/instrumentation.h> -#include <rv/da_monitor.h> #define MODULE_NAME "sco" @@ -14,31 +13,30 @@ #include <rv_trace.h> #include <monitors/sched/sched.h> +#define RV_MON_TYPE RV_MON_PER_CPU #include "sco.h" - -static struct rv_monitor rv_sco; -DECLARE_DA_MON_PER_CPU(sco, unsigned char); +#include <rv/da_monitor.h> static void handle_sched_set_state(void *data, struct task_struct *tsk, int state) { - da_handle_start_event_sco(sched_set_state_sco); + da_handle_start_event(sched_set_state_sco); } static void handle_schedule_entry(void *data, bool preempt) { - da_handle_event_sco(schedule_entry_sco); + da_handle_event(schedule_entry_sco); } static void handle_schedule_exit(void *data, bool is_switch) { - da_handle_start_event_sco(schedule_exit_sco); + da_handle_start_event(schedule_exit_sco); } static int enable_sco(void) { int retval; - retval = da_monitor_init_sco(); + retval = da_monitor_init(); if (retval) return retval; @@ -51,32 +49,32 @@ static int enable_sco(void) static void disable_sco(void) { - rv_sco.enabled = 0; + rv_this.enabled = 0; rv_detach_trace_probe("sco", sched_set_state_tp, handle_sched_set_state); rv_detach_trace_probe("sco", sched_entry_tp, handle_schedule_entry); rv_detach_trace_probe("sco", sched_exit_tp, handle_schedule_exit); - da_monitor_destroy_sco(); + da_monitor_destroy(); } -static struct rv_monitor rv_sco = { +static struct rv_monitor rv_this = { .name = "sco", .description = "scheduling context operations.", .enable = enable_sco, .disable = disable_sco, - .reset = da_monitor_reset_all_sco, + .reset = da_monitor_reset_all, .enabled = 0, }; static int __init register_sco(void) { - return rv_register_monitor(&rv_sco, &rv_sched); + return rv_register_monitor(&rv_this, &rv_sched); } static void __exit unregister_sco(void) { - rv_unregister_monitor(&rv_sco); + rv_unregister_monitor(&rv_this); } module_init(register_sco); diff --git a/kernel/trace/rv/monitors/sco/sco.h b/kernel/trace/rv/monitors/sco/sco.h index 7a4c1f2d5ca1..bac3beb51e72 100644 --- a/kernel/trace/rv/monitors/sco/sco.h +++ b/kernel/trace/rv/monitors/sco/sco.h @@ -5,19 +5,21 @@ * Documentation/trace/rv/deterministic_automata.rst */ +#define MONITOR_NAME sco + enum states_sco { - thread_context_sco = 0, + thread_context_sco, scheduling_context_sco, - state_max_sco + state_max_sco, }; #define INVALID_STATE state_max_sco enum events_sco { - sched_set_state_sco = 0, + sched_set_state_sco, schedule_entry_sco, schedule_exit_sco, - event_max_sco + event_max_sco, }; struct automaton_sco { @@ -31,12 +33,12 @@ struct automaton_sco { static const struct automaton_sco automaton_sco = { .state_names = { "thread_context", - "scheduling_context" + "scheduling_context", }, .event_names = { "sched_set_state", "schedule_entry", - "schedule_exit" + "schedule_exit", }, .function = { { thread_context_sco, scheduling_context_sco, INVALID_STATE }, diff --git a/kernel/trace/rv/monitors/scpd/scpd.c b/kernel/trace/rv/monitors/scpd/scpd.c index 1e351ba52fee..83b48627dc9f 100644 --- a/kernel/trace/rv/monitors/scpd/scpd.c +++ b/kernel/trace/rv/monitors/scpd/scpd.c @@ -6,7 +6,6 @@ #include <linux/init.h> #include <linux/rv.h> #include <rv/instrumentation.h> -#include <rv/da_monitor.h> #define MODULE_NAME "scpd" @@ -15,36 +14,35 @@ #include <rv_trace.h> #include <monitors/sched/sched.h> +#define RV_MON_TYPE RV_MON_PER_CPU #include "scpd.h" - -static struct rv_monitor rv_scpd; -DECLARE_DA_MON_PER_CPU(scpd, unsigned char); +#include <rv/da_monitor.h> static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip) { - da_handle_event_scpd(preempt_disable_scpd); + da_handle_event(preempt_disable_scpd); } static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip) { - da_handle_start_event_scpd(preempt_enable_scpd); + da_handle_start_event(preempt_enable_scpd); } static void handle_schedule_entry(void *data, bool preempt) { - da_handle_event_scpd(schedule_entry_scpd); + da_handle_event(schedule_entry_scpd); } static void handle_schedule_exit(void *data, bool is_switch) { - da_handle_event_scpd(schedule_exit_scpd); + da_handle_event(schedule_exit_scpd); } static int enable_scpd(void) { int retval; - retval = da_monitor_init_scpd(); + retval = da_monitor_init(); if (retval) return retval; @@ -58,33 +56,33 @@ static int enable_scpd(void) static void disable_scpd(void) { - rv_scpd.enabled = 0; + rv_this.enabled = 0; rv_detach_trace_probe("scpd", preempt_disable, handle_preempt_disable); rv_detach_trace_probe("scpd", preempt_enable, handle_preempt_enable); rv_detach_trace_probe("scpd", sched_entry_tp, handle_schedule_entry); rv_detach_trace_probe("scpd", sched_exit_tp, handle_schedule_exit); - da_monitor_destroy_scpd(); + da_monitor_destroy(); } -static struct rv_monitor rv_scpd = { +static struct rv_monitor rv_this = { .name = "scpd", .description = "schedule called with preemption disabled.", .enable = enable_scpd, .disable = disable_scpd, - .reset = da_monitor_reset_all_scpd, + .reset = da_monitor_reset_all, .enabled = 0, }; static int __init register_scpd(void) { - return rv_register_monitor(&rv_scpd, &rv_sched); + return rv_register_monitor(&rv_this, &rv_sched); } static void __exit unregister_scpd(void) { - rv_unregister_monitor(&rv_scpd); + rv_unregister_monitor(&rv_this); } module_init(register_scpd); diff --git a/kernel/trace/rv/monitors/scpd/scpd.h b/kernel/trace/rv/monitors/scpd/scpd.h index 295f735a5811..d6329da2671b 100644 --- a/kernel/trace/rv/monitors/scpd/scpd.h +++ b/kernel/trace/rv/monitors/scpd/scpd.h @@ -5,20 +5,22 @@ * Documentation/trace/rv/deterministic_automata.rst */ +#define MONITOR_NAME scpd + enum states_scpd { - cant_sched_scpd = 0, + cant_sched_scpd, can_sched_scpd, - state_max_scpd + state_max_scpd, }; #define INVALID_STATE state_max_scpd enum events_scpd { - preempt_disable_scpd = 0, + preempt_disable_scpd, preempt_enable_scpd, schedule_entry_scpd, schedule_exit_scpd, - event_max_scpd + event_max_scpd, }; struct automaton_scpd { @@ -32,13 +34,13 @@ struct automaton_scpd { static const struct automaton_scpd automaton_scpd = { .state_names = { "cant_sched", - "can_sched" + "can_sched", }, .event_names = { "preempt_disable", "preempt_enable", "schedule_entry", - "schedule_exit" + "schedule_exit", }, .function = { { can_sched_scpd, INVALID_STATE, INVALID_STATE, INVALID_STATE }, diff --git a/kernel/trace/rv/monitors/snep/snep.c b/kernel/trace/rv/monitors/snep/snep.c index 558950f524a5..b80b73795dec 100644 --- a/kernel/trace/rv/monitors/snep/snep.c +++ b/kernel/trace/rv/monitors/snep/snep.c @@ -6,7 +6,6 @@ #include <linux/init.h> #include <linux/rv.h> #include <rv/instrumentation.h> -#include <rv/da_monitor.h> #define MODULE_NAME "snep" @@ -15,36 +14,35 @@ #include <rv_trace.h> #include <monitors/sched/sched.h> +#define RV_MON_TYPE RV_MON_PER_CPU #include "snep.h" - -static struct rv_monitor rv_snep; -DECLARE_DA_MON_PER_CPU(snep, unsigned char); +#include <rv/da_monitor.h> static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip) { - da_handle_start_event_snep(preempt_disable_snep); + da_handle_start_event(preempt_disable_snep); } static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip) { - da_handle_start_event_snep(preempt_enable_snep); + da_handle_start_event(preempt_enable_snep); } static void handle_schedule_entry(void *data, bool preempt) { - da_handle_event_snep(schedule_entry_snep); + da_handle_event(schedule_entry_snep); } static void handle_schedule_exit(void *data, bool is_switch) { - da_handle_start_event_snep(schedule_exit_snep); + da_handle_start_event(schedule_exit_snep); } static int enable_snep(void) { int retval; - retval = da_monitor_init_snep(); + retval = da_monitor_init(); if (retval) return retval; @@ -58,33 +56,33 @@ static int enable_snep(void) static void disable_snep(void) { - rv_snep.enabled = 0; + rv_this.enabled = 0; rv_detach_trace_probe("snep", preempt_disable, handle_preempt_disable); rv_detach_trace_probe("snep", preempt_enable, handle_preempt_enable); rv_detach_trace_probe("snep", sched_entry_tp, handle_schedule_entry); rv_detach_trace_probe("snep", sched_exit_tp, handle_schedule_exit); - da_monitor_destroy_snep(); + da_monitor_destroy(); } -static struct rv_monitor rv_snep = { +static struct rv_monitor rv_this = { .name = "snep", .description = "schedule does not enable preempt.", .enable = enable_snep, .disable = disable_snep, - .reset = da_monitor_reset_all_snep, + .reset = da_monitor_reset_all, .enabled = 0, }; static int __init register_snep(void) { - return rv_register_monitor(&rv_snep, &rv_sched); + return rv_register_monitor(&rv_this, &rv_sched); } static void __exit unregister_snep(void) { - rv_unregister_monitor(&rv_snep); + rv_unregister_monitor(&rv_this); } module_init(register_snep); diff --git a/kernel/trace/rv/monitors/snep/snep.h b/kernel/trace/rv/monitors/snep/snep.h index 4cd9abb77b7b..357520a5b3d1 100644 --- a/kernel/trace/rv/monitors/snep/snep.h +++ b/kernel/trace/rv/monitors/snep/snep.h @@ -5,20 +5,22 @@ * Documentation/trace/rv/deterministic_automata.rst */ +#define MONITOR_NAME snep + enum states_snep { - non_scheduling_context_snep = 0, + non_scheduling_context_snep, scheduling_contex_snep, - state_max_snep + state_max_snep, }; #define INVALID_STATE state_max_snep enum events_snep { - preempt_disable_snep = 0, + preempt_disable_snep, preempt_enable_snep, schedule_entry_snep, schedule_exit_snep, - event_max_snep + event_max_snep, }; struct automaton_snep { @@ -32,26 +34,26 @@ struct automaton_snep { static const struct automaton_snep automaton_snep = { .state_names = { "non_scheduling_context", - "scheduling_contex" + "scheduling_contex", }, .event_names = { "preempt_disable", "preempt_enable", "schedule_entry", - "schedule_exit" + "schedule_exit", }, .function = { { non_scheduling_context_snep, non_scheduling_context_snep, scheduling_contex_snep, - INVALID_STATE + INVALID_STATE, }, { INVALID_STATE, INVALID_STATE, INVALID_STATE, - non_scheduling_context_snep + non_scheduling_context_snep, }, }, .initial_state = non_scheduling_context_snep, diff --git a/kernel/trace/rv/monitors/snroc/snroc.c b/kernel/trace/rv/monitors/snroc/snroc.c index 540e686e699f..f168b1a4b12c 100644 --- a/kernel/trace/rv/monitors/snroc/snroc.c +++ b/kernel/trace/rv/monitors/snroc/snroc.c @@ -6,7 +6,6 @@ #include <linux/init.h> #include <linux/rv.h> #include <rv/instrumentation.h> -#include <rv/da_monitor.h> #define MODULE_NAME "snroc" @@ -14,14 +13,13 @@ #include <rv_trace.h> #include <monitors/sched/sched.h> +#define RV_MON_TYPE RV_MON_PER_TASK #include "snroc.h" - -static struct rv_monitor rv_snroc; -DECLARE_DA_MON_PER_TASK(snroc, unsigned char); +#include <rv/da_monitor.h> static void handle_sched_set_state(void *data, struct task_struct *tsk, int state) { - da_handle_event_snroc(tsk, sched_set_state_snroc); + da_handle_event(tsk, sched_set_state_snroc); } static void handle_sched_switch(void *data, bool preempt, @@ -29,15 +27,15 @@ static void handle_sched_switch(void *data, bool preempt, struct task_struct *next, unsigned int prev_state) { - da_handle_start_event_snroc(prev, sched_switch_out_snroc); - da_handle_event_snroc(next, sched_switch_in_snroc); + da_handle_start_event(prev, sched_switch_out_snroc); + da_handle_event(next, sched_switch_in_snroc); } static int enable_snroc(void) { int retval; - retval = da_monitor_init_snroc(); + retval = da_monitor_init(); if (retval) return retval; @@ -49,31 +47,31 @@ static int enable_snroc(void) static void disable_snroc(void) { - rv_snroc.enabled = 0; + rv_this.enabled = 0; rv_detach_trace_probe("snroc", sched_set_state_tp, handle_sched_set_state); rv_detach_trace_probe("snroc", sched_switch, handle_sched_switch); - da_monitor_destroy_snroc(); + da_monitor_destroy(); } -static struct rv_monitor rv_snroc = { +static struct rv_monitor rv_this = { .name = "snroc", .description = "set non runnable on its own context.", .enable = enable_snroc, .disable = disable_snroc, - .reset = da_monitor_reset_all_snroc, + .reset = da_monitor_reset_all, .enabled = 0, }; static int __init register_snroc(void) { - return rv_register_monitor(&rv_snroc, &rv_sched); + return rv_register_monitor(&rv_this, &rv_sched); } static void __exit unregister_snroc(void) { - rv_unregister_monitor(&rv_snroc); + rv_unregister_monitor(&rv_this); } module_init(register_snroc); diff --git a/kernel/trace/rv/monitors/snroc/snroc.h b/kernel/trace/rv/monitors/snroc/snroc.h index c3650a2b1b10..88b7328ad31a 100644 --- a/kernel/trace/rv/monitors/snroc/snroc.h +++ b/kernel/trace/rv/monitors/snroc/snroc.h @@ -5,19 +5,21 @@ * Documentation/trace/rv/deterministic_automata.rst */ +#define MONITOR_NAME snroc + enum states_snroc { - other_context_snroc = 0, + other_context_snroc, own_context_snroc, - state_max_snroc + state_max_snroc, }; #define INVALID_STATE state_max_snroc enum events_snroc { - sched_set_state_snroc = 0, + sched_set_state_snroc, sched_switch_in_snroc, sched_switch_out_snroc, - event_max_snroc + event_max_snroc, }; struct automaton_snroc { @@ -31,12 +33,12 @@ struct automaton_snroc { static const struct automaton_snroc automaton_snroc = { .state_names = { "other_context", - "own_context" + "own_context", }, .event_names = { "sched_set_state", "sched_switch_in", - "sched_switch_out" + "sched_switch_out", }, .function = { { INVALID_STATE, own_context_snroc, INVALID_STATE }, diff --git a/kernel/trace/rv/monitors/sssw/sssw.c b/kernel/trace/rv/monitors/sssw/sssw.c index 84b8d890d9d4..a91321c890cd 100644 --- a/kernel/trace/rv/monitors/sssw/sssw.c +++ b/kernel/trace/rv/monitors/sssw/sssw.c @@ -6,7 +6,6 @@ #include <linux/init.h> #include <linux/rv.h> #include <rv/instrumentation.h> -#include <rv/da_monitor.h> #define MODULE_NAME "sssw" @@ -15,17 +14,16 @@ #include <rv_trace.h> #include <monitors/sched/sched.h> +#define RV_MON_TYPE RV_MON_PER_TASK #include "sssw.h" - -static struct rv_monitor rv_sssw; -DECLARE_DA_MON_PER_TASK(sssw, unsigned char); +#include <rv/da_monitor.h> static void handle_sched_set_state(void *data, struct task_struct *tsk, int state) { if (state == TASK_RUNNING) - da_handle_start_event_sssw(tsk, sched_set_state_runnable_sssw); + da_handle_start_event(tsk, sched_set_state_runnable_sssw); else - da_handle_event_sssw(tsk, sched_set_state_sleepable_sssw); + da_handle_event(tsk, sched_set_state_sleepable_sssw); } static void handle_sched_switch(void *data, bool preempt, @@ -34,15 +32,15 @@ static void handle_sched_switch(void *data, bool preempt, unsigned int prev_state) { if (preempt) - da_handle_event_sssw(prev, sched_switch_preempt_sssw); + da_handle_event(prev, sched_switch_preempt_sssw); else if (prev_state == TASK_RUNNING) - da_handle_event_sssw(prev, sched_switch_yield_sssw); + da_handle_event(prev, sched_switch_yield_sssw); else if (prev_state == TASK_RTLOCK_WAIT) /* special case of sleeping task with racy conditions */ - da_handle_event_sssw(prev, sched_switch_blocking_sssw); + da_handle_event(prev, sched_switch_blocking_sssw); else - da_handle_event_sssw(prev, sched_switch_suspend_sssw); - da_handle_event_sssw(next, sched_switch_in_sssw); + da_handle_event(prev, sched_switch_suspend_sssw); + da_handle_event(next, sched_switch_in_sssw); } static void handle_sched_wakeup(void *data, struct task_struct *p) @@ -51,21 +49,21 @@ static void handle_sched_wakeup(void *data, struct task_struct *p) * Wakeup can also lead to signal_wakeup although the system is * actually runnable. The monitor can safely start with this event. */ - da_handle_start_event_sssw(p, sched_wakeup_sssw); + da_handle_start_event(p, sched_wakeup_sssw); } static void handle_signal_deliver(void *data, int sig, struct kernel_siginfo *info, struct k_sigaction *ka) { - da_handle_event_sssw(current, signal_deliver_sssw); + da_handle_event(current, signal_deliver_sssw); } static int enable_sssw(void) { int retval; - retval = da_monitor_init_sssw(); + retval = da_monitor_init(); if (retval) return retval; @@ -79,33 +77,33 @@ static int enable_sssw(void) static void disable_sssw(void) { - rv_sssw.enabled = 0; + rv_this.enabled = 0; rv_detach_trace_probe("sssw", sched_set_state_tp, handle_sched_set_state); rv_detach_trace_probe("sssw", sched_switch, handle_sched_switch); rv_detach_trace_probe("sssw", sched_wakeup, handle_sched_wakeup); rv_detach_trace_probe("sssw", signal_deliver, handle_signal_deliver); - da_monitor_destroy_sssw(); + da_monitor_destroy(); } -static struct rv_monitor rv_sssw = { +static struct rv_monitor rv_this = { .name = "sssw", .description = "set state sleep and wakeup.", .enable = enable_sssw, .disable = disable_sssw, - .reset = da_monitor_reset_all_sssw, + .reset = da_monitor_reset_all, .enabled = 0, }; static int __init register_sssw(void) { - return rv_register_monitor(&rv_sssw, &rv_sched); + return rv_register_monitor(&rv_this, &rv_sched); } static void __exit unregister_sssw(void) { - rv_unregister_monitor(&rv_sssw); + rv_unregister_monitor(&rv_this); } module_init(register_sssw); diff --git a/kernel/trace/rv/monitors/sssw/sssw.h b/kernel/trace/rv/monitors/sssw/sssw.h index 243d54050c94..1a4b806061c3 100644 --- a/kernel/trace/rv/monitors/sssw/sssw.h +++ b/kernel/trace/rv/monitors/sssw/sssw.h @@ -5,18 +5,20 @@ * Documentation/trace/rv/deterministic_automata.rst */ +#define MONITOR_NAME sssw + enum states_sssw { - runnable_sssw = 0, + runnable_sssw, signal_wakeup_sssw, sleepable_sssw, sleeping_sssw, - state_max_sssw + state_max_sssw, }; #define INVALID_STATE state_max_sssw enum events_sssw { - sched_set_state_runnable_sssw = 0, + sched_set_state_runnable_sssw, sched_set_state_sleepable_sssw, sched_switch_blocking_sssw, sched_switch_in_sssw, @@ -25,7 +27,7 @@ enum events_sssw { sched_switch_yield_sssw, sched_wakeup_sssw, signal_deliver_sssw, - event_max_sssw + event_max_sssw, }; struct automaton_sssw { @@ -41,7 +43,7 @@ static const struct automaton_sssw automaton_sssw = { "runnable", "signal_wakeup", "sleepable", - "sleeping" + "sleeping", }, .event_names = { "sched_set_state_runnable", @@ -52,7 +54,7 @@ static const struct automaton_sssw automaton_sssw = { "sched_switch_suspend", "sched_switch_yield", "sched_wakeup", - "signal_deliver" + "signal_deliver", }, .function = { { @@ -64,7 +66,7 @@ static const struct automaton_sssw automaton_sssw = { INVALID_STATE, runnable_sssw, runnable_sssw, - runnable_sssw + runnable_sssw, }, { INVALID_STATE, @@ -75,7 +77,7 @@ static const struct automaton_sssw automaton_sssw = { INVALID_STATE, signal_wakeup_sssw, signal_wakeup_sssw, - runnable_sssw + runnable_sssw, }, { runnable_sssw, @@ -86,7 +88,7 @@ static const struct automaton_sssw automaton_sssw = { sleeping_sssw, signal_wakeup_sssw, runnable_sssw, - sleepable_sssw + sleepable_sssw, }, { INVALID_STATE, @@ -97,7 +99,7 @@ static const struct automaton_sssw automaton_sssw = { INVALID_STATE, INVALID_STATE, runnable_sssw, - INVALID_STATE + INVALID_STATE, }, }, .initial_state = runnable_sssw, diff --git a/kernel/trace/rv/monitors/sts/sts.c b/kernel/trace/rv/monitors/sts/sts.c index c4a9cd67c1d2..ce031cbf202a 100644 --- a/kernel/trace/rv/monitors/sts/sts.c +++ b/kernel/trace/rv/monitors/sts/sts.c @@ -6,7 +6,6 @@ #include <linux/init.h> #include <linux/rv.h> #include <rv/instrumentation.h> -#include <rv/da_monitor.h> #define MODULE_NAME "sts" @@ -16,17 +15,16 @@ #include <rv_trace.h> #include <monitors/sched/sched.h> +#define RV_MON_TYPE RV_MON_PER_CPU #include "sts.h" - -static struct rv_monitor rv_sts; -DECLARE_DA_MON_PER_CPU(sts, unsigned char); +#include <rv/da_monitor.h> #ifdef CONFIG_X86_LOCAL_APIC #include <asm/trace/irq_vectors.h> static void handle_vector_irq_entry(void *data, int vector) { - da_handle_event_sts(irq_entry_sts); + da_handle_event(irq_entry_sts); } static void attach_vector_irq(void) @@ -61,17 +59,17 @@ static void detach_vector_irq(void) { } static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip) { - da_handle_event_sts(irq_disable_sts); + da_handle_event(irq_disable_sts); } static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip) { - da_handle_event_sts(irq_enable_sts); + da_handle_event(irq_enable_sts); } static void handle_irq_entry(void *data, int irq, struct irqaction *action) { - da_handle_event_sts(irq_entry_sts); + da_handle_event(irq_entry_sts); } static void handle_sched_switch(void *data, bool preempt, @@ -79,24 +77,24 @@ static void handle_sched_switch(void *data, bool preempt, struct task_struct *next, unsigned int prev_state) { - da_handle_event_sts(sched_switch_sts); + da_handle_event(sched_switch_sts); } static void handle_schedule_entry(void *data, bool preempt) { - da_handle_event_sts(schedule_entry_sts); + da_handle_event(schedule_entry_sts); } static void handle_schedule_exit(void *data, bool is_switch) { - da_handle_start_event_sts(schedule_exit_sts); + da_handle_start_event(schedule_exit_sts); } static int enable_sts(void) { int retval; - retval = da_monitor_init_sts(); + retval = da_monitor_init(); if (retval) return retval; @@ -113,7 +111,7 @@ static int enable_sts(void) static void disable_sts(void) { - rv_sts.enabled = 0; + rv_this.enabled = 0; rv_detach_trace_probe("sts", irq_disable, handle_irq_disable); rv_detach_trace_probe("sts", irq_enable, handle_irq_enable); @@ -123,29 +121,29 @@ static void disable_sts(void) rv_detach_trace_probe("sts", sched_exit_tp, handle_schedule_exit); detach_vector_irq(); - da_monitor_destroy_sts(); + da_monitor_destroy(); } /* * This is the monitor register section. */ -static struct rv_monitor rv_sts = { +static struct rv_monitor rv_this = { .name = "sts", .description = "schedule implies task switch.", .enable = enable_sts, .disable = disable_sts, - .reset = da_monitor_reset_all_sts, + .reset = da_monitor_reset_all, .enabled = 0, }; static int __init register_sts(void) { - return rv_register_monitor(&rv_sts, &rv_sched); + return rv_register_monitor(&rv_this, &rv_sched); } static void __exit unregister_sts(void) { - rv_unregister_monitor(&rv_sts); + rv_unregister_monitor(&rv_this); } module_init(register_sts); diff --git a/kernel/trace/rv/monitors/sts/sts.h b/kernel/trace/rv/monitors/sts/sts.h index 3368b6599a00..6f7b2d9d72e6 100644 --- a/kernel/trace/rv/monitors/sts/sts.h +++ b/kernel/trace/rv/monitors/sts/sts.h @@ -5,27 +5,29 @@ * Documentation/trace/rv/deterministic_automata.rst */ +#define MONITOR_NAME sts + enum states_sts { - can_sched_sts = 0, + can_sched_sts, cant_sched_sts, disable_to_switch_sts, enable_to_exit_sts, in_irq_sts, scheduling_sts, switching_sts, - state_max_sts + state_max_sts, }; #define INVALID_STATE state_max_sts enum events_sts { - irq_disable_sts = 0, + irq_disable_sts, irq_enable_sts, irq_entry_sts, sched_switch_sts, schedule_entry_sts, schedule_exit_sts, - event_max_sts + event_max_sts, }; struct automaton_sts { @@ -44,7 +46,7 @@ static const struct automaton_sts automaton_sts = { "enable_to_exit", "in_irq", "scheduling", - "switching" + "switching", }, .event_names = { "irq_disable", @@ -52,7 +54,7 @@ static const struct automaton_sts automaton_sts = { "irq_entry", "sched_switch", "schedule_entry", - "schedule_exit" + "schedule_exit", }, .function = { { @@ -61,7 +63,7 @@ static const struct automaton_sts automaton_sts = { INVALID_STATE, INVALID_STATE, scheduling_sts, - INVALID_STATE + INVALID_STATE, }, { INVALID_STATE, @@ -69,7 +71,7 @@ static const struct automaton_sts automaton_sts = { cant_sched_sts, INVALID_STATE, INVALID_STATE, - INVALID_STATE + INVALID_STATE, }, { INVALID_STATE, @@ -77,7 +79,7 @@ static const struct automaton_sts automaton_sts = { in_irq_sts, switching_sts, INVALID_STATE, - INVALID_STATE + INVALID_STATE, }, { enable_to_exit_sts, @@ -85,7 +87,7 @@ static const struct automaton_sts automaton_sts = { enable_to_exit_sts, INVALID_STATE, INVALID_STATE, - can_sched_sts + can_sched_sts, }, { INVALID_STATE, @@ -93,7 +95,7 @@ static const struct automaton_sts automaton_sts = { in_irq_sts, INVALID_STATE, INVALID_STATE, - INVALID_STATE + INVALID_STATE, }, { disable_to_switch_sts, @@ -101,7 +103,7 @@ static const struct automaton_sts automaton_sts = { INVALID_STATE, INVALID_STATE, INVALID_STATE, - INVALID_STATE + INVALID_STATE, }, { INVALID_STATE, @@ -109,7 +111,7 @@ static const struct automaton_sts automaton_sts = { INVALID_STATE, INVALID_STATE, INVALID_STATE, - INVALID_STATE + INVALID_STATE, }, }, .initial_state = can_sched_sts, diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c index 4b4e99615a11..22d77ec42463 100644 --- a/kernel/trace/rv/monitors/wip/wip.c +++ b/kernel/trace/rv/monitors/wip/wip.c @@ -6,7 +6,6 @@ #include <linux/init.h> #include <linux/rv.h> #include <rv/instrumentation.h> -#include <rv/da_monitor.h> #define MODULE_NAME "wip" @@ -14,31 +13,30 @@ #include <trace/events/sched.h> #include <trace/events/preemptirq.h> +#define RV_MON_TYPE RV_MON_PER_CPU #include "wip.h" - -static struct rv_monitor rv_wip; -DECLARE_DA_MON_PER_CPU(wip, unsigned char); +#include <rv/da_monitor.h> static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip) { - da_handle_event_wip(preempt_disable_wip); + da_handle_event(preempt_disable_wip); } static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip) { - da_handle_start_event_wip(preempt_enable_wip); + da_handle_start_event(preempt_enable_wip); } static void handle_sched_waking(void *data, struct task_struct *task) { - da_handle_event_wip(sched_waking_wip); + da_handle_event(sched_waking_wip); } static int enable_wip(void) { int retval; - retval = da_monitor_init_wip(); + retval = da_monitor_init(); if (retval) return retval; @@ -51,32 +49,32 @@ static int enable_wip(void) static void disable_wip(void) { - rv_wip.enabled = 0; + rv_this.enabled = 0; rv_detach_trace_probe("wip", preempt_disable, handle_preempt_disable); rv_detach_trace_probe("wip", preempt_enable, handle_preempt_enable); rv_detach_trace_probe("wip", sched_waking, handle_sched_waking); - da_monitor_destroy_wip(); + da_monitor_destroy(); } -static struct rv_monitor rv_wip = { +static struct rv_monitor rv_this = { .name = "wip", .description = "wakeup in preemptive per-cpu testing monitor.", .enable = enable_wip, .disable = disable_wip, - .reset = da_monitor_reset_all_wip, + .reset = da_monitor_reset_all, .enabled = 0, }; static int __init register_wip(void) { - return rv_register_monitor(&rv_wip, NULL); + return rv_register_monitor(&rv_this, NULL); } static void __exit unregister_wip(void) { - rv_unregister_monitor(&rv_wip); + rv_unregister_monitor(&rv_this); } module_init(register_wip); diff --git a/kernel/trace/rv/monitors/wip/wip.h b/kernel/trace/rv/monitors/wip/wip.h index c7193748bf36..b4c3eea94c86 100644 --- a/kernel/trace/rv/monitors/wip/wip.h +++ b/kernel/trace/rv/monitors/wip/wip.h @@ -5,19 +5,21 @@ * Documentation/trace/rv/deterministic_automata.rst */ +#define MONITOR_NAME wip + enum states_wip { - preemptive_wip = 0, + preemptive_wip, non_preemptive_wip, - state_max_wip + state_max_wip, }; #define INVALID_STATE state_max_wip enum events_wip { - preempt_disable_wip = 0, + preempt_disable_wip, preempt_enable_wip, sched_waking_wip, - event_max_wip + event_max_wip, }; struct automaton_wip { @@ -31,12 +33,12 @@ struct automaton_wip { static const struct automaton_wip automaton_wip = { .state_names = { "preemptive", - "non_preemptive" + "non_preemptive", }, .event_names = { "preempt_disable", "preempt_enable", - "sched_waking" + "sched_waking", }, .function = { { non_preemptive_wip, INVALID_STATE, INVALID_STATE }, diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c index 4145bea2729e..579e7e217ee0 100644 --- a/kernel/trace/rv/monitors/wwnr/wwnr.c +++ b/kernel/trace/rv/monitors/wwnr/wwnr.c @@ -6,40 +6,38 @@ #include <linux/init.h> #include <linux/rv.h> #include <rv/instrumentation.h> -#include <rv/da_monitor.h> #define MODULE_NAME "wwnr" #include <rv_trace.h> #include <trace/events/sched.h> +#define RV_MON_TYPE RV_MON_PER_TASK #include "wwnr.h" - -static struct rv_monitor rv_wwnr; -DECLARE_DA_MON_PER_TASK(wwnr, unsigned char); +#include <rv/da_monitor.h> static void handle_switch(void *data, bool preempt, struct task_struct *p, struct task_struct *n, unsigned int prev_state) { /* start monitoring only after the first suspension */ if (prev_state == TASK_INTERRUPTIBLE) - da_handle_start_event_wwnr(p, switch_out_wwnr); + da_handle_start_event(p, switch_out_wwnr); else - da_handle_event_wwnr(p, switch_out_wwnr); + da_handle_event(p, switch_out_wwnr); - da_handle_event_wwnr(n, switch_in_wwnr); + da_handle_event(n, switch_in_wwnr); } static void handle_wakeup(void *data, struct task_struct *p) { - da_handle_event_wwnr(p, wakeup_wwnr); + da_handle_event(p, wakeup_wwnr); } static int enable_wwnr(void) { int retval; - retval = da_monitor_init_wwnr(); + retval = da_monitor_init(); if (retval) return retval; @@ -51,31 +49,31 @@ static int enable_wwnr(void) static void disable_wwnr(void) { - rv_wwnr.enabled = 0; + rv_this.enabled = 0; rv_detach_trace_probe("wwnr", sched_switch, handle_switch); rv_detach_trace_probe("wwnr", sched_wakeup, handle_wakeup); - da_monitor_destroy_wwnr(); + da_monitor_destroy(); } -static struct rv_monitor rv_wwnr = { +static struct rv_monitor rv_this = { .name = "wwnr", .description = "wakeup while not running per-task testing model.", .enable = enable_wwnr, .disable = disable_wwnr, - .reset = da_monitor_reset_all_wwnr, + .reset = da_monitor_reset_all, .enabled = 0, }; static int __init register_wwnr(void) { - return rv_register_monitor(&rv_wwnr, NULL); + return rv_register_monitor(&rv_this, NULL); } static void __exit unregister_wwnr(void) { - rv_unregister_monitor(&rv_wwnr); + rv_unregister_monitor(&rv_this); } module_init(register_wwnr); diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.h b/kernel/trace/rv/monitors/wwnr/wwnr.h index 0a59d23edf61..a28006512c9b 100644 --- a/kernel/trace/rv/monitors/wwnr/wwnr.h +++ b/kernel/trace/rv/monitors/wwnr/wwnr.h @@ -5,19 +5,21 @@ * Documentation/trace/rv/deterministic_automata.rst */ +#define MONITOR_NAME wwnr + enum states_wwnr { - not_running_wwnr = 0, + not_running_wwnr, running_wwnr, - state_max_wwnr + state_max_wwnr, }; #define INVALID_STATE state_max_wwnr enum events_wwnr { - switch_in_wwnr = 0, + switch_in_wwnr, switch_out_wwnr, wakeup_wwnr, - event_max_wwnr + event_max_wwnr, }; struct automaton_wwnr { @@ -31,12 +33,12 @@ struct automaton_wwnr { static const struct automaton_wwnr automaton_wwnr = { .state_names = { "not_running", - "running" + "running", }, .event_names = { "switch_in", "switch_out", - "wakeup" + "wakeup", }, .function = { { running_wwnr, INVALID_STATE, not_running_wwnr }, diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index baec63134ab6..b1cb30a7b83d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1178,11 +1178,10 @@ EXPORT_SYMBOL_GPL(__trace_array_puts); * __trace_puts - write a constant string into the trace buffer. * @ip: The address of the caller * @str: The constant string to write - * @size: The size of the string. */ -int __trace_puts(unsigned long ip, const char *str, int size) +int __trace_puts(unsigned long ip, const char *str) { - return __trace_array_puts(printk_trace, ip, str, size); + return __trace_array_puts(printk_trace, ip, str, strlen(str)); } EXPORT_SYMBOL_GPL(__trace_puts); @@ -1201,7 +1200,7 @@ int __trace_bputs(unsigned long ip, const char *str) int size = sizeof(struct bputs_entry); if (!printk_binsafe(tr)) - return __trace_puts(ip, str, strlen(str)); + return __trace_puts(ip, str); if (!(tr->trace_flags & TRACE_ITER(PRINTK))) return 0; @@ -6115,10 +6114,10 @@ static int cmp_mod_entry(const void *key, const void *pivot) unsigned long addr = (unsigned long)key; const struct trace_mod_entry *ent = pivot; - if (addr >= ent[0].mod_addr && addr < ent[1].mod_addr) - return 0; - else - return addr - ent->mod_addr; + if (addr < ent[0].mod_addr) + return -1; + + return addr >= ent[1].mod_addr; } /** diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b6d42fe06115..8428c437cb9d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -68,14 +68,17 @@ enum trace_type { #undef __field_fn #define __field_fn(type, item) type item; +#undef __field_packed +#define __field_packed(type, item) type item; + #undef __field_struct #define __field_struct(type, item) __field(type, item) #undef __field_desc #define __field_desc(type, container, item) -#undef __field_packed -#define __field_packed(type, container, item) +#undef __field_desc_packed +#define __field_desc_packed(type, container, item) #undef __array #define __array(type, item, size) type item[size]; @@ -2116,7 +2119,7 @@ extern void tracing_log_err(struct trace_array *tr, * about performance). The internal_trace_puts() is for such * a purpose. */ -#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str)) +#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str) #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index f6a8d29c0d76..54417468fdeb 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -79,8 +79,8 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, F_STRUCT( __field_struct( struct ftrace_graph_ent, graph_ent ) - __field_packed( unsigned long, graph_ent, func ) - __field_packed( unsigned long, graph_ent, depth ) + __field_desc_packed(unsigned long, graph_ent, func ) + __field_desc_packed(unsigned long, graph_ent, depth ) __dynamic_array(unsigned long, args ) ), @@ -96,9 +96,9 @@ FTRACE_ENTRY_PACKED(fgraph_retaddr_entry, fgraph_retaddr_ent_entry, F_STRUCT( __field_struct( struct fgraph_retaddr_ent, graph_rent ) - __field_packed( unsigned long, graph_rent.ent, func ) - __field_packed( unsigned long, graph_rent.ent, depth ) - __field_packed( unsigned long, graph_rent, retaddr ) + __field_desc_packed( unsigned long, graph_rent.ent, func ) + __field_desc_packed( unsigned long, graph_rent.ent, depth ) + __field_desc_packed( unsigned long, graph_rent, retaddr ) __dynamic_array(unsigned long, args ) ), @@ -123,12 +123,12 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, F_STRUCT( __field_struct( struct ftrace_graph_ret, ret ) - __field_packed( unsigned long, ret, func ) - __field_packed( unsigned long, ret, retval ) - __field_packed( unsigned int, ret, depth ) - __field_packed( unsigned int, ret, overrun ) - __field(unsigned long long, calltime ) - __field(unsigned long long, rettime ) + __field_desc_packed( unsigned long, ret, func ) + __field_desc_packed( unsigned long, ret, retval ) + __field_desc_packed( unsigned int, ret, depth ) + __field_desc_packed( unsigned int, ret, overrun ) + __field_packed(unsigned long long, calltime) + __field_packed(unsigned long long, rettime ) ), F_printk("<-- %ps (%u) (start: %llx end: %llx) over: %u retval: %lx", @@ -146,11 +146,11 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, F_STRUCT( __field_struct( struct ftrace_graph_ret, ret ) - __field_packed( unsigned long, ret, func ) - __field_packed( unsigned int, ret, depth ) - __field_packed( unsigned int, ret, overrun ) - __field(unsigned long long, calltime ) - __field(unsigned long long, rettime ) + __field_desc_packed( unsigned long, ret, func ) + __field_desc_packed( unsigned int, ret, depth ) + __field_desc_packed( unsigned int, ret, overrun ) + __field_packed(unsigned long long, calltime ) + __field_packed(unsigned long long, rettime ) ), F_printk("<-- %ps (%u) (start: %llx end: %llx) over: %u", diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 5e6e70540eef..c97bb2fda5c0 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2057,6 +2057,15 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, hist_field->fn_num = HIST_FIELD_FN_RELDYNSTRING; else hist_field->fn_num = HIST_FIELD_FN_PSTRING; + } else if (field->filter_type == FILTER_STACKTRACE) { + flags |= HIST_FIELD_FL_STACKTRACE; + + hist_field->size = MAX_FILTER_STR_VAL; + hist_field->type = kstrdup_const(field->type, GFP_KERNEL); + if (!hist_field->type) + goto free; + + hist_field->fn_num = HIST_FIELD_FN_STACK; } else { hist_field->size = field->size; hist_field->is_signed = field->is_signed; diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 4554c458b78c..45c187e77e21 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -130,7 +130,9 @@ static int synth_event_define_fields(struct trace_event_call *call) struct synth_event *event = call->data; unsigned int i, size, n_u64; char *name, *type; + int filter_type; bool is_signed; + bool is_stack; int ret = 0; for (i = 0, n_u64 = 0; i < event->n_fields; i++) { @@ -138,8 +140,12 @@ static int synth_event_define_fields(struct trace_event_call *call) is_signed = event->fields[i]->is_signed; type = event->fields[i]->type; name = event->fields[i]->name; + is_stack = event->fields[i]->is_stack; + + filter_type = is_stack ? FILTER_STACKTRACE : FILTER_OTHER; + ret = trace_define_field(call, type, name, offset, size, - is_signed, FILTER_OTHER); + is_signed, filter_type); if (ret) break; diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 1698fc22afa0..32a42ef31855 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -42,11 +42,14 @@ static int ftrace_event_register(struct trace_event_call *call, #undef __field_fn #define __field_fn(type, item) type item; +#undef __field_packed +#define __field_packed(type, item) type item; + #undef __field_desc #define __field_desc(type, container, item) type item; -#undef __field_packed -#define __field_packed(type, container, item) type item; +#undef __field_desc_packed +#define __field_desc_packed(type, container, item) type item; #undef __array #define __array(type, item, size) type item[size]; @@ -104,11 +107,14 @@ static void __always_unused ____ftrace_check_##name(void) \ #undef __field_fn #define __field_fn(_type, _item) __field_ext(_type, _item, FILTER_TRACE_FN) +#undef __field_packed +#define __field_packed(_type, _item) __field_ext_packed(_type, _item, FILTER_OTHER) + #undef __field_desc #define __field_desc(_type, _container, _item) __field_ext(_type, _item, FILTER_OTHER) -#undef __field_packed -#define __field_packed(_type, _container, _item) __field_ext_packed(_type, _item, FILTER_OTHER) +#undef __field_desc_packed +#define __field_desc_packed(_type, _container, _item) __field_ext_packed(_type, _item, FILTER_OTHER) #undef __array #define __array(_type, _item, _len) { \ @@ -146,11 +152,14 @@ static struct trace_event_fields ftrace_event_fields_##name[] = { \ #undef __field_fn #define __field_fn(type, item) +#undef __field_packed +#define __field_packed(type, item) + #undef __field_desc #define __field_desc(type, container, item) -#undef __field_packed -#define __field_packed(type, container, item) +#undef __field_desc_packed +#define __field_desc_packed(type, container, item) #undef __array #define __array(type, item, len) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index b1e9c9913309..1de6f1573621 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -901,7 +901,7 @@ static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entr trace_seq_printf(s, "%ps", func); if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) { - print_function_args(s, entry->args, (unsigned long)func); + print_function_args(s, FGRAPH_ENTRY_ARGS(entry), (unsigned long)func); trace_seq_putc(s, ';'); } else trace_seq_puts(s, "();"); diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 6ea2f6363b90..5c153106e642 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -125,7 +125,7 @@ static void __acct_update_integrals(struct task_struct *tsk, { u64 time, delta; - if (!likely(tsk->mm)) + if (unlikely(!tsk->mm || (tsk->flags & PF_KTHREAD))) return; time = stime + utime; diff --git a/kernel/ucount.c b/kernel/ucount.c index 586af49fc03e..fc4a8f2d3096 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -47,7 +47,7 @@ static int set_permissions(struct ctl_table_header *head, int mode; /* Allow users with CAP_SYS_RESOURCE unrestrained access */ - if (ns_capable(user_ns, CAP_SYS_RESOURCE)) + if (ns_capable_noaudit(user_ns, CAP_SYS_RESOURCE)) mode = (table->mode & S_IRWXU) >> 6; else /* Allow all others at most read-only access */ diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c index 39e270789444..90ab3c1a205e 100644 --- a/kernel/unwind/user.c +++ b/kernel/unwind/user.c @@ -31,6 +31,7 @@ static int unwind_user_next_common(struct unwind_user_state *state, { unsigned long cfa, fp, ra; + /* Get the Canonical Frame Address (CFA) */ if (frame->use_fp) { if (state->fp < state->sp) return -EINVAL; @@ -38,11 +39,9 @@ static int unwind_user_next_common(struct unwind_user_state *state, } else { cfa = state->sp; } - - /* Get the Canonical Frame Address (CFA) */ cfa += frame->cfa_off; - /* stack going in wrong direction? */ + /* Make sure that stack is not going in wrong direction */ if (cfa <= state->sp) return -EINVAL; @@ -50,10 +49,11 @@ static int unwind_user_next_common(struct unwind_user_state *state, if (cfa & (state->ws - 1)) return -EINVAL; - /* Find the Return Address (RA) */ + /* Get the Return Address (RA) */ if (get_user_word(&ra, cfa, frame->ra_off, state->ws)) return -EINVAL; + /* Get the Frame Pointer (FP) */ if (frame->fp_off && get_user_word(&fp, cfa, frame->fp_off, state->ws)) return -EINVAL; @@ -67,7 +67,6 @@ static int unwind_user_next_common(struct unwind_user_state *state, static int unwind_user_next_fp(struct unwind_user_state *state) { -#ifdef CONFIG_HAVE_UNWIND_USER_FP struct pt_regs *regs = task_pt_regs(current); if (state->topmost && unwind_user_at_function_start(regs)) { @@ -81,9 +80,6 @@ static int unwind_user_next_fp(struct unwind_user_state *state) ARCH_INIT_USER_FP_FRAME(state->ws) }; return unwind_user_next_common(state, &fp_frame); -#else - return -EINVAL; -#endif } static int unwind_user_next(struct unwind_user_state *state) diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c index fe9bf8db1922..8d82913223a1 100644 --- a/kernel/vmcore_info.c +++ b/kernel/vmcore_info.c @@ -36,7 +36,11 @@ struct hwerr_info { time64_t timestamp; }; -static struct hwerr_info hwerr_data[HWERR_RECOV_MAX]; +/* + * The hwerr_data[] array is declared with global scope so that it remains + * accessible to vmcoreinfo even when Link Time Optimization (LTO) is enabled. + */ +struct hwerr_info hwerr_data[HWERR_RECOV_MAX]; Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len) @@ -137,7 +141,9 @@ EXPORT_SYMBOL_GPL(hwerr_log_error_type); static int __init crash_save_vmcoreinfo_init(void) { - vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL); + int order; + order = get_order(VMCOREINFO_BYTES); + vmcoreinfo_data = (unsigned char *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order); if (!vmcoreinfo_data) { pr_warn("Memory allocation for vmcoreinfo_data failed\n"); return -ENOMEM; @@ -146,7 +152,7 @@ static int __init crash_save_vmcoreinfo_init(void) vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE, GFP_KERNEL | __GFP_ZERO); if (!vmcoreinfo_note) { - free_page((unsigned long)vmcoreinfo_data); + free_pages((unsigned long)vmcoreinfo_data, order); vmcoreinfo_data = NULL; pr_warn("Memory allocation for vmcoreinfo_note failed\n"); return -ENOMEM; @@ -238,7 +244,6 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_SYMBOL(kallsyms_token_table); VMCOREINFO_SYMBOL(kallsyms_token_index); VMCOREINFO_SYMBOL(kallsyms_offsets); - VMCOREINFO_SYMBOL(kallsyms_relative_base); #endif /* CONFIG_KALLSYMS */ arch_crash_save_vmcoreinfo(); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 366122f4a0f8..7d675781bc91 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -363,7 +363,7 @@ static struct cpumask watchdog_allowed_mask __read_mostly; /* Global variables, exported for sysctl */ unsigned int __read_mostly softlockup_panic = - IS_ENABLED(CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC); + CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC; static bool softlockup_initialized __read_mostly; static u64 __read_mostly sample_period; @@ -550,7 +550,7 @@ static bool need_counting_irqs(void) u8 util; int tail = __this_cpu_read(cpustat_tail); - tail = (tail + NUM_HARDIRQ_REPORT - 1) % NUM_HARDIRQ_REPORT; + tail = (tail + NUM_SAMPLE_PERIODS - 1) % NUM_SAMPLE_PERIODS; util = __this_cpu_read(cpustat_util[tail][STATS_HARDIRQ]); return util > HARDIRQ_PERCENT_THRESH; } @@ -774,8 +774,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { unsigned long touch_ts, period_ts, now; struct pt_regs *regs = get_irq_regs(); - int duration; int softlockup_all_cpu_backtrace; + int duration, thresh_count; unsigned long flags; if (!watchdog_enabled) @@ -879,7 +879,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); sys_info(softlockup_si_mask & ~SYS_INFO_ALL_BT); - if (softlockup_panic) + thresh_count = duration / get_softlockup_thresh(); + + if (softlockup_panic && thresh_count >= softlockup_panic) panic("softlockup: hung tasks"); } @@ -1228,7 +1230,7 @@ static const struct ctl_table watchdog_sysctls[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "softlockup_sys_info", diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c index d3ca70e3c256..cf05775a96d3 100644 --- a/kernel/watchdog_perf.c +++ b/kernel/watchdog_perf.c @@ -118,18 +118,11 @@ static void watchdog_overflow_callback(struct perf_event *event, watchdog_hardlockup_check(smp_processor_id(), regs); } -static int hardlockup_detector_event_create(void) +static struct perf_event *hardlockup_detector_event_create(unsigned int cpu) { - unsigned int cpu; struct perf_event_attr *wd_attr; struct perf_event *evt; - /* - * Preemption is not disabled because memory will be allocated. - * Ensure CPU-locality by calling this in per-CPU kthread. - */ - WARN_ON(!is_percpu_thread()); - cpu = raw_smp_processor_id(); wd_attr = &wd_hw_attr; wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); @@ -143,14 +136,7 @@ static int hardlockup_detector_event_create(void) watchdog_overflow_callback, NULL); } - if (IS_ERR(evt)) { - pr_debug("Perf event create on CPU %d failed with %ld\n", cpu, - PTR_ERR(evt)); - return PTR_ERR(evt); - } - WARN_ONCE(this_cpu_read(watchdog_ev), "unexpected watchdog_ev leak"); - this_cpu_write(watchdog_ev, evt); - return 0; + return evt; } /** @@ -159,17 +145,26 @@ static int hardlockup_detector_event_create(void) */ void watchdog_hardlockup_enable(unsigned int cpu) { + struct perf_event *evt; + WARN_ON_ONCE(cpu != smp_processor_id()); - if (hardlockup_detector_event_create()) + evt = hardlockup_detector_event_create(cpu); + if (IS_ERR(evt)) { + pr_debug("Perf event create on CPU %d failed with %ld\n", cpu, + PTR_ERR(evt)); return; + } /* use original value for check */ if (!atomic_fetch_inc(&watchdog_cpus)) pr_info("Enabled. Permanently consumes one hw-PMU counter.\n"); + WARN_ONCE(this_cpu_read(watchdog_ev), "unexpected watchdog_ev leak"); + this_cpu_write(watchdog_ev, evt); + watchdog_init_timestamp(); - perf_event_enable(this_cpu_read(watchdog_ev)); + perf_event_enable(evt); } /** @@ -263,19 +258,30 @@ bool __weak __init arch_perf_nmi_is_available(void) */ int __init watchdog_hardlockup_probe(void) { + struct perf_event *evt; + unsigned int cpu; int ret; if (!arch_perf_nmi_is_available()) return -ENODEV; - ret = hardlockup_detector_event_create(); + if (!hw_nmi_get_sample_period(watchdog_thresh)) + return -EINVAL; - if (ret) { + /* + * Test hardware PMU availability by creating a temporary perf event. + * The event is released immediately. + */ + cpu = raw_smp_processor_id(); + evt = hardlockup_detector_event_create(cpu); + if (IS_ERR(evt)) { pr_info("Perf NMI watchdog permanently disabled\n"); + ret = PTR_ERR(evt); } else { - perf_event_release_kernel(this_cpu_read(watchdog_ev)); - this_cpu_write(watchdog_ev, NULL); + perf_event_release_kernel(evt); + ret = 0; } + return ret; } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 253311af47c6..c515cff01828 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -117,6 +117,8 @@ enum wq_internal_consts { MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ CREATE_COOLDOWN = HZ, /* time to breath after fail */ + RESCUER_BATCH = 16, /* process items per turn */ + /* * Rescue workers are used only on emergencies and shared by * all cpus. Give MIN_NICE. @@ -286,6 +288,7 @@ struct pool_workqueue { struct list_head pending_node; /* LN: node on wq_node_nr_active->pending_pwqs */ struct list_head pwqs_node; /* WR: node on wq->pwqs */ struct list_head mayday_node; /* MD: node on wq->maydays */ + struct work_struct mayday_cursor; /* L: cursor on pool->worklist */ u64 stats[PWQ_NR_STATS]; @@ -1120,6 +1123,12 @@ static struct worker *find_worker_executing_work(struct worker_pool *pool, return NULL; } +static void mayday_cursor_func(struct work_struct *work) +{ + /* should not be processed, only for marking position */ + BUG(); +} + /** * move_linked_works - move linked works to a list * @work: start of series of works to be scheduled @@ -1182,6 +1191,16 @@ static bool assign_work(struct work_struct *work, struct worker *worker, lockdep_assert_held(&pool->lock); + /* The cursor work should not be processed */ + if (unlikely(work->func == mayday_cursor_func)) { + /* only worker_thread() can possibly take this branch */ + WARN_ON_ONCE(worker->rescue_wq); + if (nextp) + *nextp = list_next_entry(work, entry); + list_del_init(&work->entry); + return false; + } + /* * A single work shouldn't be executed concurrently by multiple workers. * __queue_work() ensures that @work doesn't jump to a different pool @@ -2976,9 +2995,8 @@ static void idle_cull_fn(struct work_struct *work) reap_dying_workers(&cull_list); } -static void send_mayday(struct work_struct *work) +static void send_mayday(struct pool_workqueue *pwq) { - struct pool_workqueue *pwq = get_work_pwq(work); struct workqueue_struct *wq = pwq->wq; lockdep_assert_held(&wq_mayday_lock); @@ -3016,7 +3034,7 @@ static void pool_mayday_timeout(struct timer_list *t) * rescuers. */ list_for_each_entry(work, &pool->worklist, entry) - send_mayday(work); + send_mayday(get_work_pwq(work)); } raw_spin_unlock(&wq_mayday_lock); @@ -3440,22 +3458,57 @@ sleep: static bool assign_rescuer_work(struct pool_workqueue *pwq, struct worker *rescuer) { struct worker_pool *pool = pwq->pool; + struct work_struct *cursor = &pwq->mayday_cursor; struct work_struct *work, *n; - /* need rescue? */ - if (!pwq->nr_active || !need_to_create_worker(pool)) + /* have work items to rescue? */ + if (!pwq->nr_active) return false; - /* - * Slurp in all works issued via this workqueue and - * process'em. - */ - list_for_each_entry_safe(work, n, &pool->worklist, entry) { - if (get_work_pwq(work) == pwq && assign_work(work, rescuer, &n)) + /* need rescue? */ + if (!need_to_create_worker(pool)) { + /* + * The pool has idle workers and doesn't need the rescuer, so it + * could simply return false here. + * + * However, the memory pressure might not be fully relieved. + * In PERCPU pool with concurrency enabled, having idle workers + * does not necessarily mean memory pressure is gone; it may + * simply mean regular workers have woken up, completed their + * work, and gone idle again due to concurrency limits. + * + * In this case, those working workers may later sleep again, + * the pool may run out of idle workers, and it will have to + * allocate new ones and wait for the timer to send mayday, + * causing unnecessary delay - especially if memory pressure + * was never resolved throughout. + * + * Do more work if memory pressure is still on to reduce + * relapse, using (pool->flags & POOL_MANAGER_ACTIVE), though + * not precisely, unless there are other PWQs needing help. + */ + if (!(pool->flags & POOL_MANAGER_ACTIVE) || + !list_empty(&pwq->wq->maydays)) + return false; + } + + /* search from the start or cursor if available */ + if (list_empty(&cursor->entry)) + work = list_first_entry(&pool->worklist, struct work_struct, entry); + else + work = list_next_entry(cursor, entry); + + /* find the next work item to rescue */ + list_for_each_entry_safe_from(work, n, &pool->worklist, entry) { + if (get_work_pwq(work) == pwq && assign_work(work, rescuer, &n)) { pwq->stats[PWQ_STAT_RESCUED]++; + /* put the cursor for next search */ + list_move_tail(&cursor->entry, &n->entry); + return true; + } } - return !list_empty(&rescuer->scheduled); + return false; } /** @@ -3512,6 +3565,7 @@ repeat: struct pool_workqueue *pwq = list_first_entry(&wq->maydays, struct pool_workqueue, mayday_node); struct worker_pool *pool = pwq->pool; + unsigned int count = 0; __set_current_state(TASK_RUNNING); list_del_init(&pwq->mayday_node); @@ -3524,31 +3578,27 @@ repeat: WARN_ON_ONCE(!list_empty(&rescuer->scheduled)); - if (assign_rescuer_work(pwq, rescuer)) { + while (assign_rescuer_work(pwq, rescuer)) { process_scheduled_works(rescuer); /* - * The above execution of rescued work items could - * have created more to rescue through - * pwq_activate_first_inactive() or chained - * queueing. Let's put @pwq back on mayday list so - * that such back-to-back work items, which may be - * being used to relieve memory pressure, don't - * incur MAYDAY_INTERVAL delay inbetween. + * If the per-turn work item limit is reached and other + * PWQs are in mayday, requeue mayday for this PWQ and + * let the rescuer handle the other PWQs first. */ - if (pwq->nr_active && need_to_create_worker(pool)) { + if (++count > RESCUER_BATCH && !list_empty(&pwq->wq->maydays) && + pwq->nr_active && need_to_create_worker(pool)) { raw_spin_lock(&wq_mayday_lock); - /* - * Queue iff somebody else hasn't queued it already. - */ - if (list_empty(&pwq->mayday_node)) { - get_pwq(pwq); - list_add_tail(&pwq->mayday_node, &wq->maydays); - } + send_mayday(pwq); raw_spin_unlock(&wq_mayday_lock); + break; } } + /* The cursor can not be left behind without the rescuer watching it. */ + if (!list_empty(&pwq->mayday_cursor.entry) && list_empty(&pwq->mayday_node)) + list_del_init(&pwq->mayday_cursor.entry); + /* * Leave this pool. Notify regular workers; otherwise, we end up * with 0 concurrency and stalling the execution. @@ -5167,6 +5217,19 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, INIT_LIST_HEAD(&pwq->pwqs_node); INIT_LIST_HEAD(&pwq->mayday_node); kthread_init_work(&pwq->release_work, pwq_release_workfn); + + /* + * Set the dummy cursor work with valid function and get_work_pwq(). + * + * The cursor work should only be in the pwq->pool->worklist, and + * should not be treated as a processable work item. + * + * WORK_STRUCT_PENDING and WORK_STRUCT_INACTIVE just make it less + * surprise for kernel debugging tools and reviewers. + */ + INIT_WORK(&pwq->mayday_cursor, mayday_cursor_func); + atomic_long_set(&pwq->mayday_cursor.data, (unsigned long)pwq | + WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | WORK_STRUCT_INACTIVE); } /* sync @pwq with the current state of its associated wq and link it */ @@ -6959,13 +7022,16 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask) } /** - * workqueue_unbound_exclude_cpumask - Exclude given CPUs from unbound cpumask - * @exclude_cpumask: the cpumask to be excluded from wq_unbound_cpumask + * workqueue_unbound_housekeeping_update - Propagate housekeeping cpumask update + * @hk: the new housekeeping cpumask + * + * Update the unbound workqueue cpumask on top of the new housekeeping cpumask such + * that the effective unbound affinity is the intersection of the new housekeeping + * with the requested affinity set via nohz_full=/isolcpus= or sysfs. * - * This function can be called from cpuset code to provide a set of isolated - * CPUs that should be excluded from wq_unbound_cpumask. + * Return: 0 on success and -errno on failure. */ -int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask) +int workqueue_unbound_housekeeping_update(const struct cpumask *hk) { cpumask_var_t cpumask; int ret = 0; @@ -6981,14 +7047,14 @@ int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask) * (HK_TYPE_WQ ∩ HK_TYPE_DOMAIN) house keeping mask and rewritten * by any subsequent write to workqueue/cpumask sysfs file. */ - if (!cpumask_andnot(cpumask, wq_requested_unbound_cpumask, exclude_cpumask)) + if (!cpumask_and(cpumask, wq_requested_unbound_cpumask, hk)) cpumask_copy(cpumask, wq_requested_unbound_cpumask); if (!cpumask_equal(cpumask, wq_unbound_cpumask)) ret = workqueue_apply_unbound_cpumask(cpumask); /* Save the current isolated cpumask & export it via sysfs */ if (!ret) - cpumask_copy(wq_isolated_cpumask, exclude_cpumask); + cpumask_andnot(wq_isolated_cpumask, cpu_possible_mask, hk); mutex_unlock(&wq_pool_mutex); free_cpumask_var(cpumask); @@ -7505,9 +7571,13 @@ static struct timer_list wq_watchdog_timer; static unsigned long wq_watchdog_touched = INITIAL_JIFFIES; static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES; -static unsigned int wq_panic_on_stall; +static unsigned int wq_panic_on_stall = CONFIG_BOOTPARAM_WQ_STALL_PANIC; module_param_named(panic_on_stall, wq_panic_on_stall, uint, 0644); +static unsigned int wq_panic_on_stall_time; +module_param_named(panic_on_stall_time, wq_panic_on_stall_time, uint, 0644); +MODULE_PARM_DESC(panic_on_stall_time, "Panic if stall exceeds this many seconds (0=disabled)"); + /* * Show workers that might prevent the processing of pending work items. * The only candidates are CPU-bound workers in the running state. @@ -7559,14 +7629,25 @@ static void show_cpu_pools_hogs(void) rcu_read_unlock(); } -static void panic_on_wq_watchdog(void) +/* + * It triggers a panic in two scenarios: when the total number of stalls + * exceeds a threshold, and when a stall lasts longer than + * wq_panic_on_stall_time + */ +static void panic_on_wq_watchdog(unsigned int stall_time_sec) { static unsigned int wq_stall; if (wq_panic_on_stall) { wq_stall++; - BUG_ON(wq_stall >= wq_panic_on_stall); + if (wq_stall >= wq_panic_on_stall) + panic("workqueue: %u stall(s) exceeded threshold %u\n", + wq_stall, wq_panic_on_stall); } + + if (wq_panic_on_stall_time && stall_time_sec >= wq_panic_on_stall_time) + panic("workqueue: stall lasted %us, exceeding threshold %us\n", + stall_time_sec, wq_panic_on_stall_time); } static void wq_watchdog_reset_touched(void) @@ -7581,10 +7662,12 @@ static void wq_watchdog_reset_touched(void) static void wq_watchdog_timer_fn(struct timer_list *unused) { unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ; + unsigned int max_stall_time = 0; bool lockup_detected = false; bool cpu_pool_stall = false; unsigned long now = jiffies; struct worker_pool *pool; + unsigned int stall_time; int pi; if (!thresh) @@ -7618,14 +7701,15 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) /* did we stall? */ if (time_after(now, ts + thresh)) { lockup_detected = true; + stall_time = jiffies_to_msecs(now - pool_ts) / 1000; + max_stall_time = max(max_stall_time, stall_time); if (pool->cpu >= 0 && !(pool->flags & POOL_BH)) { pool->cpu_stall = true; cpu_pool_stall = true; } pr_emerg("BUG: workqueue lockup - pool"); pr_cont_pool_info(pool); - pr_cont(" stuck for %us!\n", - jiffies_to_msecs(now - pool_ts) / 1000); + pr_cont(" stuck for %us!\n", stall_time); } @@ -7638,7 +7722,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) show_cpu_pools_hogs(); if (lockup_detected) - panic_on_wq_watchdog(); + panic_on_wq_watchdog(max_stall_time); wq_watchdog_reset_touched(); mod_timer(&wq_watchdog_timer, jiffies + thresh); |
