From 11941f8a85362f612df61f4aaab0e41b64d2111d Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 2 Jul 2021 16:48:23 +0530 Subject: bpf: cpumap: Implement generic cpumap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change implements CPUMAP redirect support for generic XDP programs. The idea is to reuse the cpu map entry's queue that is used to push native xdp frames for redirecting skb to a different CPU. This will match native XDP behavior (in that RPS is invoked again for packet reinjected into networking stack). To be able to determine whether the incoming skb is from the driver or cpumap, we reuse skb->redirected bit that skips generic XDP processing when it is set. To always make use of this, CONFIG_NET_REDIRECT guard on it has been lifted and it is always available. >From the redirect side, we add the skb to ptr_ring with its lowest bit set to 1. This should be safe as skb is not 1-byte aligned. This allows kthread to discern between xdp_frames and sk_buff. On consumption of the ptr_ring item, the lowest bit is unset. In the end, the skb is simply added to the list that kthread is anyway going to maintain for xdp_frames converted to skb, and then received again by using netif_receive_skb_list. Bulking optimization for generic cpumap is left as an exercise for a future patch for now. Since cpumap entry progs are now supported, also remove check in generic_xdp_install for the cpumap. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Reviewed-by: Toke Høiland-Jørgensen Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/20210702111825.491065-4-memxor@gmail.com --- kernel/bpf/cpumap.c | 116 ++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 98 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 480e936c54d0..585b2b77ccc4 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -16,6 +16,7 @@ * netstack, and assigning dedicated CPUs for this stage. This * basically allows for 10G wirespeed pre-filtering via bpf. */ +#include #include #include #include @@ -168,6 +169,46 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) } } +static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, + struct list_head *listp, + struct xdp_cpumap_stats *stats) +{ + struct sk_buff *skb, *tmp; + struct xdp_buff xdp; + u32 act; + int err; + + list_for_each_entry_safe(skb, tmp, listp, list) { + act = bpf_prog_run_generic_xdp(skb, &xdp, rcpu->prog); + switch (act) { + case XDP_PASS: + break; + case XDP_REDIRECT: + skb_list_del_init(skb); + err = xdp_do_generic_redirect(skb->dev, skb, &xdp, + rcpu->prog); + if (unlikely(err)) { + kfree_skb(skb); + stats->drop++; + } else { + stats->redirect++; + } + return; + default: + bpf_warn_invalid_xdp_action(act); + fallthrough; + case XDP_ABORTED: + trace_xdp_exception(skb->dev, rcpu->prog, act); + fallthrough; + case XDP_DROP: + skb_list_del_init(skb); + kfree_skb(skb); + stats->drop++; + return; + } + } +} + static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, void **frames, int n, struct xdp_cpumap_stats *stats) @@ -176,11 +217,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, struct xdp_buff xdp; int i, nframes = 0; - if (!rcpu->prog) - return n; - - rcu_read_lock_bh(); - xdp_set_return_frame_no_direct(); xdp.rxq = &rxq; @@ -227,17 +263,37 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, } } + xdp_clear_return_frame_no_direct(); + + return nframes; +} + +#define CPUMAP_BATCH 8 + +static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, + int xdp_n, struct xdp_cpumap_stats *stats, + struct list_head *list) +{ + int nframes; + + if (!rcpu->prog) + return xdp_n; + + rcu_read_lock_bh(); + + nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, xdp_n, stats); + if (stats->redirect) - xdp_do_flush_map(); + xdp_do_flush(); - xdp_clear_return_frame_no_direct(); + if (unlikely(!list_empty(list))) + cpu_map_bpf_prog_run_skb(rcpu, list, stats); rcu_read_unlock_bh(); /* resched point, may call do_softirq() */ return nframes; } -#define CPUMAP_BATCH 8 static int cpu_map_kthread_run(void *data) { @@ -254,9 +310,9 @@ static int cpu_map_kthread_run(void *data) struct xdp_cpumap_stats stats = {}; /* zero stats */ unsigned int kmem_alloc_drops = 0, sched = 0; gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; + int i, n, m, nframes, xdp_n; void *frames[CPUMAP_BATCH]; void *skbs[CPUMAP_BATCH]; - int i, n, m, nframes; LIST_HEAD(list); /* Release CPU reschedule checks */ @@ -280,9 +336,20 @@ static int cpu_map_kthread_run(void *data) */ n = __ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH); - for (i = 0; i < n; i++) { + for (i = 0, xdp_n = 0; i < n; i++) { void *f = frames[i]; - struct page *page = virt_to_page(f); + struct page *page; + + if (unlikely(__ptr_test_bit(0, &f))) { + struct sk_buff *skb = f; + + __ptr_clear_bit(0, &skb); + list_add_tail(&skb->list, &list); + continue; + } + + frames[xdp_n++] = f; + page = virt_to_page(f); /* Bring struct page memory area to curr CPU. Read by * build_skb_around via page_is_pfmemalloc(), and when @@ -292,7 +359,7 @@ static int cpu_map_kthread_run(void *data) } /* Support running another XDP prog on this CPU */ - nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, n, &stats); + nframes = cpu_map_bpf_prog_run(rcpu, frames, xdp_n, &stats, &list); if (nframes) { m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, nframes, skbs); if (unlikely(m == 0)) { @@ -330,12 +397,6 @@ static int cpu_map_kthread_run(void *data) return 0; } -bool cpu_map_prog_allowed(struct bpf_map *map) -{ - return map->map_type == BPF_MAP_TYPE_CPUMAP && - map->value_size != offsetofend(struct bpf_cpumap_val, qsize); -} - static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd) { struct bpf_prog *prog; @@ -701,6 +762,25 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, return 0; } +int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu, + struct sk_buff *skb) +{ + int ret; + + __skb_pull(skb, skb->mac_len); + skb_set_redirected(skb, false); + __ptr_set_bit(0, &skb); + + ret = ptr_ring_produce(rcpu->queue, skb); + if (ret < 0) + goto trace; + + wake_up_process(rcpu->kthread); +trace: + trace_xdp_cpumap_enqueue(rcpu->map_id, !ret, !!ret, rcpu->cpu); + return ret; +} + void __cpu_map_flush(void) { struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list); -- cgit v1.2.3 From 2ea5eabaf04a1829383aefe98ac38a2e5ae2d698 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 2 Jul 2021 16:48:24 +0530 Subject: bpf: devmap: Implement devmap prog execution for generic XDP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This lifts the restriction on running devmap BPF progs in generic redirect mode. To match native XDP behavior, it is invoked right before generic_xdp_tx is called, and only supports XDP_PASS/XDP_ABORTED/ XDP_DROP actions. We also return 0 even if devmap program drops the packet, as semantically redirect has already succeeded and the devmap prog is the last point before TX of the packet to device where it can deliver a verdict on the packet. This also means it must take care of freeing the skb, as xdp_do_generic_redirect callers only do that in case an error is returned. Since devmap entry prog is supported, remove the check in generic_xdp_install entirely. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Reviewed-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210702111825.491065-5-memxor@gmail.com --- kernel/bpf/devmap.c | 49 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 39 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 2546dafd6672..fa26eac5e4b6 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -322,16 +322,6 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, return -ENOENT; } -bool dev_map_can_have_prog(struct bpf_map *map) -{ - if ((map->map_type == BPF_MAP_TYPE_DEVMAP || - map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) && - map->value_size != offsetofend(struct bpf_devmap_val, ifindex)) - return true; - - return false; -} - static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, struct xdp_frame **frames, int n, struct net_device *dev) @@ -499,6 +489,37 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, return 0; } +static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev *dst) +{ + struct xdp_txq_info txq = { .dev = dst->dev }; + struct xdp_buff xdp; + u32 act; + + if (!dst->xdp_prog) + return XDP_PASS; + + __skb_pull(skb, skb->mac_len); + xdp.txq = &txq; + + act = bpf_prog_run_generic_xdp(skb, &xdp, dst->xdp_prog); + switch (act) { + case XDP_PASS: + __skb_push(skb, skb->mac_len); + break; + default: + bpf_warn_invalid_xdp_action(act); + fallthrough; + case XDP_ABORTED: + trace_xdp_exception(dst->dev, dst->xdp_prog, act); + fallthrough; + case XDP_DROP: + kfree_skb(skb); + break; + } + + return act; +} + int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, struct net_device *dev_rx) { @@ -614,6 +635,14 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, err = xdp_ok_fwd_dev(dst->dev, skb->len); if (unlikely(err)) return err; + + /* Redirect has already succeeded semantically at this point, so we just + * return 0 even if packet is dropped. Helper below takes care of + * freeing skb. + */ + if (dev_map_bpf_prog_run_skb(skb, dst) != XDP_PASS) + return 0; + skb->dev = dst->dev; generic_xdp_tx(skb, xdp_prog); -- cgit v1.2.3 From 11e4b63abbe23872b45f325a7c6c8b7f9ff42cad Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 2 Jul 2021 17:06:57 +0200 Subject: printk/console: Check consistent sequence number when handling race in console_unlock() The standard printk() tries to flush the message to the console immediately. It tries to take the console lock. If the lock is already taken then the current owner is responsible for flushing even the new message. There is a small race window between checking whether a new message is available and releasing the console lock. It is solved by re-checking the state after releasing the console lock. If the check is positive then console_unlock() tries to take the lock again and process the new message as well. The commit 996e966640ddea7b535c ("printk: remove logbuf_lock") causes that console_seq is not longer read atomically. As a result, the re-check might be done with an inconsistent 64-bit index. Solve it by using the last sequence number that has been checked under the console lock. In the worst case, it will take the lock again only to realized that the new message has already been proceed. But it was possible even before. The variable next_seq is marked as __maybe_unused to call down compiler warning when CONFIG_PRINTK is not defined. Fixes: commit 996e966640ddea7b535c ("printk: remove logbuf_lock") Reported-by: kernel test robot # unused next_seq warning Cc: stable@vger.kernel.org # 5.13 Signed-off-by: Petr Mladek Acked-by: Sergey Senozhatsky Reviewed-by: John Ogness Link: https://lore.kernel.org/r/20210702150657.26760-1-pmladek@suse.com --- kernel/printk/printk.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 142a58d124d9..6dad7da8f383 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2545,6 +2545,7 @@ void console_unlock(void) bool do_cond_resched, retry; struct printk_info info; struct printk_record r; + u64 __maybe_unused next_seq; if (console_suspended) { up_console_sem(); @@ -2654,8 +2655,10 @@ skip: cond_resched(); } - console_locked = 0; + /* Get consistent value of the next-to-be-used sequence number. */ + next_seq = console_seq; + console_locked = 0; up_console_sem(); /* @@ -2664,7 +2667,7 @@ skip: * there's a new owner and the console_unlock() from them will do the * flush, no worries. */ - retry = prb_read_valid(prb, console_seq, NULL); + retry = prb_read_valid(prb, next_seq, NULL); printk_safe_exit_irqrestore(flags); if (retry && console_trylock()) -- cgit v1.2.3 From b48c7236b13cb5ef1b5fdf744aa8841df0f7b43a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 29 Jun 2021 15:11:44 -0500 Subject: exit/bdflush: Remove the deprecated bdflush system call The bdflush system call has been deprecated for a very long time. Recently Michael Schmitz tested[1] and found that the last known caller of of the bdflush system call is unaffected by it's removal. Since the code is not needed delete it. [1] https://lkml.kernel.org/r/36123b5d-daa0-6c2b-f2d4-a942f069fd54@gmail.com Link: https://lkml.kernel.org/r/87sg10quue.fsf_-_@disp2133 Tested-by: Michael Schmitz Acked-by: Geert Uytterhoeven Reviewed-by: Arnd Bergmann Acked-by: Cyril Hrubis Signed-off-by: "Eric W. Biederman" --- kernel/sys_ni.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 30971b1dd4a9..cb6f98f5c97a 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -416,7 +416,6 @@ COND_SYSCALL(epoll_wait); COND_SYSCALL(recv); COND_SYSCALL_COMPAT(recv); COND_SYSCALL(send); -COND_SYSCALL(bdflush); COND_SYSCALL(uselib); /* optional: time32 */ -- cgit v1.2.3 From 0a65579cdd28bed3e84e1b4929c3080da4f06d79 Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Sat, 19 Jun 2021 11:40:32 +0800 Subject: swiotlb: Refactor swiotlb init functions Add a new function, swiotlb_init_io_tlb_mem, for the io_tlb_mem struct initialization to make the code reusable. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Acked-by: Stefano Stabellini Tested-by: Will Deacon Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 50 +++++++++++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index e50df8d8f87e..414db5fc8de9 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -168,9 +168,28 @@ void __init swiotlb_update_mem_attributes(void) memset(vaddr, 0, bytes); } -int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) +static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, + unsigned long nslabs, bool late_alloc) { + void *vaddr = phys_to_virt(start); unsigned long bytes = nslabs << IO_TLB_SHIFT, i; + + mem->nslabs = nslabs; + mem->start = start; + mem->end = mem->start + bytes; + mem->index = 0; + mem->late_alloc = late_alloc; + spin_lock_init(&mem->lock); + for (i = 0; i < mem->nslabs; i++) { + mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i); + mem->slots[i].orig_addr = INVALID_PHYS_ADDR; + mem->slots[i].alloc_size = 0; + } + memset(vaddr, 0, bytes); +} + +int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) +{ struct io_tlb_mem *mem; size_t alloc_size; @@ -186,16 +205,8 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) if (!mem) panic("%s: Failed to allocate %zu bytes align=0x%lx\n", __func__, alloc_size, PAGE_SIZE); - mem->nslabs = nslabs; - mem->start = __pa(tlb); - mem->end = mem->start + bytes; - mem->index = 0; - spin_lock_init(&mem->lock); - for (i = 0; i < mem->nslabs; i++) { - mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i); - mem->slots[i].orig_addr = INVALID_PHYS_ADDR; - mem->slots[i].alloc_size = 0; - } + + swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, false); io_tlb_default_mem = mem; if (verbose) @@ -282,8 +293,8 @@ swiotlb_late_init_with_default_size(size_t default_size) int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) { - unsigned long bytes = nslabs << IO_TLB_SHIFT, i; struct io_tlb_mem *mem; + unsigned long bytes = nslabs << IO_TLB_SHIFT; if (swiotlb_force == SWIOTLB_NO_FORCE) return 0; @@ -297,20 +308,9 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) if (!mem) return -ENOMEM; - mem->nslabs = nslabs; - mem->start = virt_to_phys(tlb); - mem->end = mem->start + bytes; - mem->index = 0; - mem->late_alloc = 1; - spin_lock_init(&mem->lock); - for (i = 0; i < mem->nslabs; i++) { - mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i); - mem->slots[i].orig_addr = INVALID_PHYS_ADDR; - mem->slots[i].alloc_size = 0; - } - + memset(mem, 0, sizeof(*mem)); set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT); - memset(tlb, 0, bytes); + swiotlb_init_io_tlb_mem(mem, virt_to_phys(tlb), nslabs, true); io_tlb_default_mem = mem; swiotlb_print_info(); -- cgit v1.2.3 From 6e675a1c455ea7579c7eaf1a38fe64267039d6fe Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Sat, 19 Jun 2021 11:40:33 +0800 Subject: swiotlb: Refactor swiotlb_create_debugfs Split the debugfs creation to make the code reusable for supporting different bounce buffer pools. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Tested-by: Will Deacon Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 414db5fc8de9..ae6a151d0a41 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -669,19 +669,26 @@ bool is_swiotlb_active(void) EXPORT_SYMBOL_GPL(is_swiotlb_active); #ifdef CONFIG_DEBUG_FS +static struct dentry *debugfs_dir; -static int __init swiotlb_create_debugfs(void) +static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem) { - struct io_tlb_mem *mem = io_tlb_default_mem; - - if (!mem) - return 0; - mem->debugfs = debugfs_create_dir("swiotlb", NULL); debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs); debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, &mem->used); +} + +static int __init swiotlb_create_default_debugfs(void) +{ + struct io_tlb_mem *mem = io_tlb_default_mem; + + debugfs_dir = debugfs_create_dir("swiotlb", NULL); + if (mem) { + mem->debugfs = debugfs_dir; + swiotlb_create_debugfs_files(mem); + } return 0; } -late_initcall(swiotlb_create_debugfs); +late_initcall(swiotlb_create_default_debugfs); #endif -- cgit v1.2.3 From 69031f500865ee3eee19566a1b9c40a189817eaa Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Sat, 19 Jun 2021 11:40:34 +0800 Subject: swiotlb: Set dev->dma_io_tlb_mem to the swiotlb pool used Always have the pointer to the swiotlb pool used in struct device. This could help simplify the code for other pools. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Tested-by: Will Deacon Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index ae6a151d0a41..33d413beddd4 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -348,7 +348,7 @@ static unsigned int swiotlb_align_offset(struct device *dev, u64 addr) static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size, enum dma_data_direction dir) { - struct io_tlb_mem *mem = io_tlb_default_mem; + struct io_tlb_mem *mem = dev->dma_io_tlb_mem; int index = (tlb_addr - mem->start) >> IO_TLB_SHIFT; phys_addr_t orig_addr = mem->slots[index].orig_addr; size_t alloc_size = mem->slots[index].alloc_size; @@ -429,7 +429,7 @@ static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index) static int find_slots(struct device *dev, phys_addr_t orig_addr, size_t alloc_size) { - struct io_tlb_mem *mem = io_tlb_default_mem; + struct io_tlb_mem *mem = dev->dma_io_tlb_mem; unsigned long boundary_mask = dma_get_seg_boundary(dev); dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(dev, mem->start) & boundary_mask; @@ -506,7 +506,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, size_t mapping_size, size_t alloc_size, enum dma_data_direction dir, unsigned long attrs) { - struct io_tlb_mem *mem = io_tlb_default_mem; + struct io_tlb_mem *mem = dev->dma_io_tlb_mem; unsigned int offset = swiotlb_align_offset(dev, orig_addr); unsigned int i; int index; @@ -557,7 +557,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, size_t mapping_size, enum dma_data_direction dir, unsigned long attrs) { - struct io_tlb_mem *mem = io_tlb_default_mem; + struct io_tlb_mem *mem = hwdev->dma_io_tlb_mem; unsigned long flags; unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr); int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT; -- cgit v1.2.3 From 7fd856aa7f4261ddac62ea59d8383fef22a0690e Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Sat, 19 Jun 2021 11:40:35 +0800 Subject: swiotlb: Update is_swiotlb_buffer to add a struct device argument Update is_swiotlb_buffer to add a struct device argument. This will be useful later to allow for different pools. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Tested-by: Will Deacon Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/direct.c | 6 +++--- kernel/dma/direct.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index f737e3347059..84c9feb5474a 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -343,7 +343,7 @@ void dma_direct_sync_sg_for_device(struct device *dev, for_each_sg(sgl, sg, nents, i) { phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg)); - if (unlikely(is_swiotlb_buffer(paddr))) + if (unlikely(is_swiotlb_buffer(dev, paddr))) swiotlb_sync_single_for_device(dev, paddr, sg->length, dir); @@ -369,7 +369,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev, if (!dev_is_dma_coherent(dev)) arch_sync_dma_for_cpu(paddr, sg->length, dir); - if (unlikely(is_swiotlb_buffer(paddr))) + if (unlikely(is_swiotlb_buffer(dev, paddr))) swiotlb_sync_single_for_cpu(dev, paddr, sg->length, dir); @@ -504,7 +504,7 @@ size_t dma_direct_max_mapping_size(struct device *dev) bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr) { return !dev_is_dma_coherent(dev) || - is_swiotlb_buffer(dma_to_phys(dev, dma_addr)); + is_swiotlb_buffer(dev, dma_to_phys(dev, dma_addr)); } /** diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index 50afc05b6f1d..13e9e7158d94 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -56,7 +56,7 @@ static inline void dma_direct_sync_single_for_device(struct device *dev, { phys_addr_t paddr = dma_to_phys(dev, addr); - if (unlikely(is_swiotlb_buffer(paddr))) + if (unlikely(is_swiotlb_buffer(dev, paddr))) swiotlb_sync_single_for_device(dev, paddr, size, dir); if (!dev_is_dma_coherent(dev)) @@ -73,7 +73,7 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev, arch_sync_dma_for_cpu_all(); } - if (unlikely(is_swiotlb_buffer(paddr))) + if (unlikely(is_swiotlb_buffer(dev, paddr))) swiotlb_sync_single_for_cpu(dev, paddr, size, dir); if (dir == DMA_FROM_DEVICE) @@ -113,7 +113,7 @@ static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) dma_direct_sync_single_for_cpu(dev, addr, size, dir); - if (unlikely(is_swiotlb_buffer(phys))) + if (unlikely(is_swiotlb_buffer(dev, phys))) swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs); } #endif /* _KERNEL_DMA_DIRECT_H */ -- cgit v1.2.3 From 6f2beb268a5d35504a636c4a3b7aaa76ec32d96c Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Sat, 19 Jun 2021 11:40:36 +0800 Subject: swiotlb: Update is_swiotlb_active to add a struct device argument Update is_swiotlb_active to add a struct device argument. This will be useful later to allow for different pools. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Tested-by: Will Deacon Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/direct.c | 2 +- kernel/dma/swiotlb.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 84c9feb5474a..7a88c34d0867 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -495,7 +495,7 @@ int dma_direct_supported(struct device *dev, u64 mask) size_t dma_direct_max_mapping_size(struct device *dev) { /* If SWIOTLB is active, use its maximum mapping size */ - if (is_swiotlb_active() && + if (is_swiotlb_active(dev) && (dma_addressing_limited(dev) || swiotlb_force == SWIOTLB_FORCE)) return swiotlb_max_mapping_size(dev); return SIZE_MAX; diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 33d413beddd4..d8677d6637dd 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -662,9 +662,9 @@ size_t swiotlb_max_mapping_size(struct device *dev) return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE; } -bool is_swiotlb_active(void) +bool is_swiotlb_active(struct device *dev) { - return io_tlb_default_mem != NULL; + return dev->dma_io_tlb_mem != NULL; } EXPORT_SYMBOL_GPL(is_swiotlb_active); -- cgit v1.2.3 From 903cd0f315fe426c6a64c54ed389de0becb663dc Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Thu, 24 Jun 2021 23:55:20 +0800 Subject: swiotlb: Use is_swiotlb_force_bounce for swiotlb data bouncing Propagate the swiotlb_force into io_tlb_default_mem->force_bounce and use it to determine whether to bounce the data or not. This will be useful later to allow for different pools. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Tested-by: Will Deacon Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk [v2: Includes Will's fix] --- kernel/dma/direct.c | 2 +- kernel/dma/direct.h | 2 +- kernel/dma/swiotlb.c | 4 ++++ 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 7a88c34d0867..a92465b4eb12 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -496,7 +496,7 @@ size_t dma_direct_max_mapping_size(struct device *dev) { /* If SWIOTLB is active, use its maximum mapping size */ if (is_swiotlb_active(dev) && - (dma_addressing_limited(dev) || swiotlb_force == SWIOTLB_FORCE)) + (dma_addressing_limited(dev) || is_swiotlb_force_bounce(dev))) return swiotlb_max_mapping_size(dev); return SIZE_MAX; } diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index 13e9e7158d94..4632b0f4f72e 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -87,7 +87,7 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev, phys_addr_t phys = page_to_phys(page) + offset; dma_addr_t dma_addr = phys_to_dma(dev, phys); - if (unlikely(swiotlb_force == SWIOTLB_FORCE)) + if (is_swiotlb_force_bounce(dev)) return swiotlb_map(dev, phys, size, dir, attrs); if (unlikely(!dma_capable(dev, dma_addr, size, true))) { diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index d8677d6637dd..04319dd22d28 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -179,6 +179,10 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, mem->end = mem->start + bytes; mem->index = 0; mem->late_alloc = late_alloc; + + if (swiotlb_force == SWIOTLB_FORCE) + mem->force_bounce = true; + spin_lock_init(&mem->lock); for (i = 0; i < mem->nslabs; i++) { mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i); -- cgit v1.2.3 From 36f7b2f3ca5f0bee00abf9ea52748ce0644a21c6 Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Thu, 24 Jun 2021 23:55:21 +0800 Subject: swiotlb: Move alloc_size to swiotlb_find_slots Rename find_slots to swiotlb_find_slots and move the maintenance of alloc_size to it for better code reusability later. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Tested-by: Will Deacon Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 04319dd22d28..0116a630dc13 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -430,8 +430,8 @@ static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index) * Find a suitable number of IO TLB entries size that will fit this request and * allocate a buffer from that IO TLB pool. */ -static int find_slots(struct device *dev, phys_addr_t orig_addr, - size_t alloc_size) +static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, + size_t alloc_size) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; unsigned long boundary_mask = dma_get_seg_boundary(dev); @@ -442,6 +442,7 @@ static int find_slots(struct device *dev, phys_addr_t orig_addr, dma_get_min_align_mask(dev) & ~(IO_TLB_SIZE - 1); unsigned int nslots = nr_slots(alloc_size), stride; unsigned int index, wrap, count = 0, i; + unsigned int offset = swiotlb_align_offset(dev, orig_addr); unsigned long flags; BUG_ON(!nslots); @@ -486,8 +487,11 @@ not_found: return -1; found: - for (i = index; i < index + nslots; i++) + for (i = index; i < index + nslots; i++) { mem->slots[i].list = 0; + mem->slots[i].alloc_size = + alloc_size - (offset + ((i - index) << IO_TLB_SHIFT)); + } for (i = index - 1; io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && mem->slots[i].list; i--) @@ -528,7 +532,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, return (phys_addr_t)DMA_MAPPING_ERROR; } - index = find_slots(dev, orig_addr, alloc_size + offset); + index = swiotlb_find_slots(dev, orig_addr, alloc_size + offset); if (index == -1) { if (!(attrs & DMA_ATTR_NO_WARN)) dev_warn_ratelimited(dev, @@ -542,11 +546,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, * This is needed when we sync the memory. Then we sync the buffer if * needed. */ - for (i = 0; i < nr_slots(alloc_size + offset); i++) { + for (i = 0; i < nr_slots(alloc_size + offset); i++) mem->slots[index + i].orig_addr = slot_addr(orig_addr, i); - mem->slots[index + i].alloc_size = - alloc_size - (i << IO_TLB_SHIFT); - } tlb_addr = slot_addr(mem->start, index) + offset; if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) -- cgit v1.2.3 From 70347877231eeb505a27abe80af7ae3d3a281519 Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Sat, 19 Jun 2021 11:40:39 +0800 Subject: swiotlb: Refactor swiotlb_tbl_unmap_single Add a new function, swiotlb_release_slots, to make the code reusable for supporting different bounce buffer pools. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Tested-by: Will Deacon Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 0116a630dc13..23b6df3a6ab7 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -555,27 +555,15 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, return tlb_addr; } -/* - * tlb_addr is the physical address of the bounce buffer to unmap. - */ -void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, - size_t mapping_size, enum dma_data_direction dir, - unsigned long attrs) +static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr) { - struct io_tlb_mem *mem = hwdev->dma_io_tlb_mem; + struct io_tlb_mem *mem = dev->dma_io_tlb_mem; unsigned long flags; - unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr); + unsigned int offset = swiotlb_align_offset(dev, tlb_addr); int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT; int nslots = nr_slots(mem->slots[index].alloc_size + offset); int count, i; - /* - * First, sync the memory before unmapping the entry - */ - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && - (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(hwdev, tlb_addr, mapping_size, DMA_FROM_DEVICE); - /* * Return the buffer to the free list by setting the corresponding * entries to indicate the number of contiguous entries available. @@ -610,6 +598,23 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, spin_unlock_irqrestore(&mem->lock, flags); } +/* + * tlb_addr is the physical address of the bounce buffer to unmap. + */ +void swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr, + size_t mapping_size, enum dma_data_direction dir, + unsigned long attrs) +{ + /* + * First, sync the memory before unmapping the entry + */ + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && + (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)) + swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_FROM_DEVICE); + + swiotlb_release_slots(dev, tlb_addr); +} + void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr, size_t size, enum dma_data_direction dir) { -- cgit v1.2.3 From f4111e39a52aa5d5136d890bbd1aa87c1c8fe3bc Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Sat, 19 Jun 2021 11:40:40 +0800 Subject: swiotlb: Add restricted DMA alloc/free support Add the functions, swiotlb_{alloc,free} and is_swiotlb_for_alloc to support the memory allocation from restricted DMA pool. The restricted DMA pool is preferred if available. Note that since coherent allocation needs remapping, one must set up another device coherent pool by shared-dma-pool and use dma_alloc_from_dev_coherent instead for atomic coherent allocation. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Tested-by: Will Deacon Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/direct.c | 49 +++++++++++++++++++++++++++++++++++++------------ kernel/dma/swiotlb.c | 38 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 73 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index a92465b4eb12..2de33e5d302b 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -75,6 +75,15 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit); } +static void __dma_direct_free_pages(struct device *dev, struct page *page, + size_t size) +{ + if (IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL) && + swiotlb_free(dev, page, size)) + return; + dma_free_contiguous(dev, page, size); +} + static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp_t gfp) { @@ -86,6 +95,16 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, &phys_limit); + if (IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL) && + is_swiotlb_for_alloc(dev)) { + page = swiotlb_alloc(dev, size); + if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { + __dma_direct_free_pages(dev, page, size); + return NULL; + } + return page; + } + page = dma_alloc_contiguous(dev, size, gfp); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { dma_free_contiguous(dev, page, size); @@ -142,7 +161,7 @@ void *dma_direct_alloc(struct device *dev, size_t size, gfp |= __GFP_NOWARN; if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && - !force_dma_unencrypted(dev)) { + !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev)) { page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO); if (!page) return NULL; @@ -155,18 +174,23 @@ void *dma_direct_alloc(struct device *dev, size_t size, } if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && - !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - !dev_is_dma_coherent(dev)) + !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && !dev_is_dma_coherent(dev) && + !is_swiotlb_for_alloc(dev)) return arch_dma_alloc(dev, size, dma_handle, gfp, attrs); /* * Remapping or decrypting memory may block. If either is required and * we can't block, allocate the memory from the atomic pools. + * If restricted DMA (i.e., is_swiotlb_for_alloc) is required, one must + * set up another device coherent pool by shared-dma-pool and use + * dma_alloc_from_dev_coherent instead. */ if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) && !gfpflags_allow_blocking(gfp) && (force_dma_unencrypted(dev) || - (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && !dev_is_dma_coherent(dev)))) + (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && + !dev_is_dma_coherent(dev))) && + !is_swiotlb_for_alloc(dev)) return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); /* we always manually zero the memory once we are done */ @@ -237,7 +261,7 @@ out_encrypt_pages: return NULL; } out_free_pages: - dma_free_contiguous(dev, page, size); + __dma_direct_free_pages(dev, page, size); return NULL; } @@ -247,15 +271,15 @@ void dma_direct_free(struct device *dev, size_t size, unsigned int page_order = get_order(size); if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && - !force_dma_unencrypted(dev)) { + !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev)) { /* cpu_addr is a struct page cookie, not a kernel address */ dma_free_contiguous(dev, cpu_addr, size); return; } if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && - !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - !dev_is_dma_coherent(dev)) { + !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && !dev_is_dma_coherent(dev) && + !is_swiotlb_for_alloc(dev)) { arch_dma_free(dev, size, cpu_addr, dma_addr, attrs); return; } @@ -273,7 +297,7 @@ void dma_direct_free(struct device *dev, size_t size, else if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED)) arch_dma_clear_uncached(cpu_addr, size); - dma_free_contiguous(dev, dma_direct_to_page(dev, dma_addr), size); + __dma_direct_free_pages(dev, dma_direct_to_page(dev, dma_addr), size); } struct page *dma_direct_alloc_pages(struct device *dev, size_t size, @@ -283,7 +307,8 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size, void *ret; if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) && - force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp)) + force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp) && + !is_swiotlb_for_alloc(dev)) return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); page = __dma_direct_alloc_pages(dev, size, gfp); @@ -310,7 +335,7 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size, *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); return page; out_free_pages: - dma_free_contiguous(dev, page, size); + __dma_direct_free_pages(dev, page, size); return NULL; } @@ -329,7 +354,7 @@ void dma_direct_free_pages(struct device *dev, size_t size, if (force_dma_unencrypted(dev)) set_memory_encrypted((unsigned long)vaddr, 1 << page_order); - dma_free_contiguous(dev, page, size); + __dma_direct_free_pages(dev, page, size); } #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 23b6df3a6ab7..44fc3d10f017 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -462,8 +462,9 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, index = wrap = wrap_index(mem, ALIGN(mem->index, stride)); do { - if ((slot_addr(tbl_dma_addr, index) & iotlb_align_mask) != - (orig_addr & iotlb_align_mask)) { + if (orig_addr && + (slot_addr(tbl_dma_addr, index) & iotlb_align_mask) != + (orig_addr & iotlb_align_mask)) { index = wrap_index(mem, index + 1); continue; } @@ -702,3 +703,36 @@ static int __init swiotlb_create_default_debugfs(void) late_initcall(swiotlb_create_default_debugfs); #endif + +#ifdef CONFIG_DMA_RESTRICTED_POOL +struct page *swiotlb_alloc(struct device *dev, size_t size) +{ + struct io_tlb_mem *mem = dev->dma_io_tlb_mem; + phys_addr_t tlb_addr; + int index; + + if (!mem) + return NULL; + + index = swiotlb_find_slots(dev, 0, size); + if (index == -1) + return NULL; + + tlb_addr = slot_addr(mem->start, index); + + return pfn_to_page(PFN_DOWN(tlb_addr)); +} + +bool swiotlb_free(struct device *dev, struct page *page, size_t size) +{ + phys_addr_t tlb_addr = page_to_phys(page); + + if (!is_swiotlb_buffer(dev, tlb_addr)) + return false; + + swiotlb_release_slots(dev, tlb_addr); + + return true; +} + +#endif /* CONFIG_DMA_RESTRICTED_POOL */ -- cgit v1.2.3 From 0b84e4f8b793eb4045fd64f6f514165a7974cd16 Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Sat, 19 Jun 2021 11:40:41 +0800 Subject: swiotlb: Add restricted DMA pool initialization Add the initialization function to create restricted DMA pools from matching reserved-memory nodes. Regardless of swiotlb setting, the restricted DMA pool is preferred if available. The restricted DMA pools provide a basic level of protection against the DMA overwriting buffer contents at unexpected times. However, to protect against general data leakage and system memory corruption, the system needs to provide a way to lock down the memory access, e.g., MPU. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Tested-by: Will Deacon Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/Kconfig | 14 ++++++++++ kernel/dma/swiotlb.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) (limited to 'kernel') diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 77b405508743..3e961dc39634 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -80,6 +80,20 @@ config SWIOTLB bool select NEED_DMA_MAP_STATE +config DMA_RESTRICTED_POOL + bool "DMA Restricted Pool" + depends on OF && OF_RESERVED_MEM + select SWIOTLB + help + This enables support for restricted DMA pools which provide a level of + DMA memory protection on systems with limited hardware protection + capabilities, such as those lacking an IOMMU. + + For more information see + + and . + If unsure, say "n". + # # Should be selected if we can mmap non-coherent mappings to userspace. # The only thing that is really required is a way to set an uncached bit diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 44fc3d10f017..0ffbaae9fba2 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -39,6 +39,13 @@ #ifdef CONFIG_DEBUG_FS #include #endif +#ifdef CONFIG_DMA_RESTRICTED_POOL +#include +#include +#include +#include +#include +#endif #include #include @@ -735,4 +742,73 @@ bool swiotlb_free(struct device *dev, struct page *page, size_t size) return true; } +static int rmem_swiotlb_device_init(struct reserved_mem *rmem, + struct device *dev) +{ + struct io_tlb_mem *mem = rmem->priv; + unsigned long nslabs = rmem->size >> IO_TLB_SHIFT; + + /* + * Since multiple devices can share the same pool, the private data, + * io_tlb_mem struct, will be initialized by the first device attached + * to it. + */ + if (!mem) { + mem = kzalloc(struct_size(mem, slots, nslabs), GFP_KERNEL); + if (!mem) + return -ENOMEM; + + set_memory_decrypted((unsigned long)phys_to_virt(rmem->base), + rmem->size >> PAGE_SHIFT); + swiotlb_init_io_tlb_mem(mem, rmem->base, nslabs, false); + mem->force_bounce = true; + mem->for_alloc = true; + + rmem->priv = mem; + + if (IS_ENABLED(CONFIG_DEBUG_FS)) { + mem->debugfs = + debugfs_create_dir(rmem->name, debugfs_dir); + swiotlb_create_debugfs_files(mem); + } + } + + dev->dma_io_tlb_mem = mem; + + return 0; +} + +static void rmem_swiotlb_device_release(struct reserved_mem *rmem, + struct device *dev) +{ + dev->dma_io_tlb_mem = io_tlb_default_mem; +} + +static const struct reserved_mem_ops rmem_swiotlb_ops = { + .device_init = rmem_swiotlb_device_init, + .device_release = rmem_swiotlb_device_release, +}; + +static int __init rmem_swiotlb_setup(struct reserved_mem *rmem) +{ + unsigned long node = rmem->fdt_node; + + if (of_get_flat_dt_prop(node, "reusable", NULL) || + of_get_flat_dt_prop(node, "linux,cma-default", NULL) || + of_get_flat_dt_prop(node, "linux,dma-default", NULL) || + of_get_flat_dt_prop(node, "no-map", NULL)) + return -EINVAL; + + if (PageHighMem(pfn_to_page(PHYS_PFN(rmem->base)))) { + pr_err("Restricted DMA pool must be accessible within the linear mapping."); + return -EINVAL; + } + + rmem->ops = &rmem_swiotlb_ops; + pr_info("Reserved memory: created restricted DMA pool at %pa, size %ld MiB\n", + &rmem->base, (unsigned long)rmem->size / SZ_1M); + return 0; +} + +RESERVEDMEM_OF_DECLARE(dma, "restricted-dma-pool", rmem_swiotlb_setup); #endif /* CONFIG_DMA_RESTRICTED_POOL */ -- cgit v1.2.3 From 09a4a79d42ced8ca7fc250b05e45ac1cb23a9c52 Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Thu, 1 Jul 2021 11:31:30 +0800 Subject: swiotlb: fix implicit debugfs declarations Factor out the debugfs bits from rmem_swiotlb_device_init() into a separate rmem_swiotlb_debugfs_init() to fix the implicit debugfs declarations. Fixes: 461021875c50 ("swiotlb: Add restricted DMA pool initialization") Reported-by: kernel test robot Signed-off-by: Claire Chang Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 0ffbaae9fba2..b7f76bca89bf 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -712,6 +712,21 @@ late_initcall(swiotlb_create_default_debugfs); #endif #ifdef CONFIG_DMA_RESTRICTED_POOL + +#ifdef CONFIG_DEBUG_FS +static void rmem_swiotlb_debugfs_init(struct reserved_mem *rmem) +{ + struct io_tlb_mem *mem = rmem->priv; + + mem->debugfs = debugfs_create_dir(rmem->name, debugfs_dir); + swiotlb_create_debugfs_files(mem); +} +#else +static void rmem_swiotlb_debugfs_init(struct reserved_mem *rmem) +{ +} +#endif + struct page *swiotlb_alloc(struct device *dev, size_t size) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; @@ -766,11 +781,7 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem, rmem->priv = mem; - if (IS_ENABLED(CONFIG_DEBUG_FS)) { - mem->debugfs = - debugfs_create_dir(rmem->name, debugfs_dir); - swiotlb_create_debugfs_files(mem); - } + rmem_swiotlb_debugfs_init(rmem); } dev->dma_io_tlb_mem = mem; -- cgit v1.2.3 From 868c9ddc182bc6728bb380cbfb3170734f72c599 Mon Sep 17 00:00:00 2001 From: Dominique Martinet Date: Wed, 7 Jul 2021 14:12:54 +0900 Subject: swiotlb: add overflow checks to swiotlb_bounce This is a follow-up on 5f89468e2f06 ("swiotlb: manipulate orig_addr when tlb_addr has offset") which fixed unaligned dma mappings, making sure the following overflows are caught: - offset of the start of the slot within the device bigger than requested address' offset, in other words if the base address given in swiotlb_tbl_map_single to create the mapping (orig_addr) was after the requested address for the sync (tlb_offset) in the same block: |------------------------------------------| block <----------------------------> mapped part of the block ^ orig_addr ^ invalid tlb_addr for sync - if the resulting offset was bigger than the allocation size this one could happen if the mapping was not until the end. e.g. |------------------------------------------| block <---------------------> mapped part of the block ^ ^ orig_addr invalid tlb_addr Both should never happen so print a warning and bail out without trying to adjust the sizes/offsets: the first one could try to sync from orig_addr to whatever is left of the requested size, but the later really has nothing to sync there... Signed-off-by: Dominique Martinet Cc: Konrad Rzeszutek Wilk Reviewed-by: Bumyong Lee Cc: Christoph Hellwig Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index b7f76bca89bf..f1a9ae7fad8f 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -365,13 +365,27 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size size_t alloc_size = mem->slots[index].alloc_size; unsigned long pfn = PFN_DOWN(orig_addr); unsigned char *vaddr = phys_to_virt(tlb_addr); - unsigned int tlb_offset; + unsigned int tlb_offset, orig_addr_offset; if (orig_addr == INVALID_PHYS_ADDR) return; - tlb_offset = (tlb_addr & (IO_TLB_SIZE - 1)) - - swiotlb_align_offset(dev, orig_addr); + tlb_offset = tlb_addr & (IO_TLB_SIZE - 1); + orig_addr_offset = swiotlb_align_offset(dev, orig_addr); + if (tlb_offset < orig_addr_offset) { + dev_WARN_ONCE(dev, 1, + "Access before mapping start detected. orig offset %u, requested offset %u.\n", + orig_addr_offset, tlb_offset); + return; + } + + tlb_offset -= orig_addr_offset; + if (tlb_offset > alloc_size) { + dev_WARN_ONCE(dev, 1, + "Buffer overflow detected. Allocation size: %zu. Mapping size: %zu+%u.\n", + alloc_size, size, tlb_offset); + return; + } orig_addr += tlb_offset; alloc_size -= tlb_offset; -- cgit v1.2.3 From 75f0fc7b48ad45a2e5736bcf8de26c8872fe8695 Mon Sep 17 00:00:00 2001 From: He Fengqing Date: Wed, 14 Jul 2021 10:18:15 +0000 Subject: bpf: Fix potential memleak and UAF in the verifier. In bpf_patch_insn_data(), we first use the bpf_patch_insn_single() to insert new instructions, then use adjust_insn_aux_data() to adjust insn_aux_data. If the old env->prog have no enough room for new inserted instructions, we use bpf_prog_realloc to construct new_prog and free the old env->prog. There have two errors here. First, if adjust_insn_aux_data() return ENOMEM, we should free the new_prog. Second, if adjust_insn_aux_data() return ENOMEM, bpf_patch_insn_data() will return NULL, and env->prog has been freed in bpf_prog_realloc, but we will use it in bpf_check(). So in this patch, we make the adjust_insn_aux_data() never fails. In bpf_patch_insn_data(), we first pre-malloc memory for the new insn_aux_data, then call bpf_patch_insn_single() to insert new instructions, at last call adjust_insn_aux_data() to adjust insn_aux_data. Fixes: 8041902dae52 ("bpf: adjust insn_aux_data when patching insns") Signed-off-by: He Fengqing Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210714101815.164322-1-hefengqing@huawei.com --- kernel/bpf/verifier.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index be38bb930bf1..3dbb3b40b754 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11425,10 +11425,11 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying * [0, off) and [off, end) to new locations, so the patched range stays zero */ -static int adjust_insn_aux_data(struct bpf_verifier_env *env, - struct bpf_prog *new_prog, u32 off, u32 cnt) +static void adjust_insn_aux_data(struct bpf_verifier_env *env, + struct bpf_insn_aux_data *new_data, + struct bpf_prog *new_prog, u32 off, u32 cnt) { - struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data; + struct bpf_insn_aux_data *old_data = env->insn_aux_data; struct bpf_insn *insn = new_prog->insnsi; u32 old_seen = old_data[off].seen; u32 prog_len; @@ -11441,12 +11442,9 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, old_data[off].zext_dst = insn_has_def32(env, insn + off + cnt - 1); if (cnt == 1) - return 0; + return; prog_len = new_prog->len; - new_data = vzalloc(array_size(prog_len, - sizeof(struct bpf_insn_aux_data))); - if (!new_data) - return -ENOMEM; + memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off); memcpy(new_data + off + cnt - 1, old_data + off, sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); @@ -11457,7 +11455,6 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, } env->insn_aux_data = new_data; vfree(old_data); - return 0; } static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len) @@ -11492,6 +11489,14 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of const struct bpf_insn *patch, u32 len) { struct bpf_prog *new_prog; + struct bpf_insn_aux_data *new_data = NULL; + + if (len > 1) { + new_data = vzalloc(array_size(env->prog->len + len - 1, + sizeof(struct bpf_insn_aux_data))); + if (!new_data) + return NULL; + } new_prog = bpf_patch_insn_single(env->prog, off, patch, len); if (IS_ERR(new_prog)) { @@ -11499,10 +11504,10 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of verbose(env, "insn %d cannot be patched due to 16-bit range\n", env->insn_aux_data[off].orig_idx); + vfree(new_data); return NULL; } - if (adjust_insn_aux_data(env, new_prog, off, len)) - return NULL; + adjust_insn_aux_data(env, new_data, new_prog, off, len); adjust_subprog_starts(env, off, len); adjust_poke_descs(new_prog, off, len); return new_prog; -- cgit v1.2.3 From d809e134be7a1fdd9f5b99ab3291c6da5c0b8240 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 14 Jul 2021 17:54:07 -0700 Subject: bpf: Prepare bpf_prog_put() to be called from irq context. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently bpf_prog_put() is called from the task context only. With addition of bpf timers the timer related helpers will start calling bpf_prog_put() from irq-saved region and in rare cases might drop the refcnt to zero. To address this case, first, convert bpf_prog_free_id() to be irq-save (this is similar to bpf_map_free_id), and, second, defer non irq appropriate calls into work queue. For example: bpf_audit_prog() is calling kmalloc and wake_up_interruptible, bpf_prog_kallsyms_del_all()->bpf_ksym_del()->spin_unlock_bh(). They are not safe with irqs disabled. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Acked-by: Andrii Nakryiko Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210715005417.78572-2-alexei.starovoitov@gmail.com --- kernel/bpf/syscall.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e343f158e556..5d1fee634be8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1699,6 +1699,8 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog) void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) { + unsigned long flags; + /* cBPF to eBPF migrations are currently not in the idr store. * Offloaded programs are removed from the store when their device * disappears - even if someone grabs an fd to them they are unusable, @@ -1708,7 +1710,7 @@ void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) return; if (do_idr_lock) - spin_lock_bh(&prog_idr_lock); + spin_lock_irqsave(&prog_idr_lock, flags); else __acquire(&prog_idr_lock); @@ -1716,7 +1718,7 @@ void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) prog->aux->id = 0; if (do_idr_lock) - spin_unlock_bh(&prog_idr_lock); + spin_unlock_irqrestore(&prog_idr_lock, flags); else __release(&prog_idr_lock); } @@ -1752,14 +1754,32 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) } } +static void bpf_prog_put_deferred(struct work_struct *work) +{ + struct bpf_prog_aux *aux; + struct bpf_prog *prog; + + aux = container_of(work, struct bpf_prog_aux, work); + prog = aux->prog; + perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); + bpf_audit_prog(prog, BPF_AUDIT_UNLOAD); + __bpf_prog_put_noref(prog, true); +} + static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { - if (atomic64_dec_and_test(&prog->aux->refcnt)) { - perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); - bpf_audit_prog(prog, BPF_AUDIT_UNLOAD); + struct bpf_prog_aux *aux = prog->aux; + + if (atomic64_dec_and_test(&aux->refcnt)) { /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); - __bpf_prog_put_noref(prog, true); + + if (in_irq() || irqs_disabled()) { + INIT_WORK(&aux->work, bpf_prog_put_deferred); + schedule_work(&aux->work); + } else { + bpf_prog_put_deferred(&aux->work); + } } } -- cgit v1.2.3 From c1b3fed319d32a721d4b9c17afaeb430444ff773 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 14 Jul 2021 17:54:08 -0700 Subject: bpf: Factor out bpf_spin_lock into helpers. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move ____bpf_spin_lock/unlock into helpers to make it more clear that quadruple underscore bpf_spin_lock/unlock are irqsave/restore variants. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Acked-by: Andrii Nakryiko Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210715005417.78572-3-alexei.starovoitov@gmail.com --- kernel/bpf/helpers.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 62cf00383910..38be3cfc2f58 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -289,13 +289,18 @@ static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock) static DEFINE_PER_CPU(unsigned long, irqsave_flags); -notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock) +static inline void __bpf_spin_lock_irqsave(struct bpf_spin_lock *lock) { unsigned long flags; local_irq_save(flags); __bpf_spin_lock(lock); __this_cpu_write(irqsave_flags, flags); +} + +notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock) +{ + __bpf_spin_lock_irqsave(lock); return 0; } @@ -306,13 +311,18 @@ const struct bpf_func_proto bpf_spin_lock_proto = { .arg1_type = ARG_PTR_TO_SPIN_LOCK, }; -notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock) +static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock) { unsigned long flags; flags = __this_cpu_read(irqsave_flags); __bpf_spin_unlock(lock); local_irq_restore(flags); +} + +notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock) +{ + __bpf_spin_unlock_irqrestore(lock); return 0; } @@ -333,9 +343,9 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, else lock = dst + map->spin_lock_off; preempt_disable(); - ____bpf_spin_lock(lock); + __bpf_spin_lock_irqsave(lock); copy_map_value(map, dst, src); - ____bpf_spin_unlock(lock); + __bpf_spin_unlock_irqrestore(lock); preempt_enable(); } -- cgit v1.2.3 From b00628b1c7d595ae5b544e059c27b1f5828314b4 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 14 Jul 2021 17:54:09 -0700 Subject: bpf: Introduce bpf timers. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce 'struct bpf_timer { __u64 :64; __u64 :64; };' that can be embedded in hash/array/lru maps as a regular field and helpers to operate on it: // Initialize the timer. // First 4 bits of 'flags' specify clockid. // Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed. long bpf_timer_init(struct bpf_timer *timer, struct bpf_map *map, int flags); // Configure the timer to call 'callback_fn' static function. long bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn); // Arm the timer to expire 'nsec' nanoseconds from the current time. long bpf_timer_start(struct bpf_timer *timer, u64 nsec, u64 flags); // Cancel the timer and wait for callback_fn to finish if it was running. long bpf_timer_cancel(struct bpf_timer *timer); Here is how BPF program might look like: struct map_elem { int counter; struct bpf_timer timer; }; struct { __uint(type, BPF_MAP_TYPE_HASH); __uint(max_entries, 1000); __type(key, int); __type(value, struct map_elem); } hmap SEC(".maps"); static int timer_cb(void *map, int *key, struct map_elem *val); /* val points to particular map element that contains bpf_timer. */ SEC("fentry/bpf_fentry_test1") int BPF_PROG(test1, int a) { struct map_elem *val; int key = 0; val = bpf_map_lookup_elem(&hmap, &key); if (val) { bpf_timer_init(&val->timer, &hmap, CLOCK_REALTIME); bpf_timer_set_callback(&val->timer, timer_cb); bpf_timer_start(&val->timer, 1000 /* call timer_cb2 in 1 usec */, 0); } } This patch adds helper implementations that rely on hrtimers to call bpf functions as timers expire. The following patches add necessary safety checks. Only programs with CAP_BPF are allowed to use bpf_timer. The amount of timers used by the program is constrained by the memcg recorded at map creation time. The bpf_timer_init() helper needs explicit 'map' argument because inner maps are dynamic and not known at load time. While the bpf_timer_set_callback() is receiving hidden 'aux->prog' argument supplied by the verifier. The prog pointer is needed to do refcnting of bpf program to make sure that program doesn't get freed while the timer is armed. This approach relies on "user refcnt" scheme used in prog_array that stores bpf programs for bpf_tail_call. The bpf_timer_set_callback() will increment the prog refcnt which is paired with bpf_timer_cancel() that will drop the prog refcnt. The ops->map_release_uref is responsible for cancelling the timers and dropping prog refcnt when user space reference to a map reaches zero. This uref approach is done to make sure that Ctrl-C of user space process will not leave timers running forever unless the user space explicitly pinned a map that contained timers in bpffs. bpf_timer_init() and bpf_timer_set_callback() will return -EPERM if map doesn't have user references (is not held by open file descriptor from user space and not pinned in bpffs). The bpf_map_delete_elem() and bpf_map_update_elem() operations cancel and free the timer if given map element had it allocated. "bpftool map update" command can be used to cancel timers. The 'struct bpf_timer' is explicitly __attribute__((aligned(8))) because '__u64 :64' has 1 byte alignment of 8 byte padding. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Acked-by: Andrii Nakryiko Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210715005417.78572-4-alexei.starovoitov@gmail.com --- kernel/bpf/helpers.c | 324 +++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 109 ++++++++++++++++ kernel/trace/bpf_trace.c | 2 +- 3 files changed, 434 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 38be3cfc2f58..74b16593983d 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -999,6 +999,322 @@ const struct bpf_func_proto bpf_snprintf_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +/* BPF map elements can contain 'struct bpf_timer'. + * Such map owns all of its BPF timers. + * 'struct bpf_timer' is allocated as part of map element allocation + * and it's zero initialized. + * That space is used to keep 'struct bpf_timer_kern'. + * bpf_timer_init() allocates 'struct bpf_hrtimer', inits hrtimer, and + * remembers 'struct bpf_map *' pointer it's part of. + * bpf_timer_set_callback() increments prog refcnt and assign bpf callback_fn. + * bpf_timer_start() arms the timer. + * If user space reference to a map goes to zero at this point + * ops->map_release_uref callback is responsible for cancelling the timers, + * freeing their memory, and decrementing prog's refcnts. + * bpf_timer_cancel() cancels the timer and decrements prog's refcnt. + * Inner maps can contain bpf timers as well. ops->map_release_uref is + * freeing the timers when inner map is replaced or deleted by user space. + */ +struct bpf_hrtimer { + struct hrtimer timer; + struct bpf_map *map; + struct bpf_prog *prog; + void __rcu *callback_fn; + void *value; +}; + +/* the actual struct hidden inside uapi struct bpf_timer */ +struct bpf_timer_kern { + struct bpf_hrtimer *timer; + /* bpf_spin_lock is used here instead of spinlock_t to make + * sure that it always fits into space resereved by struct bpf_timer + * regardless of LOCKDEP and spinlock debug flags. + */ + struct bpf_spin_lock lock; +} __attribute__((aligned(8))); + +static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running); + +static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) +{ + struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer); + struct bpf_map *map = t->map; + void *value = t->value; + void *callback_fn; + void *key; + u32 idx; + int ret; + + callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held()); + if (!callback_fn) + goto out; + + /* bpf_timer_cb() runs in hrtimer_run_softirq. It doesn't migrate and + * cannot be preempted by another bpf_timer_cb() on the same cpu. + * Remember the timer this callback is servicing to prevent + * deadlock if callback_fn() calls bpf_timer_cancel() or + * bpf_map_delete_elem() on the same timer. + */ + this_cpu_write(hrtimer_running, t); + if (map->map_type == BPF_MAP_TYPE_ARRAY) { + struct bpf_array *array = container_of(map, struct bpf_array, map); + + /* compute the key */ + idx = ((char *)value - array->value) / array->elem_size; + key = &idx; + } else { /* hash or lru */ + key = value - round_up(map->key_size, 8); + } + + ret = BPF_CAST_CALL(callback_fn)((u64)(long)map, + (u64)(long)key, + (u64)(long)value, 0, 0); + WARN_ON(ret != 0); /* Next patch moves this check into the verifier */ + + this_cpu_write(hrtimer_running, NULL); +out: + return HRTIMER_NORESTART; +} + +BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map, + u64, flags) +{ + clockid_t clockid = flags & (MAX_CLOCKS - 1); + struct bpf_hrtimer *t; + int ret = 0; + + BUILD_BUG_ON(MAX_CLOCKS != 16); + BUILD_BUG_ON(sizeof(struct bpf_timer_kern) > sizeof(struct bpf_timer)); + BUILD_BUG_ON(__alignof__(struct bpf_timer_kern) != __alignof__(struct bpf_timer)); + + if (in_nmi()) + return -EOPNOTSUPP; + + if (flags >= MAX_CLOCKS || + /* similar to timerfd except _ALARM variants are not supported */ + (clockid != CLOCK_MONOTONIC && + clockid != CLOCK_REALTIME && + clockid != CLOCK_BOOTTIME)) + return -EINVAL; + __bpf_spin_lock_irqsave(&timer->lock); + t = timer->timer; + if (t) { + ret = -EBUSY; + goto out; + } + if (!atomic64_read(&map->usercnt)) { + /* maps with timers must be either held by user space + * or pinned in bpffs. + */ + ret = -EPERM; + goto out; + } + /* allocate hrtimer via map_kmalloc to use memcg accounting */ + t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, map->numa_node); + if (!t) { + ret = -ENOMEM; + goto out; + } + t->value = (void *)timer - map->timer_off; + t->map = map; + t->prog = NULL; + rcu_assign_pointer(t->callback_fn, NULL); + hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); + t->timer.function = bpf_timer_cb; + timer->timer = t; +out: + __bpf_spin_unlock_irqrestore(&timer->lock); + return ret; +} + +static const struct bpf_func_proto bpf_timer_init_proto = { + .func = bpf_timer_init, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_TIMER, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callback_fn, + struct bpf_prog_aux *, aux) +{ + struct bpf_prog *prev, *prog = aux->prog; + struct bpf_hrtimer *t; + int ret = 0; + + if (in_nmi()) + return -EOPNOTSUPP; + __bpf_spin_lock_irqsave(&timer->lock); + t = timer->timer; + if (!t) { + ret = -EINVAL; + goto out; + } + if (!atomic64_read(&t->map->usercnt)) { + /* maps with timers must be either held by user space + * or pinned in bpffs. Otherwise timer might still be + * running even when bpf prog is detached and user space + * is gone, since map_release_uref won't ever be called. + */ + ret = -EPERM; + goto out; + } + prev = t->prog; + if (prev != prog) { + /* Bump prog refcnt once. Every bpf_timer_set_callback() + * can pick different callback_fn-s within the same prog. + */ + prog = bpf_prog_inc_not_zero(prog); + if (IS_ERR(prog)) { + ret = PTR_ERR(prog); + goto out; + } + if (prev) + /* Drop prev prog refcnt when swapping with new prog */ + bpf_prog_put(prev); + t->prog = prog; + } + rcu_assign_pointer(t->callback_fn, callback_fn); +out: + __bpf_spin_unlock_irqrestore(&timer->lock); + return ret; +} + +static const struct bpf_func_proto bpf_timer_set_callback_proto = { + .func = bpf_timer_set_callback, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_TIMER, + .arg2_type = ARG_PTR_TO_FUNC, +}; + +BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, flags) +{ + struct bpf_hrtimer *t; + int ret = 0; + + if (in_nmi()) + return -EOPNOTSUPP; + if (flags) + return -EINVAL; + __bpf_spin_lock_irqsave(&timer->lock); + t = timer->timer; + if (!t || !t->prog) { + ret = -EINVAL; + goto out; + } + hrtimer_start(&t->timer, ns_to_ktime(nsecs), HRTIMER_MODE_REL_SOFT); +out: + __bpf_spin_unlock_irqrestore(&timer->lock); + return ret; +} + +static const struct bpf_func_proto bpf_timer_start_proto = { + .func = bpf_timer_start, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_TIMER, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +static void drop_prog_refcnt(struct bpf_hrtimer *t) +{ + struct bpf_prog *prog = t->prog; + + if (prog) { + bpf_prog_put(prog); + t->prog = NULL; + rcu_assign_pointer(t->callback_fn, NULL); + } +} + +BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer) +{ + struct bpf_hrtimer *t; + int ret = 0; + + if (in_nmi()) + return -EOPNOTSUPP; + __bpf_spin_lock_irqsave(&timer->lock); + t = timer->timer; + if (!t) { + ret = -EINVAL; + goto out; + } + if (this_cpu_read(hrtimer_running) == t) { + /* If bpf callback_fn is trying to bpf_timer_cancel() + * its own timer the hrtimer_cancel() will deadlock + * since it waits for callback_fn to finish + */ + ret = -EDEADLK; + goto out; + } + drop_prog_refcnt(t); +out: + __bpf_spin_unlock_irqrestore(&timer->lock); + /* Cancel the timer and wait for associated callback to finish + * if it was running. + */ + ret = ret ?: hrtimer_cancel(&t->timer); + return ret; +} + +static const struct bpf_func_proto bpf_timer_cancel_proto = { + .func = bpf_timer_cancel, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_TIMER, +}; + +/* This function is called by map_delete/update_elem for individual element and + * by ops->map_release_uref when the user space reference to a map reaches zero. + */ +void bpf_timer_cancel_and_free(void *val) +{ + struct bpf_timer_kern *timer = val; + struct bpf_hrtimer *t; + + /* Performance optimization: read timer->timer without lock first. */ + if (!READ_ONCE(timer->timer)) + return; + + __bpf_spin_lock_irqsave(&timer->lock); + /* re-read it under lock */ + t = timer->timer; + if (!t) + goto out; + drop_prog_refcnt(t); + /* The subsequent bpf_timer_start/cancel() helpers won't be able to use + * this timer, since it won't be initialized. + */ + timer->timer = NULL; +out: + __bpf_spin_unlock_irqrestore(&timer->lock); + if (!t) + return; + /* Cancel the timer and wait for callback to complete if it was running. + * If hrtimer_cancel() can be safely called it's safe to call kfree(t) + * right after for both preallocated and non-preallocated maps. + * The timer->timer = NULL was already done and no code path can + * see address 't' anymore. + * + * Check that bpf_map_delete/update_elem() wasn't called from timer + * callback_fn. In such case don't call hrtimer_cancel() (since it will + * deadlock) and don't call hrtimer_try_to_cancel() (since it will just + * return -1). Though callback_fn is still running on this cpu it's + * safe to do kfree(t) because bpf_timer_cb() read everything it needed + * from 't'. The bpf subprog callback_fn won't be able to access 't', + * since timer->timer = NULL was already done. The timer will be + * effectively cancelled because bpf_timer_cb() will return + * HRTIMER_NORESTART. + */ + if (this_cpu_read(hrtimer_running) != t) + hrtimer_cancel(&t->timer); + kfree(t); +} + const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; @@ -1065,6 +1381,14 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_per_cpu_ptr_proto; case BPF_FUNC_this_cpu_ptr: return &bpf_this_cpu_ptr_proto; + case BPF_FUNC_timer_init: + return &bpf_timer_init_proto; + case BPF_FUNC_timer_set_callback: + return &bpf_timer_set_callback_proto; + case BPF_FUNC_timer_start: + return &bpf_timer_start_proto; + case BPF_FUNC_timer_cancel: + return &bpf_timer_cancel_proto; default: break; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3dbb3b40b754..e8645c819803 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4656,6 +4656,38 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, return 0; } +static int process_timer_func(struct bpf_verifier_env *env, int regno, + struct bpf_call_arg_meta *meta) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + bool is_const = tnum_is_const(reg->var_off); + struct bpf_map *map = reg->map_ptr; + u64 val = reg->var_off.value; + + if (!is_const) { + verbose(env, + "R%d doesn't have constant offset. bpf_timer has to be at the constant offset\n", + regno); + return -EINVAL; + } + if (!map->btf) { + verbose(env, "map '%s' has to have BTF in order to use bpf_timer\n", + map->name); + return -EINVAL; + } + if (val) { + /* This restriction will be removed in the next patch */ + verbose(env, "bpf_timer field can only be first in the map value element\n"); + return -EINVAL; + } + if (meta->map_ptr) { + verbose(env, "verifier bug. Two map pointers in a timer helper\n"); + return -EFAULT; + } + meta->map_ptr = map; + return 0; +} + static bool arg_type_is_mem_ptr(enum bpf_arg_type type) { return type == ARG_PTR_TO_MEM || @@ -4788,6 +4820,7 @@ static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PER static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } }; static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } }; static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } }; +static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, @@ -4819,6 +4852,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_FUNC] = &func_ptr_types, [ARG_PTR_TO_STACK_OR_NULL] = &stack_ptr_types, [ARG_PTR_TO_CONST_STR] = &const_str_ptr_types, + [ARG_PTR_TO_TIMER] = &timer_types, }; static int check_reg_type(struct bpf_verifier_env *env, u32 regno, @@ -4948,6 +4982,10 @@ skip_type_check: if (arg_type == ARG_CONST_MAP_PTR) { /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ + if (meta->map_ptr && meta->map_ptr != reg->map_ptr) { + verbose(env, "Map pointer doesn't match bpf_timer.\n"); + return -EINVAL; + } meta->map_ptr = reg->map_ptr; } else if (arg_type == ARG_PTR_TO_MAP_KEY) { /* bpf_map_xxx(..., map_ptr, ..., key) call: @@ -5000,6 +5038,9 @@ skip_type_check: verbose(env, "verifier internal error\n"); return -EFAULT; } + } else if (arg_type == ARG_PTR_TO_TIMER) { + if (process_timer_func(env, regno, meta)) + return -EACCES; } else if (arg_type == ARG_PTR_TO_FUNC) { meta->subprogno = reg->subprogno; } else if (arg_type_is_mem_ptr(arg_type)) { @@ -5742,6 +5783,34 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env, return 0; } +static int set_timer_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + struct bpf_map *map_ptr = caller->regs[BPF_REG_1].map_ptr; + + /* bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn); + * callback_fn(struct bpf_map *map, void *key, void *value); + */ + callee->regs[BPF_REG_1].type = CONST_PTR_TO_MAP; + __mark_reg_known_zero(&callee->regs[BPF_REG_1]); + callee->regs[BPF_REG_1].map_ptr = map_ptr; + + callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY; + __mark_reg_known_zero(&callee->regs[BPF_REG_2]); + callee->regs[BPF_REG_2].map_ptr = map_ptr; + + callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE; + __mark_reg_known_zero(&callee->regs[BPF_REG_3]); + callee->regs[BPF_REG_3].map_ptr = map_ptr; + + /* unused */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + return 0; +} + static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state; @@ -6069,6 +6138,13 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EINVAL; } + if (func_id == BPF_FUNC_timer_set_callback) { + err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, + set_timer_callback_state); + if (err < 0) + return -EINVAL; + } + if (func_id == BPF_FUNC_snprintf) { err = check_bpf_snprintf_call(env, regs); if (err < 0) @@ -12591,6 +12667,39 @@ static int do_misc_fixups(struct bpf_verifier_env *env) continue; } + if (insn->imm == BPF_FUNC_timer_set_callback) { + /* The verifier will process callback_fn as many times as necessary + * with different maps and the register states prepared by + * set_timer_callback_state will be accurate. + * + * The following use case is valid: + * map1 is shared by prog1, prog2, prog3. + * prog1 calls bpf_timer_init for some map1 elements + * prog2 calls bpf_timer_set_callback for some map1 elements. + * Those that were not bpf_timer_init-ed will return -EINVAL. + * prog3 calls bpf_timer_start for some map1 elements. + * Those that were not both bpf_timer_init-ed and + * bpf_timer_set_callback-ed will return -EINVAL. + */ + struct bpf_insn ld_addrs[2] = { + BPF_LD_IMM64(BPF_REG_3, (long)prog->aux), + }; + + insn_buf[0] = ld_addrs[0]; + insn_buf[1] = ld_addrs[1]; + insn_buf[2] = *insn; + cnt = 3; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto patch_call_imm; + } + /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup * and other inlining handlers are currently limited to 64 bit * only. diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 64bd2d84367f..6c77d25137e0 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1059,7 +1059,7 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_snprintf: return &bpf_snprintf_proto; default: - return NULL; + return bpf_base_func_proto(func_id); } } -- cgit v1.2.3 From 68134668c17f31f51930478f75495b552a411550 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 14 Jul 2021 17:54:10 -0700 Subject: bpf: Add map side support for bpf timers. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Restrict bpf timers to array, hash (both preallocated and kmalloced), and lru map types. The per-cpu maps with timers don't make sense, since 'struct bpf_timer' is a part of map value. bpf timers in per-cpu maps would mean that the number of timers depends on number of possible cpus and timers would not be accessible from all cpus. lpm map support can be added in the future. The timers in inner maps are supported. The bpf_map_update/delete_elem() helpers and sys_bpf commands cancel and free bpf_timer in a given map element. Similar to 'struct bpf_spin_lock' BTF is required and it is used to validate that map element indeed contains 'struct bpf_timer'. Make check_and_init_map_value() init both bpf_spin_lock and bpf_timer when map element data is reused in preallocated htab and lru maps. Teach copy_map_value() to support both bpf_spin_lock and bpf_timer in a single map element. There could be one of each, but not more than one. Due to 'one bpf_timer in one element' restriction do not support timers in global data, since global data is a map of single element, but from bpf program side it's seen as many global variables and restriction of single global timer would be odd. The sys_bpf map_freeze and sys_mmap syscalls are not allowed on maps with timers, since user space could have corrupted mmap element and crashed the kernel. The maps with timers cannot be readonly. Due to these restrictions search for bpf_timer in datasec BTF in case it was placed in the global data to report clear error. The previous patch allowed 'struct bpf_timer' as a first field in a map element only. Relax this restriction. Refactor lru map to s/bpf_lru_push_free/htab_lru_push_free/ to cancel and free the timer when lru map deletes an element as a part of it eviction algorithm. Make sure that bpf program cannot access 'struct bpf_timer' via direct load/store. The timer operation are done through helpers only. This is similar to 'struct bpf_spin_lock'. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Martin KaFai Lau Acked-by: Andrii Nakryiko Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210715005417.78572-5-alexei.starovoitov@gmail.com --- kernel/bpf/arraymap.c | 21 +++++++++ kernel/bpf/btf.c | 77 +++++++++++++++++++++++++++------ kernel/bpf/hashtab.c | 105 +++++++++++++++++++++++++++++++++++++++------ kernel/bpf/local_storage.c | 4 +- kernel/bpf/map_in_map.c | 2 + kernel/bpf/syscall.c | 21 +++++++-- kernel/bpf/verifier.c | 30 +++++++++++-- 7 files changed, 225 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 3c4105603f9d..cebd4fb06d19 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -287,6 +287,12 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key return 0; } +static void check_and_free_timer_in_array(struct bpf_array *arr, void *val) +{ + if (unlikely(map_value_has_timer(&arr->map))) + bpf_timer_cancel_and_free(val + arr->map.timer_off); +} + /* Called from syscall or from eBPF program */ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) @@ -321,6 +327,7 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, copy_map_value_locked(map, val, value, false); else copy_map_value(map, val, value); + check_and_free_timer_in_array(array, val); } return 0; } @@ -374,6 +381,19 @@ static void *array_map_vmalloc_addr(struct bpf_array *array) return (void *)round_down((unsigned long)array, PAGE_SIZE); } +static void array_map_free_timers(struct bpf_map *map) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + int i; + + if (likely(!map_value_has_timer(map))) + return; + + for (i = 0; i < array->map.max_entries; i++) + bpf_timer_cancel_and_free(array->value + array->elem_size * i + + map->timer_off); +} + /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void array_map_free(struct bpf_map *map) { @@ -668,6 +688,7 @@ const struct bpf_map_ops array_map_ops = { .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, + .map_release_uref = array_map_free_timers, .map_lookup_elem = array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index cb4b72997d9b..7780131f710e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3046,43 +3046,92 @@ static void btf_struct_log(struct btf_verifier_env *env, btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); } -/* find 'struct bpf_spin_lock' in map value. - * return >= 0 offset if found - * and < 0 in case of error - */ -int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t) +static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t, + const char *name, int sz, int align) { const struct btf_member *member; u32 i, off = -ENOENT; - if (!__btf_type_is_struct(t)) - return -EINVAL; - for_each_member(i, t, member) { const struct btf_type *member_type = btf_type_by_id(btf, member->type); if (!__btf_type_is_struct(member_type)) continue; - if (member_type->size != sizeof(struct bpf_spin_lock)) + if (member_type->size != sz) continue; - if (strcmp(__btf_name_by_offset(btf, member_type->name_off), - "bpf_spin_lock")) + if (strcmp(__btf_name_by_offset(btf, member_type->name_off), name)) continue; if (off != -ENOENT) - /* only one 'struct bpf_spin_lock' is allowed */ + /* only one such field is allowed */ return -E2BIG; off = btf_member_bit_offset(t, member); if (off % 8) /* valid C code cannot generate such BTF */ return -EINVAL; off /= 8; - if (off % __alignof__(struct bpf_spin_lock)) - /* valid struct bpf_spin_lock will be 4 byte aligned */ + if (off % align) + return -EINVAL; + } + return off; +} + +static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, + const char *name, int sz, int align) +{ + const struct btf_var_secinfo *vsi; + u32 i, off = -ENOENT; + + for_each_vsi(i, t, vsi) { + const struct btf_type *var = btf_type_by_id(btf, vsi->type); + const struct btf_type *var_type = btf_type_by_id(btf, var->type); + + if (!__btf_type_is_struct(var_type)) + continue; + if (var_type->size != sz) + continue; + if (vsi->size != sz) + continue; + if (strcmp(__btf_name_by_offset(btf, var_type->name_off), name)) + continue; + if (off != -ENOENT) + /* only one such field is allowed */ + return -E2BIG; + off = vsi->offset; + if (off % align) return -EINVAL; } return off; } +static int btf_find_field(const struct btf *btf, const struct btf_type *t, + const char *name, int sz, int align) +{ + + if (__btf_type_is_struct(t)) + return btf_find_struct_field(btf, t, name, sz, align); + else if (btf_type_is_datasec(t)) + return btf_find_datasec_var(btf, t, name, sz, align); + return -EINVAL; +} + +/* find 'struct bpf_spin_lock' in map value. + * return >= 0 offset if found + * and < 0 in case of error + */ +int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t) +{ + return btf_find_field(btf, t, "bpf_spin_lock", + sizeof(struct bpf_spin_lock), + __alignof__(struct bpf_spin_lock)); +} + +int btf_find_timer(const struct btf *btf, const struct btf_type *t) +{ + return btf_find_field(btf, t, "bpf_timer", + sizeof(struct bpf_timer), + __alignof__(struct bpf_timer)); +} + static void __btf_struct_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct btf_show *show) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 72c58cc516a3..6dc3fae46a56 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -228,6 +228,32 @@ static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size); } +static bool htab_has_extra_elems(struct bpf_htab *htab) +{ + return !htab_is_percpu(htab) && !htab_is_lru(htab); +} + +static void htab_free_prealloced_timers(struct bpf_htab *htab) +{ + u32 num_entries = htab->map.max_entries; + int i; + + if (likely(!map_value_has_timer(&htab->map))) + return; + if (htab_has_extra_elems(htab)) + num_entries += num_possible_cpus(); + + for (i = 0; i < num_entries; i++) { + struct htab_elem *elem; + + elem = get_htab_elem(htab, i); + bpf_timer_cancel_and_free(elem->key + + round_up(htab->map.key_size, 8) + + htab->map.timer_off); + cond_resched(); + } +} + static void htab_free_elems(struct bpf_htab *htab) { int i; @@ -265,8 +291,12 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, struct htab_elem *l; if (node) { + u32 key_size = htab->map.key_size; + l = container_of(node, struct htab_elem, lru_node); - memcpy(l->key, key, htab->map.key_size); + memcpy(l->key, key, key_size); + check_and_init_map_value(&htab->map, + l->key + round_up(key_size, 8)); return l; } @@ -278,7 +308,7 @@ static int prealloc_init(struct bpf_htab *htab) u32 num_entries = htab->map.max_entries; int err = -ENOMEM, i; - if (!htab_is_percpu(htab) && !htab_is_lru(htab)) + if (htab_has_extra_elems(htab)) num_entries += num_possible_cpus(); htab->elems = bpf_map_area_alloc((u64)htab->elem_size * num_entries, @@ -695,6 +725,14 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map, return insn - insn_buf; } +static void check_and_free_timer(struct bpf_htab *htab, struct htab_elem *elem) +{ + if (unlikely(map_value_has_timer(&htab->map))) + bpf_timer_cancel_and_free(elem->key + + round_up(htab->map.key_size, 8) + + htab->map.timer_off); +} + /* It is called from the bpf_lru_list when the LRU needs to delete * older elements from the htab. */ @@ -719,6 +757,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) if (l == tgt_l) { hlist_nulls_del_rcu(&l->hash_node); + check_and_free_timer(htab, l); break; } @@ -790,6 +829,7 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) { if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) free_percpu(htab_elem_get_ptr(l, htab->map.key_size)); + check_and_free_timer(htab, l); kfree(l); } @@ -817,6 +857,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) htab_put_fd_value(htab, l); if (htab_is_prealloc(htab)) { + check_and_free_timer(htab, l); __pcpu_freelist_push(&htab->freelist, &l->fnode); } else { atomic_dec(&htab->count); @@ -920,8 +961,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, l_new = ERR_PTR(-ENOMEM); goto dec_count; } - check_and_init_map_lock(&htab->map, - l_new->key + round_up(key_size, 8)); + check_and_init_map_value(&htab->map, + l_new->key + round_up(key_size, 8)); } memcpy(l_new->key, key, key_size); @@ -1062,6 +1103,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, hlist_nulls_del_rcu(&l_old->hash_node); if (!htab_is_prealloc(htab)) free_htab_elem(htab, l_old); + else + check_and_free_timer(htab, l_old); } ret = 0; err: @@ -1069,6 +1112,12 @@ err: return ret; } +static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem) +{ + check_and_free_timer(htab, elem); + bpf_lru_push_free(&htab->lru, &elem->lru_node); +} + static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { @@ -1102,7 +1151,8 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, l_new = prealloc_lru_pop(htab, key, hash); if (!l_new) return -ENOMEM; - memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size); + copy_map_value(&htab->map, + l_new->key + round_up(map->key_size, 8), value); ret = htab_lock_bucket(htab, b, hash, &flags); if (ret) @@ -1128,9 +1178,9 @@ err: htab_unlock_bucket(htab, b, hash, flags); if (ret) - bpf_lru_push_free(&htab->lru, &l_new->lru_node); + htab_lru_push_free(htab, l_new); else if (l_old) - bpf_lru_push_free(&htab->lru, &l_old->lru_node); + htab_lru_push_free(htab, l_old); return ret; } @@ -1339,7 +1389,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) htab_unlock_bucket(htab, b, hash, flags); if (l) - bpf_lru_push_free(&htab->lru, &l->lru_node); + htab_lru_push_free(htab, l); return ret; } @@ -1359,6 +1409,35 @@ static void delete_all_elements(struct bpf_htab *htab) } } +static void htab_free_malloced_timers(struct bpf_htab *htab) +{ + int i; + + rcu_read_lock(); + for (i = 0; i < htab->n_buckets; i++) { + struct hlist_nulls_head *head = select_bucket(htab, i); + struct hlist_nulls_node *n; + struct htab_elem *l; + + hlist_nulls_for_each_entry(l, n, head, hash_node) + check_and_free_timer(htab, l); + cond_resched_rcu(); + } + rcu_read_unlock(); +} + +static void htab_map_free_timers(struct bpf_map *map) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + + if (likely(!map_value_has_timer(&htab->map))) + return; + if (!htab_is_prealloc(htab)) + htab_free_malloced_timers(htab); + else + htab_free_prealloced_timers(htab); +} + /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void htab_map_free(struct bpf_map *map) { @@ -1456,7 +1535,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, else copy_map_value(map, value, l->key + roundup_key_size); - check_and_init_map_lock(map, value); + check_and_init_map_value(map, value); } hlist_nulls_del_rcu(&l->hash_node); @@ -1467,7 +1546,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, htab_unlock_bucket(htab, b, hash, bflags); if (is_lru_map && l) - bpf_lru_push_free(&htab->lru, &l->lru_node); + htab_lru_push_free(htab, l); return ret; } @@ -1645,7 +1724,7 @@ again_nocopy: true); else copy_map_value(map, dst_val, value); - check_and_init_map_lock(map, dst_val); + check_and_init_map_value(map, dst_val); } if (do_delete) { hlist_nulls_del_rcu(&l->hash_node); @@ -1672,7 +1751,7 @@ again_nocopy: while (node_to_free) { l = node_to_free; node_to_free = node_to_free->batch_flink; - bpf_lru_push_free(&htab->lru, &l->lru_node); + htab_lru_push_free(htab, l); } next_batch: @@ -2034,6 +2113,7 @@ const struct bpf_map_ops htab_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, + .map_release_uref = htab_map_free_timers, .map_lookup_elem = htab_map_lookup_elem, .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem, .map_update_elem = htab_map_update_elem, @@ -2055,6 +2135,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, + .map_release_uref = htab_map_free_timers, .map_lookup_elem = htab_lru_map_lookup_elem, .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem, .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index bd11db9774c3..95d70a08325d 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -173,7 +173,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key, return -ENOMEM; memcpy(&new->data[0], value, map->value_size); - check_and_init_map_lock(map, new->data); + check_and_init_map_value(map, new->data); new = xchg(&storage->buf, new); kfree_rcu(new, rcu); @@ -509,7 +509,7 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, map->numa_node); if (!storage->buf) goto enomem; - check_and_init_map_lock(map, storage->buf->data); + check_and_init_map_value(map, storage->buf->data); } else { storage->percpu_buf = bpf_map_alloc_percpu(map, size, 8, gfp); if (!storage->percpu_buf) diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 39ab0b68cade..890dfe14e731 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -50,6 +50,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) inner_map_meta->map_flags = inner_map->map_flags; inner_map_meta->max_entries = inner_map->max_entries; inner_map_meta->spin_lock_off = inner_map->spin_lock_off; + inner_map_meta->timer_off = inner_map->timer_off; /* Misc members not needed in bpf_map_meta_equal() check. */ inner_map_meta->ops = inner_map->ops; @@ -75,6 +76,7 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0, return meta0->map_type == meta1->map_type && meta0->key_size == meta1->key_size && meta0->value_size == meta1->value_size && + meta0->timer_off == meta1->timer_off && meta0->map_flags == meta1->map_flags; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5d1fee634be8..9a2068e39d23 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -260,8 +260,8 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, copy_map_value_locked(map, value, ptr, true); else copy_map_value(map, value, ptr); - /* mask lock, since value wasn't zero inited */ - check_and_init_map_lock(map, value); + /* mask lock and timer, since value wasn't zero inited */ + check_and_init_map_value(map, value); } rcu_read_unlock(); } @@ -623,7 +623,8 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) struct bpf_map *map = filp->private_data; int err; - if (!map->ops->map_mmap || map_value_has_spin_lock(map)) + if (!map->ops->map_mmap || map_value_has_spin_lock(map) || + map_value_has_timer(map)) return -ENOTSUPP; if (!(vma->vm_flags & VM_SHARED)) @@ -793,6 +794,16 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, } } + map->timer_off = btf_find_timer(btf, value_type); + if (map_value_has_timer(map)) { + if (map->map_flags & BPF_F_RDONLY_PROG) + return -EACCES; + if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_LRU_HASH && + map->map_type != BPF_MAP_TYPE_ARRAY) + return -EOPNOTSUPP; + } + if (map->ops->map_check_btf) ret = map->ops->map_check_btf(map, btf, key_type, value_type); @@ -844,6 +855,7 @@ static int map_create(union bpf_attr *attr) mutex_init(&map->freeze_mutex); map->spin_lock_off = -EINVAL; + map->timer_off = -EINVAL; if (attr->btf_key_type_id || attr->btf_value_type_id || /* Even the map's value is a kernel's struct, * the bpf_prog.o must have BTF to begin with @@ -1591,7 +1603,8 @@ static int map_freeze(const union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { + if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || + map_value_has_timer(map)) { fdput(f); return -ENOTSUPP; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e8645c819803..12b50f46a7c1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3241,6 +3241,15 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, return -EACCES; } } + if (map_value_has_timer(map)) { + u32 t = map->timer_off; + + if (reg->smin_value + off < t + sizeof(struct bpf_timer) && + t < reg->umax_value + off + size) { + verbose(env, "bpf_timer cannot be accessed directly by load/store\n"); + return -EACCES; + } + } return err; } @@ -4675,9 +4684,24 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, map->name); return -EINVAL; } - if (val) { - /* This restriction will be removed in the next patch */ - verbose(env, "bpf_timer field can only be first in the map value element\n"); + if (!map_value_has_timer(map)) { + if (map->timer_off == -E2BIG) + verbose(env, + "map '%s' has more than one 'struct bpf_timer'\n", + map->name); + else if (map->timer_off == -ENOENT) + verbose(env, + "map '%s' doesn't have 'struct bpf_timer'\n", + map->name); + else + verbose(env, + "map '%s' is not a struct type or bpf_timer is mangled\n", + map->name); + return -EINVAL; + } + if (map->timer_off != val + reg->off) { + verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n", + val + reg->off, map->timer_off); return -EINVAL; } if (meta->map_ptr) { -- cgit v1.2.3 From 3e8ce29850f1839d0603f925b30be9d8a4329917 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 14 Jul 2021 17:54:11 -0700 Subject: bpf: Prevent pointer mismatch in bpf_timer_init. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bpf_timer_init() arguments are: 1. pointer to a timer (which is embedded in map element). 2. pointer to a map. Make sure that pointer to a timer actually belongs to that map. Use map_uid (which is unique id of inner map) to reject: inner_map1 = bpf_map_lookup_elem(outer_map, key1) inner_map2 = bpf_map_lookup_elem(outer_map, key2) if (inner_map1 && inner_map2) { timer = bpf_map_lookup_elem(inner_map1); if (timer) // mismatch would have been allowed bpf_timer_init(timer, inner_map2); } Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Acked-by: Andrii Nakryiko Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210715005417.78572-6-alexei.starovoitov@gmail.com --- kernel/bpf/verifier.c | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 12b50f46a7c1..8df2671c3d33 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -255,6 +255,7 @@ struct bpf_call_arg_meta { int mem_size; u64 msize_max_value; int ref_obj_id; + int map_uid; int func_id; struct btf *btf; u32 btf_id; @@ -1135,6 +1136,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) if (map->inner_map_meta) { reg->type = CONST_PTR_TO_MAP; reg->map_ptr = map->inner_map_meta; + /* transfer reg's id which is unique for every map_lookup_elem + * as UID of the inner map. + */ + reg->map_uid = reg->id; } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { reg->type = PTR_TO_XDP_SOCK; } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || @@ -4708,6 +4713,7 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, verbose(env, "verifier bug. Two map pointers in a timer helper\n"); return -EFAULT; } + meta->map_uid = reg->map_uid; meta->map_ptr = map; return 0; } @@ -5006,11 +5012,29 @@ skip_type_check: if (arg_type == ARG_CONST_MAP_PTR) { /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ - if (meta->map_ptr && meta->map_ptr != reg->map_ptr) { - verbose(env, "Map pointer doesn't match bpf_timer.\n"); - return -EINVAL; + if (meta->map_ptr) { + /* Use map_uid (which is unique id of inner map) to reject: + * inner_map1 = bpf_map_lookup_elem(outer_map, key1) + * inner_map2 = bpf_map_lookup_elem(outer_map, key2) + * if (inner_map1 && inner_map2) { + * timer = bpf_map_lookup_elem(inner_map1); + * if (timer) + * // mismatch would have been allowed + * bpf_timer_init(timer, inner_map2); + * } + * + * Comparing map_ptr is enough to distinguish normal and outer maps. + */ + if (meta->map_ptr != reg->map_ptr || + meta->map_uid != reg->map_uid) { + verbose(env, + "timer pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n", + meta->map_uid, reg->map_uid); + return -EINVAL; + } } meta->map_ptr = reg->map_ptr; + meta->map_uid = reg->map_uid; } else if (arg_type == ARG_PTR_TO_MAP_KEY) { /* bpf_map_xxx(..., map_ptr, ..., key) call: * check that [key, key + map->key_size) are within @@ -6204,6 +6228,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EINVAL; } regs[BPF_REG_0].map_ptr = meta.map_ptr; + regs[BPF_REG_0].map_uid = meta.map_uid; if (fn->ret_type == RET_PTR_TO_MAP_VALUE) { regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; if (map_value_has_spin_lock(meta.map_ptr)) -- cgit v1.2.3 From 40ec00abf1cc92268e3e3320b36bbb33b2224808 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 14 Jul 2021 17:54:12 -0700 Subject: bpf: Remember BTF of inner maps. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BTF is required for 'struct bpf_timer' to be recognized inside map value. The bpf timers are supported inside inner maps. Remember 'struct btf *' in inner_map_meta to make it available to the verifier in the sequence: struct bpf_map *inner_map = bpf_map_lookup_elem(&outer_map, ...); if (inner_map) timer = bpf_map_lookup_elem(&inner_map, ...); Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Martin KaFai Lau Acked-by: Andrii Nakryiko Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210715005417.78572-7-alexei.starovoitov@gmail.com --- kernel/bpf/map_in_map.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 890dfe14e731..5cd8f5277279 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -3,6 +3,7 @@ */ #include #include +#include #include "map_in_map.h" @@ -51,6 +52,10 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) inner_map_meta->max_entries = inner_map->max_entries; inner_map_meta->spin_lock_off = inner_map->spin_lock_off; inner_map_meta->timer_off = inner_map->timer_off; + if (inner_map->btf) { + btf_get(inner_map->btf); + inner_map_meta->btf = inner_map->btf; + } /* Misc members not needed in bpf_map_meta_equal() check. */ inner_map_meta->ops = inner_map->ops; @@ -66,6 +71,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) void bpf_map_meta_free(struct bpf_map *map_meta) { + btf_put(map_meta->btf); kfree(map_meta); } -- cgit v1.2.3 From 86fc6ee6e246438d394e41bb7cc210b0fe724872 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 14 Jul 2021 17:54:13 -0700 Subject: bpf: Relax verifier recursion check. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the following bpf subprogram: static int timer_cb(void *map, void *key, void *value) { bpf_timer_set_callback(.., timer_cb); } the 'timer_cb' is a pointer to a function. ld_imm64 insn is used to carry this pointer. bpf_pseudo_func() returns true for such ld_imm64 insn. Unlike bpf_for_each_map_elem() the bpf_timer_set_callback() is asynchronous. Relax control flow check to allow such "recursion" that is seen as an infinite loop by check_cfg(). The distinction between bpf_for_each_map_elem() the bpf_timer_set_callback() is done in the follow up patch. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210715005417.78572-8-alexei.starovoitov@gmail.com --- kernel/bpf/verifier.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8df2671c3d33..1cb1b35e69b7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9463,8 +9463,12 @@ static int visit_func_call_insn(int t, int insn_cnt, init_explored_state(env, t + 1); if (visit_callee) { init_explored_state(env, t); - ret = push_insn(t, t + insns[t].imm + 1, BRANCH, - env, false); + ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env, + /* It's ok to allow recursion from CFG point of + * view. __check_func_call() will do the actual + * check. + */ + bpf_pseudo_func(insns + t)); } return ret; } -- cgit v1.2.3 From bfc6bb74e4f16ab264fa73398a7a79d7d2afac2e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 14 Jul 2021 17:54:14 -0700 Subject: bpf: Implement verifier support for validation of async callbacks. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bpf_for_each_map_elem() and bpf_timer_set_callback() helpers are relying on PTR_TO_FUNC infra in the verifier to validate addresses to subprograms and pass them into the helpers as function callbacks. In case of bpf_for_each_map_elem() the callback is invoked synchronously and the verifier treats it as a normal subprogram call by adding another bpf_func_state and new frame in __check_func_call(). bpf_timer_set_callback() doesn't invoke the callback directly. The subprogram will be called asynchronously from bpf_timer_cb(). Teach the verifier to validate such async callbacks as special kind of jump by pushing verifier state into stack and let pop_stack() process it. Special care needs to be taken during state pruning. The call insn doing bpf_timer_set_callback has to be a prune_point. Otherwise short timer callbacks might not have prune points in front of bpf_timer_set_callback() which means is_state_visited() will be called after this call insn is processed in __check_func_call(). Which means that another async_cb state will be pushed to be walked later and the verifier will eventually hit BPF_COMPLEXITY_LIMIT_JMP_SEQ limit. Since push_async_cb() looks like another push_stack() branch the infinite loop detection will trigger false positive. To recognize this case mark such states as in_async_callback_fn. To distinguish infinite loop in async callback vs the same callback called with different arguments for different map and timer add async_entry_cnt to bpf_func_state. Enforce return zero from async callbacks. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210715005417.78572-9-alexei.starovoitov@gmail.com --- kernel/bpf/helpers.c | 8 ++-- kernel/bpf/verifier.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 123 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 74b16593983d..9fe846ec6bd1 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1043,7 +1043,6 @@ static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) void *callback_fn; void *key; u32 idx; - int ret; callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held()); if (!callback_fn) @@ -1066,10 +1065,9 @@ static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) key = value - round_up(map->key_size, 8); } - ret = BPF_CAST_CALL(callback_fn)((u64)(long)map, - (u64)(long)key, - (u64)(long)value, 0, 0); - WARN_ON(ret != 0); /* Next patch moves this check into the verifier */ + BPF_CAST_CALL(callback_fn)((u64)(long)map, (u64)(long)key, + (u64)(long)value, 0, 0); + /* The verifier checked that return value is zero. */ this_cpu_write(hrtimer_running, NULL); out: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1cb1b35e69b7..ab06256bf6c8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -735,6 +735,10 @@ static void print_verifier_state(struct bpf_verifier_env *env, if (state->refs[i].id) verbose(env, ",%d", state->refs[i].id); } + if (state->in_callback_fn) + verbose(env, " cb"); + if (state->in_async_callback_fn) + verbose(env, " async_cb"); verbose(env, "\n"); } @@ -1527,6 +1531,54 @@ static void init_func_state(struct bpf_verifier_env *env, init_reg_state(env, state); } +/* Similar to push_stack(), but for async callbacks */ +static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, + int insn_idx, int prev_insn_idx, + int subprog) +{ + struct bpf_verifier_stack_elem *elem; + struct bpf_func_state *frame; + + elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL); + if (!elem) + goto err; + + elem->insn_idx = insn_idx; + elem->prev_insn_idx = prev_insn_idx; + elem->next = env->head; + elem->log_pos = env->log.len_used; + env->head = elem; + env->stack_size++; + if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) { + verbose(env, + "The sequence of %d jumps is too complex for async cb.\n", + env->stack_size); + goto err; + } + /* Unlike push_stack() do not copy_verifier_state(). + * The caller state doesn't matter. + * This is async callback. It starts in a fresh stack. + * Initialize it similar to do_check_common(). + */ + elem->st.branches = 1; + frame = kzalloc(sizeof(*frame), GFP_KERNEL); + if (!frame) + goto err; + init_func_state(env, frame, + BPF_MAIN_FUNC /* callsite */, + 0 /* frameno within this callchain */, + subprog /* subprog number within this prog */); + elem->st.frame[0] = frame; + return &elem->st; +err: + free_verifier_state(env->cur_state, true); + env->cur_state = NULL; + /* pop all elements and return */ + while (!pop_stack(env, NULL, NULL, false)); + return NULL; +} + + enum reg_arg_type { SRC_OP, /* register is used as source operand */ DST_OP, /* register is used as destination operand */ @@ -5704,6 +5756,30 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn } } + if (insn->code == (BPF_JMP | BPF_CALL) && + insn->imm == BPF_FUNC_timer_set_callback) { + struct bpf_verifier_state *async_cb; + + /* there is no real recursion here. timer callbacks are async */ + async_cb = push_async_cb(env, env->subprog_info[subprog].start, + *insn_idx, subprog); + if (!async_cb) + return -EFAULT; + callee = async_cb->frame[0]; + callee->async_entry_cnt = caller->async_entry_cnt + 1; + + /* Convert bpf_timer_set_callback() args into timer callback args */ + err = set_callee_state_cb(env, caller, callee, *insn_idx); + if (err) + return err; + + clear_caller_saved_regs(env, caller->regs); + mark_reg_unknown(env, caller->regs, BPF_REG_0); + caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; + /* continue with next insn after call */ + return 0; + } + callee = kzalloc(sizeof(*callee), GFP_KERNEL); if (!callee) return -ENOMEM; @@ -5856,6 +5932,7 @@ static int set_timer_callback_state(struct bpf_verifier_env *env, /* unused */ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + callee->in_async_callback_fn = true; return 0; } @@ -9224,7 +9301,8 @@ static int check_return_code(struct bpf_verifier_env *env) struct tnum range = tnum_range(0, 1); enum bpf_prog_type prog_type = resolve_prog_type(env->prog); int err; - const bool is_subprog = env->cur_state->frame[0]->subprogno; + struct bpf_func_state *frame = env->cur_state->frame[0]; + const bool is_subprog = frame->subprogno; /* LSM and struct_ops func-ptr's return type could be "void" */ if (!is_subprog && @@ -9249,6 +9327,22 @@ static int check_return_code(struct bpf_verifier_env *env) } reg = cur_regs(env) + BPF_REG_0; + + if (frame->in_async_callback_fn) { + /* enforce return zero from async callbacks like timer */ + if (reg->type != SCALAR_VALUE) { + verbose(env, "In async callback the register R0 is not a known value (%s)\n", + reg_type_str[reg->type]); + return -EINVAL; + } + + if (!tnum_in(tnum_const(0), reg->var_off)) { + verbose_invalid_scalar(env, reg, &range, "async callback", "R0"); + return -EINVAL; + } + return 0; + } + if (is_subprog) { if (reg->type != SCALAR_VALUE) { verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n", @@ -9496,6 +9590,13 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) return DONE_EXPLORING; case BPF_CALL: + if (insns[t].imm == BPF_FUNC_timer_set_callback) + /* Mark this call insn to trigger is_state_visited() check + * before call itself is processed by __check_func_call(). + * Otherwise new async state will be pushed for further + * exploration. + */ + init_explored_state(env, t); return visit_func_call_insn(t, insn_cnt, insns, env, insns[t].src_reg == BPF_PSEUDO_CALL); @@ -10503,9 +10604,25 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) states_cnt++; if (sl->state.insn_idx != insn_idx) goto next; + if (sl->state.branches) { - if (states_maybe_looping(&sl->state, cur) && - states_equal(env, &sl->state, cur)) { + struct bpf_func_state *frame = sl->state.frame[sl->state.curframe]; + + if (frame->in_async_callback_fn && + frame->async_entry_cnt != cur->frame[cur->curframe]->async_entry_cnt) { + /* Different async_entry_cnt means that the verifier is + * processing another entry into async callback. + * Seeing the same state is not an indication of infinite + * loop or infinite recursion. + * But finding the same state doesn't mean that it's safe + * to stop processing the current state. The previous state + * hasn't yet reached bpf_exit, since state.branches > 0. + * Checking in_async_callback_fn alone is not enough either. + * Since the verifier still needs to catch infinite loops + * inside async callbacks. + */ + } else if (states_maybe_looping(&sl->state, cur) && + states_equal(env, &sl->state, cur)) { verbose_linfo(env, insn_idx, "; "); verbose(env, "infinite loop detected at insn %d\n", insn_idx); return -EINVAL; -- cgit v1.2.3 From 7ddc80a476c2d599246028af5808d15f9e24c109 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 14 Jul 2021 17:54:15 -0700 Subject: bpf: Teach stack depth check about async callbacks. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Teach max stack depth checking algorithm about async callbacks that don't increase bpf program stack size. Also add sanity check that bpf_tail_call didn't sneak into async cb. It's impossible, since PTR_TO_CTX is not available in async cb, hence the program cannot contain bpf_tail_call(ctx,...); Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210715005417.78572-10-alexei.starovoitov@gmail.com --- kernel/bpf/verifier.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ab06256bf6c8..344ee67265cc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3709,6 +3709,8 @@ process_func: continue_func: subprog_end = subprog[idx + 1].start; for (; i < subprog_end; i++) { + int next_insn; + if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i)) continue; /* remember insn and function to return to */ @@ -3716,13 +3718,22 @@ continue_func: ret_prog[frame] = idx; /* find the callee */ - i = i + insn[i].imm + 1; - idx = find_subprog(env, i); + next_insn = i + insn[i].imm + 1; + idx = find_subprog(env, next_insn); if (idx < 0) { WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", - i); + next_insn); return -EFAULT; } + if (subprog[idx].is_async_cb) { + if (subprog[idx].has_tail_call) { + verbose(env, "verifier bug. subprog has tail_call and async cb\n"); + return -EFAULT; + } + /* async callbacks don't increase bpf prog stack size */ + continue; + } + i = next_insn; if (subprog[idx].has_tail_call) tail_call_reachable = true; @@ -5761,6 +5772,7 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn struct bpf_verifier_state *async_cb; /* there is no real recursion here. timer callbacks are async */ + env->subprog_info[subprog].is_async_cb = true; async_cb = push_async_cb(env, env->subprog_info[subprog].start, *insn_idx, subprog); if (!async_cb) -- cgit v1.2.3 From 1e37392cccdea94da635e3c6d16b21865806f619 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 14 Jul 2021 11:43:54 +0200 Subject: bpf: Enable BPF_TRAMP_F_IP_ARG for trampolines with call_get_func_ip Enabling BPF_TRAMP_F_IP_ARG for trampolines that actually need it. The BPF_TRAMP_F_IP_ARG adds extra 3 instructions to trampoline code and is used only by programs with bpf_get_func_ip helper, which is added in following patch and sets call_get_func_ip bit. This patch ensures that BPF_TRAMP_F_IP_ARG flag is used only for trampolines that have programs with call_get_func_ip set. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210714094400.396467-3-jolsa@kernel.org --- kernel/bpf/trampoline.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 28a3630c48ee..b2535acfe9db 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -172,7 +172,7 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) } static struct bpf_tramp_progs * -bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total) +bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_arg) { const struct bpf_prog_aux *aux; struct bpf_tramp_progs *tprogs; @@ -189,8 +189,10 @@ bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total) *total += tr->progs_cnt[kind]; progs = tprogs[kind].progs; - hlist_for_each_entry(aux, &tr->progs_hlist[kind], tramp_hlist) + hlist_for_each_entry(aux, &tr->progs_hlist[kind], tramp_hlist) { + *ip_arg |= aux->prog->call_get_func_ip; *progs++ = aux->prog; + } } return tprogs; } @@ -333,9 +335,10 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) struct bpf_tramp_image *im; struct bpf_tramp_progs *tprogs; u32 flags = BPF_TRAMP_F_RESTORE_REGS; + bool ip_arg = false; int err, total; - tprogs = bpf_trampoline_get_progs(tr, &total); + tprogs = bpf_trampoline_get_progs(tr, &total, &ip_arg); if (IS_ERR(tprogs)) return PTR_ERR(tprogs); @@ -357,6 +360,9 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs) flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; + if (ip_arg) + flags |= BPF_TRAMP_F_IP_ARG; + err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE, &tr->func.model, flags, tprogs, tr->func.addr); -- cgit v1.2.3 From 9b99edcae5c80c8fb9f8e7149bae528c9e610a72 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 14 Jul 2021 11:43:55 +0200 Subject: bpf: Add bpf_get_func_ip helper for tracing programs Adding bpf_get_func_ip helper for BPF_PROG_TYPE_TRACING programs, specifically for all trampoline attach types. The trampoline's caller IP address is stored in (ctx - 8) address. so there's no reason to actually call the helper, but rather fixup the call instruction and return [ctx - 8] value directly. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210714094400.396467-4-jolsa@kernel.org --- kernel/bpf/verifier.c | 43 +++++++++++++++++++++++++++++++++++++++++++ kernel/trace/bpf_trace.c | 15 +++++++++++++++ 2 files changed, 58 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 344ee67265cc..ceef190514e4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6161,6 +6161,27 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env, return err; } +static int check_get_func_ip(struct bpf_verifier_env *env) +{ + enum bpf_attach_type eatype = env->prog->expected_attach_type; + enum bpf_prog_type type = resolve_prog_type(env->prog); + int func_id = BPF_FUNC_get_func_ip; + + if (type == BPF_PROG_TYPE_TRACING) { + if (eatype != BPF_TRACE_FENTRY && eatype != BPF_TRACE_FEXIT && + eatype != BPF_MODIFY_RETURN) { + verbose(env, "func %s#%d supported only for fentry/fexit/fmod_ret programs\n", + func_id_name(func_id), func_id); + return -ENOTSUPP; + } + return 0; + } + + verbose(env, "func %s#%d not supported for program type %d\n", + func_id_name(func_id), func_id, type); + return -ENOTSUPP; +} + static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { @@ -6439,6 +6460,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (func_id == BPF_FUNC_get_stackid || func_id == BPF_FUNC_get_stack) env->prog->call_get_stack = true; + if (func_id == BPF_FUNC_get_func_ip) { + if (check_get_func_ip(env)) + return -ENOTSUPP; + env->prog->call_get_func_ip = true; + } + if (changes_data) clear_all_pkt_pointers(env); return 0; @@ -12632,6 +12659,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; bool expect_blinding = bpf_jit_blinding_enabled(prog); + enum bpf_prog_type prog_type = resolve_prog_type(prog); struct bpf_insn *insn = prog->insnsi; const struct bpf_func_proto *fn; const int insn_cnt = prog->len; @@ -12998,6 +13026,21 @@ patch_map_ops_generic: continue; } + /* Implement bpf_get_func_ip inline. */ + if (prog_type == BPF_PROG_TYPE_TRACING && + insn->imm == BPF_FUNC_get_func_ip) { + /* Load IP address from ctx - 8 */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1); + if (!new_prog) + return -ENOMEM; + + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + patch_call_imm: fn = env->ops->get_func_proto(insn->imm, env->prog); /* all functions that have prototype and verifier allowed diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 6c77d25137e0..3e71503eeb23 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -948,6 +948,19 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = { .arg5_type = ARG_ANYTHING, }; +BPF_CALL_1(bpf_get_func_ip_tracing, void *, ctx) +{ + /* This helper call is inlined by verifier. */ + return ((u64 *)ctx)[-1]; +} + +static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = { + .func = bpf_get_func_ip_tracing, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1058,6 +1071,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_for_each_map_elem_proto; case BPF_FUNC_snprintf: return &bpf_snprintf_proto; + case BPF_FUNC_get_func_ip: + return &bpf_get_func_ip_proto_tracing; default: return bpf_base_func_proto(func_id); } -- cgit v1.2.3 From 9ffd9f3ff7193933dae171740ab70a103d460065 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 14 Jul 2021 11:43:56 +0200 Subject: bpf: Add bpf_get_func_ip helper for kprobe programs Adding bpf_get_func_ip helper for BPF_PROG_TYPE_KPROBE programs, so it's now possible to call bpf_get_func_ip from both kprobe and kretprobe programs. Taking the caller's address from 'struct kprobe::addr', which is defined for both kprobe and kretprobe. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Reviewed-by: Masami Hiramatsu Link: https://lore.kernel.org/bpf/20210714094400.396467-5-jolsa@kernel.org --- kernel/bpf/verifier.c | 2 ++ kernel/trace/bpf_trace.c | 16 ++++++++++++++++ 2 files changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ceef190514e4..97216f799ba8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6175,6 +6175,8 @@ static int check_get_func_ip(struct bpf_verifier_env *env) return -ENOTSUPP; } return 0; + } else if (type == BPF_PROG_TYPE_KPROBE) { + return 0; } verbose(env, "func %s#%d not supported for program type %d\n", diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3e71503eeb23..0b113716bc7a 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -961,6 +961,20 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs) +{ + struct kprobe *kp = kprobe_running(); + + return kp ? (u64) kp->addr : 0; +} + +static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = { + .func = bpf_get_func_ip_kprobe, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1092,6 +1106,8 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_override_return: return &bpf_override_return_proto; #endif + case BPF_FUNC_get_func_ip: + return &bpf_get_func_ip_proto_kprobe; default: return bpf_tracing_func_proto(func_id, prog); } -- cgit v1.2.3 From 17edea21b38d047a10c189296c58aea9875d0d0a Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sun, 4 Jul 2021 12:02:42 -0700 Subject: sock_map: Relax config dependency to CONFIG_NET Currently sock_map still has Kconfig dependency on CONFIG_INET, but there is no actual functional dependency on it after we introduce ->psock_update_sk_prot(). We have to extend it to CONFIG_NET now as we are going to support AF_UNIX. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210704190252.11866-2-xiyou.wangcong@gmail.com --- kernel/bpf/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index bd04f4a44c01..a82d6de86522 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -29,7 +29,7 @@ config BPF_SYSCALL select IRQ_WORK select TASKS_TRACE_RCU select BINARY_PRINTF - select NET_SOCK_MSG if INET + select NET_SOCK_MSG if NET default n help Enable the bpf() system call that allows to manipulate BPF programs -- cgit v1.2.3 From 1f8c543f142942a2325e729fc4c64b6898749c3a Mon Sep 17 00:00:00 2001 From: zhaoxiaoqiang11 Date: Fri, 16 Jul 2021 17:18:53 +0800 Subject: cgroup: remove cgroup_mount from comments Git rid of an outdated comment. Since cgroup was fully switched to fs_context, cgroup_mount() is gone and it's confusing to mention in comments of cgroup_kill_sb(). Delete it. Signed-off-by: zhaoxiaoqiang11 Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 3a0161c21b6b..d0725c1a8db5 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2169,7 +2169,6 @@ static void cgroup_kill_sb(struct super_block *sb) /* * If @root doesn't have any children, start killing it. * This prevents new mounts by disabling percpu_ref_tryget_live(). - * cgroup_mount() may wait for @root's release. * * And don't kill the default root. */ -- cgit v1.2.3 From c7603cfa04e7c3a435b31d065f7cbdc829428f6e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 12 Jul 2021 16:06:15 -0700 Subject: bpf: Add ambient BPF runtime context stored in current b910eaaaa4b8 ("bpf: Fix NULL pointer dereference in bpf_get_local_storage() helper") fixed the problem with cgroup-local storage use in BPF by pre-allocating per-CPU array of 8 cgroup storage pointers to accommodate possible BPF program preemptions and nested executions. While this seems to work good in practice, it introduces new and unnecessary failure mode in which not all BPF programs might be executed if we fail to find an unused slot for cgroup storage, however unlikely it is. It might also not be so unlikely when/if we allow sleepable cgroup BPF programs in the future. Further, the way that cgroup storage is implemented as ambiently-available property during entire BPF program execution is a convenient way to pass extra information to BPF program and helpers without requiring user code to pass around extra arguments explicitly. So it would be good to have a generic solution that can allow implementing this without arbitrary restrictions. Ideally, such solution would work for both preemptable and sleepable BPF programs in exactly the same way. This patch introduces such solution, bpf_run_ctx. It adds one pointer field (bpf_ctx) to task_struct. This field is maintained by BPF_PROG_RUN family of macros in such a way that it always stays valid throughout BPF program execution. BPF program preemption is handled by remembering previous current->bpf_ctx value locally while executing nested BPF program and restoring old value after nested BPF program finishes. This is handled by two helper functions, bpf_set_run_ctx() and bpf_reset_run_ctx(), which are supposed to be used before and after BPF program runs, respectively. Restoring old value of the pointer handles preemption, while bpf_run_ctx pointer being a property of current task_struct naturally solves this problem for sleepable BPF programs by "following" BPF program execution as it is scheduled in and out of CPU. It would even allow CPU migration of BPF programs, even though it's not currently allowed by BPF infra. This patch cleans up cgroup local storage handling as a first application. The design itself is generic, though, with bpf_run_ctx being an empty struct that is supposed to be embedded into a specific struct for a given BPF program type (bpf_cg_run_ctx in this case). Follow up patches are planned that will expand this mechanism for other uses within tracing BPF programs. To verify that this change doesn't revert the fix to the original cgroup storage issue, I ran the same repro as in the original report ([0]) and didn't get any problems. Replacing bpf_reset_run_ctx(old_run_ctx) with bpf_reset_run_ctx(NULL) triggers the issue pretty quickly (so repro does work). [0] https://lore.kernel.org/bpf/YEEvBUiJl2pJkxTd@krava/ Fixes: b910eaaaa4b8 ("bpf: Fix NULL pointer dereference in bpf_get_local_storage() helper") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210712230615.3525979-1-andrii@kernel.org --- kernel/bpf/helpers.c | 16 +++++----------- kernel/bpf/local_storage.c | 3 --- kernel/fork.c | 1 + 3 files changed, 6 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 9fe846ec6bd1..15746f779fe1 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -393,8 +393,6 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = { }; #ifdef CONFIG_CGROUP_BPF -DECLARE_PER_CPU(struct bpf_cgroup_storage_info, - bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]); BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) { @@ -403,17 +401,13 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) * verifier checks that its value is correct. */ enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); - struct bpf_cgroup_storage *storage = NULL; + struct bpf_cgroup_storage *storage; + struct bpf_cg_run_ctx *ctx; void *ptr; - int i; - for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) { - if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current)) - continue; - - storage = this_cpu_read(bpf_cgroup_storage_info[i].storage[stype]); - break; - } + /* get current cgroup storage from BPF run context */ + ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); + storage = ctx->prog_item->cgroup_storage[stype]; if (stype == BPF_CGROUP_STORAGE_SHARED) ptr = &READ_ONCE(storage->buf)->data[0]; diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 95d70a08325d..362e81481594 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -11,9 +11,6 @@ #ifdef CONFIG_CGROUP_BPF -DEFINE_PER_CPU(struct bpf_cgroup_storage_info, - bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]); - #include "../cgroup/cgroup-internal.h" #define LOCAL_STORAGE_CREATE_FLAG_MASK \ diff --git a/kernel/fork.c b/kernel/fork.c index bc94b2cc5995..e8b41e212110 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2083,6 +2083,7 @@ static __latent_entropy struct task_struct *copy_process( #endif #ifdef CONFIG_BPF_SYSCALL RCU_INIT_POINTER(p->bpf_storage, NULL); + p->bpf_ctx = NULL; #endif /* Perform scheduler related setup. Assign this task to a CPU. */ -- cgit v1.2.3 From a1ad4b8a19566b11e0306f8b197f2fd4567340e5 Mon Sep 17 00:00:00 2001 From: Chris Down Date: Tue, 15 Jun 2021 17:52:48 +0100 Subject: printk: Straighten out log_flags into printk_info_flags In the past, `enum log_flags` was part of `struct log`, hence the name. `struct log` has since been reworked and now this struct is stored inside `struct printk_info`. However, the name was never updated, which is somewhat confusing -- especially since these flags operate at the record level rather than at the level of an abstract log. printk_info_flags also joins its other metadata struct friends in printk_ringbuffer.h. Signed-off-by: Chris Down Reviewed-by: Petr Mladek Acked-by: Andy Shevchenko Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/3dd801982f02603e6e3aa4f8bc4f5ebb830a4949.1623775748.git.chris@chrisdown.name --- kernel/printk/internal.h | 6 ++++++ kernel/printk/printk.c | 43 ++++++++++++++++++++----------------------- 2 files changed, 26 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 51615c909b2f..1075e60fcd98 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -12,6 +12,12 @@ #define PRINTK_NMI_CONTEXT_OFFSET 0x010000000 +/* Flags for a single printk record. */ +enum printk_info_flags { + LOG_NEWLINE = 2, /* text ended with a newline */ + LOG_CONT = 8, /* text is a fragment of a continuation line */ +}; + __printf(4, 0) int vprintk_store(int facility, int level, const struct dev_printk_info *dev_info, diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 142a58d124d9..39f1ec22a6a6 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -350,11 +350,6 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; * non-prinatable characters are escaped in the "\xff" notation. */ -enum log_flags { - LOG_NEWLINE = 2, /* text ended with a newline */ - LOG_CONT = 8, /* text is a fragment of a continuation line */ -}; - /* syslog_lock protects syslog_* variables and write access to clear_seq. */ static DEFINE_RAW_SPINLOCK(syslog_lock); @@ -1965,19 +1960,20 @@ static inline u32 printk_caller_id(void) * * @text: The terminated text message. * @level: A pointer to the current level value, will be updated. - * @lflags: A pointer to the current log flags, will be updated. + * @flags: A pointer to the current printk_info flags, will be updated. * * @level may be NULL if the caller is not interested in the parsed value. * Otherwise the variable pointed to by @level must be set to * LOGLEVEL_DEFAULT in order to be updated with the parsed value. * - * @lflags may be NULL if the caller is not interested in the parsed value. - * Otherwise the variable pointed to by @lflags will be OR'd with the parsed + * @flags may be NULL if the caller is not interested in the parsed value. + * Otherwise the variable pointed to by @flags will be OR'd with the parsed * value. * * Return: The length of the parsed level and control flags. */ -static u16 parse_prefix(char *text, int *level, enum log_flags *lflags) +static u16 parse_prefix(char *text, int *level, + enum printk_info_flags *flags) { u16 prefix_len = 0; int kern_level; @@ -1993,8 +1989,8 @@ static u16 parse_prefix(char *text, int *level, enum log_flags *lflags) *level = kern_level - '0'; break; case 'c': /* KERN_CONT */ - if (lflags) - *lflags |= LOG_CONT; + if (flags) + *flags |= LOG_CONT; } prefix_len += 2; @@ -2004,8 +2000,9 @@ static u16 parse_prefix(char *text, int *level, enum log_flags *lflags) return prefix_len; } -static u16 printk_sprint(char *text, u16 size, int facility, enum log_flags *lflags, - const char *fmt, va_list args) +static u16 printk_sprint(char *text, u16 size, int facility, + enum printk_info_flags *flags, const char *fmt, + va_list args) { u16 text_len; @@ -2014,7 +2011,7 @@ static u16 printk_sprint(char *text, u16 size, int facility, enum log_flags *lfl /* Mark and strip a trailing newline. */ if (text_len && text[text_len - 1] == '\n') { text_len--; - *lflags |= LOG_NEWLINE; + *flags |= LOG_NEWLINE; } /* Strip log level and control flags. */ @@ -2038,7 +2035,7 @@ int vprintk_store(int facility, int level, { const u32 caller_id = printk_caller_id(); struct prb_reserved_entry e; - enum log_flags lflags = 0; + enum printk_info_flags flags = 0; struct printk_record r; u16 trunc_msg_len = 0; char prefix_buf[8]; @@ -2070,22 +2067,22 @@ int vprintk_store(int facility, int level, /* Extract log level or control flags. */ if (facility == 0) - parse_prefix(&prefix_buf[0], &level, &lflags); + parse_prefix(&prefix_buf[0], &level, &flags); if (level == LOGLEVEL_DEFAULT) level = default_message_loglevel; if (dev_info) - lflags |= LOG_NEWLINE; + flags |= LOG_NEWLINE; - if (lflags & LOG_CONT) { + if (flags & LOG_CONT) { prb_rec_init_wr(&r, reserve_size); if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) { text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size, - facility, &lflags, fmt, args); + facility, &flags, fmt, args); r.info->text_len += text_len; - if (lflags & LOG_NEWLINE) { + if (flags & LOG_NEWLINE) { r.info->flags |= LOG_NEWLINE; prb_final_commit(&e); } else { @@ -2112,20 +2109,20 @@ int vprintk_store(int facility, int level, } /* fill message */ - text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &lflags, fmt, args); + text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &flags, fmt, args); if (trunc_msg_len) memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len); r.info->text_len = text_len + trunc_msg_len; r.info->facility = facility; r.info->level = level & 7; - r.info->flags = lflags & 0x1f; + r.info->flags = flags & 0x1f; r.info->ts_nsec = ts_nsec; r.info->caller_id = caller_id; if (dev_info) memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); /* A message without a trailing newline can be continued. */ - if (!(lflags & LOG_NEWLINE)) + if (!(flags & LOG_NEWLINE)) prb_commit(&e); else prb_final_commit(&e); -- cgit v1.2.3 From f3d75cf537db57f7918a17a75527951de850e5ec Mon Sep 17 00:00:00 2001 From: Chris Down Date: Tue, 15 Jun 2021 17:52:51 +0100 Subject: printk: Rework parse_prefix into printk_parse_prefix parse_prefix is needed externally by later patches, so move it into a context where it can be used as such. Also give it the printk_ prefix to reduce the chance of collisions. Signed-off-by: Chris Down Cc: Petr Mladek Reviewed-by: Petr Mladek Acked-by: Andy Shevchenko Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/b22ba314a860e5c7f887958f1eab2649f9bd1d06.1623775748.git.chris@chrisdown.name --- kernel/printk/internal.h | 2 ++ kernel/printk/printk.c | 8 ++++---- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 1075e60fcd98..1596e2837318 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -57,6 +57,8 @@ bool printk_percpu_data_ready(void); void defer_console_output(void); +u16 printk_parse_prefix(const char *text, int *level, + enum printk_info_flags *flags); #else /* diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 39f1ec22a6a6..03956c3eb745 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1956,7 +1956,7 @@ static inline u32 printk_caller_id(void) } /** - * parse_prefix - Parse level and control flags. + * printk_parse_prefix - Parse level and control flags. * * @text: The terminated text message. * @level: A pointer to the current level value, will be updated. @@ -1972,7 +1972,7 @@ static inline u32 printk_caller_id(void) * * Return: The length of the parsed level and control flags. */ -static u16 parse_prefix(char *text, int *level, +u16 printk_parse_prefix(const char *text, int *level, enum printk_info_flags *flags) { u16 prefix_len = 0; @@ -2018,7 +2018,7 @@ static u16 printk_sprint(char *text, u16 size, int facility, if (facility == 0) { u16 prefix_len; - prefix_len = parse_prefix(text, NULL, NULL); + prefix_len = printk_parse_prefix(text, NULL, NULL); if (prefix_len) { text_len -= prefix_len; memmove(text, text + prefix_len, text_len); @@ -2067,7 +2067,7 @@ int vprintk_store(int facility, int level, /* Extract log level or control flags. */ if (facility == 0) - parse_prefix(&prefix_buf[0], &level, &flags); + printk_parse_prefix(&prefix_buf[0], &level, &flags); if (level == LOGLEVEL_DEFAULT) level = default_message_loglevel; -- cgit v1.2.3 From 337015573718b161891a3473d25f59273f2e626b Mon Sep 17 00:00:00 2001 From: Chris Down Date: Tue, 15 Jun 2021 17:52:53 +0100 Subject: printk: Userspace format indexing support We have a number of systems industry-wide that have a subset of their functionality that works as follows: 1. Receive a message from local kmsg, serial console, or netconsole; 2. Apply a set of rules to classify the message; 3. Do something based on this classification (like scheduling a remediation for the machine), rinse, and repeat. As a couple of examples of places we have this implemented just inside Facebook, although this isn't a Facebook-specific problem, we have this inside our netconsole processing (for alarm classification), and as part of our machine health checking. We use these messages to determine fairly important metrics around production health, and it's important that we get them right. While for some kinds of issues we have counters, tracepoints, or metrics with a stable interface which can reliably indicate the issue, in order to react to production issues quickly we need to work with the interface which most kernel developers naturally use when developing: printk. Most production issues come from unexpected phenomena, and as such usually the code in question doesn't have easily usable tracepoints or other counters available for the specific problem being mitigated. We have a number of lines of monitoring defence against problems in production (host metrics, process metrics, service metrics, etc), and where it's not feasible to reliably monitor at another level, this kind of pragmatic netconsole monitoring is essential. As one would expect, monitoring using printk is rather brittle for a number of reasons -- most notably that the message might disappear entirely in a new version of the kernel, or that the message may change in some way that the regex or other classification methods start to silently fail. One factor that makes this even harder is that, under normal operation, many of these messages are never expected to be hit. For example, there may be a rare hardware bug which one wants to detect if it was to ever happen again, but its recurrence is not likely or anticipated. This precludes using something like checking whether the printk in question was printed somewhere fleetwide recently to determine whether the message in question is still present or not, since we don't anticipate that it should be printed anywhere, but still need to monitor for its future presence in the long-term. This class of issue has happened on a number of occasions, causing unhealthy machines with hardware issues to remain in production for longer than ideal. As a recent example, some monitoring around blk_update_request fell out of date and caused semi-broken machines to remain in production for longer than would be desirable. Searching through the codebase to find the message is also extremely fragile, because many of the messages are further constructed beyond their callsite (eg. btrfs_printk and other module-specific wrappers, each with their own functionality). Even if they aren't, guessing the format and formulation of the underlying message based on the aesthetics of the message emitted is not a recipe for success at scale, and our previous issues with fleetwide machine health checking demonstrate as much. This provides a solution to the issue of silently changed or deleted printks: we record pointers to all printk format strings known at compile time into a new .printk_index section, both in vmlinux and modules. At runtime, this can then be iterated by looking at /printk/index/, which emits the following format, both readable by humans and able to be parsed by machines: $ head -1 vmlinux; shuf -n 5 vmlinux # filename:line function "format" <5> block/blk-settings.c:661 disk_stack_limits "%s: Warning: Device %s is misaligned\n" <4> kernel/trace/trace.c:8296 trace_create_file "Could not create tracefs '%s' entry\n" <6> arch/x86/kernel/hpet.c:144 _hpet_print_config "hpet: %s(%d):\n" <6> init/do_mounts.c:605 prepare_namespace "Waiting for root device %s...\n" <6> drivers/acpi/osl.c:1410 acpi_no_auto_serialize_setup "ACPI: auto-serialization disabled\n" This mitigates the majority of cases where we have a highly-specific printk which we want to match on, as we can now enumerate and check whether the format changed or the printk callsite disappeared entirely in userspace. This allows us to catch changes to printks we monitor earlier and decide what to do about it before it becomes problematic. There is no additional runtime cost for printk callers or printk itself, and the assembly generated is exactly the same. Signed-off-by: Chris Down Cc: Petr Mladek Cc: Jessica Yu Cc: Sergey Senozhatsky Cc: John Ogness Cc: Steven Rostedt Cc: Greg Kroah-Hartman Cc: Johannes Weiner Cc: Kees Cook Reviewed-by: Petr Mladek Tested-by: Petr Mladek Reported-by: kernel test robot Acked-by: Andy Shevchenko Acked-by: Jessica Yu # for module.{c,h} Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/e42070983637ac5e384f17fbdbe86d19c7b212a5.1623775748.git.chris@chrisdown.name --- kernel/module.c | 5 ++ kernel/printk/Makefile | 1 + kernel/printk/index.c | 195 +++++++++++++++++++++++++++++++++++++++++++++++++ kernel/printk/printk.c | 13 ++-- 4 files changed, 209 insertions(+), 5 deletions(-) create mode 100644 kernel/printk/index.c (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index ed13917ea5f3..40ec9a030eec 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3355,6 +3355,11 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(unsigned long), &mod->num_kprobe_blacklist); #endif +#ifdef CONFIG_PRINTK_INDEX + mod->printk_index_start = section_objs(info, ".printk_index", + sizeof(*mod->printk_index_start), + &mod->printk_index_size); +#endif #ifdef CONFIG_HAVE_STATIC_CALL_INLINE mod->static_call_sites = section_objs(info, ".static_call_sites", sizeof(*mod->static_call_sites), diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index eee3dc9b60a9..d118739874c0 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -3,3 +3,4 @@ obj-y = printk.o obj-$(CONFIG_PRINTK) += printk_safe.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o obj-$(CONFIG_PRINTK) += printk_ringbuffer.o +obj-$(CONFIG_PRINTK_INDEX) += index.o diff --git a/kernel/printk/index.c b/kernel/printk/index.c new file mode 100644 index 000000000000..ca062f5e1779 --- /dev/null +++ b/kernel/printk/index.c @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Userspace indexing of printk formats + */ + +#include +#include +#include +#include +#include + +#include "internal.h" + +extern struct pi_entry *__start_printk_index[]; +extern struct pi_entry *__stop_printk_index[]; + +/* The base dir for module formats, typically debugfs/printk/index/ */ +static struct dentry *dfs_index; + +static struct pi_entry *pi_get_entry(const struct module *mod, loff_t pos) +{ + struct pi_entry **entries; + unsigned int nr_entries; + +#ifdef CONFIG_MODULES + if (mod) { + entries = mod->printk_index_start; + nr_entries = mod->printk_index_size; + } +#endif + + if (!mod) { + /* vmlinux, comes from linker symbols */ + entries = __start_printk_index; + nr_entries = __stop_printk_index - __start_printk_index; + } + + if (pos >= nr_entries) + return NULL; + + return entries[pos]; +} + +static void *pi_next(struct seq_file *s, void *v, loff_t *pos) +{ + const struct module *mod = s->file->f_inode->i_private; + struct pi_entry *entry = pi_get_entry(mod, *pos); + + (*pos)++; + + return entry; +} + +static void *pi_start(struct seq_file *s, loff_t *pos) +{ + /* + * Make show() print the header line. Do not update *pos because + * pi_next() still has to return the entry at index 0 later. + */ + if (*pos == 0) + return SEQ_START_TOKEN; + + return pi_next(s, NULL, pos); +} + +/* + * We need both ESCAPE_ANY and explicit characters from ESCAPE_SPECIAL in @only + * because otherwise ESCAPE_NAP will cause double quotes and backslashes to be + * ignored for quoting. + */ +#define seq_escape_printf_format(s, src) \ + seq_escape_str(s, src, ESCAPE_ANY | ESCAPE_NAP | ESCAPE_APPEND, "\"\\") + +static int pi_show(struct seq_file *s, void *v) +{ + const struct pi_entry *entry = v; + int level = LOGLEVEL_DEFAULT; + enum printk_info_flags flags = 0; + u16 prefix_len = 0; + + if (v == SEQ_START_TOKEN) { + seq_puts(s, "# filename:line function \"format\"\n"); + return 0; + } + + if (!entry->fmt) + return 0; + + if (entry->level) + printk_parse_prefix(entry->level, &level, &flags); + else + prefix_len = printk_parse_prefix(entry->fmt, &level, &flags); + + + if (flags & LOG_CONT) { + /* + * LOGLEVEL_DEFAULT here means "use the same level as the + * message we're continuing from", not the default message + * loglevel, so don't display it as such. + */ + if (level == LOGLEVEL_DEFAULT) + seq_puts(s, ""); + else + seq_printf(s, "<%d,c>", level); + } else + seq_printf(s, "<%d>", level); + + seq_printf(s, " %s:%d %s \"", entry->file, entry->line, entry->func); + if (entry->subsys_fmt_prefix) + seq_escape_printf_format(s, entry->subsys_fmt_prefix); + seq_escape_printf_format(s, entry->fmt + prefix_len); + seq_puts(s, "\"\n"); + + return 0; +} + +static void pi_stop(struct seq_file *p, void *v) { } + +static const struct seq_operations dfs_index_sops = { + .start = pi_start, + .next = pi_next, + .show = pi_show, + .stop = pi_stop, +}; + +DEFINE_SEQ_ATTRIBUTE(dfs_index); + +#ifdef CONFIG_MODULES +static const char *pi_get_module_name(struct module *mod) +{ + return mod ? mod->name : "vmlinux"; +} +#else +static const char *pi_get_module_name(struct module *mod) +{ + return "vmlinux"; +} +#endif + +void pi_create_file(struct module *mod) +{ + debugfs_create_file(pi_get_module_name(mod), 0444, dfs_index, + mod, &dfs_index_fops); +} + +void pi_remove_file(struct module *mod) +{ + debugfs_remove(debugfs_lookup(pi_get_module_name(mod), dfs_index)); +} + +#ifdef CONFIG_MODULES +static int pi_module_notify(struct notifier_block *nb, unsigned long op, + void *data) +{ + struct module *mod = data; + + switch (op) { + case MODULE_STATE_COMING: + pi_create_file(mod); + break; + case MODULE_STATE_GOING: + pi_remove_file(mod); + break; + default: /* we don't care about other module states */ + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block module_printk_fmts_nb = { + .notifier_call = pi_module_notify, +}; + +static void __init pi_setup_module_notifier(void) +{ + register_module_notifier(&module_printk_fmts_nb); +} +#else +static inline void __init pi_setup_module_notifier(void) { } +#endif + +static int __init pi_init(void) +{ + struct dentry *dfs_root = debugfs_create_dir("printk", NULL); + + dfs_index = debugfs_create_dir("index", dfs_root); + pi_setup_module_notifier(); + pi_create_file(NULL); + + return 0; +} + +/* debugfs comes up on core and must be initialised first */ +postcore_initcall(pi_init); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 03956c3eb745..765f7af6ce56 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2184,10 +2184,13 @@ int vprintk_default(const char *fmt, va_list args) EXPORT_SYMBOL_GPL(vprintk_default); /** - * printk - print a kernel message + * _printk - print a kernel message * @fmt: format string * - * This is printk(). It can be called from any context. We want it to work. + * This is _printk(). It can be called from any context. We want it to work. + * + * If printk indexing is enabled, _printk() is called from printk_index_wrap. + * Otherwise, printk is simply #defined to _printk. * * We try to grab the console_lock. If we succeed, it's easy - we log the * output and call the console drivers. If we fail to get the semaphore, we @@ -2204,7 +2207,7 @@ EXPORT_SYMBOL_GPL(vprintk_default); * * See the vsnprintf() documentation for format string extensions over C99. */ -asmlinkage __visible int printk(const char *fmt, ...) +asmlinkage __visible int _printk(const char *fmt, ...) { va_list args; int r; @@ -2215,7 +2218,7 @@ asmlinkage __visible int printk(const char *fmt, ...) return r; } -EXPORT_SYMBOL(printk); +EXPORT_SYMBOL(_printk); #else /* CONFIG_PRINTK */ @@ -3200,7 +3203,7 @@ int vprintk_deferred(const char *fmt, va_list args) return r; } -int printk_deferred(const char *fmt, ...) +int _printk_deferred(const char *fmt, ...) { va_list args; int r; -- cgit v1.2.3 From d97e99386ad0dcae08cb0f0c70efa806a2d4811c Mon Sep 17 00:00:00 2001 From: MaYuming Date: Sun, 11 Jul 2021 10:57:47 +0800 Subject: audit: add header protection to kernel/audit.h Protect kernel/audit.h against multiple #include's. Signed-off-by: MaYuming [PM: rewrite subj/description] Signed-off-by: Paul Moore --- kernel/audit.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/audit.h b/kernel/audit.h index b565ea16c0a5..d6a2c899a8db 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -6,6 +6,9 @@ * Copyright 2005 IBM Corporation */ +#ifndef _KERNEL_AUDIT_H_ +#define _KERNEL_AUDIT_H_ + #include #include #include @@ -331,3 +334,5 @@ extern int audit_filter(int msgtype, unsigned int listtype); extern void audit_ctl_lock(void); extern void audit_ctl_unlock(void); + +#endif -- cgit v1.2.3 From a7a73697360ea81244eea550138b8f614348860c Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 7 Jun 2021 14:56:48 +0200 Subject: kcsan: Remove CONFIG_KCSAN_DEBUG By this point CONFIG_KCSAN_DEBUG is pretty useless, as the system just isn't usable with it due to spamming console (I imagine a randconfig test robot will run into this sooner or later). Remove it. Back in 2019 I used it occasionally to record traces of watchpoints and verify the encoding is correct, but these days we have proper tests. If something similar is needed in future, just add it back ad-hoc. Signed-off-by: Marco Elver Acked-by: Mark Rutland Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 26709ea65c71..d92977ede7e1 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -479,15 +479,6 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) break; /* ignore; we do not diff the values */ } - if (IS_ENABLED(CONFIG_KCSAN_DEBUG)) { - kcsan_disable_current(); - pr_err("watching %s, size: %zu, addr: %px [slot: %d, encoded: %lx]\n", - is_write ? "write" : "read", size, ptr, - watchpoint_slot((unsigned long)ptr), - encode_watchpoint((unsigned long)ptr, size, is_write)); - kcsan_enable_current(); - } - /* * Delay this thread, to increase probability of observing a racy * conflicting access. -- cgit v1.2.3 From 08cac6049412061e571fadadc3e23464dc46d0f2 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 7 Jun 2021 14:56:50 +0200 Subject: kcsan: Reduce get_ctx() uses in kcsan_found_watchpoint() There are a number get_ctx() calls that are close to each other, which results in poor codegen (repeated preempt_count loads). Specifically in kcsan_found_watchpoint() (even though it's a slow-path) it is beneficial to keep the race-window small until the watchpoint has actually been consumed to avoid missed opportunities to report a race. Let's clean it up a bit before we add more code in kcsan_found_watchpoint(). Signed-off-by: Marco Elver Acked-by: Mark Rutland Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index d92977ede7e1..906100923b88 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -301,9 +301,9 @@ static inline void reset_kcsan_skip(void) this_cpu_write(kcsan_skip, skip_count); } -static __always_inline bool kcsan_is_enabled(void) +static __always_inline bool kcsan_is_enabled(struct kcsan_ctx *ctx) { - return READ_ONCE(kcsan_enabled) && get_ctx()->disable_count == 0; + return READ_ONCE(kcsan_enabled) && !ctx->disable_count; } /* Introduce delay depending on context and configuration. */ @@ -353,10 +353,17 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, atomic_long_t *watchpoint, long encoded_watchpoint) { + struct kcsan_ctx *ctx = get_ctx(); unsigned long flags; bool consumed; - if (!kcsan_is_enabled()) + /* + * We know a watchpoint exists. Let's try to keep the race-window + * between here and finally consuming the watchpoint below as small as + * possible -- avoid unneccessarily complex code until consumed. + */ + + if (!kcsan_is_enabled(ctx)) return; /* @@ -364,14 +371,12 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, * reporting a race where e.g. the writer set up the watchpoint, but the * reader has access_mask!=0, we have to ignore the found watchpoint. */ - if (get_ctx()->access_mask != 0) + if (ctx->access_mask) return; /* - * Consume the watchpoint as soon as possible, to minimize the chances - * of !consumed. Consuming the watchpoint must always be guarded by - * kcsan_is_enabled() check, as otherwise we might erroneously - * triggering reports when disabled. + * Consuming the watchpoint must be guarded by kcsan_is_enabled() to + * avoid erroneously triggering reports if the context is disabled. */ consumed = try_consume_watchpoint(watchpoint, encoded_watchpoint); @@ -409,6 +414,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) unsigned long access_mask; enum kcsan_value_change value_change = KCSAN_VALUE_CHANGE_MAYBE; unsigned long ua_flags = user_access_save(); + struct kcsan_ctx *ctx = get_ctx(); unsigned long irq_flags = 0; /* @@ -417,7 +423,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) */ reset_kcsan_skip(); - if (!kcsan_is_enabled()) + if (!kcsan_is_enabled(ctx)) goto out; /* @@ -489,7 +495,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) * Re-read value, and check if it is as expected; if not, we infer a * racy access. */ - access_mask = get_ctx()->access_mask; + access_mask = ctx->access_mask; new = 0; switch (size) { case 1: -- cgit v1.2.3 From 49f72d5358dd3c0d28bcd2232c513000b15480f0 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 7 Jun 2021 14:56:51 +0200 Subject: kcsan: Rework atomic.h into permissive.h Rework atomic.h into permissive.h to better reflect its purpose, and introduce kcsan_ignore_address() and kcsan_ignore_data_race(). Introduce CONFIG_KCSAN_PERMISSIVE and update the stub functions in preparation for subsequent changes. As before, developers who choose to use KCSAN in "strict" mode will see all data races and are not affected. Furthermore, by relying on the value-change filter logic for kcsan_ignore_data_race(), even if the permissive rules are enabled, the opt-outs in report.c:skip_report() override them (such as for RCU-related functions by default). The option CONFIG_KCSAN_PERMISSIVE is disabled by default, so that the documented default behaviour of KCSAN does not change. Instead, like CONFIG_KCSAN_IGNORE_ATOMICS, the option needs to be explicitly opted in. Signed-off-by: Marco Elver Acked-by: Mark Rutland Signed-off-by: Paul E. McKenney --- kernel/kcsan/atomic.h | 23 ----------------------- kernel/kcsan/core.c | 33 ++++++++++++++++++++++++--------- kernel/kcsan/permissive.h | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+), 32 deletions(-) delete mode 100644 kernel/kcsan/atomic.h create mode 100644 kernel/kcsan/permissive.h (limited to 'kernel') diff --git a/kernel/kcsan/atomic.h b/kernel/kcsan/atomic.h deleted file mode 100644 index 530ae1bda8e7..000000000000 --- a/kernel/kcsan/atomic.h +++ /dev/null @@ -1,23 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Rules for implicitly atomic memory accesses. - * - * Copyright (C) 2019, Google LLC. - */ - -#ifndef _KERNEL_KCSAN_ATOMIC_H -#define _KERNEL_KCSAN_ATOMIC_H - -#include - -/* - * Special rules for certain memory where concurrent conflicting accesses are - * common, however, the current convention is to not mark them; returns true if - * access to @ptr should be considered atomic. Called from slow-path. - */ -static bool kcsan_is_atomic_special(const volatile void *ptr) -{ - return false; -} - -#endif /* _KERNEL_KCSAN_ATOMIC_H */ diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 906100923b88..439edb9dcbb1 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -20,9 +20,9 @@ #include #include -#include "atomic.h" #include "encoding.h" #include "kcsan.h" +#include "permissive.h" static bool kcsan_early_enable = IS_ENABLED(CONFIG_KCSAN_EARLY_ENABLE); unsigned int kcsan_udelay_task = CONFIG_KCSAN_UDELAY_TASK; @@ -353,6 +353,7 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, atomic_long_t *watchpoint, long encoded_watchpoint) { + const bool is_assert = (type & KCSAN_ACCESS_ASSERT) != 0; struct kcsan_ctx *ctx = get_ctx(); unsigned long flags; bool consumed; @@ -374,6 +375,16 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, if (ctx->access_mask) return; + /* + * If the other thread does not want to ignore the access, and there was + * a value change as a result of this thread's operation, we will still + * generate a report of unknown origin. + * + * Use CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN=n to filter. + */ + if (!is_assert && kcsan_ignore_address(ptr)) + return; + /* * Consuming the watchpoint must be guarded by kcsan_is_enabled() to * avoid erroneously triggering reports if the context is disabled. @@ -396,7 +407,7 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_REPORT_RACES]); } - if ((type & KCSAN_ACCESS_ASSERT) != 0) + if (is_assert) atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]); else atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_DATA_RACES]); @@ -427,12 +438,10 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) goto out; /* - * Special atomic rules: unlikely to be true, so we check them here in - * the slow-path, and not in the fast-path in is_atomic(). Call after - * kcsan_is_enabled(), as we may access memory that is not yet - * initialized during early boot. + * Check to-ignore addresses after kcsan_is_enabled(), as we may access + * memory that is not yet initialized during early boot. */ - if (!is_assert && kcsan_is_atomic_special(ptr)) + if (!is_assert && kcsan_ignore_address(ptr)) goto out; if (!check_encodable((unsigned long)ptr, size)) { @@ -518,8 +527,14 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type) if (access_mask) diff &= access_mask; - /* Were we able to observe a value-change? */ - if (diff != 0) + /* + * Check if we observed a value change. + * + * Also check if the data race should be ignored (the rules depend on + * non-zero diff); if it is to be ignored, the below rules for + * KCSAN_VALUE_CHANGE_MAYBE apply. + */ + if (diff && !kcsan_ignore_data_race(size, type, old, new, diff)) value_change = KCSAN_VALUE_CHANGE_TRUE; /* Check if this access raced with another. */ diff --git a/kernel/kcsan/permissive.h b/kernel/kcsan/permissive.h new file mode 100644 index 000000000000..f90e30800c11 --- /dev/null +++ b/kernel/kcsan/permissive.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Special rules for ignoring entire classes of data-racy memory accesses. None + * of the rules here imply that such data races are generally safe! + * + * All rules in this file can be configured via CONFIG_KCSAN_PERMISSIVE. Keep + * them separate from core code to make it easier to audit. + * + * Copyright (C) 2019, Google LLC. + */ + +#ifndef _KERNEL_KCSAN_PERMISSIVE_H +#define _KERNEL_KCSAN_PERMISSIVE_H + +#include + +/* + * Access ignore rules based on address. + */ +static __always_inline bool kcsan_ignore_address(const volatile void *ptr) +{ + if (!IS_ENABLED(CONFIG_KCSAN_PERMISSIVE)) + return false; + + return false; +} + +/* + * Data race ignore rules based on access type and value change patterns. + */ +static bool +kcsan_ignore_data_race(size_t size, int type, u64 old, u64 new, u64 diff) +{ + if (!IS_ENABLED(CONFIG_KCSAN_PERMISSIVE)) + return false; + + /* + * Rules here are only for plain read accesses, so that we still report + * data races between plain read-write accesses. + */ + if (type || size > sizeof(long)) + return false; + + return false; +} + +#endif /* _KERNEL_KCSAN_PERMISSIVE_H */ -- cgit v1.2.3 From 9c827cd1fcdf54bb50f874f91af0d5de2aceb035 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 7 Jun 2021 14:56:52 +0200 Subject: kcsan: Print if strict or non-strict during init Show a brief message if KCSAN is strict or non-strict, and if non-strict also say that CONFIG_KCSAN_STRICT=y can be used to see all data races. This is to hint to users of KCSAN who blindly use the default config that their configuration might miss data races of interest. Signed-off-by: Marco Elver Acked-by: Mark Rutland Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 439edb9dcbb1..76e67d1e02d4 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -656,6 +656,15 @@ void __init kcsan_init(void) pr_info("enabled early\n"); WRITE_ONCE(kcsan_enabled, true); } + + if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY) || + IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC) || + IS_ENABLED(CONFIG_KCSAN_PERMISSIVE) || + IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { + pr_warn("non-strict mode configured - use CONFIG_KCSAN_STRICT=y to see all data races\n"); + } else { + pr_info("strict mode configured\n"); + } } /* === Exported interface =================================================== */ -- cgit v1.2.3 From d8fd74d35a8d3c602232e3238e916bda9d03d520 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 7 Jun 2021 14:56:53 +0200 Subject: kcsan: permissive: Ignore data-racy 1-bit value changes Add rules to ignore data-racy reads with only 1-bit value changes. Details about the rules are captured in comments in kernel/kcsan/permissive.h. More background follows. While investigating a number of data races, we've encountered data-racy accesses on flags variables to be very common. The typical pattern is a reader masking all but one bit, and/or the writer setting/clearing only 1 bit (current->flags being a frequently encountered case; more examples in mm/sl[au]b.c, which disable KCSAN for this reason). Since these types of data-racy accesses are common (with the assumption they are intentional and hard to miscompile) having the option (with CONFIG_KCSAN_PERMISSIVE=y) to filter them will avoid forcing everyone to mark them, and deliberately left to preference at this time. One important motivation for having this option built-in is to move closer to being able to enable KCSAN on CI systems or for testers wishing to test the whole kernel, while more easily filtering less interesting data races with higher probability. For the implementation, we considered several alternatives, but had one major requirement: that the rules be kept together with the Linux-kernel tree. Adding them to the compiler would preclude us from making changes quickly; if the rules require tweaks, having them part of the compiler requires waiting another ~1 year for the next release -- that's not realistic. We are left with the following options: 1. Maintain compiler plugins as part of the kernel-tree that removes instrumentation for some accesses (e.g. plain-& with 1-bit mask). The analysis would be reader-side focused, as no assumption can be made about racing writers. Because it seems unrealistic to maintain 2 plugins, one for LLVM and GCC, we would likely pick LLVM. Furthermore, no kernel infrastructure exists to maintain LLVM plugins, and the build-system implications and maintenance overheads do not look great (historically, plugins written against old LLVM APIs are not guaranteed to work with newer LLVM APIs). 2. Find a set of rules that can be expressed in terms of observed value changes, and make it part of the KCSAN runtime. The analysis is writer-side focused, given we rely on observed value changes. The approach taken here is (2). While a complete approach requires both (1) and (2), experiments show that the majority of data races involving trivial bit operations on flags variables can be removed with (2) alone. It goes without saying that the filtering of data races using (1) or (2) does _not_ guarantee they are safe! Therefore, limiting ourselves to (2) for now is the conservative choice for setups that wish to enable CONFIG_KCSAN_PERMISSIVE=y. Signed-off-by: Marco Elver Acked-by: Mark Rutland Signed-off-by: Paul E. McKenney --- kernel/kcsan/kcsan_test.c | 32 +++++++++++++++++++++++++++++++ kernel/kcsan/permissive.h | 49 ++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 80 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index 8bcffbdef3d3..dc55fd5a36fc 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -414,6 +414,14 @@ static noinline void test_kernel_atomic_builtins(void) __atomic_load_n(&test_var, __ATOMIC_RELAXED); } +static noinline void test_kernel_xor_1bit(void) +{ + /* Do not report data races between the read-writes. */ + kcsan_nestable_atomic_begin(); + test_var ^= 0x10000; + kcsan_nestable_atomic_end(); +} + /* ===== Test cases ===== */ /* Simple test with normal data race. */ @@ -952,6 +960,29 @@ static void test_atomic_builtins(struct kunit *test) KUNIT_EXPECT_FALSE(test, match_never); } +__no_kcsan +static void test_1bit_value_change(struct kunit *test) +{ + const struct expect_report expect = { + .access = { + { test_kernel_read, &test_var, sizeof(test_var), 0 }, + { test_kernel_xor_1bit, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) }, + }, + }; + bool match = false; + + begin_test_checks(test_kernel_read, test_kernel_xor_1bit); + do { + match = IS_ENABLED(CONFIG_KCSAN_PERMISSIVE) + ? report_available() + : report_matches(&expect); + } while (!end_test_checks(match)); + if (IS_ENABLED(CONFIG_KCSAN_PERMISSIVE)) + KUNIT_EXPECT_FALSE(test, match); + else + KUNIT_EXPECT_TRUE(test, match); +} + /* * Generate thread counts for all test cases. Values generated are in interval * [2, 5] followed by exponentially increasing thread counts from 8 to 32. @@ -1024,6 +1055,7 @@ static struct kunit_case kcsan_test_cases[] = { KCSAN_KUNIT_CASE(test_jiffies_noreport), KCSAN_KUNIT_CASE(test_seqlock_noreport), KCSAN_KUNIT_CASE(test_atomic_builtins), + KCSAN_KUNIT_CASE(test_1bit_value_change), {}, }; diff --git a/kernel/kcsan/permissive.h b/kernel/kcsan/permissive.h index f90e30800c11..2c01fe4a59ee 100644 --- a/kernel/kcsan/permissive.h +++ b/kernel/kcsan/permissive.h @@ -12,6 +12,8 @@ #ifndef _KERNEL_KCSAN_PERMISSIVE_H #define _KERNEL_KCSAN_PERMISSIVE_H +#include +#include #include /* @@ -22,7 +24,11 @@ static __always_inline bool kcsan_ignore_address(const volatile void *ptr) if (!IS_ENABLED(CONFIG_KCSAN_PERMISSIVE)) return false; - return false; + /* + * Data-racy bitops on current->flags are too common, ignore completely + * for now. + */ + return ptr == ¤t->flags; } /* @@ -41,6 +47,47 @@ kcsan_ignore_data_race(size_t size, int type, u64 old, u64 new, u64 diff) if (type || size > sizeof(long)) return false; + /* + * A common pattern is checking/setting just 1 bit in a variable; for + * example: + * + * if (flags & SOME_FLAG) { ... } + * + * and elsewhere flags is updated concurrently: + * + * flags |= SOME_OTHER_FLAG; // just 1 bit + * + * While it is still recommended that such accesses be marked + * appropriately, in many cases these types of data races are so common + * that marking them all is often unrealistic and left to maintainer + * preference. + * + * The assumption in all cases is that with all known compiler + * optimizations (including those that tear accesses), because no more + * than 1 bit changed, the plain accesses are safe despite the presence + * of data races. + * + * The rules here will ignore the data races if we observe no more than + * 1 bit changed. + * + * Of course many operations can effecively change just 1 bit, but the + * general assuption that data races involving 1-bit changes can be + * tolerated still applies. + * + * And in case a true bug is missed, the bug likely manifests as a + * reportable data race elsewhere. + */ + if (hweight64(diff) == 1) { + /* + * Exception: Report data races where the values look like + * ordinary booleans (one of them was 0 and the 0th bit was + * changed) More often than not, they come with interesting + * memory ordering requirements, so let's report them. + */ + if (!((!old || !new) && diff == 1)) + return true; + } + return false; } -- cgit v1.2.3 From 16c5900ba776c5acd6568abd60c40f948a96e496 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 21 Jul 2021 23:19:45 +0200 Subject: bpf: Fix pointer cast warning kp->addr is a pointer, so it cannot be cast directly to a 'u64' when it gets interpreted as an integer value: kernel/trace/bpf_trace.c: In function '____bpf_get_func_ip_kprobe': kernel/trace/bpf_trace.c:968:21: error: cast from pointer to integer of different size [-Werror=pointer-to-int-cast] 968 | return kp ? (u64) kp->addr : 0; Use the uintptr_t type instead. Fixes: 9ffd9f3ff719 ("bpf: Add bpf_get_func_ip helper for kprobe programs") Signed-off-by: Arnd Bergmann Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210721212007.3876595-1-arnd@kernel.org --- kernel/trace/bpf_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 08906007306d..1f22ce1fa971 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -965,7 +965,7 @@ BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs) { struct kprobe *kp = kprobe_running(); - return kp ? (u64) kp->addr : 0; + return kp ? (uintptr_t)kp->addr : 0; } static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = { -- cgit v1.2.3 From 724f17b7d45d62c71e92471666647a823cb9baa9 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 21 Jul 2021 12:56:30 +0100 Subject: bpf: Remove redundant intiialization of variable stype The variable stype is being initialized with a value that is never read, it is being updated later on. The assignment is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210721115630.109279-1-colin.king@canonical.com --- kernel/bpf/local_storage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 362e81481594..7ed2a14dc0de 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -406,7 +406,7 @@ static int cgroup_storage_check_btf(const struct bpf_map *map, static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m) { - enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); + enum bpf_cgroup_storage_type stype; struct bpf_cgroup_storage *storage; int cpu; -- cgit v1.2.3 From 2c9f7eaf08659fa23d25b93a446f74306b3abea8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 28 May 2021 13:38:19 -0500 Subject: signal/sparc: si_trapno is only used with SIGILL ILL_ILLTRP While reviewing the signal handlers on sparc it became clear that si_trapno is only set to a non-zero value when sending SIGILL with si_code ILL_ILLTRP. Add force_sig_fault_trapno and send SIGILL ILL_ILLTRP with it. Remove the define of __ARCH_SI_TRAPNO and remove the always zero si_trapno parameter from send_sig_fault and force_sig_fault. v1: https://lkml.kernel.org/r/m1eeers7q7.fsf_-_@fess.ebiederm.org v2: https://lkml.kernel.org/r/20210505141101.11519-7-ebiederm@xmission.com Link: https://lkml.kernel.org/r/87mtqnxx89.fsf_-_@disp2133 Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index a3229add4455..87a374225277 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1808,6 +1808,22 @@ int force_sig_ptrace_errno_trap(int errno, void __user *addr) return force_sig_info(&info); } +/* For the rare architectures that include trap information using + * si_trapno. + */ +int force_sig_fault_trapno(int sig, int code, void __user *addr, int trapno) +{ + struct kernel_siginfo info; + + clear_siginfo(&info); + info.si_signo = sig; + info.si_errno = 0; + info.si_code = code; + info.si_addr = addr; + info.si_trapno = trapno; + return force_sig_info(&info); +} + int kill_pgrp(struct pid *pid, int sig, int priv) { int ret; @@ -3243,6 +3259,9 @@ enum siginfo_layout siginfo_layout(unsigned sig, int si_code) #endif else if ((sig == SIGTRAP) && (si_code == TRAP_PERF)) layout = SIL_PERF_EVENT; + else if (IS_ENABLED(CONFIG_SPARC) && + (sig == SIGILL) && (si_code == ILL_ILLTRP)) + layout = SIL_FAULT_TRAPNO; #ifdef __ARCH_SI_TRAPNO else if (layout == SIL_FAULT) layout = SIL_FAULT_TRAPNO; -- cgit v1.2.3 From 7de5f68d497cbc700c4a28cc037dd61f00e452e8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 28 May 2021 14:15:51 -0500 Subject: signal/alpha: si_trapno is only used with SIGFPE and SIGTRAP TRAP_UNK While reviewing the signal handlers on alpha it became clear that si_trapno is only set to a non-zero value when sending SIGFPE and when sending SITGRAP with si_code TRAP_UNK. Add send_sig_fault_trapno and send SIGTRAP TRAP_UNK, and SIGFPE with it. Remove the define of __ARCH_SI_TRAPNO and remove the always zero si_trapno parameter from send_sig_fault and force_sig_fault. v1: https://lkml.kernel.org/r/m1eeers7q7.fsf_-_@fess.ebiederm.org v2: https://lkml.kernel.org/r/20210505141101.11519-7-ebiederm@xmission.com Link: https://lkml.kernel.org/r/87h7gvxx7l.fsf_-_@disp2133 Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 87a374225277..ae06a424aa72 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1824,6 +1824,23 @@ int force_sig_fault_trapno(int sig, int code, void __user *addr, int trapno) return force_sig_info(&info); } +/* For the rare architectures that include trap information using + * si_trapno. + */ +int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno, + struct task_struct *t) +{ + struct kernel_siginfo info; + + clear_siginfo(&info); + info.si_signo = sig; + info.si_errno = 0; + info.si_code = code; + info.si_addr = addr; + info.si_trapno = trapno; + return send_sig_info(info.si_signo, &info, t); +} + int kill_pgrp(struct pid *pid, int sig, int priv) { int ret; @@ -3262,6 +3279,10 @@ enum siginfo_layout siginfo_layout(unsigned sig, int si_code) else if (IS_ENABLED(CONFIG_SPARC) && (sig == SIGILL) && (si_code == ILL_ILLTRP)) layout = SIL_FAULT_TRAPNO; + else if (IS_ENABLED(CONFIG_ALPHA) && + ((sig == SIGFPE) || + ((sig == SIGTRAP) && (si_code == TRAP_UNK)))) + layout = SIL_FAULT_TRAPNO; #ifdef __ARCH_SI_TRAPNO else if (layout == SIL_FAULT) layout = SIL_FAULT_TRAPNO; -- cgit v1.2.3 From c7fff9288dce1ee5fa9de8d656e09cc8e0e3281b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 30 Apr 2021 17:53:38 -0500 Subject: signal: Remove the generic __ARCH_SI_TRAPNO support Now that __ARCH_SI_TRAPNO is no longer set by any architecture remove all of the code it enabled from the kernel. On alpha and sparc a more explict approach of using send_sig_fault_trapno or force_sig_fault_trapno in the very limited circumstances where si_trapno was set to a non-zero value. The generic support that is being removed always set si_trapno on all fault signals. With only SIGILL ILL_ILLTRAP on sparc and SIGFPE and SIGTRAP TRAP_UNK on alpla providing si_trapno values asking all senders of fault signals to provide an si_trapno value does not make sense. Making si_trapno an ordinary extension of the fault siginfo layout has enabled the architecture generic implementation of SIGTRAP TRAP_PERF, and enables other faulting signals to grow architecture generic senders as well. v1: https://lkml.kernel.org/r/m18s4zs7nu.fsf_-_@fess.ebiederm.org v2: https://lkml.kernel.org/r/20210505141101.11519-8-ebiederm@xmission.com Link: https://lkml.kernel.org/r/87bl73xx6x.fsf_-_@disp2133 Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index ae06a424aa72..2181423e562a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1666,7 +1666,6 @@ void force_sigsegv(int sig) } int force_sig_fault_to_task(int sig, int code, void __user *addr - ___ARCH_SI_TRAPNO(int trapno) ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) , struct task_struct *t) { @@ -1677,9 +1676,6 @@ int force_sig_fault_to_task(int sig, int code, void __user *addr info.si_errno = 0; info.si_code = code; info.si_addr = addr; -#ifdef __ARCH_SI_TRAPNO - info.si_trapno = trapno; -#endif #ifdef __ia64__ info.si_imm = imm; info.si_flags = flags; @@ -1689,16 +1685,13 @@ int force_sig_fault_to_task(int sig, int code, void __user *addr } int force_sig_fault(int sig, int code, void __user *addr - ___ARCH_SI_TRAPNO(int trapno) ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)) { return force_sig_fault_to_task(sig, code, addr - ___ARCH_SI_TRAPNO(trapno) ___ARCH_SI_IA64(imm, flags, isr), current); } int send_sig_fault(int sig, int code, void __user *addr - ___ARCH_SI_TRAPNO(int trapno) ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) , struct task_struct *t) { @@ -1709,9 +1702,6 @@ int send_sig_fault(int sig, int code, void __user *addr info.si_errno = 0; info.si_code = code; info.si_addr = addr; -#ifdef __ARCH_SI_TRAPNO - info.si_trapno = trapno; -#endif #ifdef __ia64__ info.si_imm = imm; info.si_flags = flags; @@ -3283,10 +3273,6 @@ enum siginfo_layout siginfo_layout(unsigned sig, int si_code) ((sig == SIGFPE) || ((sig == SIGTRAP) && (si_code == TRAP_UNK)))) layout = SIL_FAULT_TRAPNO; -#ifdef __ARCH_SI_TRAPNO - else if (layout == SIL_FAULT) - layout = SIL_FAULT_TRAPNO; -#endif } else if (si_code <= NSIGPOLL) layout = SIL_POLL; -- cgit v1.2.3 From f4ac73023449e6f2f74f69e38f4840c83edfa840 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 30 Apr 2021 17:58:56 -0500 Subject: signal: Rename SIL_PERF_EVENT SIL_FAULT_PERF_EVENT for consistency It helps to know which part of the siginfo structure the siginfo_layout value is talking about. v1: https://lkml.kernel.org/r/m18s4zs7nu.fsf_-_@fess.ebiederm.org v2: https://lkml.kernel.org/r/20210505141101.11519-9-ebiederm@xmission.com Link: https://lkml.kernel.org/r/87zgumw8cc.fsf_-_@disp2133 Acked-by: Marco Elver Signed-off-by: Eric W. Biederman --- kernel/signal.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 2181423e562a..332b21f2fe72 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1213,7 +1213,7 @@ static inline bool has_si_pid_and_uid(struct kernel_siginfo *info) case SIL_FAULT_MCEERR: case SIL_FAULT_BNDERR: case SIL_FAULT_PKUERR: - case SIL_PERF_EVENT: + case SIL_FAULT_PERF_EVENT: case SIL_SYS: ret = false; break; @@ -2580,7 +2580,7 @@ static void hide_si_addr_tag_bits(struct ksignal *ksig) case SIL_FAULT_MCEERR: case SIL_FAULT_BNDERR: case SIL_FAULT_PKUERR: - case SIL_PERF_EVENT: + case SIL_FAULT_PERF_EVENT: ksig->info.si_addr = arch_untagged_si_addr( ksig->info.si_addr, ksig->sig, ksig->info.si_code); break; @@ -3265,7 +3265,7 @@ enum siginfo_layout siginfo_layout(unsigned sig, int si_code) layout = SIL_FAULT_PKUERR; #endif else if ((sig == SIGTRAP) && (si_code == TRAP_PERF)) - layout = SIL_PERF_EVENT; + layout = SIL_FAULT_PERF_EVENT; else if (IS_ENABLED(CONFIG_SPARC) && (sig == SIGILL) && (si_code == ILL_ILLTRP)) layout = SIL_FAULT_TRAPNO; @@ -3394,7 +3394,7 @@ void copy_siginfo_to_external32(struct compat_siginfo *to, to->si_addr = ptr_to_compat(from->si_addr); to->si_pkey = from->si_pkey; break; - case SIL_PERF_EVENT: + case SIL_FAULT_PERF_EVENT: to->si_addr = ptr_to_compat(from->si_addr); to->si_perf_data = from->si_perf_data; to->si_perf_type = from->si_perf_type; @@ -3471,7 +3471,7 @@ static int post_copy_siginfo_from_user32(kernel_siginfo_t *to, to->si_addr = compat_ptr(from->si_addr); to->si_pkey = from->si_pkey; break; - case SIL_PERF_EVENT: + case SIL_FAULT_PERF_EVENT: to->si_addr = compat_ptr(from->si_addr); to->si_perf_data = from->si_perf_data; to->si_perf_type = from->si_perf_type; -- cgit v1.2.3 From 3cee6fb8e69ecd79be891c89a94974c48a25a437 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 1 Jul 2021 13:06:19 -0700 Subject: bpf: tcp: Support bpf_(get|set)sockopt in bpf tcp iter This patch allows bpf tcp iter to call bpf_(get|set)sockopt. To allow a specific bpf iter (tcp here) to call a set of helpers, get_func_proto function pointer is added to bpf_iter_reg. The bpf iter is a tracing prog which currently requires CAP_PERFMON or CAP_SYS_ADMIN, so this patch does not impose other capability checks for bpf_(get|set)sockopt. Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko Reviewed-by: Eric Dumazet Acked-by: Kuniyuki Iwashima Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210701200619.1036715-1-kafai@fb.com --- kernel/bpf/bpf_iter.c | 22 ++++++++++++++++++++++ kernel/trace/bpf_trace.c | 7 ++++++- 2 files changed, 28 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 2d4fbdbb194e..2e9d47bb40ff 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -360,6 +360,28 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog) return supported; } +const struct bpf_func_proto * +bpf_iter_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + const struct bpf_iter_target_info *tinfo; + const struct bpf_func_proto *fn = NULL; + + mutex_lock(&targets_mutex); + list_for_each_entry(tinfo, &targets, list) { + if (tinfo->btf_id == prog->aux->attach_btf_id) { + const struct bpf_iter_reg *reg_info; + + reg_info = tinfo->reg_info; + if (reg_info->get_func_proto) + fn = reg_info->get_func_proto(func_id, prog); + break; + } + } + mutex_unlock(&targets_mutex); + + return fn; +} + static void bpf_iter_link_release(struct bpf_link *link) { struct bpf_iter_link *iter_link = diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 1f22ce1fa971..c5e0b6a64091 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1461,6 +1461,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) const struct bpf_func_proto * tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *fn; + switch (func_id) { #ifdef CONFIG_NET case BPF_FUNC_skb_output: @@ -1501,7 +1503,10 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_d_path: return &bpf_d_path_proto; default: - return raw_tp_prog_func_proto(func_id, prog); + fn = raw_tp_prog_func_proto(func_id, prog); + if (!fn && prog->expected_attach_type == BPF_TRACE_ITER) + fn = bpf_iter_get_func_proto(func_id, prog); + return fn; } } -- cgit v1.2.3 From 463e862ac63ef27fca423782536f6465abc3f180 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 20 Jul 2021 14:38:24 +0100 Subject: swiotlb: Convert io_default_tlb_mem to static allocation Since commit 69031f500865 ("swiotlb: Set dev->dma_io_tlb_mem to the swiotlb pool used"), 'struct device' may hold a copy of the global 'io_default_tlb_mem' pointer if the device is using swiotlb for DMA. A subsequent call to swiotlb_exit() will therefore leave dangling pointers behind in these device structures, resulting in KASAN splats such as: | BUG: KASAN: use-after-free in __iommu_dma_unmap_swiotlb+0x64/0xb0 | Read of size 8 at addr ffff8881d7830000 by task swapper/0/0 | | CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.12.0-rc3-debug #1 | Hardware name: HP HP Desktop M01-F1xxx/87D6, BIOS F.12 12/17/2020 | Call Trace: | | dump_stack+0x9c/0xcf | print_address_description.constprop.0+0x18/0x130 | kasan_report.cold+0x7f/0x111 | __iommu_dma_unmap_swiotlb+0x64/0xb0 | nvme_pci_complete_rq+0x73/0x130 | blk_complete_reqs+0x6f/0x80 | __do_softirq+0xfc/0x3be Convert 'io_default_tlb_mem' to a static structure, so that the per-device pointers remain valid after swiotlb_exit() has been invoked. All users are updated to reference the static structure directly, using the 'nslabs' field to determine whether swiotlb has been initialised. The 'slots' array is still allocated dynamically and referenced via a pointer rather than a flexible array member. Cc: Claire Chang Cc: Christoph Hellwig Cc: Robin Murphy Cc: Konrad Rzeszutek Wilk Fixes: 69031f500865 ("swiotlb: Set dev->dma_io_tlb_mem to the swiotlb pool used") Reported-by: Nathan Chancellor Tested-by: Nathan Chancellor Tested-by: Claire Chang Reviewed-by: Christoph Hellwig Signed-off-by: Will Deacon Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 66 ++++++++++++++++++++++++++++------------------------ 1 file changed, 36 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index f1a9ae7fad8f..7948f274f9bb 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -70,7 +70,7 @@ enum swiotlb_force swiotlb_force; -struct io_tlb_mem *io_tlb_default_mem; +struct io_tlb_mem io_tlb_default_mem; /* * Max segment that we can provide which (if pages are contingous) will @@ -101,7 +101,7 @@ early_param("swiotlb", setup_io_tlb_npages); unsigned int swiotlb_max_segment(void) { - return io_tlb_default_mem ? max_segment : 0; + return io_tlb_default_mem.nslabs ? max_segment : 0; } EXPORT_SYMBOL_GPL(swiotlb_max_segment); @@ -134,9 +134,9 @@ void __init swiotlb_adjust_size(unsigned long size) void swiotlb_print_info(void) { - struct io_tlb_mem *mem = io_tlb_default_mem; + struct io_tlb_mem *mem = &io_tlb_default_mem; - if (!mem) { + if (!mem->nslabs) { pr_warn("No low mem\n"); return; } @@ -163,11 +163,11 @@ static inline unsigned long nr_slots(u64 val) */ void __init swiotlb_update_mem_attributes(void) { - struct io_tlb_mem *mem = io_tlb_default_mem; + struct io_tlb_mem *mem = &io_tlb_default_mem; void *vaddr; unsigned long bytes; - if (!mem || mem->late_alloc) + if (!mem->nslabs || mem->late_alloc) return; vaddr = phys_to_virt(mem->start); bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT); @@ -201,25 +201,24 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) { - struct io_tlb_mem *mem; + struct io_tlb_mem *mem = &io_tlb_default_mem; size_t alloc_size; if (swiotlb_force == SWIOTLB_NO_FORCE) return 0; /* protect against double initialization */ - if (WARN_ON_ONCE(io_tlb_default_mem)) + if (WARN_ON_ONCE(mem->nslabs)) return -ENOMEM; - alloc_size = PAGE_ALIGN(struct_size(mem, slots, nslabs)); - mem = memblock_alloc(alloc_size, PAGE_SIZE); - if (!mem) + alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs)); + mem->slots = memblock_alloc(alloc_size, PAGE_SIZE); + if (!mem->slots) panic("%s: Failed to allocate %zu bytes align=0x%lx\n", __func__, alloc_size, PAGE_SIZE); swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, false); - io_tlb_default_mem = mem; if (verbose) swiotlb_print_info(); swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT); @@ -304,26 +303,24 @@ swiotlb_late_init_with_default_size(size_t default_size) int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) { - struct io_tlb_mem *mem; + struct io_tlb_mem *mem = &io_tlb_default_mem; unsigned long bytes = nslabs << IO_TLB_SHIFT; if (swiotlb_force == SWIOTLB_NO_FORCE) return 0; /* protect against double initialization */ - if (WARN_ON_ONCE(io_tlb_default_mem)) + if (WARN_ON_ONCE(mem->nslabs)) return -ENOMEM; - mem = (void *)__get_free_pages(GFP_KERNEL, - get_order(struct_size(mem, slots, nslabs))); - if (!mem) + mem->slots = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(array_size(sizeof(*mem->slots), nslabs))); + if (!mem->slots) return -ENOMEM; - memset(mem, 0, sizeof(*mem)); set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT); swiotlb_init_io_tlb_mem(mem, virt_to_phys(tlb), nslabs, true); - io_tlb_default_mem = mem; swiotlb_print_info(); swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT); return 0; @@ -331,18 +328,18 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) void __init swiotlb_exit(void) { - struct io_tlb_mem *mem = io_tlb_default_mem; size_t size; + struct io_tlb_mem *mem = &io_tlb_default_mem; - if (!mem) + if (!mem->nslabs) return; - size = struct_size(mem, slots, mem->nslabs); + size = array_size(sizeof(*mem->slots), mem->nslabs); if (mem->late_alloc) - free_pages((unsigned long)mem, get_order(size)); + free_pages((unsigned long)mem->slots, get_order(size)); else - memblock_free_late(__pa(mem), PAGE_ALIGN(size)); - io_tlb_default_mem = NULL; + memblock_free_late(__pa(mem->slots), PAGE_ALIGN(size)); + memset(mem, 0, sizeof(*mem)); } /* @@ -696,7 +693,9 @@ size_t swiotlb_max_mapping_size(struct device *dev) bool is_swiotlb_active(struct device *dev) { - return dev->dma_io_tlb_mem != NULL; + struct io_tlb_mem *mem = dev->dma_io_tlb_mem; + + return mem && mem->nslabs; } EXPORT_SYMBOL_GPL(is_swiotlb_active); @@ -711,10 +710,10 @@ static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem) static int __init swiotlb_create_default_debugfs(void) { - struct io_tlb_mem *mem = io_tlb_default_mem; + struct io_tlb_mem *mem = &io_tlb_default_mem; debugfs_dir = debugfs_create_dir("swiotlb", NULL); - if (mem) { + if (mem->nslabs) { mem->debugfs = debugfs_dir; swiotlb_create_debugfs_files(mem); } @@ -783,10 +782,17 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem, * to it. */ if (!mem) { - mem = kzalloc(struct_size(mem, slots, nslabs), GFP_KERNEL); + mem = kzalloc(sizeof(*mem), GFP_KERNEL); if (!mem) return -ENOMEM; + mem->slots = kzalloc(array_size(sizeof(*mem->slots), nslabs), + GFP_KERNEL); + if (!mem->slots) { + kfree(mem); + return -ENOMEM; + } + set_memory_decrypted((unsigned long)phys_to_virt(rmem->base), rmem->size >> PAGE_SHIFT); swiotlb_init_io_tlb_mem(mem, rmem->base, nslabs, false); @@ -806,7 +812,7 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem, static void rmem_swiotlb_device_release(struct reserved_mem *rmem, struct device *dev) { - dev->dma_io_tlb_mem = io_tlb_default_mem; + dev->dma_io_tlb_mem = &io_tlb_default_mem; } static const struct reserved_mem_ops rmem_swiotlb_ops = { -- cgit v1.2.3 From 1efd3fc0ccf52e1aa5f0bf5b0d82847180d20951 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 20 Jul 2021 14:38:25 +0100 Subject: swiotlb: Emit diagnostic in swiotlb_exit() A recent debugging session would have been made a little bit easier if we had noticed sooner that swiotlb_exit() was being called during boot. Add a simple diagnostic message to swiotlb_exit() to complement the one from swiotlb_print_info() during initialisation. Cc: Claire Chang Cc: Christoph Hellwig Cc: Robin Murphy Link: https://lore.kernel.org/r/20210705190352.GA19461@willie-the-truck Suggested-by: Konrad Rzeszutek Wilk Tested-by: Nathan Chancellor Tested-by: Claire Chang Reviewed-by: Christoph Hellwig Signed-off-by: Will Deacon Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 7948f274f9bb..b3c793ed9e64 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -334,6 +334,7 @@ void __init swiotlb_exit(void) if (!mem->nslabs) return; + pr_info("tearing down default memory pool\n"); size = array_size(sizeof(*mem->slots), mem->nslabs); if (mem->late_alloc) free_pages((unsigned long)mem->slots, get_order(size)); -- cgit v1.2.3 From ad6c00283163cb7ad52cdf97d2850547446f7d98 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 20 Jul 2021 14:38:26 +0100 Subject: swiotlb: Free tbl memory in swiotlb_exit() Although swiotlb_exit() frees the 'slots' metadata array referenced by 'io_tlb_default_mem', it leaves the underlying buffer pages allocated despite no longer being usable. Extend swiotlb_exit() to free the buffer pages as well as the slots array. Cc: Claire Chang Cc: Christoph Hellwig Cc: Robin Murphy Cc: Konrad Rzeszutek Wilk Tested-by: Nathan Chancellor Tested-by: Claire Chang Reviewed-by: Christoph Hellwig Signed-off-by: Will Deacon Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index b3c793ed9e64..87c40517e822 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -328,18 +328,27 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) void __init swiotlb_exit(void) { - size_t size; struct io_tlb_mem *mem = &io_tlb_default_mem; + unsigned long tbl_vaddr; + size_t tbl_size, slots_size; if (!mem->nslabs) return; pr_info("tearing down default memory pool\n"); - size = array_size(sizeof(*mem->slots), mem->nslabs); - if (mem->late_alloc) - free_pages((unsigned long)mem->slots, get_order(size)); - else - memblock_free_late(__pa(mem->slots), PAGE_ALIGN(size)); + tbl_vaddr = (unsigned long)phys_to_virt(mem->start); + tbl_size = PAGE_ALIGN(mem->end - mem->start); + slots_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), mem->nslabs)); + + set_memory_encrypted(tbl_vaddr, tbl_size >> PAGE_SHIFT); + if (mem->late_alloc) { + free_pages(tbl_vaddr, get_order(tbl_size)); + free_pages((unsigned long)mem->slots, get_order(slots_size)); + } else { + memblock_free_late(mem->start, tbl_size); + memblock_free_late(__pa(mem->slots), slots_size); + } + memset(mem, 0, sizeof(*mem)); } -- cgit v1.2.3 From 0f0aa84850a4105401723c6c0eeb61c2e67c869a Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 23 Jul 2021 09:47:07 +0200 Subject: printk/index: Fix warning about missing prototypes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit 337015573718b161 ("printk: Userspace format indexing support") triggered the following build failure: kernel/printk/index.c:140:6: warning: no previous prototype for ‘pi_create_file’ [-Wmissing-prototypes] void pi_create_file(struct module *mod) ^~~~~~~~~~~~~~ kernel/printk/index.c:146:6: warning: no previous prototype for ‘pi_remove_file’ [-Wmissing-prototypes] void pi_remove_file(struct module *mod) ^~~~~~~~~~~~~~ Fixes: 337015573718b161 ("printk: Userspace format indexing support") Reported-by: kernel test robot Suggested-by: Chris Down [pmladek@suse.com: Let the compiler decide about inlining.] Signed-off-by: Petr Mladek Link: https://lore.kernel.org/lkml/YPql089IwSpudw%2F1@alley/ --- kernel/printk/index.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/index.c b/kernel/printk/index.c index ca062f5e1779..58d27272f992 100644 --- a/kernel/printk/index.c +++ b/kernel/printk/index.c @@ -137,13 +137,13 @@ static const char *pi_get_module_name(struct module *mod) } #endif -void pi_create_file(struct module *mod) +static void pi_create_file(struct module *mod) { debugfs_create_file(pi_get_module_name(mod), 0444, dfs_index, mod, &dfs_index_fops); } -void pi_remove_file(struct module *mod) +static void pi_remove_file(struct module *mod) { debugfs_remove(debugfs_lookup(pi_get_module_name(mod), dfs_index)); } -- cgit v1.2.3 From 7d9e2661f268585ca24ab4edbc1e2925b08374b2 Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Sun, 25 Jul 2021 15:16:00 -0600 Subject: printk: Move the printk() kerneldoc comment to its new home Commit 337015573718 ("printk: Userspace format indexing support") turned printk() into a macro, but left the kerneldoc comment for it with the (now) _printk() function, resulting in this docs-build warning: kernel/printk/printk.c:1: warning: 'printk' not found Move the kerneldoc comment back next to the (now) macro it's meant to describe and have the docs build find it there. Fixes: 337015573718b161 ("printk: Userspace format indexing support") Signed-off-by: Jonathan Corbet Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/87o8aqt7qn.fsf@meer.lwn.net --- kernel/printk/printk.c | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 765f7af6ce56..8030c670f0bc 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2183,30 +2183,6 @@ int vprintk_default(const char *fmt, va_list args) } EXPORT_SYMBOL_GPL(vprintk_default); -/** - * _printk - print a kernel message - * @fmt: format string - * - * This is _printk(). It can be called from any context. We want it to work. - * - * If printk indexing is enabled, _printk() is called from printk_index_wrap. - * Otherwise, printk is simply #defined to _printk. - * - * We try to grab the console_lock. If we succeed, it's easy - we log the - * output and call the console drivers. If we fail to get the semaphore, we - * place the output into the log buffer and return. The current holder of - * the console_sem will notice the new output in console_unlock(); and will - * send it to the consoles before releasing the lock. - * - * One effect of this deferred printing is that code which calls printk() and - * then changes console_loglevel may break. This is because console_loglevel - * is inspected when the actual printing occurs. - * - * See also: - * printf(3) - * - * See the vsnprintf() documentation for format string extensions over C99. - */ asmlinkage __visible int _printk(const char *fmt, ...) { va_list args; -- cgit v1.2.3 From 002eb6ad075142e5940122c7fcee71cf1e906e29 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 15 Jul 2021 21:39:55 +0206 Subject: printk: track/limit recursion Currently the printk safe buffers provide a form of recursion protection by redirecting to the safe buffers whenever printk() is recursively called. In preparation for removal of the safe buffers, provide an alternate explicit recursion protection. Recursion is limited to 3 levels per-CPU and per-context. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20210715193359.25946-3-john.ogness@linutronix.de --- kernel/printk/printk.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 83 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 142a58d124d9..7fa0b4d91975 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1940,6 +1940,76 @@ static void call_console_drivers(const char *ext_text, size_t ext_len, } } +/* + * Recursion is tracked separately on each CPU. If NMIs are supported, an + * additional NMI context per CPU is also separately tracked. Until per-CPU + * is available, a separate "early tracking" is performed. + */ +static DEFINE_PER_CPU(u8, printk_count); +static u8 printk_count_early; +#ifdef CONFIG_HAVE_NMI +static DEFINE_PER_CPU(u8, printk_count_nmi); +static u8 printk_count_nmi_early; +#endif + +/* + * Recursion is limited to keep the output sane. printk() should not require + * more than 1 level of recursion (allowing, for example, printk() to trigger + * a WARN), but a higher value is used in case some printk-internal errors + * exist, such as the ringbuffer validation checks failing. + */ +#define PRINTK_MAX_RECURSION 3 + +/* + * Return a pointer to the dedicated counter for the CPU+context of the + * caller. + */ +static u8 *__printk_recursion_counter(void) +{ +#ifdef CONFIG_HAVE_NMI + if (in_nmi()) { + if (printk_percpu_data_ready()) + return this_cpu_ptr(&printk_count_nmi); + return &printk_count_nmi_early; + } +#endif + if (printk_percpu_data_ready()) + return this_cpu_ptr(&printk_count); + return &printk_count_early; +} + +/* + * Enter recursion tracking. Interrupts are disabled to simplify tracking. + * The caller must check the boolean return value to see if the recursion is + * allowed. On failure, interrupts are not disabled. + * + * @recursion_ptr must be a variable of type (u8 *) and is the same variable + * that is passed to printk_exit_irqrestore(). + */ +#define printk_enter_irqsave(recursion_ptr, flags) \ +({ \ + bool success = true; \ + \ + typecheck(u8 *, recursion_ptr); \ + local_irq_save(flags); \ + (recursion_ptr) = __printk_recursion_counter(); \ + if (*(recursion_ptr) > PRINTK_MAX_RECURSION) { \ + local_irq_restore(flags); \ + success = false; \ + } else { \ + (*(recursion_ptr))++; \ + } \ + success; \ +}) + +/* Exit recursion tracking, restoring interrupts. */ +#define printk_exit_irqrestore(recursion_ptr, flags) \ + do { \ + typecheck(u8 *, recursion_ptr); \ + (*(recursion_ptr))--; \ + local_irq_restore(flags); \ + } while (0) + int printk_delay_msec __read_mostly; static inline void printk_delay(void) @@ -2040,11 +2110,14 @@ int vprintk_store(int facility, int level, struct prb_reserved_entry e; enum log_flags lflags = 0; struct printk_record r; + unsigned long irqflags; u16 trunc_msg_len = 0; char prefix_buf[8]; + u8 *recursion_ptr; u16 reserve_size; va_list args2; u16 text_len; + int ret = 0; u64 ts_nsec; /* @@ -2055,6 +2128,9 @@ int vprintk_store(int facility, int level, */ ts_nsec = local_clock(); + if (!printk_enter_irqsave(recursion_ptr, irqflags)) + return 0; + /* * The sprintf needs to come first since the syslog prefix might be * passed in as a parameter. An extra byte must be reserved so that @@ -2092,7 +2168,8 @@ int vprintk_store(int facility, int level, prb_commit(&e); } - return text_len; + ret = text_len; + goto out; } } @@ -2108,7 +2185,7 @@ int vprintk_store(int facility, int level, prb_rec_init_wr(&r, reserve_size + trunc_msg_len); if (!prb_reserve(&e, prb, &r)) - return 0; + goto out; } /* fill message */ @@ -2130,7 +2207,10 @@ int vprintk_store(int facility, int level, else prb_final_commit(&e); - return (text_len + trunc_msg_len); + ret = text_len + trunc_msg_len; +out: + printk_exit_irqrestore(recursion_ptr, irqflags); + return ret; } asmlinkage int vprintk_emit(int facility, int level, -- cgit v1.2.3 From 93d102f094be9beab28e5afb656c188b16a3793b Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 15 Jul 2021 21:39:56 +0206 Subject: printk: remove safe buffers With @logbuf_lock removed, the high level printk functions for storing messages are lockless. Messages can be stored from any context, so there is no need for the NMI and safe buffers anymore. Remove the NMI and safe buffers. Although the safe buffers are removed, the NMI and safe context tracking is still in place. In these contexts, store the message immediately but still use irq_work to defer the console printing. Since printk recursion tracking is in place, safe context tracking for most of printk is not needed. Remove it. Only safe context tracking relating to the console and console_owner locks is left in place. This is because the console and console_owner locks are needed for the actual printing. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20210715193359.25946-4-john.ogness@linutronix.de --- kernel/kexec_core.c | 1 - kernel/panic.c | 3 - kernel/printk/internal.h | 17 --- kernel/printk/printk.c | 120 ++++++---------- kernel/printk/printk_safe.c | 335 +------------------------------------------- 5 files changed, 48 insertions(+), 428 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index f099baee3578..69c6e9b7761c 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -978,7 +978,6 @@ void crash_kexec(struct pt_regs *regs) old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu); if (old_cpu == PANIC_CPU_INVALID) { /* This is the 1st CPU which comes here, so go ahead. */ - printk_safe_flush_on_panic(); __crash_kexec(regs); /* diff --git a/kernel/panic.c b/kernel/panic.c index 332736a72a58..1f0df42f8d0c 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -247,7 +247,6 @@ void panic(const char *fmt, ...) * Bypass the panic_cpu check and call __crash_kexec directly. */ if (!_crash_kexec_post_notifiers) { - printk_safe_flush_on_panic(); __crash_kexec(NULL); /* @@ -271,8 +270,6 @@ void panic(const char *fmt, ...) */ atomic_notifier_call_chain(&panic_notifier_list, 0, buf); - /* Call flush even twice. It tries harder with a single online CPU */ - printk_safe_flush_on_panic(); kmsg_dump(KMSG_DUMP_PANIC); /* diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 51615c909b2f..6cc35c5de890 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -22,7 +22,6 @@ __printf(1, 0) int vprintk_deferred(const char *fmt, va_list args); void __printk_safe_enter(void); void __printk_safe_exit(void); -void printk_safe_init(void); bool printk_percpu_data_ready(void); #define printk_safe_enter_irqsave(flags) \ @@ -37,18 +36,6 @@ bool printk_percpu_data_ready(void); local_irq_restore(flags); \ } while (0) -#define printk_safe_enter_irq() \ - do { \ - local_irq_disable(); \ - __printk_safe_enter(); \ - } while (0) - -#define printk_safe_exit_irq() \ - do { \ - __printk_safe_exit(); \ - local_irq_enable(); \ - } while (0) - void defer_console_output(void); #else @@ -61,9 +48,5 @@ void defer_console_output(void); #define printk_safe_enter_irqsave(flags) local_irq_save(flags) #define printk_safe_exit_irqrestore(flags) local_irq_restore(flags) -#define printk_safe_enter_irq() local_irq_disable() -#define printk_safe_exit_irq() local_irq_enable() - -static inline void printk_safe_init(void) { } static inline bool printk_percpu_data_ready(void) { return false; } #endif /* CONFIG_PRINTK */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 7fa0b4d91975..219ad710a9e8 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -732,27 +732,22 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, if (ret) return ret; - printk_safe_enter_irq(); if (!prb_read_valid(prb, atomic64_read(&user->seq), r)) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; - printk_safe_exit_irq(); goto out; } - printk_safe_exit_irq(); ret = wait_event_interruptible(log_wait, prb_read_valid(prb, atomic64_read(&user->seq), r)); if (ret) goto out; - printk_safe_enter_irq(); } if (r->info->seq != atomic64_read(&user->seq)) { /* our last seen message is gone, return error and reset */ atomic64_set(&user->seq, r->info->seq); ret = -EPIPE; - printk_safe_exit_irq(); goto out; } @@ -762,7 +757,6 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, &r->info->dev_info); atomic64_set(&user->seq, r->info->seq + 1); - printk_safe_exit_irq(); if (len > count) { ret = -EINVAL; @@ -797,7 +791,6 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) if (offset) return -ESPIPE; - printk_safe_enter_irq(); switch (whence) { case SEEK_SET: /* the first record */ @@ -818,7 +811,6 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) default: ret = -EINVAL; } - printk_safe_exit_irq(); return ret; } @@ -833,7 +825,6 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait) poll_wait(file, &log_wait, wait); - printk_safe_enter_irq(); if (prb_read_valid_info(prb, atomic64_read(&user->seq), &info, NULL)) { /* return error when data has vanished underneath us */ if (info.seq != atomic64_read(&user->seq)) @@ -841,7 +832,6 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait) else ret = EPOLLIN|EPOLLRDNORM; } - printk_safe_exit_irq(); return ret; } @@ -874,9 +864,7 @@ static int devkmsg_open(struct inode *inode, struct file *file) prb_rec_init_rd(&user->record, &user->info, &user->text_buf[0], sizeof(user->text_buf)); - printk_safe_enter_irq(); atomic64_set(&user->seq, prb_first_valid_seq(prb)); - printk_safe_exit_irq(); file->private_data = user; return 0; @@ -1042,9 +1030,6 @@ static inline void log_buf_add_cpu(void) {} static void __init set_percpu_data_ready(void) { - printk_safe_init(); - /* Make sure we set this flag only after printk_safe() init is done */ - barrier(); __printk_percpu_data_ready = true; } @@ -1082,6 +1067,7 @@ void __init setup_log_buf(int early) struct prb_desc *new_descs; struct printk_info info; struct printk_record r; + unsigned int text_size; size_t new_descs_size; size_t new_infos_size; unsigned long flags; @@ -1142,24 +1128,37 @@ void __init setup_log_buf(int early) new_descs, ilog2(new_descs_count), new_infos); - printk_safe_enter_irqsave(flags); + local_irq_save(flags); log_buf_len = new_log_buf_len; log_buf = new_log_buf; new_log_buf_len = 0; free = __LOG_BUF_LEN; - prb_for_each_record(0, &printk_rb_static, seq, &r) - free -= add_to_rb(&printk_rb_dynamic, &r); + prb_for_each_record(0, &printk_rb_static, seq, &r) { + text_size = add_to_rb(&printk_rb_dynamic, &r); + if (text_size > free) + free = 0; + else + free -= text_size; + } - /* - * This is early enough that everything is still running on the - * boot CPU and interrupts are disabled. So no new messages will - * appear during the transition to the dynamic buffer. - */ prb = &printk_rb_dynamic; - printk_safe_exit_irqrestore(flags); + local_irq_restore(flags); + + /* + * Copy any remaining messages that might have appeared from + * NMI context after copying but before switching to the + * dynamic buffer. + */ + prb_for_each_record(seq, &printk_rb_static, seq, &r) { + text_size = add_to_rb(&printk_rb_dynamic, &r); + if (text_size > free) + free = 0; + else + free -= text_size; + } if (seq != prb_next_seq(&printk_rb_static)) { pr_err("dropped %llu messages\n", @@ -1498,11 +1497,9 @@ static int syslog_print(char __user *buf, int size) size_t n; size_t skip; - printk_safe_enter_irq(); - raw_spin_lock(&syslog_lock); + raw_spin_lock_irq(&syslog_lock); if (!prb_read_valid(prb, syslog_seq, &r)) { - raw_spin_unlock(&syslog_lock); - printk_safe_exit_irq(); + raw_spin_unlock_irq(&syslog_lock); break; } if (r.info->seq != syslog_seq) { @@ -1531,8 +1528,7 @@ static int syslog_print(char __user *buf, int size) syslog_partial += n; } else n = 0; - raw_spin_unlock(&syslog_lock); - printk_safe_exit_irq(); + raw_spin_unlock_irq(&syslog_lock); if (!n) break; @@ -1566,7 +1562,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear) return -ENOMEM; time = printk_time; - printk_safe_enter_irq(); /* * Find first record that fits, including all following records, * into the user-provided buffer for this dump. @@ -1587,23 +1582,20 @@ static int syslog_print_all(char __user *buf, int size, bool clear) break; } - printk_safe_exit_irq(); if (copy_to_user(buf + len, text, textlen)) len = -EFAULT; else len += textlen; - printk_safe_enter_irq(); if (len < 0) break; } if (clear) { - raw_spin_lock(&syslog_lock); + raw_spin_lock_irq(&syslog_lock); latched_seq_write(&clear_seq, seq); - raw_spin_unlock(&syslog_lock); + raw_spin_unlock_irq(&syslog_lock); } - printk_safe_exit_irq(); kfree(text); return len; @@ -1611,11 +1603,9 @@ static int syslog_print_all(char __user *buf, int size, bool clear) static void syslog_clear(void) { - printk_safe_enter_irq(); - raw_spin_lock(&syslog_lock); + raw_spin_lock_irq(&syslog_lock); latched_seq_write(&clear_seq, prb_next_seq(prb)); - raw_spin_unlock(&syslog_lock); - printk_safe_exit_irq(); + raw_spin_unlock_irq(&syslog_lock); } /* Return a consistent copy of @syslog_seq. */ @@ -1703,12 +1693,10 @@ int do_syslog(int type, char __user *buf, int len, int source) break; /* Number of chars in the log buffer */ case SYSLOG_ACTION_SIZE_UNREAD: - printk_safe_enter_irq(); - raw_spin_lock(&syslog_lock); + raw_spin_lock_irq(&syslog_lock); if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) { /* No unread messages. */ - raw_spin_unlock(&syslog_lock); - printk_safe_exit_irq(); + raw_spin_unlock_irq(&syslog_lock); return 0; } if (info.seq != syslog_seq) { @@ -1736,8 +1724,7 @@ int do_syslog(int type, char __user *buf, int len, int source) } error -= syslog_partial; } - raw_spin_unlock(&syslog_lock); - printk_safe_exit_irq(); + raw_spin_unlock_irq(&syslog_lock); break; /* Size of the log buffer */ case SYSLOG_ACTION_SIZE_BUFFER: @@ -2219,7 +2206,6 @@ asmlinkage int vprintk_emit(int facility, int level, { int printed_len; bool in_sched = false; - unsigned long flags; /* Suppress unimportant messages after panic happens */ if (unlikely(suppress_printk)) @@ -2233,9 +2219,7 @@ asmlinkage int vprintk_emit(int facility, int level, boot_delay_msec(level); printk_delay(); - printk_safe_enter_irqsave(flags); printed_len = vprintk_store(facility, level, dev_info, fmt, args); - printk_safe_exit_irqrestore(flags); /* If called from the scheduler, we can not call up(). */ if (!in_sched) { @@ -2664,9 +2648,9 @@ again: for (;;) { size_t ext_len = 0; + int handover; size_t len; - printk_safe_enter_irqsave(flags); skip: if (!prb_read_valid(prb, console_seq, &r)) break; @@ -2716,19 +2700,22 @@ skip: * were to occur on another CPU, it may wait for this one to * finish. This task can not be preempted if there is a * waiter waiting to take over. + * + * Interrupts are disabled because the hand over to a waiter + * must not be interrupted until the hand over is completed + * (@console_waiter is cleared). */ + printk_safe_enter_irqsave(flags); console_lock_spinning_enable(); stop_critical_timings(); /* don't trace print latency */ call_console_drivers(ext_text, ext_len, text, len); start_critical_timings(); - if (console_lock_spinning_disable_and_check()) { - printk_safe_exit_irqrestore(flags); - return; - } - + handover = console_lock_spinning_disable_and_check(); printk_safe_exit_irqrestore(flags); + if (handover) + return; if (do_cond_resched) cond_resched(); @@ -2745,8 +2732,6 @@ skip: * flush, no worries. */ retry = prb_read_valid(prb, console_seq, NULL); - printk_safe_exit_irqrestore(flags); - if (retry && console_trylock()) goto again; } @@ -2808,13 +2793,8 @@ void console_flush_on_panic(enum con_flush_mode mode) console_trylock(); console_may_schedule = 0; - if (mode == CONSOLE_REPLAY_ALL) { - unsigned long flags; - - printk_safe_enter_irqsave(flags); + if (mode == CONSOLE_REPLAY_ALL) console_seq = prb_first_valid_seq(prb); - printk_safe_exit_irqrestore(flags); - } console_unlock(); } @@ -3466,14 +3446,12 @@ bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog, struct printk_info info; unsigned int line_count; struct printk_record r; - unsigned long flags; size_t l = 0; bool ret = false; if (iter->cur_seq < min_seq) iter->cur_seq = min_seq; - printk_safe_enter_irqsave(flags); prb_rec_init_rd(&r, &info, line, size); /* Read text or count text lines? */ @@ -3494,7 +3472,6 @@ bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog, iter->cur_seq = r.info->seq + 1; ret = true; out: - printk_safe_exit_irqrestore(flags); if (len) *len = l; return ret; @@ -3526,7 +3503,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog, u64 min_seq = latched_seq_read_nolock(&clear_seq); struct printk_info info; struct printk_record r; - unsigned long flags; u64 seq; u64 next_seq; size_t len = 0; @@ -3539,7 +3515,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog, if (iter->cur_seq < min_seq) iter->cur_seq = min_seq; - printk_safe_enter_irqsave(flags); if (prb_read_valid_info(prb, iter->cur_seq, &info, NULL)) { if (info.seq != iter->cur_seq) { /* messages are gone, move to first available one */ @@ -3548,10 +3523,8 @@ bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog, } /* last entry */ - if (iter->cur_seq >= iter->next_seq) { - printk_safe_exit_irqrestore(flags); + if (iter->cur_seq >= iter->next_seq) goto out; - } /* * Find first record that fits, including all following records, @@ -3583,7 +3556,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog, iter->next_seq = next_seq; ret = true; - printk_safe_exit_irqrestore(flags); out: if (len_out) *len_out = len; @@ -3601,12 +3573,8 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); */ void kmsg_dump_rewind(struct kmsg_dump_iter *iter) { - unsigned long flags; - - printk_safe_enter_irqsave(flags); iter->cur_seq = latched_seq_read_nolock(&clear_seq); iter->next_seq = prb_next_seq(prb); - printk_safe_exit_irqrestore(flags); } EXPORT_SYMBOL_GPL(kmsg_dump_rewind); diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 94232186fccb..29c580dac93d 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -15,286 +15,9 @@ #include "internal.h" -/* - * In NMI and safe mode, printk() avoids taking locks. Instead, - * it uses an alternative implementation that temporary stores - * the strings into a per-CPU buffer. The content of the buffer - * is later flushed into the main ring buffer via IRQ work. - * - * The alternative implementation is chosen transparently - * by examining current printk() context mask stored in @printk_context - * per-CPU variable. - * - * The implementation allows to flush the strings also from another CPU. - * There are situations when we want to make sure that all buffers - * were handled or when IRQs are blocked. - */ - -#define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) - \ - sizeof(atomic_t) - \ - sizeof(atomic_t) - \ - sizeof(struct irq_work)) - -struct printk_safe_seq_buf { - atomic_t len; /* length of written data */ - atomic_t message_lost; - struct irq_work work; /* IRQ work that flushes the buffer */ - unsigned char buffer[SAFE_LOG_BUF_LEN]; -}; - -static DEFINE_PER_CPU(struct printk_safe_seq_buf, safe_print_seq); static DEFINE_PER_CPU(int, printk_context); -static DEFINE_RAW_SPINLOCK(safe_read_lock); - -#ifdef CONFIG_PRINTK_NMI -static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq); -#endif - -/* Get flushed in a more safe context. */ -static void queue_flush_work(struct printk_safe_seq_buf *s) -{ - if (printk_percpu_data_ready()) - irq_work_queue(&s->work); -} - -/* - * Add a message to per-CPU context-dependent buffer. NMI and printk-safe - * have dedicated buffers, because otherwise printk-safe preempted by - * NMI-printk would have overwritten the NMI messages. - * - * The messages are flushed from irq work (or from panic()), possibly, - * from other CPU, concurrently with printk_safe_log_store(). Should this - * happen, printk_safe_log_store() will notice the buffer->len mismatch - * and repeat the write. - */ -static __printf(2, 0) int printk_safe_log_store(struct printk_safe_seq_buf *s, - const char *fmt, va_list args) -{ - int add; - size_t len; - va_list ap; - -again: - len = atomic_read(&s->len); - - /* The trailing '\0' is not counted into len. */ - if (len >= sizeof(s->buffer) - 1) { - atomic_inc(&s->message_lost); - queue_flush_work(s); - return 0; - } - - /* - * Make sure that all old data have been read before the buffer - * was reset. This is not needed when we just append data. - */ - if (!len) - smp_rmb(); - - va_copy(ap, args); - add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, ap); - va_end(ap); - if (!add) - return 0; - - /* - * Do it once again if the buffer has been flushed in the meantime. - * Note that atomic_cmpxchg() is an implicit memory barrier that - * makes sure that the data were written before updating s->len. - */ - if (atomic_cmpxchg(&s->len, len, len + add) != len) - goto again; - - queue_flush_work(s); - return add; -} - -static inline void printk_safe_flush_line(const char *text, int len) -{ - /* - * Avoid any console drivers calls from here, because we may be - * in NMI or printk_safe context (when in panic). The messages - * must go only into the ring buffer at this stage. Consoles will - * get explicitly called later when a crashdump is not generated. - */ - printk_deferred("%.*s", len, text); -} - -/* printk part of the temporary buffer line by line */ -static int printk_safe_flush_buffer(const char *start, size_t len) -{ - const char *c, *end; - bool header; - - c = start; - end = start + len; - header = true; - - /* Print line by line. */ - while (c < end) { - if (*c == '\n') { - printk_safe_flush_line(start, c - start + 1); - start = ++c; - header = true; - continue; - } - - /* Handle continuous lines or missing new line. */ - if ((c + 1 < end) && printk_get_level(c)) { - if (header) { - c = printk_skip_level(c); - continue; - } - - printk_safe_flush_line(start, c - start); - start = c++; - header = true; - continue; - } - - header = false; - c++; - } - - /* Check if there was a partial line. Ignore pure header. */ - if (start < end && !header) { - static const char newline[] = KERN_CONT "\n"; - - printk_safe_flush_line(start, end - start); - printk_safe_flush_line(newline, strlen(newline)); - } - - return len; -} - -static void report_message_lost(struct printk_safe_seq_buf *s) -{ - int lost = atomic_xchg(&s->message_lost, 0); - - if (lost) - printk_deferred("Lost %d message(s)!\n", lost); -} - -/* - * Flush data from the associated per-CPU buffer. The function - * can be called either via IRQ work or independently. - */ -static void __printk_safe_flush(struct irq_work *work) -{ - struct printk_safe_seq_buf *s = - container_of(work, struct printk_safe_seq_buf, work); - unsigned long flags; - size_t len; - int i; - - /* - * The lock has two functions. First, one reader has to flush all - * available message to make the lockless synchronization with - * writers easier. Second, we do not want to mix messages from - * different CPUs. This is especially important when printing - * a backtrace. - */ - raw_spin_lock_irqsave(&safe_read_lock, flags); - - i = 0; -more: - len = atomic_read(&s->len); - - /* - * This is just a paranoid check that nobody has manipulated - * the buffer an unexpected way. If we printed something then - * @len must only increase. Also it should never overflow the - * buffer size. - */ - if ((i && i >= len) || len > sizeof(s->buffer)) { - const char *msg = "printk_safe_flush: internal error\n"; - - printk_safe_flush_line(msg, strlen(msg)); - len = 0; - } - - if (!len) - goto out; /* Someone else has already flushed the buffer. */ - - /* Make sure that data has been written up to the @len */ - smp_rmb(); - i += printk_safe_flush_buffer(s->buffer + i, len - i); - - /* - * Check that nothing has got added in the meantime and truncate - * the buffer. Note that atomic_cmpxchg() is an implicit memory - * barrier that makes sure that the data were copied before - * updating s->len. - */ - if (atomic_cmpxchg(&s->len, len, 0) != len) - goto more; - -out: - report_message_lost(s); - raw_spin_unlock_irqrestore(&safe_read_lock, flags); -} - -/** - * printk_safe_flush - flush all per-cpu nmi buffers. - * - * The buffers are flushed automatically via IRQ work. This function - * is useful only when someone wants to be sure that all buffers have - * been flushed at some point. - */ -void printk_safe_flush(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { -#ifdef CONFIG_PRINTK_NMI - __printk_safe_flush(&per_cpu(nmi_print_seq, cpu).work); -#endif - __printk_safe_flush(&per_cpu(safe_print_seq, cpu).work); - } -} - -/** - * printk_safe_flush_on_panic - flush all per-cpu nmi buffers when the system - * goes down. - * - * Similar to printk_safe_flush() but it can be called even in NMI context when - * the system goes down. It does the best effort to get NMI messages into - * the main ring buffer. - * - * Note that it could try harder when there is only one CPU online. - */ -void printk_safe_flush_on_panic(void) -{ - /* - * Make sure that we could access the safe buffers. - * Do not risk a double release when more CPUs are up. - */ - if (raw_spin_is_locked(&safe_read_lock)) { - if (num_online_cpus() > 1) - return; - - debug_locks_off(); - raw_spin_lock_init(&safe_read_lock); - } - - printk_safe_flush(); -} - #ifdef CONFIG_PRINTK_NMI -/* - * Safe printk() for NMI context. It uses a per-CPU buffer to - * store the message. NMIs are not nested, so there is always only - * one writer running. But the buffer might get flushed from another - * CPU, so we need to be careful. - */ -static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) -{ - struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq); - - return printk_safe_log_store(s, fmt, args); -} - void noinstr printk_nmi_enter(void) { this_cpu_add(printk_context, PRINTK_NMI_CONTEXT_OFFSET); @@ -309,9 +32,6 @@ void noinstr printk_nmi_exit(void) * Marks a code that might produce many messages in NMI context * and the risk of losing them is more critical than eventual * reordering. - * - * It has effect only when called in NMI context. Then printk() - * will store the messages into the main logbuf directly. */ void printk_nmi_direct_enter(void) { @@ -324,27 +44,8 @@ void printk_nmi_direct_exit(void) this_cpu_and(printk_context, ~PRINTK_NMI_DIRECT_CONTEXT_MASK); } -#else - -static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) -{ - return 0; -} - #endif /* CONFIG_PRINTK_NMI */ -/* - * Lock-less printk(), to avoid deadlocks should the printk() recurse - * into itself. It uses a per-CPU buffer to store the message, just like - * NMI. - */ -static __printf(1, 0) int vprintk_safe(const char *fmt, va_list args) -{ - struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq); - - return printk_safe_log_store(s, fmt, args); -} - /* Can be preempted by NMI. */ void __printk_safe_enter(void) { @@ -369,46 +70,18 @@ asmlinkage int vprintk(const char *fmt, va_list args) * Use the main logbuf even in NMI. But avoid calling console * drivers that might have their own locks. */ - if ((this_cpu_read(printk_context) & PRINTK_NMI_DIRECT_CONTEXT_MASK)) { - unsigned long flags; + if (this_cpu_read(printk_context) & + (PRINTK_NMI_DIRECT_CONTEXT_MASK | + PRINTK_NMI_CONTEXT_MASK | + PRINTK_SAFE_CONTEXT_MASK)) { int len; - printk_safe_enter_irqsave(flags); len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args); - printk_safe_exit_irqrestore(flags); defer_console_output(); return len; } - /* Use extra buffer in NMI. */ - if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK) - return vprintk_nmi(fmt, args); - - /* Use extra buffer to prevent a recursion deadlock in safe mode. */ - if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) - return vprintk_safe(fmt, args); - /* No obstacles. */ return vprintk_default(fmt, args); } EXPORT_SYMBOL(vprintk); - -void __init printk_safe_init(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { - struct printk_safe_seq_buf *s; - - s = &per_cpu(safe_print_seq, cpu); - init_irq_work(&s->work, __printk_safe_flush); - -#ifdef CONFIG_PRINTK_NMI - s = &per_cpu(nmi_print_seq, cpu); - init_irq_work(&s->work, __printk_safe_flush); -#endif - } - - /* Flush pending messages that did not have scheduled IRQ works. */ - printk_safe_flush(); -} -- cgit v1.2.3 From 85e3e7fbbb720b9897fba9a99659e31cbd1c082e Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 15 Jul 2021 21:39:57 +0206 Subject: printk: remove NMI tracking All NMI contexts are handled the same as the safe context: store the message and defer printing. There is no need to have special NMI context tracking for this. Using in_nmi() is enough. There are several parts of the kernel that are manually calling into the printk NMI context tracking in order to cause general printk deferred printing: arch/arm/kernel/smp.c arch/powerpc/kexec/crash.c kernel/trace/trace.c For arm/kernel/smp.c and powerpc/kexec/crash.c, provide a new function pair printk_deferred_enter/exit that explicitly achieves the same objective. For ftrace, remove the printk context manipulation completely. It was added in commit 03fc7f9c99c1 ("printk/nmi: Prevent deadlock when accessing the main log buffer in NMI"). The purpose was to enforce storing messages directly into the ring buffer even in NMI context. It really should have only modified the behavior in NMI context. There is no need for a special behavior any longer. All messages are always stored directly now. The console deferring is handled transparently in vprintk(). Signed-off-by: John Ogness [pmladek@suse.com: Remove special handling in ftrace.c completely. Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20210715193359.25946-5-john.ogness@linutronix.de --- kernel/printk/internal.h | 8 -------- kernel/printk/printk_safe.c | 37 +------------------------------------ kernel/trace/trace.c | 2 -- 3 files changed, 1 insertion(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 6cc35c5de890..b6d310c72fc9 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -6,12 +6,6 @@ #ifdef CONFIG_PRINTK -#define PRINTK_SAFE_CONTEXT_MASK 0x007ffffff -#define PRINTK_NMI_DIRECT_CONTEXT_MASK 0x008000000 -#define PRINTK_NMI_CONTEXT_MASK 0xff0000000 - -#define PRINTK_NMI_CONTEXT_OFFSET 0x010000000 - __printf(4, 0) int vprintk_store(int facility, int level, const struct dev_printk_info *dev_info, @@ -19,8 +13,6 @@ int vprintk_store(int facility, int level, __printf(1, 0) int vprintk_default(const char *fmt, va_list args); __printf(1, 0) int vprintk_deferred(const char *fmt, va_list args); -void __printk_safe_enter(void); -void __printk_safe_exit(void); bool printk_percpu_data_ready(void); diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 29c580dac93d..ef0f9a2044da 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -4,12 +4,9 @@ */ #include -#include -#include #include #include #include -#include #include #include @@ -17,35 +14,6 @@ static DEFINE_PER_CPU(int, printk_context); -#ifdef CONFIG_PRINTK_NMI -void noinstr printk_nmi_enter(void) -{ - this_cpu_add(printk_context, PRINTK_NMI_CONTEXT_OFFSET); -} - -void noinstr printk_nmi_exit(void) -{ - this_cpu_sub(printk_context, PRINTK_NMI_CONTEXT_OFFSET); -} - -/* - * Marks a code that might produce many messages in NMI context - * and the risk of losing them is more critical than eventual - * reordering. - */ -void printk_nmi_direct_enter(void) -{ - if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK) - this_cpu_or(printk_context, PRINTK_NMI_DIRECT_CONTEXT_MASK); -} - -void printk_nmi_direct_exit(void) -{ - this_cpu_and(printk_context, ~PRINTK_NMI_DIRECT_CONTEXT_MASK); -} - -#endif /* CONFIG_PRINTK_NMI */ - /* Can be preempted by NMI. */ void __printk_safe_enter(void) { @@ -70,10 +38,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) * Use the main logbuf even in NMI. But avoid calling console * drivers that might have their own locks. */ - if (this_cpu_read(printk_context) & - (PRINTK_NMI_DIRECT_CONTEXT_MASK | - PRINTK_NMI_CONTEXT_MASK | - PRINTK_SAFE_CONTEXT_MASK)) { + if (this_cpu_read(printk_context) || in_nmi()) { int len; len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d23a09d3eb37..2f41311c61d7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9647,7 +9647,6 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) tracing_off(); local_irq_save(flags); - printk_nmi_direct_enter(); /* Simulate the iterator */ trace_init_global_iter(&iter); @@ -9729,7 +9728,6 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); } atomic_dec(&dump_running); - printk_nmi_direct_exit(); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(ftrace_dump); -- cgit v1.2.3 From b371cbb584d843bc4194d0cd4ce5ecd19b0cf55f Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 15 Jul 2021 21:39:58 +0206 Subject: printk: convert @syslog_lock to mutex @syslog_lock was a raw_spin_lock to simplify the transition of removing @logbuf_lock and the safe buffers. With that transition complete, and since all uses of @syslog_lock are within sleepable contexts, @syslog_lock can become a mutex. Note that until now register_console() would disable interrupts using irqsave, which implies that it may be called with interrupts disabled. And indeed, there is one possible call chain on parisc where this happens: handle_interruption(code=1) /* High-priority machine check (HPMC) */ pdc_console_restart() pdc_console_init_force() register_console() However, register_console() calls console_lock(), which might sleep. So it has never been allowed to call register_console() from an atomic context and the above call chain is a bug. Note that the removal of read_syslog_seq_irq() is slightly changing the behavior of SYSLOG_ACTION_READ by testing against a possibly outdated @seq value. However, the value of @seq could have changed after the test, so it is not a new window. A follow-up commit closes this window. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20210715193359.25946-6-john.ogness@linutronix.de --- kernel/printk/printk.c | 49 ++++++++++++++++++++----------------------------- 1 file changed, 20 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 219ad710a9e8..86f1258d08b0 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -356,7 +356,7 @@ enum log_flags { }; /* syslog_lock protects syslog_* variables and write access to clear_seq. */ -static DEFINE_RAW_SPINLOCK(syslog_lock); +static DEFINE_MUTEX(syslog_lock); #ifdef CONFIG_PRINTK DECLARE_WAIT_QUEUE_HEAD(log_wait); @@ -1497,9 +1497,9 @@ static int syslog_print(char __user *buf, int size) size_t n; size_t skip; - raw_spin_lock_irq(&syslog_lock); + mutex_lock(&syslog_lock); if (!prb_read_valid(prb, syslog_seq, &r)) { - raw_spin_unlock_irq(&syslog_lock); + mutex_unlock(&syslog_lock); break; } if (r.info->seq != syslog_seq) { @@ -1528,7 +1528,7 @@ static int syslog_print(char __user *buf, int size) syslog_partial += n; } else n = 0; - raw_spin_unlock_irq(&syslog_lock); + mutex_unlock(&syslog_lock); if (!n) break; @@ -1592,9 +1592,9 @@ static int syslog_print_all(char __user *buf, int size, bool clear) } if (clear) { - raw_spin_lock_irq(&syslog_lock); + mutex_lock(&syslog_lock); latched_seq_write(&clear_seq, seq); - raw_spin_unlock_irq(&syslog_lock); + mutex_unlock(&syslog_lock); } kfree(text); @@ -1603,21 +1603,9 @@ static int syslog_print_all(char __user *buf, int size, bool clear) static void syslog_clear(void) { - raw_spin_lock_irq(&syslog_lock); + mutex_lock(&syslog_lock); latched_seq_write(&clear_seq, prb_next_seq(prb)); - raw_spin_unlock_irq(&syslog_lock); -} - -/* Return a consistent copy of @syslog_seq. */ -static u64 read_syslog_seq_irq(void) -{ - u64 seq; - - raw_spin_lock_irq(&syslog_lock); - seq = syslog_seq; - raw_spin_unlock_irq(&syslog_lock); - - return seq; + mutex_unlock(&syslog_lock); } int do_syslog(int type, char __user *buf, int len, int source) @@ -1626,6 +1614,7 @@ int do_syslog(int type, char __user *buf, int len, int source) bool clear = false; static int saved_console_loglevel = LOGLEVEL_DEFAULT; int error; + u64 seq; error = check_syslog_permissions(type, source); if (error) @@ -1644,8 +1633,12 @@ int do_syslog(int type, char __user *buf, int len, int source) if (!access_ok(buf, len)) return -EFAULT; - error = wait_event_interruptible(log_wait, - prb_read_valid(prb, read_syslog_seq_irq(), NULL)); + /* Get a consistent copy of @syslog_seq. */ + mutex_lock(&syslog_lock); + seq = syslog_seq; + mutex_unlock(&syslog_lock); + + error = wait_event_interruptible(log_wait, prb_read_valid(prb, seq, NULL)); if (error) return error; error = syslog_print(buf, len); @@ -1693,10 +1686,10 @@ int do_syslog(int type, char __user *buf, int len, int source) break; /* Number of chars in the log buffer */ case SYSLOG_ACTION_SIZE_UNREAD: - raw_spin_lock_irq(&syslog_lock); + mutex_lock(&syslog_lock); if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) { /* No unread messages. */ - raw_spin_unlock_irq(&syslog_lock); + mutex_unlock(&syslog_lock); return 0; } if (info.seq != syslog_seq) { @@ -1714,7 +1707,6 @@ int do_syslog(int type, char __user *buf, int len, int source) } else { bool time = syslog_partial ? syslog_time : printk_time; unsigned int line_count; - u64 seq; prb_for_each_info(syslog_seq, prb, seq, &info, &line_count) { @@ -1724,7 +1716,7 @@ int do_syslog(int type, char __user *buf, int len, int source) } error -= syslog_partial; } - raw_spin_unlock_irq(&syslog_lock); + mutex_unlock(&syslog_lock); break; /* Size of the log buffer */ case SYSLOG_ACTION_SIZE_BUFFER: @@ -2929,7 +2921,6 @@ static int try_enable_new_console(struct console *newcon, bool user_specified) */ void register_console(struct console *newcon) { - unsigned long flags; struct console *bcon = NULL; int err; @@ -3034,9 +3025,9 @@ void register_console(struct console *newcon) exclusive_console_stop_seq = console_seq; /* Get a consistent copy of @syslog_seq. */ - raw_spin_lock_irqsave(&syslog_lock, flags); + mutex_lock(&syslog_lock); console_seq = syslog_seq; - raw_spin_unlock_irqrestore(&syslog_lock, flags); + mutex_unlock(&syslog_lock); } console_unlock(); console_sysfs_notify(); -- cgit v1.2.3 From 8d909b2333f37e5da84a9e6a2cbe21f52be5f42a Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 15 Jul 2021 21:39:59 +0206 Subject: printk: syslog: close window between wait and read Syslog's SYSLOG_ACTION_READ is supposed to block until the next syslog record can be read, and then it should read that record. However, because @syslog_lock is not held between waking up and reading the record, another reader could read the record first, thus causing SYSLOG_ACTION_READ to return with a value of 0, never having read _anything_. By holding @syslog_lock between waking up and reading, it can be guaranteed that SYSLOG_ACTION_READ blocks until it successfully reads a syslog record (or a real error occurs). Signed-off-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20210715193359.25946-7-john.ogness@linutronix.de --- kernel/printk/printk.c | 55 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 86f1258d08b0..65fffa6368c9 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1480,12 +1480,14 @@ static u64 find_first_fitting_seq(u64 start_seq, u64 max_seq, size_t size, return seq; } +/* The caller is responsible for making sure @size is greater than 0. */ static int syslog_print(char __user *buf, int size) { struct printk_info info; struct printk_record r; char *text; int len = 0; + u64 seq; text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL); if (!text) @@ -1493,15 +1495,35 @@ static int syslog_print(char __user *buf, int size) prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX); - while (size > 0) { + mutex_lock(&syslog_lock); + + /* + * Wait for the @syslog_seq record to be available. @syslog_seq may + * change while waiting. + */ + do { + seq = syslog_seq; + + mutex_unlock(&syslog_lock); + len = wait_event_interruptible(log_wait, prb_read_valid(prb, seq, NULL)); + mutex_lock(&syslog_lock); + + if (len) + goto out; + } while (syslog_seq != seq); + + /* + * Copy records that fit into the buffer. The above cycle makes sure + * that the first record is always available. + */ + do { size_t n; size_t skip; + int err; - mutex_lock(&syslog_lock); - if (!prb_read_valid(prb, syslog_seq, &r)) { - mutex_unlock(&syslog_lock); + if (!prb_read_valid(prb, syslog_seq, &r)) break; - } + if (r.info->seq != syslog_seq) { /* message is gone, move to next valid one */ syslog_seq = r.info->seq; @@ -1528,12 +1550,15 @@ static int syslog_print(char __user *buf, int size) syslog_partial += n; } else n = 0; - mutex_unlock(&syslog_lock); if (!n) break; - if (copy_to_user(buf, text + skip, n)) { + mutex_unlock(&syslog_lock); + err = copy_to_user(buf, text + skip, n); + mutex_lock(&syslog_lock); + + if (err) { if (!len) len = -EFAULT; break; @@ -1542,8 +1567,9 @@ static int syslog_print(char __user *buf, int size) len += n; size -= n; buf += n; - } - + } while (size); +out: + mutex_unlock(&syslog_lock); kfree(text); return len; } @@ -1614,7 +1640,6 @@ int do_syslog(int type, char __user *buf, int len, int source) bool clear = false; static int saved_console_loglevel = LOGLEVEL_DEFAULT; int error; - u64 seq; error = check_syslog_permissions(type, source); if (error) @@ -1632,15 +1657,6 @@ int do_syslog(int type, char __user *buf, int len, int source) return 0; if (!access_ok(buf, len)) return -EFAULT; - - /* Get a consistent copy of @syslog_seq. */ - mutex_lock(&syslog_lock); - seq = syslog_seq; - mutex_unlock(&syslog_lock); - - error = wait_event_interruptible(log_wait, prb_read_valid(prb, seq, NULL)); - if (error) - return error; error = syslog_print(buf, len); break; /* Read/clear last kernel messages */ @@ -1707,6 +1723,7 @@ int do_syslog(int type, char __user *buf, int len, int source) } else { bool time = syslog_partial ? syslog_time : printk_time; unsigned int line_count; + u64 seq; prb_for_each_info(syslog_seq, prb, seq, &info, &line_count) { -- cgit v1.2.3 From 0f3adb8a1e5f36e792598c1d77a2cfac9c90a4f9 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 20 Jul 2021 10:18:26 -0400 Subject: cgroup/cpuset: Miscellaneous code cleanup Use more descriptive variable names for update_prstate(), remove unnecessary code and fix some typos. There is no functional change. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 40 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index adb5190c4429..f5fef5516d99 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1114,7 +1114,7 @@ enum subparts_cmd { * cpus_allowed can be granted or an error code will be returned. * * For partcmd_disable, the cpuset is being transofrmed from a partition - * root back to a non-partition root. any CPUs in cpus_allowed that are in + * root back to a non-partition root. Any CPUs in cpus_allowed that are in * parent's subparts_cpus will be taken away from that cpumask and put back * into parent's effective_cpus. 0 should always be returned. * @@ -1225,7 +1225,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, /* * partcmd_update w/o newmask: * - * addmask = cpus_allowed & parent->effectiveb_cpus + * addmask = cpus_allowed & parent->effective_cpus * * Note that parent's subparts_cpus may have been * pre-shrunk in case there is a change in the cpu list. @@ -1365,12 +1365,12 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) case PRS_DISABLED: /* * If parent is not a partition root or an - * invalid partition root, clear the state - * state and the CS_CPU_EXCLUSIVE flag. + * invalid partition root, clear its state + * and its CS_CPU_EXCLUSIVE flag. */ WARN_ON_ONCE(cp->partition_root_state != PRS_ERROR); - cp->partition_root_state = 0; + cp->partition_root_state = PRS_DISABLED; /* * clear_bit() is an atomic operation and @@ -1937,30 +1937,28 @@ out: /* * update_prstate - update partititon_root_state - * cs: the cpuset to update - * val: 0 - disabled, 1 - enabled + * cs: the cpuset to update + * new_prs: new partition root state * * Call with cpuset_mutex held. */ -static int update_prstate(struct cpuset *cs, int val) +static int update_prstate(struct cpuset *cs, int new_prs) { int err; struct cpuset *parent = parent_cs(cs); - struct tmpmasks tmp; + struct tmpmasks tmpmask; - if ((val != 0) && (val != 1)) - return -EINVAL; - if (val == cs->partition_root_state) + if (new_prs == cs->partition_root_state) return 0; /* * Cannot force a partial or invalid partition root to a full * partition root. */ - if (val && cs->partition_root_state) + if (new_prs && (cs->partition_root_state < 0)) return -EINVAL; - if (alloc_cpumasks(NULL, &tmp)) + if (alloc_cpumasks(NULL, &tmpmask)) return -ENOMEM; err = -EINVAL; @@ -1978,7 +1976,7 @@ static int update_prstate(struct cpuset *cs, int val) goto out; err = update_parent_subparts_cpumask(cs, partcmd_enable, - NULL, &tmp); + NULL, &tmpmask); if (err) { update_flag(CS_CPU_EXCLUSIVE, cs, 0); goto out; @@ -1990,18 +1988,18 @@ static int update_prstate(struct cpuset *cs, int val) * CS_CPU_EXCLUSIVE bit. */ if (cs->partition_root_state == PRS_ERROR) { - cs->partition_root_state = 0; + cs->partition_root_state = PRS_DISABLED; update_flag(CS_CPU_EXCLUSIVE, cs, 0); err = 0; goto out; } err = update_parent_subparts_cpumask(cs, partcmd_disable, - NULL, &tmp); + NULL, &tmpmask); if (err) goto out; - cs->partition_root_state = 0; + cs->partition_root_state = PRS_DISABLED; /* Turning off CS_CPU_EXCLUSIVE will not return error */ update_flag(CS_CPU_EXCLUSIVE, cs, 0); @@ -2015,11 +2013,11 @@ static int update_prstate(struct cpuset *cs, int val) update_tasks_cpumask(parent); if (parent->child_ecpus_count) - update_sibling_cpumasks(parent, cs, &tmp); + update_sibling_cpumasks(parent, cs, &tmpmask); rebuild_sched_domains_locked(); out: - free_cpumasks(NULL, &tmp); + free_cpumasks(NULL, &tmpmask); return err; } @@ -3060,7 +3058,7 @@ retry: goto retry; } - parent = parent_cs(cs); + parent = parent_cs(cs); compute_effective_cpumask(&new_cpus, cs, parent); nodes_and(new_mems, cs->mems_allowed, parent->effective_mems); -- cgit v1.2.3 From 15d428e6fe77fffc3f4fff923336036f5496ef17 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 20 Jul 2021 10:18:27 -0400 Subject: cgroup/cpuset: Fix a partition bug with hotplug In cpuset_hotplug_workfn(), the detection of whether the cpu list has been changed is done by comparing the effective cpus of the top cpuset with the cpu_active_mask. However, in the rare case that just all the CPUs in the subparts_cpus are offlined, the detection fails and the partition states are not updated correctly. Fix it by forcing the cpus_updated flag to true in this particular case. Fixes: 4b842da276a8 ("cpuset: Make CPU hotplug work with partition") Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index f5fef5516d99..28a784bf64b1 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3166,6 +3166,13 @@ static void cpuset_hotplug_workfn(struct work_struct *work) cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus); mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems); + /* + * In the rare case that hotplug removes all the cpus in subparts_cpus, + * we assumed that cpus are updated. + */ + if (!cpus_updated && top_cpuset.nr_subparts_cpus) + cpus_updated = true; + /* synchronize cpus_allowed to cpu_active_mask */ if (cpus_updated) { spin_lock_irq(&callback_lock); -- cgit v1.2.3 From 95f7f15461fa482a05237669507b4c9b06865b73 Mon Sep 17 00:00:00 2001 From: Sumit Garg Date: Wed, 14 Jul 2021 11:26:20 +0530 Subject: kdb: Get rid of custom debug heap allocator Currently the only user for debug heap is kdbnearsym() which can be modified to rather use statically allocated buffer for symbol name as per it's current usage. So do that and hence remove custom debug heap allocator. Note that this change puts a restriction on kdbnearsym() callers to carefully use shared namebuf such that a caller should consume the symbol returned immediately prior to another call to fetch a different symbol. Also, this change uses standard KSYM_NAME_LEN macro for namebuf allocation instead of local variable: knt1_size which should avoid any conflicts caused by changes to KSYM_NAME_LEN macro value. This change has been tested using kgdbtest on arm64 which doesn't show any regressions. Suggested-by: Daniel Thompson Signed-off-by: Sumit Garg Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20210714055620.369915-1-sumit.garg@linaro.org Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_debugger.c | 1 - kernel/debug/kdb/kdb_private.h | 5 - kernel/debug/kdb/kdb_support.c | 329 ++++------------------------------------ 3 files changed, 28 insertions(+), 307 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index 0220afda3200..e91fc3e4edd5 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c @@ -140,7 +140,6 @@ int kdb_stub(struct kgdb_state *ks) */ kdb_common_deinit_state(); KDB_STATE_CLEAR(PAGER); - kdbnearsym_cleanup(); if (error == KDB_CMD_KGDB) { if (KDB_STATE(DOING_KGDB)) KDB_STATE_CLEAR(DOING_KGDB); diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 170c69aedebb..8dbc840113c9 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -109,7 +109,6 @@ extern int kdbgetaddrarg(int, const char **, int*, unsigned long *, long *, char **); extern int kdbgetsymval(const char *, kdb_symtab_t *); extern int kdbnearsym(unsigned long, kdb_symtab_t *); -extern void kdbnearsym_cleanup(void); extern char *kdb_strdup(const char *str, gfp_t type); extern void kdb_symbol_print(unsigned long, const kdb_symtab_t *, unsigned int); @@ -233,10 +232,6 @@ extern struct task_struct *kdb_curr_task(int); #define GFP_KDB (in_dbg_master() ? GFP_ATOMIC : GFP_KERNEL) -extern void *debug_kmalloc(size_t size, gfp_t flags); -extern void debug_kfree(void *); -extern void debug_kusage(void); - extern struct task_struct *kdb_current_task; extern struct pt_regs *kdb_current_regs; diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 9f50d22d68e6..c605b17b2a0d 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -52,48 +52,48 @@ int kdbgetsymval(const char *symname, kdb_symtab_t *symtab) } EXPORT_SYMBOL(kdbgetsymval); -static char *kdb_name_table[100]; /* arbitrary size */ - -/* - * kdbnearsym - Return the name of the symbol with the nearest address - * less than 'addr'. +/** + * kdbnearsym() - Return the name of the symbol with the nearest address + * less than @addr. + * @addr: Address to check for near symbol + * @symtab: Structure to receive results * - * Parameters: - * addr Address to check for symbol near - * symtab Structure to receive results - * Returns: - * 0 No sections contain this address, symtab zero filled - * 1 Address mapped to module/symbol/section, data in symtab - * Remarks: - * 2.6 kallsyms has a "feature" where it unpacks the name into a - * string. If that string is reused before the caller expects it - * then the caller sees its string change without warning. To - * avoid cluttering up the main kdb code with lots of kdb_strdup, - * tests and kfree calls, kdbnearsym maintains an LRU list of the - * last few unique strings. The list is sized large enough to - * hold active strings, no kdb caller of kdbnearsym makes more - * than ~20 later calls before using a saved value. + * WARNING: This function may return a pointer to a single statically + * allocated buffer (namebuf). kdb's unusual calling context (single + * threaded, all other CPUs halted) provides us sufficient locking for + * this to be safe. The only constraint imposed by the static buffer is + * that the caller must consume any previous reply prior to another call + * to lookup a new symbol. + * + * Note that, strictly speaking, some architectures may re-enter the kdb + * trap if the system turns out to be very badly damaged and this breaks + * the single-threaded assumption above. In these circumstances successful + * continuation and exit from the inner trap is unlikely to work and any + * user attempting this receives a prominent warning before being allowed + * to progress. In these circumstances we remain memory safe because + * namebuf[KSYM_NAME_LEN-1] will never change from '\0' although we do + * tolerate the possibility of garbled symbol display from the outer kdb + * trap. + * + * Return: + * * 0 - No sections contain this address, symtab zero filled + * * 1 - Address mapped to module/symbol/section, data in symtab */ int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab) { int ret = 0; unsigned long symbolsize = 0; unsigned long offset = 0; -#define knt1_size 128 /* must be >= kallsyms table size */ - char *knt1 = NULL; + static char namebuf[KSYM_NAME_LEN]; kdb_dbg_printf(AR, "addr=0x%lx, symtab=%px\n", addr, symtab); memset(symtab, 0, sizeof(*symtab)); if (addr < 4096) goto out; - knt1 = debug_kmalloc(knt1_size, GFP_ATOMIC); - if (!knt1) { - kdb_func_printf("addr=0x%lx cannot kmalloc knt1\n", addr); - goto out; - } + symtab->sym_name = kallsyms_lookup(addr, &symbolsize , &offset, - (char **)(&symtab->mod_name), knt1); + (char **)(&symtab->mod_name), namebuf); if (offset > 8*1024*1024) { symtab->sym_name = NULL; addr = offset = symbolsize = 0; @@ -102,63 +102,14 @@ int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab) symtab->sym_end = symtab->sym_start + symbolsize; ret = symtab->sym_name != NULL && *(symtab->sym_name) != '\0'; - if (ret) { - int i; - /* Another 2.6 kallsyms "feature". Sometimes the sym_name is - * set but the buffer passed into kallsyms_lookup is not used, - * so it contains garbage. The caller has to work out which - * buffer needs to be saved. - * - * What was Rusty smoking when he wrote that code? - */ - if (symtab->sym_name != knt1) { - strncpy(knt1, symtab->sym_name, knt1_size); - knt1[knt1_size-1] = '\0'; - } - for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) { - if (kdb_name_table[i] && - strcmp(kdb_name_table[i], knt1) == 0) - break; - } - if (i >= ARRAY_SIZE(kdb_name_table)) { - debug_kfree(kdb_name_table[0]); - memmove(kdb_name_table, kdb_name_table+1, - sizeof(kdb_name_table[0]) * - (ARRAY_SIZE(kdb_name_table)-1)); - } else { - debug_kfree(knt1); - knt1 = kdb_name_table[i]; - memmove(kdb_name_table+i, kdb_name_table+i+1, - sizeof(kdb_name_table[0]) * - (ARRAY_SIZE(kdb_name_table)-i-1)); - } - i = ARRAY_SIZE(kdb_name_table) - 1; - kdb_name_table[i] = knt1; - symtab->sym_name = kdb_name_table[i]; - knt1 = NULL; - } - if (symtab->mod_name == NULL) symtab->mod_name = "kernel"; kdb_dbg_printf(AR, "returns %d symtab->sym_start=0x%lx, symtab->mod_name=%px, symtab->sym_name=%px (%s)\n", ret, symtab->sym_start, symtab->mod_name, symtab->sym_name, symtab->sym_name); - out: - debug_kfree(knt1); return ret; } -void kdbnearsym_cleanup(void) -{ - int i; - for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) { - if (kdb_name_table[i]) { - debug_kfree(kdb_name_table[i]); - kdb_name_table[i] = NULL; - } - } -} - static char ks_namebuf[KSYM_NAME_LEN+1], ks_namebuf_prev[KSYM_NAME_LEN+1]; /* @@ -656,230 +607,6 @@ unsigned long kdb_task_state(const struct task_struct *p, unsigned long mask) return (mask & kdb_task_state_string(state)) != 0; } -/* Last ditch allocator for debugging, so we can still debug even when - * the GFP_ATOMIC pool has been exhausted. The algorithms are tuned - * for space usage, not for speed. One smallish memory pool, the free - * chain is always in ascending address order to allow coalescing, - * allocations are done in brute force best fit. - */ - -struct debug_alloc_header { - u32 next; /* offset of next header from start of pool */ - u32 size; - void *caller; -}; - -/* The memory returned by this allocator must be aligned, which means - * so must the header size. Do not assume that sizeof(struct - * debug_alloc_header) is a multiple of the alignment, explicitly - * calculate the overhead of this header, including the alignment. - * The rest of this code must not use sizeof() on any header or - * pointer to a header. - */ -#define dah_align 8 -#define dah_overhead ALIGN(sizeof(struct debug_alloc_header), dah_align) - -static u64 debug_alloc_pool_aligned[256*1024/dah_align]; /* 256K pool */ -static char *debug_alloc_pool = (char *)debug_alloc_pool_aligned; -static u32 dah_first, dah_first_call = 1, dah_used, dah_used_max; - -/* Locking is awkward. The debug code is called from all contexts, - * including non maskable interrupts. A normal spinlock is not safe - * in NMI context. Try to get the debug allocator lock, if it cannot - * be obtained after a second then give up. If the lock could not be - * previously obtained on this cpu then only try once. - * - * sparse has no annotation for "this function _sometimes_ acquires a - * lock", so fudge the acquire/release notation. - */ -static DEFINE_SPINLOCK(dap_lock); -static int get_dap_lock(void) - __acquires(dap_lock) -{ - static int dap_locked = -1; - int count; - if (dap_locked == smp_processor_id()) - count = 1; - else - count = 1000; - while (1) { - if (spin_trylock(&dap_lock)) { - dap_locked = -1; - return 1; - } - if (!count--) - break; - udelay(1000); - } - dap_locked = smp_processor_id(); - __acquire(dap_lock); - return 0; -} - -void *debug_kmalloc(size_t size, gfp_t flags) -{ - unsigned int rem, h_offset; - struct debug_alloc_header *best, *bestprev, *prev, *h; - void *p = NULL; - if (!get_dap_lock()) { - __release(dap_lock); /* we never actually got it */ - return NULL; - } - h = (struct debug_alloc_header *)(debug_alloc_pool + dah_first); - if (dah_first_call) { - h->size = sizeof(debug_alloc_pool_aligned) - dah_overhead; - dah_first_call = 0; - } - size = ALIGN(size, dah_align); - prev = best = bestprev = NULL; - while (1) { - if (h->size >= size && (!best || h->size < best->size)) { - best = h; - bestprev = prev; - if (h->size == size) - break; - } - if (!h->next) - break; - prev = h; - h = (struct debug_alloc_header *)(debug_alloc_pool + h->next); - } - if (!best) - goto out; - rem = best->size - size; - /* The pool must always contain at least one header */ - if (best->next == 0 && bestprev == NULL && rem < dah_overhead) - goto out; - if (rem >= dah_overhead) { - best->size = size; - h_offset = ((char *)best - debug_alloc_pool) + - dah_overhead + best->size; - h = (struct debug_alloc_header *)(debug_alloc_pool + h_offset); - h->size = rem - dah_overhead; - h->next = best->next; - } else - h_offset = best->next; - best->caller = __builtin_return_address(0); - dah_used += best->size; - dah_used_max = max(dah_used, dah_used_max); - if (bestprev) - bestprev->next = h_offset; - else - dah_first = h_offset; - p = (char *)best + dah_overhead; - memset(p, POISON_INUSE, best->size - 1); - *((char *)p + best->size - 1) = POISON_END; -out: - spin_unlock(&dap_lock); - return p; -} - -void debug_kfree(void *p) -{ - struct debug_alloc_header *h; - unsigned int h_offset; - if (!p) - return; - if ((char *)p < debug_alloc_pool || - (char *)p >= debug_alloc_pool + sizeof(debug_alloc_pool_aligned)) { - kfree(p); - return; - } - if (!get_dap_lock()) { - __release(dap_lock); /* we never actually got it */ - return; /* memory leak, cannot be helped */ - } - h = (struct debug_alloc_header *)((char *)p - dah_overhead); - memset(p, POISON_FREE, h->size - 1); - *((char *)p + h->size - 1) = POISON_END; - h->caller = NULL; - dah_used -= h->size; - h_offset = (char *)h - debug_alloc_pool; - if (h_offset < dah_first) { - h->next = dah_first; - dah_first = h_offset; - } else { - struct debug_alloc_header *prev; - unsigned int prev_offset; - prev = (struct debug_alloc_header *)(debug_alloc_pool + - dah_first); - while (1) { - if (!prev->next || prev->next > h_offset) - break; - prev = (struct debug_alloc_header *) - (debug_alloc_pool + prev->next); - } - prev_offset = (char *)prev - debug_alloc_pool; - if (prev_offset + dah_overhead + prev->size == h_offset) { - prev->size += dah_overhead + h->size; - memset(h, POISON_FREE, dah_overhead - 1); - *((char *)h + dah_overhead - 1) = POISON_END; - h = prev; - h_offset = prev_offset; - } else { - h->next = prev->next; - prev->next = h_offset; - } - } - if (h_offset + dah_overhead + h->size == h->next) { - struct debug_alloc_header *next; - next = (struct debug_alloc_header *) - (debug_alloc_pool + h->next); - h->size += dah_overhead + next->size; - h->next = next->next; - memset(next, POISON_FREE, dah_overhead - 1); - *((char *)next + dah_overhead - 1) = POISON_END; - } - spin_unlock(&dap_lock); -} - -void debug_kusage(void) -{ - struct debug_alloc_header *h_free, *h_used; -#ifdef CONFIG_IA64 - /* FIXME: using dah for ia64 unwind always results in a memory leak. - * Fix that memory leak first, then set debug_kusage_one_time = 1 for - * all architectures. - */ - static int debug_kusage_one_time; -#else - static int debug_kusage_one_time = 1; -#endif - if (!get_dap_lock()) { - __release(dap_lock); /* we never actually got it */ - return; - } - h_free = (struct debug_alloc_header *)(debug_alloc_pool + dah_first); - if (dah_first == 0 && - (h_free->size == sizeof(debug_alloc_pool_aligned) - dah_overhead || - dah_first_call)) - goto out; - if (!debug_kusage_one_time) - goto out; - debug_kusage_one_time = 0; - kdb_func_printf("debug_kmalloc memory leak dah_first %d\n", dah_first); - if (dah_first) { - h_used = (struct debug_alloc_header *)debug_alloc_pool; - kdb_func_printf("h_used %px size %d\n", h_used, h_used->size); - } - do { - h_used = (struct debug_alloc_header *) - ((char *)h_free + dah_overhead + h_free->size); - kdb_func_printf("h_used %px size %d caller %px\n", - h_used, h_used->size, h_used->caller); - h_free = (struct debug_alloc_header *) - (debug_alloc_pool + h_free->next); - } while (h_free->next); - h_used = (struct debug_alloc_header *) - ((char *)h_free + dah_overhead + h_free->size); - if ((char *)h_used - debug_alloc_pool != - sizeof(debug_alloc_pool_aligned)) - kdb_func_printf("h_used %px size %d caller %px\n", - h_used, h_used->size, h_used->caller); -out: - spin_unlock(&dap_lock); -} - /* Maintain a small stack of kdb_flags to allow recursion without disturbing * the global kdb state. */ -- cgit v1.2.3 From b39cded834154cf54442489b56b33d047edd6d8f Mon Sep 17 00:00:00 2001 From: Sumit Garg Date: Mon, 12 Jul 2021 19:16:17 +0530 Subject: kdb: Rename struct defcmd_set to struct kdb_macro Rename struct defcmd_set to struct kdb_macro as that sounds more appropriate given its purpose. Suggested-by: Daniel Thompson Signed-off-by: Sumit Garg Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20210712134620.276667-2-sumit.garg@linaro.org Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_main.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index d8ee5647b732..5cf9867fa118 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -654,7 +654,7 @@ static void kdb_cmderror(int diag) * Returns: * zero for success, a kdb diagnostic if error */ -struct defcmd_set { +struct kdb_macro { int count; bool usable; char *name; @@ -662,8 +662,8 @@ struct defcmd_set { char *help; char **command; }; -static struct defcmd_set *defcmd_set; -static int defcmd_set_count; +static struct kdb_macro *kdb_macro; +static int kdb_macro_count; static bool defcmd_in_progress; /* Forward references */ @@ -671,7 +671,7 @@ static int kdb_exec_defcmd(int argc, const char **argv); static int kdb_defcmd2(const char *cmdstr, const char *argv0) { - struct defcmd_set *s = defcmd_set + defcmd_set_count - 1; + struct kdb_macro *s = kdb_macro + kdb_macro_count - 1; char **save_command = s->command; if (strcmp(argv0, "endefcmd") == 0) { defcmd_in_progress = false; @@ -704,7 +704,7 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0) static int kdb_defcmd(int argc, const char **argv) { - struct defcmd_set *save_defcmd_set = defcmd_set, *s; + struct kdb_macro *save_kdb_macro = kdb_macro, *s; if (defcmd_in_progress) { kdb_printf("kdb: nested defcmd detected, assuming missing " "endefcmd\n"); @@ -712,7 +712,7 @@ static int kdb_defcmd(int argc, const char **argv) } if (argc == 0) { int i; - for (s = defcmd_set; s < defcmd_set + defcmd_set_count; ++s) { + for (s = kdb_macro; s < kdb_macro + kdb_macro_count; ++s) { kdb_printf("defcmd %s \"%s\" \"%s\"\n", s->name, s->usage, s->help); for (i = 0; i < s->count; ++i) @@ -727,13 +727,13 @@ static int kdb_defcmd(int argc, const char **argv) kdb_printf("Command only available during kdb_init()\n"); return KDB_NOTIMP; } - defcmd_set = kmalloc_array(defcmd_set_count + 1, sizeof(*defcmd_set), - GFP_KDB); - if (!defcmd_set) + kdb_macro = kmalloc_array(kdb_macro_count + 1, sizeof(*kdb_macro), + GFP_KDB); + if (!kdb_macro) goto fail_defcmd; - memcpy(defcmd_set, save_defcmd_set, - defcmd_set_count * sizeof(*defcmd_set)); - s = defcmd_set + defcmd_set_count; + memcpy(kdb_macro, save_kdb_macro, + kdb_macro_count * sizeof(*kdb_macro)); + s = kdb_macro + kdb_macro_count; memset(s, 0, sizeof(*s)); s->usable = true; s->name = kdb_strdup(argv[1], GFP_KDB); @@ -753,19 +753,19 @@ static int kdb_defcmd(int argc, const char **argv) strcpy(s->help, argv[3]+1); s->help[strlen(s->help)-1] = '\0'; } - ++defcmd_set_count; + ++kdb_macro_count; defcmd_in_progress = true; - kfree(save_defcmd_set); + kfree(save_kdb_macro); return 0; fail_help: kfree(s->usage); fail_usage: kfree(s->name); fail_name: - kfree(defcmd_set); + kfree(kdb_macro); fail_defcmd: - kdb_printf("Could not allocate new defcmd_set entry for %s\n", argv[1]); - defcmd_set = save_defcmd_set; + kdb_printf("Could not allocate new kdb_macro entry for %s\n", argv[1]); + kdb_macro = save_kdb_macro; return KDB_NOTIMP; } @@ -781,14 +781,14 @@ fail_defcmd: static int kdb_exec_defcmd(int argc, const char **argv) { int i, ret; - struct defcmd_set *s; + struct kdb_macro *s; if (argc != 0) return KDB_ARGCOUNT; - for (s = defcmd_set, i = 0; i < defcmd_set_count; ++i, ++s) { + for (s = kdb_macro, i = 0; i < kdb_macro_count; ++i, ++s) { if (strcmp(s->name, argv[0]) == 0) break; } - if (i == defcmd_set_count) { + if (i == kdb_macro_count) { kdb_printf("kdb_exec_defcmd: could not find commands for %s\n", argv[0]); return KDB_NOTIMP; -- cgit v1.2.3 From c25abcd625505f53b72dc156bac32b5120826742 Mon Sep 17 00:00:00 2001 From: Sumit Garg Date: Mon, 12 Jul 2021 19:16:18 +0530 Subject: kdb: Get rid of redundant kdb_register_flags() Commit e4f291b3f7bb ("kdb: Simplify kdb commands registration") allowed registration of pre-allocated kdb commands with pointer to struct kdbtab_t. Lets switch other users as well to register pre- allocated kdb commands via: - Changing prototype for kdb_register() to pass a pointer to struct kdbtab_t instead. - Embed kdbtab_t structure in kdb_macro_t rather than individual params. With these changes kdb_register_flags() becomes redundant and hence removed. Also, since we have switched all users to register pre-allocated commands, "is_dynamic" flag in struct kdbtab_t becomes redundant and hence removed as well. Suggested-by: Daniel Thompson Signed-off-by: Sumit Garg Acked-by: Steven Rostedt (VMware) Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20210712134620.276667-3-sumit.garg@linaro.org Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_main.c | 167 +++++++++++++---------------------------- kernel/debug/kdb/kdb_private.h | 13 ---- kernel/trace/trace_kdb.c | 12 ++- 3 files changed, 62 insertions(+), 130 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 5cf9867fa118..b2880fad26d4 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -33,7 +33,6 @@ #include #include #include -#include #include #include #include @@ -657,9 +656,7 @@ static void kdb_cmderror(int diag) struct kdb_macro { int count; bool usable; - char *name; - char *usage; - char *help; + kdbtab_t cmd; char **command; }; static struct kdb_macro *kdb_macro; @@ -678,13 +675,7 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0) if (!s->count) s->usable = false; if (s->usable) - /* macros are always safe because when executed each - * internal command re-enters kdb_parse() and is - * safety checked individually. - */ - kdb_register_flags(s->name, kdb_exec_defcmd, s->usage, - s->help, 0, - KDB_ENABLE_ALWAYS_SAFE); + kdb_register(&s->cmd); return 0; } if (!s->usable) @@ -705,6 +696,8 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0) static int kdb_defcmd(int argc, const char **argv) { struct kdb_macro *save_kdb_macro = kdb_macro, *s; + kdbtab_t *mp; + if (defcmd_in_progress) { kdb_printf("kdb: nested defcmd detected, assuming missing " "endefcmd\n"); @@ -713,8 +706,8 @@ static int kdb_defcmd(int argc, const char **argv) if (argc == 0) { int i; for (s = kdb_macro; s < kdb_macro + kdb_macro_count; ++s) { - kdb_printf("defcmd %s \"%s\" \"%s\"\n", s->name, - s->usage, s->help); + kdb_printf("defcmd %s \"%s\" \"%s\"\n", s->cmd.cmd_name, + s->cmd.cmd_usage, s->cmd.cmd_help); for (i = 0; i < s->count; ++i) kdb_printf("%s", s->command[i]); kdb_printf("endefcmd\n"); @@ -736,31 +729,36 @@ static int kdb_defcmd(int argc, const char **argv) s = kdb_macro + kdb_macro_count; memset(s, 0, sizeof(*s)); s->usable = true; - s->name = kdb_strdup(argv[1], GFP_KDB); - if (!s->name) + + mp = &s->cmd; + mp->cmd_func = kdb_exec_defcmd; + mp->cmd_minlen = 0; + mp->cmd_flags = KDB_ENABLE_ALWAYS_SAFE; + mp->cmd_name = kdb_strdup(argv[1], GFP_KDB); + if (!mp->cmd_name) goto fail_name; - s->usage = kdb_strdup(argv[2], GFP_KDB); - if (!s->usage) + mp->cmd_usage = kdb_strdup(argv[2], GFP_KDB); + if (!mp->cmd_usage) goto fail_usage; - s->help = kdb_strdup(argv[3], GFP_KDB); - if (!s->help) + mp->cmd_help = kdb_strdup(argv[3], GFP_KDB); + if (!mp->cmd_help) goto fail_help; - if (s->usage[0] == '"') { - strcpy(s->usage, argv[2]+1); - s->usage[strlen(s->usage)-1] = '\0'; + if (mp->cmd_usage[0] == '"') { + strcpy(mp->cmd_usage, argv[2]+1); + mp->cmd_usage[strlen(mp->cmd_usage)-1] = '\0'; } - if (s->help[0] == '"') { - strcpy(s->help, argv[3]+1); - s->help[strlen(s->help)-1] = '\0'; + if (mp->cmd_help[0] == '"') { + strcpy(mp->cmd_help, argv[3]+1); + mp->cmd_help[strlen(mp->cmd_help)-1] = '\0'; } ++kdb_macro_count; defcmd_in_progress = true; kfree(save_kdb_macro); return 0; fail_help: - kfree(s->usage); + kfree(mp->cmd_usage); fail_usage: - kfree(s->name); + kfree(mp->cmd_name); fail_name: kfree(kdb_macro); fail_defcmd: @@ -785,7 +783,7 @@ static int kdb_exec_defcmd(int argc, const char **argv) if (argc != 0) return KDB_ARGCOUNT; for (s = kdb_macro, i = 0; i < kdb_macro_count; ++i, ++s) { - if (strcmp(s->name, argv[0]) == 0) + if (strcmp(s->cmd.cmd_name, argv[0]) == 0) break; } if (i == kdb_macro_count) { @@ -797,7 +795,7 @@ static int kdb_exec_defcmd(int argc, const char **argv) /* Recursive use of kdb_parse, do not use argv after * this point */ argv = NULL; - kdb_printf("[%s]kdb> %s\n", s->name, s->command[i]); + kdb_printf("[%s]kdb> %s\n", s->cmd.cmd_name, s->command[i]); ret = kdb_parse(s->command[i]); if (ret) return ret; @@ -2613,56 +2611,32 @@ static int kdb_grep_help(int argc, const char **argv) return 0; } -/* - * kdb_register_flags - This function is used to register a kernel - * debugger command. - * Inputs: - * cmd Command name - * func Function to execute the command - * usage A simple usage string showing arguments - * help A simple help string describing command - * repeat Does the command auto repeat on enter? - * Returns: - * zero for success, one if a duplicate command. +/** + * kdb_register() - This function is used to register a kernel debugger + * command. + * @cmd: pointer to kdb command + * + * Note that it's the job of the caller to keep the memory for the cmd + * allocated until unregister is called. */ -int kdb_register_flags(char *cmd, - kdb_func_t func, - char *usage, - char *help, - short minlen, - kdb_cmdflags_t flags) +int kdb_register(kdbtab_t *cmd) { kdbtab_t *kp; list_for_each_entry(kp, &kdb_cmds_head, list_node) { - if (strcmp(kp->cmd_name, cmd) == 0) { - kdb_printf("Duplicate kdb command registered: " - "%s, func %px help %s\n", cmd, func, help); + if (strcmp(kp->cmd_name, cmd->cmd_name) == 0) { + kdb_printf("Duplicate kdb cmd: %s, func %p help %s\n", + cmd->cmd_name, cmd->cmd_func, cmd->cmd_help); return 1; } } - kp = kmalloc(sizeof(*kp), GFP_KDB); - if (!kp) { - kdb_printf("Could not allocate new kdb_command table\n"); - return 1; - } - - kp->cmd_name = cmd; - kp->cmd_func = func; - kp->cmd_usage = usage; - kp->cmd_help = help; - kp->cmd_minlen = minlen; - kp->cmd_flags = flags; - kp->is_dynamic = true; - - list_add_tail(&kp->list_node, &kdb_cmds_head); - + list_add_tail(&cmd->list_node, &kdb_cmds_head); return 0; } -EXPORT_SYMBOL_GPL(kdb_register_flags); +EXPORT_SYMBOL_GPL(kdb_register); -/* +/** * kdb_register_table() - This function is used to register a kdb command * table. * @kp: pointer to kdb command table @@ -2676,55 +2650,15 @@ void kdb_register_table(kdbtab_t *kp, size_t len) } } -/* - * kdb_register - Compatibility register function for commands that do - * not need to specify a repeat state. Equivalent to - * kdb_register_flags with flags set to 0. - * Inputs: - * cmd Command name - * func Function to execute the command - * usage A simple usage string showing arguments - * help A simple help string describing command - * Returns: - * zero for success, one if a duplicate command. - */ -int kdb_register(char *cmd, - kdb_func_t func, - char *usage, - char *help, - short minlen) -{ - return kdb_register_flags(cmd, func, usage, help, minlen, 0); -} -EXPORT_SYMBOL_GPL(kdb_register); - -/* - * kdb_unregister - This function is used to unregister a kernel - * debugger command. It is generally called when a module which - * implements kdb commands is unloaded. - * Inputs: - * cmd Command name - * Returns: - * zero for success, one command not registered. +/** + * kdb_unregister() - This function is used to unregister a kernel debugger + * command. It is generally called when a module which + * implements kdb command is unloaded. + * @cmd: pointer to kdb command */ -int kdb_unregister(char *cmd) +void kdb_unregister(kdbtab_t *cmd) { - kdbtab_t *kp; - - /* - * find the command. - */ - list_for_each_entry(kp, &kdb_cmds_head, list_node) { - if (strcmp(kp->cmd_name, cmd) == 0) { - list_del(&kp->list_node); - if (kp->is_dynamic) - kfree(kp); - return 0; - } - } - - /* Couldn't find it. */ - return 1; + list_del(&cmd->list_node); } EXPORT_SYMBOL_GPL(kdb_unregister); @@ -2900,6 +2834,11 @@ static kdbtab_t maintab[] = { .cmd_func = kdb_defcmd, .cmd_usage = "name \"usage\" \"help\"", .cmd_help = "Define a set of commands, down to endefcmd", + /* + * Macros are always safe because when executed each + * internal command re-enters kdb_parse() and is safety + * checked individually. + */ .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, }, { .cmd_name = "kill", diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 8dbc840113c9..629590084a0d 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -164,19 +164,6 @@ typedef struct _kdb_bp { #ifdef CONFIG_KGDB_KDB extern kdb_bp_t kdb_breakpoints[/* KDB_MAXBPT */]; -/* The KDB shell command table */ -typedef struct _kdbtab { - char *cmd_name; /* Command name */ - kdb_func_t cmd_func; /* Function to execute command */ - char *cmd_usage; /* Usage String for this command */ - char *cmd_help; /* Help message for this command */ - short cmd_minlen; /* Minimum legal # command - * chars required */ - kdb_cmdflags_t cmd_flags; /* Command behaviour flags */ - struct list_head list_node; /* Command list */ - bool is_dynamic; /* Command table allocation type */ -} kdbtab_t; - extern void kdb_register_table(kdbtab_t *kp, size_t len); extern int kdb_bt(int, const char **); /* KDB display back trace */ diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 9da76104f7a2..6c4f92c79e43 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -147,11 +147,17 @@ static int kdb_ftdump(int argc, const char **argv) return 0; } +static kdbtab_t ftdump_cmd = { + .cmd_name = "ftdump", + .cmd_func = kdb_ftdump, + .cmd_usage = "[skip_#entries] [cpu]", + .cmd_help = "Dump ftrace log; -skip dumps last #entries", + .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, +}; + static __init int kdb_ftrace_register(void) { - kdb_register_flags("ftdump", kdb_ftdump, "[skip_#entries] [cpu]", - "Dump ftrace log; -skip dumps last #entries", 0, - KDB_ENABLE_ALWAYS_SAFE); + kdb_register(&ftdump_cmd); return 0; } -- cgit v1.2.3 From 9a5db530aa7d98b10c4f5104027565c98cca49e6 Mon Sep 17 00:00:00 2001 From: Sumit Garg Date: Mon, 12 Jul 2021 19:16:19 +0530 Subject: kdb: Simplify kdb_defcmd macro logic Switch to use a linked list instead of dynamic array which makes allocation of kdb macro and traversing the kdb macro commands list simpler. Suggested-by: Daniel Thompson Signed-off-by: Sumit Garg Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20210712134620.276667-4-sumit.garg@linaro.org Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_main.c | 107 ++++++++++++++++++++++++-------------------- 1 file changed, 58 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index b2880fad26d4..7c7a2ef834fc 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -654,13 +654,16 @@ static void kdb_cmderror(int diag) * zero for success, a kdb diagnostic if error */ struct kdb_macro { - int count; - bool usable; - kdbtab_t cmd; - char **command; + kdbtab_t cmd; /* Macro command */ + struct list_head statements; /* Associated statement list */ }; + +struct kdb_macro_statement { + char *statement; /* Statement text */ + struct list_head list_node; /* Statement list node */ +}; + static struct kdb_macro *kdb_macro; -static int kdb_macro_count; static bool defcmd_in_progress; /* Forward references */ @@ -668,34 +671,33 @@ static int kdb_exec_defcmd(int argc, const char **argv); static int kdb_defcmd2(const char *cmdstr, const char *argv0) { - struct kdb_macro *s = kdb_macro + kdb_macro_count - 1; - char **save_command = s->command; + struct kdb_macro_statement *kms; + + if (!kdb_macro) + return KDB_NOTIMP; + if (strcmp(argv0, "endefcmd") == 0) { defcmd_in_progress = false; - if (!s->count) - s->usable = false; - if (s->usable) - kdb_register(&s->cmd); + if (!list_empty(&kdb_macro->statements)) + kdb_register(&kdb_macro->cmd); return 0; } - if (!s->usable) - return KDB_NOTIMP; - s->command = kcalloc(s->count + 1, sizeof(*(s->command)), GFP_KDB); - if (!s->command) { - kdb_printf("Could not allocate new kdb_defcmd table for %s\n", + + kms = kmalloc(sizeof(*kms), GFP_KDB); + if (!kms) { + kdb_printf("Could not allocate new kdb macro command: %s\n", cmdstr); - s->usable = false; return KDB_NOTIMP; } - memcpy(s->command, save_command, s->count * sizeof(*(s->command))); - s->command[s->count++] = kdb_strdup(cmdstr, GFP_KDB); - kfree(save_command); + + kms->statement = kdb_strdup(cmdstr, GFP_KDB); + list_add_tail(&kms->list_node, &kdb_macro->statements); + return 0; } static int kdb_defcmd(int argc, const char **argv) { - struct kdb_macro *save_kdb_macro = kdb_macro, *s; kdbtab_t *mp; if (defcmd_in_progress) { @@ -704,13 +706,21 @@ static int kdb_defcmd(int argc, const char **argv) kdb_defcmd2("endefcmd", "endefcmd"); } if (argc == 0) { - int i; - for (s = kdb_macro; s < kdb_macro + kdb_macro_count; ++s) { - kdb_printf("defcmd %s \"%s\" \"%s\"\n", s->cmd.cmd_name, - s->cmd.cmd_usage, s->cmd.cmd_help); - for (i = 0; i < s->count; ++i) - kdb_printf("%s", s->command[i]); - kdb_printf("endefcmd\n"); + kdbtab_t *kp; + struct kdb_macro *kmp; + struct kdb_macro_statement *kms; + + list_for_each_entry(kp, &kdb_cmds_head, list_node) { + if (kp->cmd_func == kdb_exec_defcmd) { + kdb_printf("defcmd %s \"%s\" \"%s\"\n", + kp->cmd_name, kp->cmd_usage, + kp->cmd_help); + kmp = container_of(kp, struct kdb_macro, cmd); + list_for_each_entry(kms, &kmp->statements, + list_node) + kdb_printf("%s", kms->statement); + kdb_printf("endefcmd\n"); + } } return 0; } @@ -720,17 +730,11 @@ static int kdb_defcmd(int argc, const char **argv) kdb_printf("Command only available during kdb_init()\n"); return KDB_NOTIMP; } - kdb_macro = kmalloc_array(kdb_macro_count + 1, sizeof(*kdb_macro), - GFP_KDB); + kdb_macro = kzalloc(sizeof(*kdb_macro), GFP_KDB); if (!kdb_macro) goto fail_defcmd; - memcpy(kdb_macro, save_kdb_macro, - kdb_macro_count * sizeof(*kdb_macro)); - s = kdb_macro + kdb_macro_count; - memset(s, 0, sizeof(*s)); - s->usable = true; - mp = &s->cmd; + mp = &kdb_macro->cmd; mp->cmd_func = kdb_exec_defcmd; mp->cmd_minlen = 0; mp->cmd_flags = KDB_ENABLE_ALWAYS_SAFE; @@ -751,9 +755,9 @@ static int kdb_defcmd(int argc, const char **argv) strcpy(mp->cmd_help, argv[3]+1); mp->cmd_help[strlen(mp->cmd_help)-1] = '\0'; } - ++kdb_macro_count; + + INIT_LIST_HEAD(&kdb_macro->statements); defcmd_in_progress = true; - kfree(save_kdb_macro); return 0; fail_help: kfree(mp->cmd_usage); @@ -763,7 +767,6 @@ fail_name: kfree(kdb_macro); fail_defcmd: kdb_printf("Could not allocate new kdb_macro entry for %s\n", argv[1]); - kdb_macro = save_kdb_macro; return KDB_NOTIMP; } @@ -778,25 +781,31 @@ fail_defcmd: */ static int kdb_exec_defcmd(int argc, const char **argv) { - int i, ret; - struct kdb_macro *s; + int ret; + kdbtab_t *kp; + struct kdb_macro *kmp; + struct kdb_macro_statement *kms; + if (argc != 0) return KDB_ARGCOUNT; - for (s = kdb_macro, i = 0; i < kdb_macro_count; ++i, ++s) { - if (strcmp(s->cmd.cmd_name, argv[0]) == 0) + + list_for_each_entry(kp, &kdb_cmds_head, list_node) { + if (strcmp(kp->cmd_name, argv[0]) == 0) break; } - if (i == kdb_macro_count) { + if (list_entry_is_head(kp, &kdb_cmds_head, list_node)) { kdb_printf("kdb_exec_defcmd: could not find commands for %s\n", argv[0]); return KDB_NOTIMP; } - for (i = 0; i < s->count; ++i) { - /* Recursive use of kdb_parse, do not use argv after - * this point */ + kmp = container_of(kp, struct kdb_macro, cmd); + list_for_each_entry(kms, &kmp->statements, list_node) { + /* + * Recursive use of kdb_parse, do not use argv after this point. + */ argv = NULL; - kdb_printf("[%s]kdb> %s\n", s->cmd.cmd_name, s->command[i]); - ret = kdb_parse(s->command[i]); + kdb_printf("[%s]kdb> %s\n", kmp->cmd.cmd_name, kms->statement); + ret = kdb_parse(kms->statement); if (ret) return ret; } -- cgit v1.2.3 From e868f0a3c4b9c1d7721f08b703142a876814a3f8 Mon Sep 17 00:00:00 2001 From: Sumit Garg Date: Mon, 12 Jul 2021 19:16:20 +0530 Subject: kdb: Rename members of struct kdbtab_t Remove redundant prefix "cmd_" from name of members in struct kdbtab_t for better readibility. Suggested-by: Doug Anderson Signed-off-by: Sumit Garg Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20210712134620.276667-5-sumit.garg@linaro.org Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_bp.c | 72 ++++---- kernel/debug/kdb/kdb_main.c | 404 ++++++++++++++++++++++---------------------- kernel/trace/trace_kdb.c | 10 +- 3 files changed, 242 insertions(+), 244 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index 2168f8dacb99..372025cf1ca3 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c @@ -523,51 +523,51 @@ static int kdb_ss(int argc, const char **argv) } static kdbtab_t bptab[] = { - { .cmd_name = "bp", - .cmd_func = kdb_bp, - .cmd_usage = "[]", - .cmd_help = "Set/Display breakpoints", - .cmd_flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS, + { .name = "bp", + .func = kdb_bp, + .usage = "[]", + .help = "Set/Display breakpoints", + .flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS, }, - { .cmd_name = "bl", - .cmd_func = kdb_bp, - .cmd_usage = "[]", - .cmd_help = "Display breakpoints", - .cmd_flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS, + { .name = "bl", + .func = kdb_bp, + .usage = "[]", + .help = "Display breakpoints", + .flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS, }, - { .cmd_name = "bc", - .cmd_func = kdb_bc, - .cmd_usage = "", - .cmd_help = "Clear Breakpoint", - .cmd_flags = KDB_ENABLE_FLOW_CTRL, + { .name = "bc", + .func = kdb_bc, + .usage = "", + .help = "Clear Breakpoint", + .flags = KDB_ENABLE_FLOW_CTRL, }, - { .cmd_name = "be", - .cmd_func = kdb_bc, - .cmd_usage = "", - .cmd_help = "Enable Breakpoint", - .cmd_flags = KDB_ENABLE_FLOW_CTRL, + { .name = "be", + .func = kdb_bc, + .usage = "", + .help = "Enable Breakpoint", + .flags = KDB_ENABLE_FLOW_CTRL, }, - { .cmd_name = "bd", - .cmd_func = kdb_bc, - .cmd_usage = "", - .cmd_help = "Disable Breakpoint", - .cmd_flags = KDB_ENABLE_FLOW_CTRL, + { .name = "bd", + .func = kdb_bc, + .usage = "", + .help = "Disable Breakpoint", + .flags = KDB_ENABLE_FLOW_CTRL, }, - { .cmd_name = "ss", - .cmd_func = kdb_ss, - .cmd_usage = "", - .cmd_help = "Single Step", - .cmd_minlen = 1, - .cmd_flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS, + { .name = "ss", + .func = kdb_ss, + .usage = "", + .help = "Single Step", + .minlen = 1, + .flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS, }, }; static kdbtab_t bphcmd = { - .cmd_name = "bph", - .cmd_func = kdb_bp, - .cmd_usage = "[]", - .cmd_help = "[datar [length]|dataw [length]] Set hw brk", - .cmd_flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS, + .name = "bph", + .func = kdb_bp, + .usage = "[]", + .help = "[datar [length]|dataw [length]] Set hw brk", + .flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS, }; /* Initialize the breakpoint table and register breakpoint commands. */ diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 7c7a2ef834fc..fa6deda894a1 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -711,10 +711,9 @@ static int kdb_defcmd(int argc, const char **argv) struct kdb_macro_statement *kms; list_for_each_entry(kp, &kdb_cmds_head, list_node) { - if (kp->cmd_func == kdb_exec_defcmd) { + if (kp->func == kdb_exec_defcmd) { kdb_printf("defcmd %s \"%s\" \"%s\"\n", - kp->cmd_name, kp->cmd_usage, - kp->cmd_help); + kp->name, kp->usage, kp->help); kmp = container_of(kp, struct kdb_macro, cmd); list_for_each_entry(kms, &kmp->statements, list_node) @@ -735,34 +734,34 @@ static int kdb_defcmd(int argc, const char **argv) goto fail_defcmd; mp = &kdb_macro->cmd; - mp->cmd_func = kdb_exec_defcmd; - mp->cmd_minlen = 0; - mp->cmd_flags = KDB_ENABLE_ALWAYS_SAFE; - mp->cmd_name = kdb_strdup(argv[1], GFP_KDB); - if (!mp->cmd_name) + mp->func = kdb_exec_defcmd; + mp->minlen = 0; + mp->flags = KDB_ENABLE_ALWAYS_SAFE; + mp->name = kdb_strdup(argv[1], GFP_KDB); + if (!mp->name) goto fail_name; - mp->cmd_usage = kdb_strdup(argv[2], GFP_KDB); - if (!mp->cmd_usage) + mp->usage = kdb_strdup(argv[2], GFP_KDB); + if (!mp->usage) goto fail_usage; - mp->cmd_help = kdb_strdup(argv[3], GFP_KDB); - if (!mp->cmd_help) + mp->help = kdb_strdup(argv[3], GFP_KDB); + if (!mp->help) goto fail_help; - if (mp->cmd_usage[0] == '"') { - strcpy(mp->cmd_usage, argv[2]+1); - mp->cmd_usage[strlen(mp->cmd_usage)-1] = '\0'; + if (mp->usage[0] == '"') { + strcpy(mp->usage, argv[2]+1); + mp->usage[strlen(mp->usage)-1] = '\0'; } - if (mp->cmd_help[0] == '"') { - strcpy(mp->cmd_help, argv[3]+1); - mp->cmd_help[strlen(mp->cmd_help)-1] = '\0'; + if (mp->help[0] == '"') { + strcpy(mp->help, argv[3]+1); + mp->help[strlen(mp->help)-1] = '\0'; } INIT_LIST_HEAD(&kdb_macro->statements); defcmd_in_progress = true; return 0; fail_help: - kfree(mp->cmd_usage); + kfree(mp->usage); fail_usage: - kfree(mp->cmd_name); + kfree(mp->name); fail_name: kfree(kdb_macro); fail_defcmd: @@ -790,7 +789,7 @@ static int kdb_exec_defcmd(int argc, const char **argv) return KDB_ARGCOUNT; list_for_each_entry(kp, &kdb_cmds_head, list_node) { - if (strcmp(kp->cmd_name, argv[0]) == 0) + if (strcmp(kp->name, argv[0]) == 0) break; } if (list_entry_is_head(kp, &kdb_cmds_head, list_node)) { @@ -804,7 +803,7 @@ static int kdb_exec_defcmd(int argc, const char **argv) * Recursive use of kdb_parse, do not use argv after this point. */ argv = NULL; - kdb_printf("[%s]kdb> %s\n", kmp->cmd.cmd_name, kms->statement); + kdb_printf("[%s]kdb> %s\n", kmp->cmd.name, kms->statement); ret = kdb_parse(kms->statement); if (ret) return ret; @@ -1016,11 +1015,11 @@ int kdb_parse(const char *cmdstr) * If this command is allowed to be abbreviated, * check to see if this is it. */ - if (tp->cmd_minlen && (strlen(argv[0]) <= tp->cmd_minlen) && - (strncmp(argv[0], tp->cmd_name, tp->cmd_minlen) == 0)) + if (tp->minlen && (strlen(argv[0]) <= tp->minlen) && + (strncmp(argv[0], tp->name, tp->minlen) == 0)) break; - if (strcmp(argv[0], tp->cmd_name) == 0) + if (strcmp(argv[0], tp->name) == 0) break; } @@ -1031,8 +1030,7 @@ int kdb_parse(const char *cmdstr) */ if (list_entry_is_head(tp, &kdb_cmds_head, list_node)) { list_for_each_entry(tp, &kdb_cmds_head, list_node) { - if (strncmp(argv[0], tp->cmd_name, - strlen(tp->cmd_name)) == 0) + if (strncmp(argv[0], tp->name, strlen(tp->name)) == 0) break; } } @@ -1040,19 +1038,19 @@ int kdb_parse(const char *cmdstr) if (!list_entry_is_head(tp, &kdb_cmds_head, list_node)) { int result; - if (!kdb_check_flags(tp->cmd_flags, kdb_cmd_enabled, argc <= 1)) + if (!kdb_check_flags(tp->flags, kdb_cmd_enabled, argc <= 1)) return KDB_NOPERM; KDB_STATE_SET(CMD); - result = (*tp->cmd_func)(argc-1, (const char **)argv); + result = (*tp->func)(argc-1, (const char **)argv); if (result && ignore_errors && result > KDB_CMD_GO) result = 0; KDB_STATE_CLEAR(CMD); - if (tp->cmd_flags & KDB_REPEAT_WITH_ARGS) + if (tp->flags & KDB_REPEAT_WITH_ARGS) return result; - argc = tp->cmd_flags & KDB_REPEAT_NO_ARGS ? 1 : 0; + argc = tp->flags & KDB_REPEAT_NO_ARGS ? 1 : 0; if (argv[argc]) *(argv[argc]) = '\0'; return result; @@ -2419,12 +2417,12 @@ static int kdb_help(int argc, const char **argv) char *space = ""; if (KDB_FLAG(CMD_INTERRUPT)) return 0; - if (!kdb_check_flags(kt->cmd_flags, kdb_cmd_enabled, true)) + if (!kdb_check_flags(kt->flags, kdb_cmd_enabled, true)) continue; - if (strlen(kt->cmd_usage) > 20) + if (strlen(kt->usage) > 20) space = "\n "; - kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name, - kt->cmd_usage, space, kt->cmd_help); + kdb_printf("%-15.15s %-20s%s%s\n", kt->name, + kt->usage, space, kt->help); } return 0; } @@ -2633,9 +2631,9 @@ int kdb_register(kdbtab_t *cmd) kdbtab_t *kp; list_for_each_entry(kp, &kdb_cmds_head, list_node) { - if (strcmp(kp->cmd_name, cmd->cmd_name) == 0) { + if (strcmp(kp->name, cmd->name) == 0) { kdb_printf("Duplicate kdb cmd: %s, func %p help %s\n", - cmd->cmd_name, cmd->cmd_func, cmd->cmd_help); + cmd->name, cmd->func, cmd->help); return 1; } } @@ -2672,218 +2670,218 @@ void kdb_unregister(kdbtab_t *cmd) EXPORT_SYMBOL_GPL(kdb_unregister); static kdbtab_t maintab[] = { - { .cmd_name = "md", - .cmd_func = kdb_md, - .cmd_usage = "", - .cmd_help = "Display Memory Contents, also mdWcN, e.g. md8c1", - .cmd_minlen = 1, - .cmd_flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS, + { .name = "md", + .func = kdb_md, + .usage = "", + .help = "Display Memory Contents, also mdWcN, e.g. md8c1", + .minlen = 1, + .flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS, }, - { .cmd_name = "mdr", - .cmd_func = kdb_md, - .cmd_usage = " ", - .cmd_help = "Display Raw Memory", - .cmd_flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS, + { .name = "mdr", + .func = kdb_md, + .usage = " ", + .help = "Display Raw Memory", + .flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS, }, - { .cmd_name = "mdp", - .cmd_func = kdb_md, - .cmd_usage = " ", - .cmd_help = "Display Physical Memory", - .cmd_flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS, + { .name = "mdp", + .func = kdb_md, + .usage = " ", + .help = "Display Physical Memory", + .flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS, }, - { .cmd_name = "mds", - .cmd_func = kdb_md, - .cmd_usage = "", - .cmd_help = "Display Memory Symbolically", - .cmd_flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS, + { .name = "mds", + .func = kdb_md, + .usage = "", + .help = "Display Memory Symbolically", + .flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS, }, - { .cmd_name = "mm", - .cmd_func = kdb_mm, - .cmd_usage = " ", - .cmd_help = "Modify Memory Contents", - .cmd_flags = KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS, + { .name = "mm", + .func = kdb_mm, + .usage = " ", + .help = "Modify Memory Contents", + .flags = KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS, }, - { .cmd_name = "go", - .cmd_func = kdb_go, - .cmd_usage = "[]", - .cmd_help = "Continue Execution", - .cmd_minlen = 1, - .cmd_flags = KDB_ENABLE_REG_WRITE | + { .name = "go", + .func = kdb_go, + .usage = "[]", + .help = "Continue Execution", + .minlen = 1, + .flags = KDB_ENABLE_REG_WRITE | KDB_ENABLE_ALWAYS_SAFE_NO_ARGS, }, - { .cmd_name = "rd", - .cmd_func = kdb_rd, - .cmd_usage = "", - .cmd_help = "Display Registers", - .cmd_flags = KDB_ENABLE_REG_READ, + { .name = "rd", + .func = kdb_rd, + .usage = "", + .help = "Display Registers", + .flags = KDB_ENABLE_REG_READ, }, - { .cmd_name = "rm", - .cmd_func = kdb_rm, - .cmd_usage = " ", - .cmd_help = "Modify Registers", - .cmd_flags = KDB_ENABLE_REG_WRITE, + { .name = "rm", + .func = kdb_rm, + .usage = " ", + .help = "Modify Registers", + .flags = KDB_ENABLE_REG_WRITE, }, - { .cmd_name = "ef", - .cmd_func = kdb_ef, - .cmd_usage = "", - .cmd_help = "Display exception frame", - .cmd_flags = KDB_ENABLE_MEM_READ, + { .name = "ef", + .func = kdb_ef, + .usage = "", + .help = "Display exception frame", + .flags = KDB_ENABLE_MEM_READ, }, - { .cmd_name = "bt", - .cmd_func = kdb_bt, - .cmd_usage = "[]", - .cmd_help = "Stack traceback", - .cmd_minlen = 1, - .cmd_flags = KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS, + { .name = "bt", + .func = kdb_bt, + .usage = "[]", + .help = "Stack traceback", + .minlen = 1, + .flags = KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS, }, - { .cmd_name = "btp", - .cmd_func = kdb_bt, - .cmd_usage = "", - .cmd_help = "Display stack for process ", - .cmd_flags = KDB_ENABLE_INSPECT, + { .name = "btp", + .func = kdb_bt, + .usage = "", + .help = "Display stack for process ", + .flags = KDB_ENABLE_INSPECT, }, - { .cmd_name = "bta", - .cmd_func = kdb_bt, - .cmd_usage = "[D|R|S|T|C|Z|E|U|I|M|A]", - .cmd_help = "Backtrace all processes matching state flag", - .cmd_flags = KDB_ENABLE_INSPECT, + { .name = "bta", + .func = kdb_bt, + .usage = "[D|R|S|T|C|Z|E|U|I|M|A]", + .help = "Backtrace all processes matching state flag", + .flags = KDB_ENABLE_INSPECT, }, - { .cmd_name = "btc", - .cmd_func = kdb_bt, - .cmd_usage = "", - .cmd_help = "Backtrace current process on each cpu", - .cmd_flags = KDB_ENABLE_INSPECT, + { .name = "btc", + .func = kdb_bt, + .usage = "", + .help = "Backtrace current process on each cpu", + .flags = KDB_ENABLE_INSPECT, }, - { .cmd_name = "btt", - .cmd_func = kdb_bt, - .cmd_usage = "", - .cmd_help = "Backtrace process given its struct task address", - .cmd_flags = KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS, + { .name = "btt", + .func = kdb_bt, + .usage = "", + .help = "Backtrace process given its struct task address", + .flags = KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS, }, - { .cmd_name = "env", - .cmd_func = kdb_env, - .cmd_usage = "", - .cmd_help = "Show environment variables", - .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, + { .name = "env", + .func = kdb_env, + .usage = "", + .help = "Show environment variables", + .flags = KDB_ENABLE_ALWAYS_SAFE, }, - { .cmd_name = "set", - .cmd_func = kdb_set, - .cmd_usage = "", - .cmd_help = "Set environment variables", - .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, + { .name = "set", + .func = kdb_set, + .usage = "", + .help = "Set environment variables", + .flags = KDB_ENABLE_ALWAYS_SAFE, }, - { .cmd_name = "help", - .cmd_func = kdb_help, - .cmd_usage = "", - .cmd_help = "Display Help Message", - .cmd_minlen = 1, - .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, + { .name = "help", + .func = kdb_help, + .usage = "", + .help = "Display Help Message", + .minlen = 1, + .flags = KDB_ENABLE_ALWAYS_SAFE, }, - { .cmd_name = "?", - .cmd_func = kdb_help, - .cmd_usage = "", - .cmd_help = "Display Help Message", - .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, + { .name = "?", + .func = kdb_help, + .usage = "", + .help = "Display Help Message", + .flags = KDB_ENABLE_ALWAYS_SAFE, }, - { .cmd_name = "cpu", - .cmd_func = kdb_cpu, - .cmd_usage = "", - .cmd_help = "Switch to new cpu", - .cmd_flags = KDB_ENABLE_ALWAYS_SAFE_NO_ARGS, + { .name = "cpu", + .func = kdb_cpu, + .usage = "", + .help = "Switch to new cpu", + .flags = KDB_ENABLE_ALWAYS_SAFE_NO_ARGS, }, - { .cmd_name = "kgdb", - .cmd_func = kdb_kgdb, - .cmd_usage = "", - .cmd_help = "Enter kgdb mode", - .cmd_flags = 0, + { .name = "kgdb", + .func = kdb_kgdb, + .usage = "", + .help = "Enter kgdb mode", + .flags = 0, }, - { .cmd_name = "ps", - .cmd_func = kdb_ps, - .cmd_usage = "[|A]", - .cmd_help = "Display active task list", - .cmd_flags = KDB_ENABLE_INSPECT, + { .name = "ps", + .func = kdb_ps, + .usage = "[|A]", + .help = "Display active task list", + .flags = KDB_ENABLE_INSPECT, }, - { .cmd_name = "pid", - .cmd_func = kdb_pid, - .cmd_usage = "", - .cmd_help = "Switch to another task", - .cmd_flags = KDB_ENABLE_INSPECT, + { .name = "pid", + .func = kdb_pid, + .usage = "", + .help = "Switch to another task", + .flags = KDB_ENABLE_INSPECT, }, - { .cmd_name = "reboot", - .cmd_func = kdb_reboot, - .cmd_usage = "", - .cmd_help = "Reboot the machine immediately", - .cmd_flags = KDB_ENABLE_REBOOT, + { .name = "reboot", + .func = kdb_reboot, + .usage = "", + .help = "Reboot the machine immediately", + .flags = KDB_ENABLE_REBOOT, }, #if defined(CONFIG_MODULES) - { .cmd_name = "lsmod", - .cmd_func = kdb_lsmod, - .cmd_usage = "", - .cmd_help = "List loaded kernel modules", - .cmd_flags = KDB_ENABLE_INSPECT, + { .name = "lsmod", + .func = kdb_lsmod, + .usage = "", + .help = "List loaded kernel modules", + .flags = KDB_ENABLE_INSPECT, }, #endif #if defined(CONFIG_MAGIC_SYSRQ) - { .cmd_name = "sr", - .cmd_func = kdb_sr, - .cmd_usage = "", - .cmd_help = "Magic SysRq key", - .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, + { .name = "sr", + .func = kdb_sr, + .usage = "", + .help = "Magic SysRq key", + .flags = KDB_ENABLE_ALWAYS_SAFE, }, #endif #if defined(CONFIG_PRINTK) - { .cmd_name = "dmesg", - .cmd_func = kdb_dmesg, - .cmd_usage = "[lines]", - .cmd_help = "Display syslog buffer", - .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, + { .name = "dmesg", + .func = kdb_dmesg, + .usage = "[lines]", + .help = "Display syslog buffer", + .flags = KDB_ENABLE_ALWAYS_SAFE, }, #endif - { .cmd_name = "defcmd", - .cmd_func = kdb_defcmd, - .cmd_usage = "name \"usage\" \"help\"", - .cmd_help = "Define a set of commands, down to endefcmd", + { .name = "defcmd", + .func = kdb_defcmd, + .usage = "name \"usage\" \"help\"", + .help = "Define a set of commands, down to endefcmd", /* * Macros are always safe because when executed each * internal command re-enters kdb_parse() and is safety * checked individually. */ - .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, + .flags = KDB_ENABLE_ALWAYS_SAFE, }, - { .cmd_name = "kill", - .cmd_func = kdb_kill, - .cmd_usage = "<-signal> ", - .cmd_help = "Send a signal to a process", - .cmd_flags = KDB_ENABLE_SIGNAL, + { .name = "kill", + .func = kdb_kill, + .usage = "<-signal> ", + .help = "Send a signal to a process", + .flags = KDB_ENABLE_SIGNAL, }, - { .cmd_name = "summary", - .cmd_func = kdb_summary, - .cmd_usage = "", - .cmd_help = "Summarize the system", - .cmd_minlen = 4, - .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, + { .name = "summary", + .func = kdb_summary, + .usage = "", + .help = "Summarize the system", + .minlen = 4, + .flags = KDB_ENABLE_ALWAYS_SAFE, }, - { .cmd_name = "per_cpu", - .cmd_func = kdb_per_cpu, - .cmd_usage = " [] []", - .cmd_help = "Display per_cpu variables", - .cmd_minlen = 3, - .cmd_flags = KDB_ENABLE_MEM_READ, + { .name = "per_cpu", + .func = kdb_per_cpu, + .usage = " [] []", + .help = "Display per_cpu variables", + .minlen = 3, + .flags = KDB_ENABLE_MEM_READ, }, - { .cmd_name = "grephelp", - .cmd_func = kdb_grep_help, - .cmd_usage = "", - .cmd_help = "Display help on | grep", - .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, + { .name = "grephelp", + .func = kdb_grep_help, + .usage = "", + .help = "Display help on | grep", + .flags = KDB_ENABLE_ALWAYS_SAFE, }, }; static kdbtab_t nmicmd = { - .cmd_name = "disable_nmi", - .cmd_func = kdb_disable_nmi, - .cmd_usage = "", - .cmd_help = "Disable NMI entry to KDB", - .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, + .name = "disable_nmi", + .func = kdb_disable_nmi, + .usage = "", + .help = "Disable NMI entry to KDB", + .flags = KDB_ENABLE_ALWAYS_SAFE, }; /* Initialize the kdb command table. */ diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 6c4f92c79e43..59857a1ee44c 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -148,11 +148,11 @@ static int kdb_ftdump(int argc, const char **argv) } static kdbtab_t ftdump_cmd = { - .cmd_name = "ftdump", - .cmd_func = kdb_ftdump, - .cmd_usage = "[skip_#entries] [cpu]", - .cmd_help = "Dump ftrace log; -skip dumps last #entries", - .cmd_flags = KDB_ENABLE_ALWAYS_SAFE, + .name = "ftdump", + .func = kdb_ftdump, + .usage = "[skip_#entries] [cpu]", + .help = "Dump ftrace log; -skip dumps last #entries", + .flags = KDB_ENABLE_ALWAYS_SAFE, }; static __init int kdb_ftrace_register(void) -- cgit v1.2.3 From 33b57e0cc78eb82f2921eb4c6d1c8fcaa733823b Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 27 Jul 2021 15:23:35 -0700 Subject: bpf: Increase supported cgroup storage value size Current max cgroup storage value size is 4k (PAGE_SIZE). The other local storages accept up to 64k (BPF_LOCAL_STORAGE_MAX_VALUE_SIZE). Let's align max cgroup value size with the other storages. For percpu, the max is 32k (PCPU_MIN_UNIT_SIZE) because percpu allocator is not happy about larger values. netcnt test is extended to exercise those maximum values (non-percpu max size is close to, but not real max). v4: * remove inner union (Andrii Nakryiko) * keep net_cnt on the stack (Andrii Nakryiko) v3: * refine SIZEOF_BPF_LOCAL_STORAGE_ELEM comment (Yonghong Song) * anonymous struct in percpu_net_cnt & net_cnt (Yonghong Song) * reorder free (Yonghong Song) v2: * cap max_value_size instead of BUILD_BUG_ON (Martin KaFai Lau) Signed-off-by: Stanislav Fomichev Signed-off-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210727222335.4029096-1-sdf@google.com --- kernel/bpf/local_storage.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 7ed2a14dc0de..035e9e3a7132 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -1,6 +1,7 @@ //SPDX-License-Identifier: GPL-2.0 #include #include +#include #include #include #include @@ -283,9 +284,17 @@ enoent: static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) { + __u32 max_value_size = BPF_LOCAL_STORAGE_MAX_VALUE_SIZE; int numa_node = bpf_map_attr_numa_node(attr); struct bpf_cgroup_storage_map *map; + /* percpu is bound by PCPU_MIN_UNIT_SIZE, non-percu + * is the same as other local storages. + */ + if (attr->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) + max_value_size = min_t(__u32, max_value_size, + PCPU_MIN_UNIT_SIZE); + if (attr->key_size != sizeof(struct bpf_cgroup_storage_key) && attr->key_size != sizeof(__u64)) return ERR_PTR(-EINVAL); @@ -293,7 +302,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) if (attr->value_size == 0) return ERR_PTR(-EINVAL); - if (attr->value_size > PAGE_SIZE) + if (attr->value_size > max_value_size) return ERR_PTR(-E2BIG); if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK || -- cgit v1.2.3 From 10102a890b543a8a08457dc69fa55bc032403c7d Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Tue, 27 Jul 2021 14:06:35 +0100 Subject: printk: Add printk.console_no_auto_verbose boot parameter console_verbose() increases console loglevel to CONSOLE_LOGLEVEL_MOTORMOUTH, which provides more information to debug a panic/oops. Unfortunately, in Arista we maintain some DUTs (Device Under Test) that are configured to have 9600 baud rate. While verbose console messages have their value to post-analyze crashes, on such setup they: - may prevent panic/oops messages being printed - take too long to flush on console resulting in watchdog reboot In all our setups we use kdump which saves dmesg buffer after panic, so in reality those extra messages on console provide no additional value, but rather add risk of not getting to __crash_kexec(). Provide printk.console_no_auto_verbose boot parameter, which allows to switch off printk being verbose on oops/panic/lockdep. Cc: Andrew Morton Cc: John Ogness Cc: Petr Mladek Cc: Sergey Senozhatsky Cc: Steven Rostedt Signed-off-by: Dmitry Safonov Suggested-by: Petr Mladek Reviewed-by: Sergey Senozhatsky Reviewed-by: Petr Mladek Tested-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20210727130635.675184-3-dima@arista.com --- kernel/printk/printk.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 142a58d124d9..a6b94c3c5ac5 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2404,6 +2404,18 @@ module_param_named(console_suspend, console_suspend_enabled, MODULE_PARM_DESC(console_suspend, "suspend console during suspend" " and hibernate operations"); +static bool printk_console_no_auto_verbose; + +void console_verbose(void) +{ + if (console_loglevel && !printk_console_no_auto_verbose) + console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; +} +EXPORT_SYMBOL_GPL(console_verbose); + +module_param_named(console_no_auto_verbose, printk_console_no_auto_verbose, bool, 0644); +MODULE_PARM_DESC(console_no_auto_verbose, "Disable console loglevel raise to highest on oops/panic/etc"); + /** * suspend_console - suspend the console subsystem * -- cgit v1.2.3 From f728c4a9e8405caae69d4bc1232c54ff57b5d20f Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Thu, 22 Jul 2021 11:03:52 +0800 Subject: workqueue: Fix possible memory leaks in wq_numa_init() In error handling branch "if (WARN_ON(node == NUMA_NO_NODE))", the previously allocated memories are not released. Doing this before allocating memory eliminates memory leaks. tj: Note that the condition only occurs when the arch code is pretty broken and the WARN_ON might as well be BUG_ON(). Signed-off-by: Zhen Lei Reviewed-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f148eacda55a..542c2d03dab6 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5902,6 +5902,13 @@ static void __init wq_numa_init(void) return; } + for_each_possible_cpu(cpu) { + if (WARN_ON(cpu_to_node(cpu) == NUMA_NO_NODE)) { + pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu); + return; + } + } + wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(); BUG_ON(!wq_update_unbound_numa_attrs_buf); @@ -5919,11 +5926,6 @@ static void __init wq_numa_init(void) for_each_possible_cpu(cpu) { node = cpu_to_node(cpu); - if (WARN_ON(node == NUMA_NO_NODE)) { - pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu); - /* happens iff arch is bonkers, let's just proceed */ - return; - } cpumask_set_cpu(cpu, tbl[node]); } -- cgit v1.2.3 From d36216429ff3e69db4f6ea5e0c86b80010f5f30b Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 28 Jul 2021 11:30:25 -0700 Subject: bpf: Emit better log message if bpf_iter ctx arg btf_id == 0 To avoid kernel build failure due to some missing .BTF-ids referenced functions/types, the patch ([1]) tries to fill btf_id 0 for these types. In bpf verifier, for percpu variable and helper returning btf_id cases, verifier already emitted proper warning with something like verbose(env, "Helper has invalid btf_id in R%d\n", regno); verbose(env, "invalid return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); But this is not the case for bpf_iter context arguments. I hacked resolve_btfids to encode btf_id 0 for struct task_struct. With `./test_progs -n 7/5`, I got, 0: (79) r2 = *(u64 *)(r1 +0) func 'bpf_iter_task' arg0 has btf_id 29739 type STRUCT 'bpf_iter_meta' ; struct seq_file *seq = ctx->meta->seq; 1: (79) r6 = *(u64 *)(r2 +0) ; struct task_struct *task = ctx->task; 2: (79) r7 = *(u64 *)(r1 +8) ; if (task == (void *)0) { 3: (55) if r7 != 0x0 goto pc+11 ... ; BPF_SEQ_PRINTF(seq, "%8d %8d\n", task->tgid, task->pid); 26: (61) r1 = *(u32 *)(r7 +1372) Type '(anon)' is not a struct Basically, verifier will return btf_id 0 for task_struct. Later on, when the code tries to access task->tgid, the verifier correctly complains the type is '(anon)' and it is not a struct. Users still need to backtrace to find out what is going on. Let us catch the invalid btf_id 0 earlier and provide better message indicating btf_id is wrong. The new error message looks like below: R1 type=ctx expected=fp ; struct seq_file *seq = ctx->meta->seq; 0: (79) r2 = *(u64 *)(r1 +0) func 'bpf_iter_task' arg0 has btf_id 29739 type STRUCT 'bpf_iter_meta' ; struct seq_file *seq = ctx->meta->seq; 1: (79) r6 = *(u64 *)(r2 +0) ; struct task_struct *task = ctx->task; 2: (79) r7 = *(u64 *)(r1 +8) invalid btf_id for context argument offset 8 invalid bpf_context access off=8 size=8 [1] https://lore.kernel.org/bpf/20210727132532.2473636-1-hengqi.chen@gmail.com/ Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210728183025.1461750-1-yhs@fb.com --- kernel/bpf/btf.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 7780131f710e..c395024610ed 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4825,6 +4825,11 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i]; if (ctx_arg_info->offset == off) { + if (!ctx_arg_info->btf_id) { + bpf_log(log,"invalid btf_id for context argument offset %u\n", off); + return false; + } + info->reg_type = ctx_arg_info->reg_type; info->btf = btf_vmlinux; info->btf_id = ctx_arg_info->btf_id; -- cgit v1.2.3 From b61a28cf11d61f512172e673b8f8c4a6c789b425 Mon Sep 17 00:00:00 2001 From: Johan Almbladh Date: Wed, 28 Jul 2021 18:47:41 +0200 Subject: bpf: Fix off-by-one in tail call count limiting Before, the interpreter allowed up to MAX_TAIL_CALL_CNT + 1 tail calls. Now precisely MAX_TAIL_CALL_CNT is allowed, which is in line with the behavior of the x86 JITs. Signed-off-by: Johan Almbladh Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210728164741.350370-1-johan.almbladh@anyfinetworks.com --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b1a5fc04492b..fe807b203a6f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1562,7 +1562,7 @@ select_insn: if (unlikely(index >= array->map.max_entries)) goto out; - if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT)) + if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT)) goto out; tail_call_cnt++; -- cgit v1.2.3 From d2c8cce647f3022d5960a3bf2b50a2da341d9c8b Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 3 Aug 2021 16:16:13 +0200 Subject: PM: sleep: s2idle: Replace deprecated CPU-hotplug functions The functions get_online_cpus() and put_online_cpus() have been deprecated during the CPU hotplug rework. They map directly to cpus_read_lock() and cpus_read_unlock(). Replace deprecated CPU-hotplug functions with the official version. The behavior remains unchanged. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index d8cae434f9eb..eb75f394a059 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -96,7 +96,7 @@ static void s2idle_enter(void) s2idle_state = S2IDLE_STATE_ENTER; raw_spin_unlock_irq(&s2idle_lock); - get_online_cpus(); + cpus_read_lock(); cpuidle_resume(); /* Push all the CPUs into the idle loop. */ @@ -106,7 +106,7 @@ static void s2idle_enter(void) s2idle_state == S2IDLE_STATE_WAKE); cpuidle_pause(); - put_online_cpus(); + cpus_read_unlock(); raw_spin_lock_irq(&s2idle_lock); -- cgit v1.2.3 From 4fac49fd0a349aa3afb3ad7ec778a00592c7ab59 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Wed, 4 Aug 2021 12:44:07 +0200 Subject: PM: sleep: check RTC features instead of ops in suspend_test Test RTC_FEATURE_ALARM instead of relying on ops->set_alarm to know whether alarms are available. Signed-off-by: Alexandre Belloni Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index e1ed58adb69e..d20526c5be15 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -129,7 +129,7 @@ static int __init has_wakealarm(struct device *dev, const void *data) { struct rtc_device *candidate = to_rtc_device(dev); - if (!candidate->ops->set_alarm) + if (!test_bit(RTC_FEATURE_ALARM, candidate->features)) return 0; if (!device_may_wakeup(candidate->dev.parent)) return 0; -- cgit v1.2.3 From 7fcc17d0cb12938d2b3507973a6f93fc9ed2c7a1 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Tue, 3 Aug 2021 11:27:43 +0100 Subject: PM: EM: Increase energy calculation precision The Energy Model (EM) provides useful information about device power in each performance state to other subsystems like: Energy Aware Scheduler (EAS). The energy calculation in EAS does arithmetic operation based on the EM em_cpu_energy(). Current implementation of that function uses em_perf_state::cost as a pre-computed cost coefficient equal to: cost = power * max_frequency / frequency. The 'power' is expressed in milli-Watts (or in abstract scale). There are corner cases when the EAS energy calculation for two Performance Domains (PDs) return the same value. The EAS compares these values to choose smaller one. It might happen that this values are equal due to rounding error. In such scenario, we need better resolution, e.g. 1000 times better. To provide this possibility increase the resolution in the em_perf_state::cost for 64-bit architectures. The cost of increasing resolution on 32-bit is pretty high (64-bit division) and is not justified since there are no new 32bit big.LITTLE EAS systems expected which would benefit from this higher resolution. This patch allows to avoid the rounding to milli-Watt errors, which might occur in EAS energy estimation for each PD. The rounding error is common for small tasks which have small utilization value. There are two places in the code where it makes a difference: 1. In the find_energy_efficient_cpu() where we are searching for best_delta. We might suffer there when two PDs return the same result, like in the example below. Scenario: Low utilized system e.g. ~200 sum_util for PD0 and ~220 for PD1. There are quite a few small tasks ~10-15 util. These tasks would suffer for the rounding error. These utilization values are typical when running games on Android. One of our partners has reported 5..10mA less battery drain when running with increased resolution. Some details: We have two PDs: PD0 (big) and PD1 (little) Let's compare w/o patch set ('old') and w/ patch set ('new') We are comparing energy w/ task and w/o task placed in the PDs a) 'old' w/o patch set, PD0 task_util = 13 cost = 480 sum_util_w/o_task = 215 sum_util_w_task = 228 scale_cpu = 1024 energy_w/o_task = 480 * 215 / 1024 = 100.78 => 100 energy_w_task = 480 * 228 / 1024 = 106.87 => 106 energy_diff = 106 - 100 = 6 (this is equal to 'old' PD1's energy_diff in 'c)') b) 'new' w/ patch set, PD0 task_util = 13 cost = 480 * 1000 = 480000 sum_util_w/o_task = 215 sum_util_w_task = 228 energy_w/o_task = 480000 * 215 / 1024 = 100781 energy_w_task = 480000 * 228 / 1024 = 106875 energy_diff = 106875 - 100781 = 6094 (this is not equal to 'new' PD1's energy_diff in 'd)') c) 'old' w/o patch set, PD1 task_util = 13 cost = 160 sum_util_w/o_task = 283 sum_util_w_task = 293 scale_cpu = 355 energy_w/o_task = 160 * 283 / 355 = 127.55 => 127 energy_w_task = 160 * 296 / 355 = 133.41 => 133 energy_diff = 133 - 127 = 6 (this is equal to 'old' PD0's energy_diff in 'a)') d) 'new' w/ patch set, PD1 task_util = 13 cost = 160 * 1000 = 160000 sum_util_w/o_task = 283 sum_util_w_task = 293 scale_cpu = 355 energy_w/o_task = 160000 * 283 / 355 = 127549 energy_w_task = 160000 * 296 / 355 = 133408 energy_diff = 133408 - 127549 = 5859 (this is not equal to 'new' PD0's energy_diff in 'b)') 2. Difference in the 6% energy margin filter at the end of find_energy_efficient_cpu(). With this patch the margin comparison also has better resolution, so it's possible to have better task placement thanks to that. Fixes: 27871f7a8a341ef ("PM: Introduce an Energy Model management framework") Reported-by: CCJ Yeh Reviewed-by: Dietmar Eggemann Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- kernel/power/energy_model.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 0f4530b3a8cd..a332ccd829e2 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -170,7 +170,9 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, /* Compute the cost of each performance state. */ fmax = (u64) table[nr_states - 1].frequency; for (i = 0; i < nr_states; i++) { - table[i].cost = div64_u64(fmax * table[i].power, + unsigned long power_res = em_scale_power(table[i].power); + + table[i].cost = div64_u64(fmax * power_res, table[i].frequency); } -- cgit v1.2.3 From e5c6b312ce3cc97e90ea159446e6bfa06645364d Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Thu, 5 Aug 2021 15:29:17 +0800 Subject: cpufreq: schedutil: Use kobject release() method to free sugov_tunables The struct sugov_tunables is protected by the kobject, so we can't free it directly. Otherwise we would get a call trace like this: ODEBUG: free active (active state 0) object type: timer_list hint: delayed_work_timer_fn+0x0/0x30 WARNING: CPU: 3 PID: 720 at lib/debugobjects.c:505 debug_print_object+0xb8/0x100 Modules linked in: CPU: 3 PID: 720 Comm: a.sh Tainted: G W 5.14.0-rc1-next-20210715-yocto-standard+ #507 Hardware name: Marvell OcteonTX CN96XX board (DT) pstate: 40400009 (nZcv daif +PAN -UAO -TCO BTYPE=--) pc : debug_print_object+0xb8/0x100 lr : debug_print_object+0xb8/0x100 sp : ffff80001ecaf910 x29: ffff80001ecaf910 x28: ffff00011b10b8d0 x27: ffff800011043d80 x26: ffff00011a8f0000 x25: ffff800013cb3ff0 x24: 0000000000000000 x23: ffff80001142aa68 x22: ffff800011043d80 x21: ffff00010de46f20 x20: ffff800013c0c520 x19: ffff800011d8f5b0 x18: 0000000000000010 x17: 6e6968207473696c x16: 5f72656d6974203a x15: 6570797420746365 x14: 6a626f2029302065 x13: 303378302f307830 x12: 2b6e665f72656d69 x11: ffff8000124b1560 x10: ffff800012331520 x9 : ffff8000100ca6b0 x8 : 000000000017ffe8 x7 : c0000000fffeffff x6 : 0000000000000001 x5 : ffff800011d8c000 x4 : ffff800011d8c740 x3 : 0000000000000000 x2 : ffff0001108301c0 x1 : ab3c90eedf9c0f00 x0 : 0000000000000000 Call trace: debug_print_object+0xb8/0x100 __debug_check_no_obj_freed+0x1c0/0x230 debug_check_no_obj_freed+0x20/0x88 slab_free_freelist_hook+0x154/0x1c8 kfree+0x114/0x5d0 sugov_exit+0xbc/0xc0 cpufreq_exit_governor+0x44/0x90 cpufreq_set_policy+0x268/0x4a8 store_scaling_governor+0xe0/0x128 store+0xc0/0xf0 sysfs_kf_write+0x54/0x80 kernfs_fop_write_iter+0x128/0x1c0 new_sync_write+0xf0/0x190 vfs_write+0x2d4/0x478 ksys_write+0x74/0x100 __arm64_sys_write+0x24/0x30 invoke_syscall.constprop.0+0x54/0xe0 do_el0_svc+0x64/0x158 el0_svc+0x2c/0xb0 el0t_64_sync_handler+0xb0/0xb8 el0t_64_sync+0x198/0x19c irq event stamp: 5518 hardirqs last enabled at (5517): [] console_unlock+0x554/0x6c8 hardirqs last disabled at (5518): [] el1_dbg+0x28/0xa0 softirqs last enabled at (5504): [] __do_softirq+0x4d0/0x6c0 softirqs last disabled at (5483): [] irq_exit+0x1b0/0x1b8 So split the original sugov_tunables_free() into two functions, sugov_clear_global_tunables() is just used to clear the global_tunables and the new sugov_tunables_free() is used as kobj_type::release to release the sugov_tunables safely. Fixes: 9bdcb44e391d ("cpufreq: schedutil: New governor based on scheduler utilization data") Cc: 4.7+ # 4.7+ Signed-off-by: Kevin Hao Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 57124614363d..e7af18857371 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -537,9 +537,17 @@ static struct attribute *sugov_attrs[] = { }; ATTRIBUTE_GROUPS(sugov); +static void sugov_tunables_free(struct kobject *kobj) +{ + struct gov_attr_set *attr_set = container_of(kobj, struct gov_attr_set, kobj); + + kfree(to_sugov_tunables(attr_set)); +} + static struct kobj_type sugov_tunables_ktype = { .default_groups = sugov_groups, .sysfs_ops = &governor_sysfs_ops, + .release = &sugov_tunables_free, }; /********************** cpufreq governor interface *********************/ @@ -639,12 +647,10 @@ static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_polic return tunables; } -static void sugov_tunables_free(struct sugov_tunables *tunables) +static void sugov_clear_global_tunables(void) { if (!have_governor_per_policy()) global_tunables = NULL; - - kfree(tunables); } static int sugov_init(struct cpufreq_policy *policy) @@ -707,7 +713,7 @@ out: fail: kobject_put(&tunables->attr_set.kobj); policy->governor_data = NULL; - sugov_tunables_free(tunables); + sugov_clear_global_tunables(); stop_kthread: sugov_kthread_stop(sg_policy); @@ -734,7 +740,7 @@ static void sugov_exit(struct cpufreq_policy *policy) count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); policy->governor_data = NULL; if (!count) - sugov_tunables_free(tunables); + sugov_clear_global_tunables(); mutex_unlock(&global_tunables_lock); -- cgit v1.2.3 From 1d7db834a027ecfb5ac93be2e8bb45bfcbbf6eaa Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 22 Jul 2021 18:55:12 +0800 Subject: dma-debug: use memory_intersects() directly There is already a memory_intersects() helper in sections.h, use memory_intersects() directly instead of private overlap(). Signed-off-by: Kefeng Wang Signed-off-by: Christoph Hellwig --- kernel/dma/debug.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index dadae6255d05..fe6efd181614 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -1064,20 +1064,10 @@ static void check_for_stack(struct device *dev, } } -static inline bool overlap(void *addr, unsigned long len, void *start, void *end) -{ - unsigned long a1 = (unsigned long)addr; - unsigned long b1 = a1 + len; - unsigned long a2 = (unsigned long)start; - unsigned long b2 = (unsigned long)end; - - return !(b1 <= a2 || a1 >= b2); -} - static void check_for_illegal_area(struct device *dev, void *addr, unsigned long len) { - if (overlap(addr, len, _stext, _etext) || - overlap(addr, len, __start_rodata, __end_rodata)) + if (memory_intersects(_stext, _etext, addr, len) || + memory_intersects(__start_rodata, __end_rodata, addr, len)) err_printk(dev, NULL, "device driver maps memory from kernel text or rodata [addr=%p] [len=%lu]\n", addr, len); } -- cgit v1.2.3 From 173735c346c412d9f084825ecb04f24ada0e2986 Mon Sep 17 00:00:00 2001 From: Anthony Iliopoulos Date: Thu, 22 Jul 2021 16:10:55 +0200 Subject: dma-debug: fix debugfs initialization order Due to link order, dma_debug_init is called before debugfs has a chance to initialize (via debugfs_init which also happens in the core initcall stage), so the directories for dma-debug are never created. Decouple dma_debug_fs_init from dma_debug_init and defer its init until core_initcall_sync (after debugfs has been initialized) while letting dma-debug initialization occur as soon as possible to catch any early mappings, as suggested in [1]. [1] https://lore.kernel.org/linux-iommu/YIgGa6yF%2Fadg8OSN@kroah.com/ Fixes: 15b28bbcd567 ("dma-debug: move initialization to common code") Signed-off-by: Anthony Iliopoulos Signed-off-by: Christoph Hellwig --- kernel/dma/debug.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index fe6efd181614..6c90c69e5311 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -792,7 +792,7 @@ static int dump_show(struct seq_file *seq, void *v) } DEFINE_SHOW_ATTRIBUTE(dump); -static void dma_debug_fs_init(void) +static int __init dma_debug_fs_init(void) { struct dentry *dentry = debugfs_create_dir("dma-api", NULL); @@ -805,7 +805,10 @@ static void dma_debug_fs_init(void) debugfs_create_u32("nr_total_entries", 0444, dentry, &nr_total_entries); debugfs_create_file("driver_filter", 0644, dentry, NULL, &filter_fops); debugfs_create_file("dump", 0444, dentry, NULL, &dump_fops); + + return 0; } +core_initcall_sync(dma_debug_fs_init); static int device_dma_allocations(struct device *dev, struct dma_debug_entry **out_entry) { @@ -890,8 +893,6 @@ static int dma_debug_init(void) spin_lock_init(&dma_entry_hash[i].lock); } - dma_debug_fs_init(); - nr_pages = DIV_ROUND_UP(nr_prealloc_entries, DMA_DEBUG_DYNAMIC_ENTRIES); for (i = 0; i < nr_pages; ++i) dma_debug_create_entries(GFP_KERNEL); -- cgit v1.2.3 From fffe3cc8c2194f60c4af4fac7f27d25e8828f001 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 29 Jul 2021 14:15:19 -0600 Subject: dma-mapping: allow map_sg() ops to return negative error codes Allow dma_map_sgtable() to pass errors from the map_sg() ops. This will be required for returning appropriate error codes when mapping P2PDMA memory. Introduce __dma_map_sg_attrs() which will return the raw error code from the map_sg operation (whether it be negative or zero). Then add a dma_map_sg_attrs() wrapper to convert any negative errors to zero to satisfy the existing calling convention. dma_map_sgtable() defines three error codes that .map_sg implementations are allowed to return: -EINVAL, -ENOMEM and -EIO. The latter of which is a generic return for cases that are passing DMA_MAPPING_ERROR through. dma_map_sgtable() will convert a zero error return for old map_sg() ops into a -EIO return and return any negative errors as reported. This allows map_sg implementations to start returning multiple negative error codes. Legacy map_sg implementations can continue to return zero until they are all converted. Signed-off-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig --- kernel/dma/mapping.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 74 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 2b06a809d0b9..21e550076be5 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -177,12 +177,8 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, } EXPORT_SYMBOL(dma_unmap_page_attrs); -/* - * dma_maps_sg_attrs returns 0 on error and > 0 on success. - * It should never return a value < 0. - */ -int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction dir, unsigned long attrs) +static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); int ents; @@ -197,13 +193,83 @@ int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, ents = dma_direct_map_sg(dev, sg, nents, dir, attrs); else ents = ops->map_sg(dev, sg, nents, dir, attrs); - BUG_ON(ents < 0); - debug_dma_map_sg(dev, sg, nents, ents, dir); + + if (ents > 0) + debug_dma_map_sg(dev, sg, nents, ents, dir); + else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM && + ents != -EIO && ents != 0)) + return -EIO; return ents; } + +/** + * dma_map_sg_attrs - Map the given buffer for DMA + * @dev: The device for which to perform the DMA operation + * @sg: The sg_table object describing the buffer + * @dir: DMA direction + * @attrs: Optional DMA attributes for the map operation + * + * Maps a buffer described by a scatterlist passed in the sg argument with + * nents segments for the @dir DMA operation by the @dev device. + * + * Returns the number of mapped entries (which can be less than nents) + * on success. Zero is returned for any error. + * + * dma_unmap_sg_attrs() should be used to unmap the buffer with the + * original sg and original nents (not the value returned by this funciton). + */ +int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, unsigned long attrs) +{ + int ret; + + ret = __dma_map_sg_attrs(dev, sg, nents, dir, attrs); + if (ret < 0) + return 0; + return ret; +} EXPORT_SYMBOL(dma_map_sg_attrs); +/** + * dma_map_sgtable - Map the given buffer for DMA + * @dev: The device for which to perform the DMA operation + * @sgt: The sg_table object describing the buffer + * @dir: DMA direction + * @attrs: Optional DMA attributes for the map operation + * + * Maps a buffer described by a scatterlist stored in the given sg_table + * object for the @dir DMA operation by the @dev device. After success, the + * ownership for the buffer is transferred to the DMA domain. One has to + * call dma_sync_sgtable_for_cpu() or dma_unmap_sgtable() to move the + * ownership of the buffer back to the CPU domain before touching the + * buffer by the CPU. + * + * Returns 0 on success or a negative error code on error. The following + * error codes are supported with the given meaning: + * + * -EINVAL - An invalid argument, unaligned access or other error + * in usage. Will not succeed if retried. + * -ENOMEM - Insufficient resources (like memory or IOVA space) to + * complete the mapping. Should succeed if retried later. + * -EIO - Legacy error code with an unknown meaning. eg. this is + * returned if a lower level call returned DMA_MAPPING_ERROR. + */ +int dma_map_sgtable(struct device *dev, struct sg_table *sgt, + enum dma_data_direction dir, unsigned long attrs) +{ + int nents; + + nents = __dma_map_sg_attrs(dev, sgt->sgl, sgt->orig_nents, dir, attrs); + if (nents == 0) + return -EIO; + if (nents < 0) + return nents; + sgt->nents = nents; + return 0; +} +EXPORT_SYMBOL_GPL(dma_map_sgtable); + void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs) -- cgit v1.2.3 From c81be74e7d790f090d3c8a52e20a334dc2506a3f Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 29 Jul 2021 14:15:20 -0600 Subject: dma-direct: return appropriate error code from dma_direct_map_sg() Now that the map_sg() op expects error codes instead of return zero on error, convert dma_direct_map_sg() to return an error code. Per the documentation for dma_map_sgtable(), -EIO is returned due to an DMA_MAPPING_ERROR with unknown cause. Signed-off-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index f737e3347059..f33ceb68aef2 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -411,7 +411,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, out_unmap: dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); - return 0; + return -EIO; } dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, -- cgit v1.2.3 From 66ab63104f9cab09209851fef168f5972d791903 Mon Sep 17 00:00:00 2001 From: Martin Oliveira Date: Thu, 29 Jul 2021 14:15:38 -0600 Subject: dma-mapping: return error code from dma_dummy_map_sg() The .map_sg() op now expects an error code instead of zero on failure. The only errno to return is -EINVAL in the case when DMA is not supported. Signed-off-by: Martin Oliveira Signed-off-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig --- kernel/dma/dummy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/dummy.c b/kernel/dma/dummy.c index eacd4c5b10bf..b492d59ac77e 100644 --- a/kernel/dma/dummy.c +++ b/kernel/dma/dummy.c @@ -22,7 +22,7 @@ static int dma_dummy_map_sg(struct device *dev, struct scatterlist *sgl, int nelems, enum dma_data_direction dir, unsigned long attrs) { - return 0; + return -EINVAL; } static int dma_dummy_supported(struct device *hwdev, u64 mask) -- cgit v1.2.3 From d03c54419274f96434e2ad74e59e67ec6d54ca86 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 9 Aug 2021 17:13:31 +0200 Subject: dma-mapping: disallow .map_sg operations from returning zero on error Now that all the .map_sg operations have been converted to returning proper error codes, drop the code to handle a zero return value, add a warning if a zero is returned. Signed-off-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig --- kernel/dma/mapping.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 21e550076be5..967b62692102 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -197,7 +197,7 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, if (ents > 0) debug_dma_map_sg(dev, sg, nents, ents, dir); else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM && - ents != -EIO && ents != 0)) + ents != -EIO)) return -EIO; return ents; @@ -261,8 +261,6 @@ int dma_map_sgtable(struct device *dev, struct sg_table *sgt, int nents; nents = __dma_map_sg_attrs(dev, sgt->sgl, sgt->orig_nents, dir, attrs); - if (nents == 0) - return -EIO; if (nents < 0) return nents; sgt->nents = nents; -- cgit v1.2.3 From aeea1b86f9363f3feabb496534d886f082a89f21 Mon Sep 17 00:00:00 2001 From: Jussi Maki Date: Sat, 31 Jul 2021 05:57:35 +0000 Subject: bpf, devmap: Exclude XDP broadcast to master device If the ingress device is bond slave, do not broadcast back through it or the bond master. Signed-off-by: Jussi Maki Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210731055738.16820-5-joamaki@gmail.com --- kernel/bpf/devmap.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 60 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 542e94fa30b4..f02d04540c0c 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -534,10 +534,9 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog); } -static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_buff *xdp, - int exclude_ifindex) +static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_buff *xdp) { - if (!obj || obj->dev->ifindex == exclude_ifindex || + if (!obj || !obj->dev->netdev_ops->ndo_xdp_xmit) return false; @@ -562,17 +561,48 @@ static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj, return 0; } +static inline bool is_ifindex_excluded(int *excluded, int num_excluded, int ifindex) +{ + while (num_excluded--) { + if (ifindex == excluded[num_excluded]) + return true; + } + return false; +} + +/* Get ifindex of each upper device. 'indexes' must be able to hold at + * least MAX_NEST_DEV elements. + * Returns the number of ifindexes added. + */ +static int get_upper_ifindexes(struct net_device *dev, int *indexes) +{ + struct net_device *upper; + struct list_head *iter; + int n = 0; + + netdev_for_each_upper_dev_rcu(dev, upper, iter) { + indexes[n++] = upper->ifindex; + } + return n; +} + int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, struct bpf_map *map, bool exclude_ingress) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - int exclude_ifindex = exclude_ingress ? dev_rx->ifindex : 0; struct bpf_dtab_netdev *dst, *last_dst = NULL; + int excluded_devices[1+MAX_NEST_DEV]; struct hlist_head *head; struct xdp_frame *xdpf; + int num_excluded = 0; unsigned int i; int err; + if (exclude_ingress) { + num_excluded = get_upper_ifindexes(dev_rx, excluded_devices); + excluded_devices[num_excluded++] = dev_rx->ifindex; + } + xdpf = xdp_convert_buff_to_frame(xdp); if (unlikely(!xdpf)) return -EOVERFLOW; @@ -581,7 +611,10 @@ int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, for (i = 0; i < map->max_entries; i++) { dst = rcu_dereference_check(dtab->netdev_map[i], rcu_read_lock_bh_held()); - if (!is_valid_dst(dst, xdp, exclude_ifindex)) + if (!is_valid_dst(dst, xdp)) + continue; + + if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) continue; /* we only need n-1 clones; last_dst enqueued below */ @@ -601,7 +634,11 @@ int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, head = dev_map_index_hash(dtab, i); hlist_for_each_entry_rcu(dst, head, index_hlist, lockdep_is_held(&dtab->index_lock)) { - if (!is_valid_dst(dst, xdp, exclude_ifindex)) + if (!is_valid_dst(dst, xdp)) + continue; + + if (is_ifindex_excluded(excluded_devices, num_excluded, + dst->dev->ifindex)) continue; /* we only need n-1 clones; last_dst enqueued below */ @@ -675,18 +712,27 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, bool exclude_ingress) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - int exclude_ifindex = exclude_ingress ? dev->ifindex : 0; struct bpf_dtab_netdev *dst, *last_dst = NULL; + int excluded_devices[1+MAX_NEST_DEV]; struct hlist_head *head; struct hlist_node *next; + int num_excluded = 0; unsigned int i; int err; + if (exclude_ingress) { + num_excluded = get_upper_ifindexes(dev, excluded_devices); + excluded_devices[num_excluded++] = dev->ifindex; + } + if (map->map_type == BPF_MAP_TYPE_DEVMAP) { for (i = 0; i < map->max_entries; i++) { dst = rcu_dereference_check(dtab->netdev_map[i], rcu_read_lock_bh_held()); - if (!dst || dst->dev->ifindex == exclude_ifindex) + if (!dst) + continue; + + if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) continue; /* we only need n-1 clones; last_dst enqueued below */ @@ -700,12 +746,17 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, return err; last_dst = dst; + } } else { /* BPF_MAP_TYPE_DEVMAP_HASH */ for (i = 0; i < dtab->n_buckets; i++) { head = dev_map_index_hash(dtab, i); hlist_for_each_entry_safe(dst, next, head, index_hlist) { - if (!dst || dst->dev->ifindex == exclude_ifindex) + if (!dst) + continue; + + if (is_ifindex_excluded(excluded_devices, num_excluded, + dst->dev->ifindex)) continue; /* we only need n-1 clones; last_dst enqueued below */ -- cgit v1.2.3 From 67dc8325370844ffce92aa59abe8b453aa6aa83c Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Sat, 31 Jul 2021 08:01:29 +0800 Subject: workqueue: Fix typo in comments Fix typo: *assing ==> assign *alloced ==> allocated *Retun ==> Return *excute ==> execute v1->v2: *reverse 'iff' *update changelog Signed-off-by: Cai Huoqing Signed-off-by: Tejun Heo --- kernel/workqueue.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 542c2d03dab6..d3c35e47aa90 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -524,7 +524,7 @@ static inline void debug_work_deactivate(struct work_struct *work) { } #endif /** - * worker_pool_assign_id - allocate ID and assing it to @pool + * worker_pool_assign_id - allocate ID and assign it to @pool * @pool: the pool pointer of interest * * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned @@ -3763,7 +3763,7 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); } -/* initialize newly alloced @pwq which is associated with @wq and @pool */ +/* initialize newly allocated @pwq which is associated with @wq and @pool */ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, struct worker_pool *pool) { @@ -5331,7 +5331,7 @@ static int workqueue_apply_unbound_cpumask(void) * the affinity of all unbound workqueues. This function check the @cpumask * and apply it to all unbound workqueues and updates all pwqs of them. * - * Retun: 0 - Success + * Return: 0 - Success * -EINVAL - Invalid @cpumask * -ENOMEM - Failed to allocate memory for attrs or pwqs. */ -- cgit v1.2.3 From e441b56fe438fd126b9eea7d30c57d3cd3f34e14 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Wed, 4 Aug 2021 11:50:36 +0800 Subject: workqueue: Replace deprecated ida_simple_*() with ida_alloc()/ida_free() Replace ida_simple_get() with ida_alloc() and ida_simple_remove() with ida_free(), the latter is more concise and intuitive. In addition, if ida_alloc() fails, NULL is returned directly. This eliminates unnecessary initialization of two local variables and an 'if' judgment. Signed-off-by: Zhen Lei Signed-off-by: Tejun Heo --- kernel/workqueue.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d3c35e47aa90..29f049463d88 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1912,14 +1912,14 @@ static void worker_detach_from_pool(struct worker *worker) */ static struct worker *create_worker(struct worker_pool *pool) { - struct worker *worker = NULL; - int id = -1; + struct worker *worker; + int id; char id_buf[16]; /* ID is needed to determine kthread name */ - id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL); + id = ida_alloc(&pool->worker_ida, GFP_KERNEL); if (id < 0) - goto fail; + return NULL; worker = alloc_worker(pool->node); if (!worker) @@ -1954,8 +1954,7 @@ static struct worker *create_worker(struct worker_pool *pool) return worker; fail: - if (id >= 0) - ida_simple_remove(&pool->worker_ida, id); + ida_free(&pool->worker_ida, id); kfree(worker); return NULL; } @@ -2378,7 +2377,7 @@ woke_up: set_pf_worker(false); set_task_comm(worker->task, "kworker/dying"); - ida_simple_remove(&pool->worker_ida, worker->id); + ida_free(&pool->worker_ida, worker->id); worker_detach_from_pool(worker); kfree(worker); return 0; -- cgit v1.2.3 From ffd8bea81fbb5abe6518bce8d6297a149b935cf7 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 3 Aug 2021 16:16:20 +0200 Subject: workqueue: Replace deprecated CPU-hotplug functions. The functions get_online_cpus() and put_online_cpus() have been deprecated during the CPU hotplug rework. They map directly to cpus_read_lock() and cpus_read_unlock(). Replace deprecated CPU-hotplug functions with the official version. The behavior remains unchanged. Cc: Tejun Heo Reviewed-by: Lai Jiangshan Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Tejun Heo --- kernel/workqueue.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 29f049463d88..9bce39dba297 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3292,7 +3292,7 @@ int schedule_on_each_cpu(work_func_t func) if (!works) return -ENOMEM; - get_online_cpus(); + cpus_read_lock(); for_each_online_cpu(cpu) { struct work_struct *work = per_cpu_ptr(works, cpu); @@ -3304,7 +3304,7 @@ int schedule_on_each_cpu(work_func_t func) for_each_online_cpu(cpu) flush_work(per_cpu_ptr(works, cpu)); - put_online_cpus(); + cpus_read_unlock(); free_percpu(works); return 0; } @@ -4015,14 +4015,14 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) static void apply_wqattrs_lock(void) { /* CPUs should stay stable across pwq creations and installations */ - get_online_cpus(); + cpus_read_lock(); mutex_lock(&wq_pool_mutex); } static void apply_wqattrs_unlock(void) { mutex_unlock(&wq_pool_mutex); - put_online_cpus(); + cpus_read_unlock(); } static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, @@ -4067,7 +4067,7 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, * * Performs GFP_KERNEL allocations. * - * Assumes caller has CPU hotplug read exclusion, i.e. get_online_cpus(). + * Assumes caller has CPU hotplug read exclusion, i.e. cpus_read_lock(). * * Return: 0 on success and -errno on failure. */ @@ -4195,7 +4195,7 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) return 0; } - get_online_cpus(); + cpus_read_lock(); if (wq->flags & __WQ_ORDERED) { ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]); /* there should only be single pwq for ordering guarantee */ @@ -4205,7 +4205,7 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) } else { ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]); } - put_online_cpus(); + cpus_read_unlock(); return ret; } @@ -5167,10 +5167,10 @@ long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg) { long ret = -ENODEV; - get_online_cpus(); + cpus_read_lock(); if (cpu_online(cpu)) ret = work_on_cpu(cpu, fn, arg); - put_online_cpus(); + cpus_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(work_on_cpu_safe); @@ -5442,7 +5442,7 @@ static ssize_t wq_pool_ids_show(struct device *dev, const char *delim = ""; int node, written = 0; - get_online_cpus(); + cpus_read_lock(); rcu_read_lock(); for_each_node(node) { written += scnprintf(buf + written, PAGE_SIZE - written, @@ -5452,7 +5452,7 @@ static ssize_t wq_pool_ids_show(struct device *dev, } written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); rcu_read_unlock(); - put_online_cpus(); + cpus_read_unlock(); return written; } -- cgit v1.2.3 From 6ba34d3c73674e46d9e126e4f0cee79e5ef2481c Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 20 Jul 2021 10:18:28 -0400 Subject: cgroup/cpuset: Fix violation of cpuset locking rule The cpuset fields that manage partition root state do not strictly follow the cpuset locking rule that update to cpuset has to be done with both the callback_lock and cpuset_mutex held. This is now fixed by making sure that the locking rule is upheld. Fixes: 3881b86128d0 ("cpuset: Add an error state to cpuset.sched.partition") Fixes: 4b842da276a8 ("cpuset: Make CPU hotplug work with partition") Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 58 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 28a784bf64b1..13b5be6df4da 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1148,6 +1148,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, struct cpuset *parent = parent_cs(cpuset); int adding; /* Moving cpus from effective_cpus to subparts_cpus */ int deleting; /* Moving cpus from subparts_cpus to effective_cpus */ + int new_prs; bool part_error = false; /* Partition error? */ percpu_rwsem_assert_held(&cpuset_rwsem); @@ -1183,6 +1184,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, * A cpumask update cannot make parent's effective_cpus become empty. */ adding = deleting = false; + new_prs = cpuset->partition_root_state; if (cmd == partcmd_enable) { cpumask_copy(tmp->addmask, cpuset->cpus_allowed); adding = true; @@ -1247,11 +1249,11 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, switch (cpuset->partition_root_state) { case PRS_ENABLED: if (part_error) - cpuset->partition_root_state = PRS_ERROR; + new_prs = PRS_ERROR; break; case PRS_ERROR: if (!part_error) - cpuset->partition_root_state = PRS_ENABLED; + new_prs = PRS_ENABLED; break; } /* @@ -1260,10 +1262,10 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, part_error = (prev_prs == PRS_ERROR); } - if (!part_error && (cpuset->partition_root_state == PRS_ERROR)) + if (!part_error && (new_prs == PRS_ERROR)) return 0; /* Nothing need to be done */ - if (cpuset->partition_root_state == PRS_ERROR) { + if (new_prs == PRS_ERROR) { /* * Remove all its cpus from parent's subparts_cpus. */ @@ -1272,7 +1274,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, parent->subparts_cpus); } - if (!adding && !deleting) + if (!adding && !deleting && (new_prs == cpuset->partition_root_state)) return 0; /* @@ -1299,6 +1301,9 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, } parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus); + + if (cpuset->partition_root_state != new_prs) + cpuset->partition_root_state = new_prs; spin_unlock_irq(&callback_lock); return cmd == partcmd_update; @@ -1321,6 +1326,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) struct cpuset *cp; struct cgroup_subsys_state *pos_css; bool need_rebuild_sched_domains = false; + int new_prs; rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, cs) { @@ -1360,7 +1366,8 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) * update_tasks_cpumask() again for tasks in the parent * cpuset if the parent's subparts_cpus changes. */ - if ((cp != cs) && cp->partition_root_state) { + new_prs = cp->partition_root_state; + if ((cp != cs) && new_prs) { switch (parent->partition_root_state) { case PRS_DISABLED: /* @@ -1370,7 +1377,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) */ WARN_ON_ONCE(cp->partition_root_state != PRS_ERROR); - cp->partition_root_state = PRS_DISABLED; + new_prs = PRS_DISABLED; /* * clear_bit() is an atomic operation and @@ -1391,11 +1398,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) /* * When parent is invalid, it has to be too. */ - cp->partition_root_state = PRS_ERROR; - if (cp->nr_subparts_cpus) { - cp->nr_subparts_cpus = 0; - cpumask_clear(cp->subparts_cpus); - } + new_prs = PRS_ERROR; break; } } @@ -1407,8 +1410,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) spin_lock_irq(&callback_lock); cpumask_copy(cp->effective_cpus, tmp->new_cpus); - if (cp->nr_subparts_cpus && - (cp->partition_root_state != PRS_ENABLED)) { + if (cp->nr_subparts_cpus && (new_prs != PRS_ENABLED)) { cp->nr_subparts_cpus = 0; cpumask_clear(cp->subparts_cpus); } else if (cp->nr_subparts_cpus) { @@ -1435,6 +1437,10 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) = cpumask_weight(cp->subparts_cpus); } } + + if (new_prs != cp->partition_root_state) + cp->partition_root_state = new_prs; + spin_unlock_irq(&callback_lock); WARN_ON(!is_in_v2_mode() && @@ -1944,25 +1950,25 @@ out: */ static int update_prstate(struct cpuset *cs, int new_prs) { - int err; + int err, old_prs = cs->partition_root_state; struct cpuset *parent = parent_cs(cs); struct tmpmasks tmpmask; - if (new_prs == cs->partition_root_state) + if (old_prs == new_prs) return 0; /* * Cannot force a partial or invalid partition root to a full * partition root. */ - if (new_prs && (cs->partition_root_state < 0)) + if (new_prs && (old_prs == PRS_ERROR)) return -EINVAL; if (alloc_cpumasks(NULL, &tmpmask)) return -ENOMEM; err = -EINVAL; - if (!cs->partition_root_state) { + if (!old_prs) { /* * Turning on partition root requires setting the * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed @@ -1981,14 +1987,12 @@ static int update_prstate(struct cpuset *cs, int new_prs) update_flag(CS_CPU_EXCLUSIVE, cs, 0); goto out; } - cs->partition_root_state = PRS_ENABLED; } else { /* * Turning off partition root will clear the * CS_CPU_EXCLUSIVE bit. */ - if (cs->partition_root_state == PRS_ERROR) { - cs->partition_root_state = PRS_DISABLED; + if (old_prs == PRS_ERROR) { update_flag(CS_CPU_EXCLUSIVE, cs, 0); err = 0; goto out; @@ -1999,8 +2003,6 @@ static int update_prstate(struct cpuset *cs, int new_prs) if (err) goto out; - cs->partition_root_state = PRS_DISABLED; - /* Turning off CS_CPU_EXCLUSIVE will not return error */ update_flag(CS_CPU_EXCLUSIVE, cs, 0); } @@ -2017,6 +2019,12 @@ static int update_prstate(struct cpuset *cs, int new_prs) rebuild_sched_domains_locked(); out: + if (!err) { + spin_lock_irq(&callback_lock); + cs->partition_root_state = new_prs; + spin_unlock_irq(&callback_lock); + } + free_cpumasks(NULL, &tmpmask); return err; } @@ -3080,8 +3088,10 @@ retry: if (is_partition_root(cs) && (cpumask_empty(&new_cpus) || (parent->partition_root_state == PRS_ERROR))) { if (cs->nr_subparts_cpus) { + spin_lock_irq(&callback_lock); cs->nr_subparts_cpus = 0; cpumask_clear(cs->subparts_cpus); + spin_unlock_irq(&callback_lock); compute_effective_cpumask(&new_cpus, cs, parent); } @@ -3095,7 +3105,9 @@ retry: cpumask_empty(&new_cpus)) { update_parent_subparts_cpumask(cs, partcmd_disable, NULL, tmp); + spin_lock_irq(&callback_lock); cs->partition_root_state = PRS_ERROR; + spin_unlock_irq(&callback_lock); } cpuset_force_rebuild(); } -- cgit v1.2.3 From c5c63b9a6a2e53757b598485b8adbafa56d6d9ee Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 3 Aug 2021 16:16:07 +0200 Subject: cgroup: Replace deprecated CPU-hotplug functions. The functions get_online_cpus() and put_online_cpus() have been deprecated during the CPU hotplug rework. They map directly to cpus_read_lock() and cpus_read_unlock(). Replace deprecated CPU-hotplug functions with the official version. The behavior remains unchanged. Cc: Zefan Li Cc: Tejun Heo Cc: Johannes Weiner Cc: cgroups@vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 13b5be6df4da..65258102e030 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -979,7 +979,7 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * - * Call with cpuset_mutex held. Takes get_online_cpus(). + * Call with cpuset_mutex held. Takes cpus_read_lock(). */ static void rebuild_sched_domains_locked(void) { @@ -1040,11 +1040,11 @@ static void rebuild_sched_domains_locked(void) void rebuild_sched_domains(void) { - get_online_cpus(); + cpus_read_lock(); percpu_down_write(&cpuset_rwsem); rebuild_sched_domains_locked(); percpu_up_write(&cpuset_rwsem); - put_online_cpus(); + cpus_read_unlock(); } /** @@ -2288,7 +2288,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = 0; - get_online_cpus(); + cpus_read_lock(); percpu_down_write(&cpuset_rwsem); if (!is_cpuset_online(cs)) { retval = -ENODEV; @@ -2326,7 +2326,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, } out_unlock: percpu_up_write(&cpuset_rwsem); - put_online_cpus(); + cpus_read_unlock(); return retval; } @@ -2337,7 +2337,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = -ENODEV; - get_online_cpus(); + cpus_read_lock(); percpu_down_write(&cpuset_rwsem); if (!is_cpuset_online(cs)) goto out_unlock; @@ -2352,7 +2352,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, } out_unlock: percpu_up_write(&cpuset_rwsem); - put_online_cpus(); + cpus_read_unlock(); return retval; } @@ -2391,7 +2391,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, kernfs_break_active_protection(of->kn); flush_work(&cpuset_hotplug_work); - get_online_cpus(); + cpus_read_lock(); percpu_down_write(&cpuset_rwsem); if (!is_cpuset_online(cs)) goto out_unlock; @@ -2417,7 +2417,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, free_cpuset(trialcs); out_unlock: percpu_up_write(&cpuset_rwsem); - put_online_cpus(); + cpus_read_unlock(); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); flush_workqueue(cpuset_migrate_mm_wq); @@ -2548,7 +2548,7 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, return -EINVAL; css_get(&cs->css); - get_online_cpus(); + cpus_read_lock(); percpu_down_write(&cpuset_rwsem); if (!is_cpuset_online(cs)) goto out_unlock; @@ -2556,7 +2556,7 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, retval = update_prstate(cs, val); out_unlock: percpu_up_write(&cpuset_rwsem); - put_online_cpus(); + cpus_read_unlock(); css_put(&cs->css); return retval ?: nbytes; } @@ -2762,7 +2762,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (!parent) return 0; - get_online_cpus(); + cpus_read_lock(); percpu_down_write(&cpuset_rwsem); set_bit(CS_ONLINE, &cs->flags); @@ -2815,7 +2815,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) spin_unlock_irq(&callback_lock); out_unlock: percpu_up_write(&cpuset_rwsem); - put_online_cpus(); + cpus_read_unlock(); return 0; } @@ -2834,7 +2834,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); - get_online_cpus(); + cpus_read_lock(); percpu_down_write(&cpuset_rwsem); if (is_partition_root(cs)) @@ -2855,7 +2855,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) clear_bit(CS_ONLINE, &cs->flags); percpu_up_write(&cpuset_rwsem); - put_online_cpus(); + cpus_read_unlock(); } static void cpuset_css_free(struct cgroup_subsys_state *css) -- cgit v1.2.3 From 51be9e51a8000ffc6a33083ceca9da9303ed4dc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 1 Jul 2021 15:27:33 +0200 Subject: KVM: PPC: Book3S HV: XIVE: Fix mapping of passthrough interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PCI MSI interrupt numbers are now mapped in a PCI-MSI domain but the underlying calls handling the passthrough of the interrupt in the guest need a number in the XIVE IRQ domain. Use the IRQ data mapped in the XIVE IRQ domain and not the one in the PCI-MSI domain. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-16-clg@kaod.org --- kernel/irq/irqdomain.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 51c483ce2447..0eee4816edac 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -491,6 +491,7 @@ struct irq_domain *irq_get_default_host(void) { return irq_default_domain; } +EXPORT_SYMBOL_GPL(irq_get_default_host); static bool irq_domain_is_nomap(struct irq_domain *domain) { -- cgit v1.2.3 From b4cc61960879025665a46085cc9b3fa334b34cc8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 10 Aug 2021 17:03:49 -0700 Subject: cgroup: cgroup-v1: clean up kernel-doc notation Fix kernel-doc warnings found in cgroup-v1.c: kernel/cgroup/cgroup-v1.c:55: warning: No description found for return value of 'cgroup_attach_task_all' kernel/cgroup/cgroup-v1.c:94: warning: expecting prototype for cgroup_trasnsfer_tasks(). Prototype was for cgroup_transfer_tasks() instead cgroup-v1.c:96: warning: No description found for return value of 'cgroup_transfer_tasks' kernel/cgroup/cgroup-v1.c:687: warning: No description found for return value of 'cgroupstats_build' Signed-off-by: Randy Dunlap Cc: Tejun Heo Cc: Zefan Li Cc: Johannes Weiner Cc: cgroups@vger.kernel.org Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 8d6bf56ed77a..d25838995151 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -50,6 +50,8 @@ bool cgroup1_ssid_disabled(int ssid) * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' * @from: attach to all cgroups of a given task * @tsk: the task to be attached + * + * Return: %0 on success or a negative errno code on failure */ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) { @@ -80,7 +82,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) EXPORT_SYMBOL_GPL(cgroup_attach_task_all); /** - * cgroup_trasnsfer_tasks - move tasks from one cgroup to another + * cgroup_transfer_tasks - move tasks from one cgroup to another * @to: cgroup to which the tasks will be moved * @from: cgroup in which the tasks currently reside * @@ -89,6 +91,8 @@ EXPORT_SYMBOL_GPL(cgroup_attach_task_all); * is guaranteed to be either visible in the source cgroup after the * parent's migration is complete or put into the target cgroup. No task * can slip out of migration through forking. + * + * Return: %0 on success or a negative errno code on failure */ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) { @@ -682,6 +686,8 @@ int proc_cgroupstats_show(struct seq_file *m, void *v) * * Build and fill cgroupstats so that taskstats can export it to user * space. + * + * Return: %0 on success or a negative errno code on failure */ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { -- cgit v1.2.3 From e7cc9888dc57927f0977d346dab64a5bdfc3da59 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 10 Aug 2021 23:06:02 -0400 Subject: cgroup/cpuset: Enable event notification when partition state changes A valid cpuset partition can become invalid if all its CPUs are offlined or somehow removed. This can happen through external events without "cpuset.cpus.partition" being touched at all. Users that rely on the property of a partition being present do not currently have a simple way to get such an event notified other than constant periodic polling which is both inefficient and cumbersome. To make life easier for those users, event notification is now enabled for "cpuset.cpus.partition" whenever its state changes. Suggested-by: Tejun Heo Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 46 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 65258102e030..fcc11f2d3b5b 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -160,6 +160,9 @@ struct cpuset { */ int use_parent_ecpus; int child_ecpus_count; + + /* Handle for cpuset.cpus.partition */ + struct cgroup_file partition_file; }; /* @@ -263,6 +266,16 @@ static inline int is_partition_root(const struct cpuset *cs) return cs->partition_root_state > 0; } +/* + * Send notification event of whenever partition_root_state changes. + */ +static inline void notify_partition_change(struct cpuset *cs, + int old_prs, int new_prs) +{ + if (old_prs != new_prs) + cgroup_file_notify(&cs->partition_file); +} + static struct cpuset top_cpuset = { .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), @@ -1148,7 +1161,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, struct cpuset *parent = parent_cs(cpuset); int adding; /* Moving cpus from effective_cpus to subparts_cpus */ int deleting; /* Moving cpus from subparts_cpus to effective_cpus */ - int new_prs; + int old_prs, new_prs; bool part_error = false; /* Partition error? */ percpu_rwsem_assert_held(&cpuset_rwsem); @@ -1184,7 +1197,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, * A cpumask update cannot make parent's effective_cpus become empty. */ adding = deleting = false; - new_prs = cpuset->partition_root_state; + old_prs = new_prs = cpuset->partition_root_state; if (cmd == partcmd_enable) { cpumask_copy(tmp->addmask, cpuset->cpus_allowed); adding = true; @@ -1274,7 +1287,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, parent->subparts_cpus); } - if (!adding && !deleting && (new_prs == cpuset->partition_root_state)) + if (!adding && !deleting && (new_prs == old_prs)) return 0; /* @@ -1302,9 +1315,11 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus); - if (cpuset->partition_root_state != new_prs) + if (old_prs != new_prs) cpuset->partition_root_state = new_prs; + spin_unlock_irq(&callback_lock); + notify_partition_change(cpuset, old_prs, new_prs); return cmd == partcmd_update; } @@ -1326,7 +1341,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) struct cpuset *cp; struct cgroup_subsys_state *pos_css; bool need_rebuild_sched_domains = false; - int new_prs; + int old_prs, new_prs; rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, cs) { @@ -1366,8 +1381,8 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) * update_tasks_cpumask() again for tasks in the parent * cpuset if the parent's subparts_cpus changes. */ - new_prs = cp->partition_root_state; - if ((cp != cs) && new_prs) { + old_prs = new_prs = cp->partition_root_state; + if ((cp != cs) && old_prs) { switch (parent->partition_root_state) { case PRS_DISABLED: /* @@ -1438,10 +1453,11 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) } } - if (new_prs != cp->partition_root_state) + if (new_prs != old_prs) cp->partition_root_state = new_prs; spin_unlock_irq(&callback_lock); + notify_partition_change(cp, old_prs, new_prs); WARN_ON(!is_in_v2_mode() && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); @@ -2023,6 +2039,7 @@ out: spin_lock_irq(&callback_lock); cs->partition_root_state = new_prs; spin_unlock_irq(&callback_lock); + notify_partition_change(cs, old_prs, new_prs); } free_cpumasks(NULL, &tmpmask); @@ -2708,6 +2725,7 @@ static struct cftype dfl_files[] = { .write = sched_partition_write, .private = FILE_PARTITION_ROOT, .flags = CFTYPE_NOT_ON_ROOT, + .file_offset = offsetof(struct cpuset, partition_file), }, { @@ -3103,11 +3121,17 @@ retry: */ if ((parent->partition_root_state == PRS_ERROR) || cpumask_empty(&new_cpus)) { + int old_prs; + update_parent_subparts_cpumask(cs, partcmd_disable, NULL, tmp); - spin_lock_irq(&callback_lock); - cs->partition_root_state = PRS_ERROR; - spin_unlock_irq(&callback_lock); + old_prs = cs->partition_root_state; + if (old_prs != PRS_ERROR) { + spin_lock_irq(&callback_lock); + cs->partition_root_state = PRS_ERROR; + spin_unlock_irq(&callback_lock); + notify_partition_change(cs, old_prs, PRS_ERROR); + } } cpuset_force_rebuild(); } -- cgit v1.2.3 From 2863643fb8b92291a7e97ba46e342f1163595fa8 Mon Sep 17 00:00:00 2001 From: Ran Xiaokai Date: Wed, 28 Jul 2021 00:26:29 -0700 Subject: set_user: add capability check when rlimit(RLIMIT_NPROC) exceeds in copy_process(): non root users but with capability CAP_SYS_RESOURCE or CAP_SYS_ADMIN will clean PF_NPROC_EXCEEDED flag even rlimit(RLIMIT_NPROC) exceeds. Add the same capability check logic here. Align the permission checks in copy_process() and set_user(). In copy_process() CAP_SYS_RESOURCE or CAP_SYS_ADMIN capable users will be able to circumvent and clear the PF_NPROC_EXCEEDED flag whereas they aren't able to the same in set_user(). There's no obvious logic to this and trying to unearth the reason in the thread didn't go anywhere. The gist seems to be that this code wants to make sure that a program can't successfully exec if it has gone through a set*id() transition while exceeding its RLIMIT_NPROC. A capable but non-INIT_USER caller getting PF_NPROC_EXCEEDED set during a set*id() transition wouldn't be able to exec right away if they still exceed their RLIMIT_NPROC at the time of exec. So their exec would fail in fs/exec.c: if ((current->flags & PF_NPROC_EXCEEDED) && is_ucounts_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { retval = -EAGAIN; goto out_ret; } However, if the caller were to fork() right after the set*id() transition but before the exec while still exceeding their RLIMIT_NPROC then they would get PF_NPROC_EXCEEDED cleared (while the child would inherit it): retval = -EAGAIN; if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { if (p->real_cred->user != INIT_USER && !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) goto bad_fork_free; } current->flags &= ~PF_NPROC_EXCEEDED; which means a subsequent exec by the capable caller would now succeed even though they could still exceed their RLIMIT_NPROC limit. This seems inconsistent. Allow a CAP_SYS_ADMIN or CAP_SYS_RESOURCE capable user to avoid PF_NPROC_EXCEEDED as they already can in copy_process(). Cc: peterz@infradead.org, tglx@linutronix.de, linux-kernel@vger.kernel.org, Ran Xiaokai , , , Link: https://lore.kernel.org/r/20210728072629.530435-1-ran.xiaokai@zte.com.cn Cc: Neil Brown Cc: Kees Cook Cc: Thomas Gleixner Cc: James Morris Cc: Linus Torvalds Cc: Peter Zijlstra Signed-off-by: Ran Xiaokai Signed-off-by: Christian Brauner --- kernel/sys.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index ef1a78f5d71c..72c7639e3c98 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -480,7 +480,8 @@ static int set_user(struct cred *new) * failure to the execve() stage. */ if (is_ucounts_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) && - new_user != INIT_USER) + new_user != INIT_USER && + !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) current->flags |= PF_NPROC_EXCEEDED; else current->flags &= ~PF_NPROC_EXCEEDED; -- cgit v1.2.3 From ee9707e8593dfb9a375cf4793c3fd03d4142b463 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 11 Aug 2021 15:57:07 -0400 Subject: cgroup/cpuset: Enable memory migration for cpuset v2 When a user changes cpuset.cpus, each task in a v2 cpuset will be moved to one of the new cpus if it is not there already. For memory, however, they won't be migrated to the new nodes when cpuset.mems changes. This is an inconsistency in behavior. In cpuset v1, there is a memory_migrate control file to enable such behavior by setting the CS_MEMORY_MIGRATE flag. Make it the default for cpuset v2 so that we have a consistent set of behavior for both cpus and memory. There is certainly a cost to make memory migration the default, but it is a one time cost that shouldn't really matter as long as cpuset.mems isn't changed frequenty. Update the cgroup-v2.rst file to document the new behavior and recommend against changing cpuset.mems frequently. Since there won't be any concurrent access to the newly allocated cpuset structure in cpuset_css_alloc(), we can use the cheaper non-atomic __set_bit() instead of the more expensive atomic set_bit(). Signed-off-by: Waiman Long Acked-by: Johannes Weiner Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index fcc11f2d3b5b..44d234b0df5e 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2761,12 +2761,16 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) return ERR_PTR(-ENOMEM); } - set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); + __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); nodes_clear(cs->mems_allowed); nodes_clear(cs->effective_mems); fmeter_init(&cs->fmeter); cs->relax_domain_level = -1; + /* Set CS_MEMORY_MIGRATE for default hierarchy */ + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) + __set_bit(CS_MEMORY_MIGRATE, &cs->flags); + return &cs->css; } -- cgit v1.2.3 From f1248dee954c2ddb0ece47a13591e5d55d422d22 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 13 Aug 2021 16:05:29 -0700 Subject: bpf: Allow bpf_get_netns_cookie in BPF_PROG_TYPE_CGROUP_SOCKOPT This is similar to existing BPF_PROG_TYPE_CGROUP_SOCK and BPF_PROG_TYPE_CGROUP_SOCK_ADDR. Signed-off-by: Stanislav Fomichev Signed-off-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210813230530.333779-2-sdf@google.com --- kernel/bpf/cgroup.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index b567ca46555c..9f6070369caa 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1846,11 +1846,29 @@ const struct bpf_verifier_ops cg_sysctl_verifier_ops = { const struct bpf_prog_ops cg_sysctl_prog_ops = { }; +#ifdef CONFIG_NET +BPF_CALL_1(bpf_get_netns_cookie_sockopt, struct bpf_sockopt_kern *, ctx) +{ + const struct net *net = ctx ? sock_net(ctx->sk) : &init_net; + + return net->net_cookie; +} + +static const struct bpf_func_proto bpf_get_netns_cookie_sockopt_proto = { + .func = bpf_get_netns_cookie_sockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX_OR_NULL, +}; +#endif + static const struct bpf_func_proto * cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { #ifdef CONFIG_NET + case BPF_FUNC_get_netns_cookie: + return &bpf_get_netns_cookie_sockopt_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: -- cgit v1.2.3 From 2a047e0662aee1bd773e0415accd785ad26a9398 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Aug 2021 17:19:36 +0200 Subject: dma-mapping: return an unsigned int from dma_map_sg{,_attrs} These can only return 0 for failure or the number of entries, so turn the return value into an unsigned int. Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe --- kernel/dma/mapping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 967b62692102..7ee5284bff58 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -219,7 +219,7 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, * dma_unmap_sg_attrs() should be used to unmap the buffer with the * original sg and original nents (not the value returned by this funciton). */ -int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, +unsigned int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs) { int ret; -- cgit v1.2.3 From 3478cfcfcddff0f3aad82891be2992e51c4f7936 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 14 Aug 2021 10:57:16 +0900 Subject: bpf: Support "%c" in bpf_bprintf_prepare(). /proc/net/unix uses "%c" to print a single-byte character to escape '\0' in the name of the abstract UNIX domain socket. The following selftest uses it, so this patch adds support for "%c". Note that it does not support wide character ("%lc" and "%llc") for simplicity. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210814015718.42704-3-kuniyu@amazon.co.jp --- kernel/bpf/helpers.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 32761be48143..4e8540716187 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -907,6 +907,20 @@ fmt_str: tmp_buf += err; num_spec++; + continue; + } else if (fmt[i] == 'c') { + if (!tmp_buf) + goto nocopy_fmt; + + if (tmp_buf_end == tmp_buf) { + err = -ENOSPC; + goto out; + } + + *tmp_buf = raw_args[num_spec]; + tmp_buf++; + num_spec++; + continue; } -- cgit v1.2.3 From 2a14c9ae15a38148484a128b84bff7e9ffd90d68 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 16 Jun 2021 14:19:33 -0700 Subject: params: lift param_set_uint_minmax to common code It is a useful helper hence move it to common code so others can enjoy it. Suggested-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- kernel/params.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 2daa2780a92c..8299bd764e42 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -243,6 +243,24 @@ STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); STANDARD_PARAM_DEF(hexint, unsigned int, "%#08x", kstrtouint); +int param_set_uint_minmax(const char *val, const struct kernel_param *kp, + unsigned int min, unsigned int max) +{ + unsigned int num; + int ret; + + if (!val) + return -EINVAL; + ret = kstrtouint(val, 0, &num); + if (ret) + return ret; + if (num < min || num > max) + return -EINVAL; + *((unsigned int *)kp->arg) = num; + return 0; +} +EXPORT_SYMBOL_GPL(param_set_uint_minmax); + int param_set_charp(const char *val, const struct kernel_param *kp) { if (strlen(val) > 1024) { -- cgit v1.2.3 From 6fe7c745f2acb73e4cc961d7f91125eef5a8861f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 10 Aug 2021 11:07:14 +0900 Subject: tracing/boot: Fix a hist trigger dependency for boot time tracing Fixes a build error when CONFIG_HIST_TRIGGERS=n with boot-time tracing. Since the trigger_process_regex() is defined only when CONFIG_HIST_TRIGGERS=y, if it is disabled, the 'actions' event option also must be disabled. Link: https://lkml.kernel.org/r/162856123376.203126.582144262622247352.stgit@devnote2 Fixes: 81a59555ff15 ("tracing/boot: Add per-event settings") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 94ef2d099e32..d713714cba67 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -205,12 +205,15 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, pr_err("Failed to apply filter: %s\n", buf); } - xbc_node_for_each_array_value(enode, "actions", anode, p) { - if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) - pr_err("action string is too long: %s\n", p); - else if (trigger_process_regex(file, buf) < 0) - pr_err("Failed to apply an action: %s\n", buf); - } + if (IS_ENABLED(CONFIG_HIST_TRIGGERS)) { + xbc_node_for_each_array_value(enode, "actions", anode, p) { + if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) + pr_err("action string is too long: %s\n", p); + else if (trigger_process_regex(file, buf) < 0) + pr_err("Failed to apply an action: %s\n", buf); + } + } else if (xbc_node_find_value(enode, "actions", NULL)) + pr_err("Failed to apply event actions because CONFIG_HIST_TRIGGERS is not set.\n"); if (xbc_node_find_value(enode, "enable", NULL)) { if (trace_event_enable_disable(file, 1, 0) < 0) -- cgit v1.2.3 From de9a48a360b70d5318061cf1237431d1869555e4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 7 Jul 2021 17:36:24 -0400 Subject: tracing: Add linear buckets to histogram logic There's been several times I wished the histogram logic had a "grouping" feature for the buckets. Currently, each bucket has a size of one. That is, if you trace the amount of requested allocations, each allocation is its own bucket, even if you are interested in what allocates 100 bytes or less, 100 to 200, 200 to 300, etc. Also, without grouping, it fills up the allocated histogram buckets quickly. If you are tracking latency, and don't care if something is 200 microseconds off, or 201 microseconds off, but want to track them by say 10 microseconds each. This can not currently be done. There is a log2 but that grouping get's too big too fast for a lot of cases. Introduce a "buckets=SIZE" command to each field where it will record in a rounded number. For example: ># echo 'hist:keys=bytes_req.buckets=100:sort=bytes_req' > events/kmem/kmalloc/trigger ># cat events/kmem/kmalloc/hist # event histogram # # trigger info: hist:keys=bytes_req.buckets=100:vals=hitcount:sort=bytes_req.buckets=100:size=2048 [active] # { bytes_req: ~ 0-99 } hitcount: 3149 { bytes_req: ~ 100-199 } hitcount: 1468 { bytes_req: ~ 200-299 } hitcount: 39 { bytes_req: ~ 300-399 } hitcount: 306 { bytes_req: ~ 400-499 } hitcount: 364 { bytes_req: ~ 500-599 } hitcount: 32 { bytes_req: ~ 600-699 } hitcount: 69 { bytes_req: ~ 700-799 } hitcount: 37 { bytes_req: ~ 1200-1299 } hitcount: 16 { bytes_req: ~ 1400-1499 } hitcount: 30 { bytes_req: ~ 2000-2099 } hitcount: 6 { bytes_req: ~ 4000-4099 } hitcount: 2168 { bytes_req: ~ 5000-5099 } hitcount: 6 Totals: Hits: 7690 Entries: 13 Dropped: 0 Link: https://lkml.kernel.org/r/20210707213921.980359719@goodmis.org Acked-by: Namhyung Kim Reviewed-by: Tom Zanussi Reviewed-by: Masami Hiramatsu Tested-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 65 +++++++++++++++++++++++++++++++++++----- 1 file changed, 58 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index a48aa2a2875b..8e87c4a429fd 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -121,6 +121,7 @@ struct hist_field { unsigned int size; unsigned int offset; unsigned int is_signed; + unsigned long buckets; const char *type; struct hist_field *operands[HIST_FIELD_OPERANDS_MAX]; struct hist_trigger_data *hist_data; @@ -219,6 +220,27 @@ static u64 hist_field_log2(struct hist_field *hist_field, return (u64) ilog2(roundup_pow_of_two(val)); } +static u64 hist_field_bucket(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_field *operand = hist_field->operands[0]; + unsigned long buckets = hist_field->buckets; + + u64 val = operand->fn(operand, elt, buffer, rbe, event); + + if (WARN_ON_ONCE(!buckets)) + return val; + + if (val >= LONG_MAX) + val = div64_ul(val, buckets); + else + val = (u64)((unsigned long)val / buckets); + return val * buckets; +} + static u64 hist_field_plus(struct hist_field *hist_field, struct tracing_map_elt *elt, struct trace_buffer *buffer, @@ -318,6 +340,7 @@ enum hist_field_flags { HIST_FIELD_FL_VAR_REF = 1 << 14, HIST_FIELD_FL_CPU = 1 << 15, HIST_FIELD_FL_ALIAS = 1 << 16, + HIST_FIELD_FL_BUCKET = 1 << 17, }; struct var_defs { @@ -1109,7 +1132,8 @@ static const char *hist_field_name(struct hist_field *field, if (field->field) field_name = field->field->name; else if (field->flags & HIST_FIELD_FL_LOG2 || - field->flags & HIST_FIELD_FL_ALIAS) + field->flags & HIST_FIELD_FL_ALIAS || + field->flags & HIST_FIELD_FL_BUCKET) field_name = hist_field_name(field->operands[0], ++level); else if (field->flags & HIST_FIELD_FL_CPU) field_name = "common_cpu"; @@ -1470,6 +1494,8 @@ static const char *get_hist_field_flags(struct hist_field *hist_field) flags_str = "syscall"; else if (hist_field->flags & HIST_FIELD_FL_LOG2) flags_str = "log2"; + else if (hist_field->flags & HIST_FIELD_FL_BUCKET) + flags_str = "buckets"; else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP_USECS) flags_str = "usecs"; @@ -1658,9 +1684,10 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, goto out; } - if (flags & HIST_FIELD_FL_LOG2) { - unsigned long fl = flags & ~HIST_FIELD_FL_LOG2; - hist_field->fn = hist_field_log2; + if (flags & (HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET)) { + unsigned long fl = flags & ~(HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET); + hist_field->fn = flags & HIST_FIELD_FL_LOG2 ? hist_field_log2 : + hist_field_bucket; hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL); hist_field->size = hist_field->operands[0]->size; hist_field->type = kstrdup(hist_field->operands[0]->type, GFP_KERNEL); @@ -1953,7 +1980,7 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data, static struct ftrace_event_field * parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, - char *field_str, unsigned long *flags) + char *field_str, unsigned long *flags, unsigned long *buckets) { struct ftrace_event_field *field = NULL; char *field_name, *modifier, *str; @@ -1980,7 +2007,22 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, *flags |= HIST_FIELD_FL_LOG2; else if (strcmp(modifier, "usecs") == 0) *flags |= HIST_FIELD_FL_TIMESTAMP_USECS; - else { + else if (strncmp(modifier, "bucket", 6) == 0) { + int ret; + + modifier += 6; + + if (*modifier == 's') + modifier++; + if (*modifier != '=') + goto error; + modifier++; + ret = kstrtoul(modifier, 0, buckets); + if (ret || !(*buckets)) + goto error; + *flags |= HIST_FIELD_FL_BUCKET; + } else { + error: hist_err(tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier)); field = ERR_PTR(-EINVAL); goto out; @@ -2049,6 +2091,7 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, char *s, *ref_system = NULL, *ref_event = NULL, *ref_var = str; struct ftrace_event_field *field = NULL; struct hist_field *hist_field = NULL; + unsigned long buckets = 0; int ret = 0; s = strchr(str, '.'); @@ -2086,7 +2129,7 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, } else str = s; - field = parse_field(hist_data, file, str, flags); + field = parse_field(hist_data, file, str, flags, &buckets); if (IS_ERR(field)) { ret = PTR_ERR(field); goto out; @@ -2097,6 +2140,7 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, ret = -ENOMEM; goto out; } + hist_field->buckets = buckets; return hist_field; out: @@ -4698,6 +4742,11 @@ static void hist_trigger_print_key(struct seq_file *m, } else if (key_field->flags & HIST_FIELD_FL_LOG2) { seq_printf(m, "%s: ~ 2^%-2llu", field_name, *(u64 *)(key + key_field->offset)); + } else if (key_field->flags & HIST_FIELD_FL_BUCKET) { + unsigned long buckets = key_field->buckets; + uval = *(u64 *)(key + key_field->offset); + seq_printf(m, "%s: ~ %llu-%llu", field_name, + uval, uval + buckets -1); } else if (key_field->flags & HIST_FIELD_FL_STRING) { seq_printf(m, "%s: %-50s", field_name, (char *)(key + key_field->offset)); @@ -5137,6 +5186,8 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) seq_printf(m, ".%s", flags); } } + if (hist_field->buckets) + seq_printf(m, "=%ld", hist_field->buckets); } static int event_hist_trigger_print(struct seq_file *m, -- cgit v1.2.3 From 370364351926e4fcc7c1a486901bfaae0172b7d9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 7 Jul 2021 17:36:25 -0400 Subject: tracing/histogram: Update the documentation for the buckets modifier Update both the tracefs README file as well as the histogram.rst to include an explanation of what the buckets modifier is and how to use it. Include an example with the wakeup_latency example for both log2 and the buckets modifiers as there was no existing log2 example. Link: https://lkml.kernel.org/r/20210707213922.167218794@goodmis.org Acked-by: Namhyung Kim Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a1adb29ef5c1..be0169594de5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5654,6 +5654,7 @@ static const char readme_msg[] = "\t .execname display a common_pid as a program name\n" "\t .syscall display a syscall id as a syscall name\n" "\t .log2 display log2 value rather than raw number\n" + "\t .buckets=size display values in groups of size rather than raw number\n" "\t .usecs display a common_timestamp in microseconds\n\n" "\t The 'pause' parameter can be used to pause an existing hist\n" "\t trigger or to start a hist trigger but not log any events\n" -- cgit v1.2.3 From 3347d80baa41c357cf263923f60aa8051a753d76 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 22 Jul 2021 10:27:06 -0400 Subject: tracing: Have histogram types be constant when possible Instead of kstrdup("const", GFP_KERNEL), have the hist_field type simply assign the constant hist_field->type = "const"; And when the value passed to it is a variable, use "kstrdup_const(var, GFP_KERNEL);" which will just copy the value if the variable is already a constant. This saves on having to allocate when not needed. All frees of the hist_field->type will need to use kfree_const(). Link: https://lkml.kernel.org/r/20210722142837.280718447@goodmis.org Suggested-by: Masami Hiramatsu Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 8e87c4a429fd..bb466a82b938 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1616,7 +1616,9 @@ static void __destroy_hist_field(struct hist_field *hist_field) kfree(hist_field->var.name); kfree(hist_field->name); - kfree(hist_field->type); + + /* Can likely be a const */ + kfree_const(hist_field->type); kfree(hist_field->system); kfree(hist_field->event_name); @@ -1673,9 +1675,7 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, if (flags & HIST_FIELD_FL_HITCOUNT) { hist_field->fn = hist_field_counter; hist_field->size = sizeof(u64); - hist_field->type = kstrdup("u64", GFP_KERNEL); - if (!hist_field->type) - goto free; + hist_field->type = "u64"; goto out; } @@ -1690,7 +1690,7 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, hist_field_bucket; hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL); hist_field->size = hist_field->operands[0]->size; - hist_field->type = kstrdup(hist_field->operands[0]->type, GFP_KERNEL); + hist_field->type = kstrdup_const(hist_field->operands[0]->type, GFP_KERNEL); if (!hist_field->type) goto free; goto out; @@ -1699,18 +1699,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, if (flags & HIST_FIELD_FL_TIMESTAMP) { hist_field->fn = hist_field_timestamp; hist_field->size = sizeof(u64); - hist_field->type = kstrdup("u64", GFP_KERNEL); - if (!hist_field->type) - goto free; + hist_field->type = "u64"; goto out; } if (flags & HIST_FIELD_FL_CPU) { hist_field->fn = hist_field_cpu; hist_field->size = sizeof(int); - hist_field->type = kstrdup("unsigned int", GFP_KERNEL); - if (!hist_field->type) - goto free; + hist_field->type = "unsigned int"; goto out; } @@ -1723,7 +1719,7 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, flags |= HIST_FIELD_FL_STRING; hist_field->size = MAX_FILTER_STR_VAL; - hist_field->type = kstrdup(field->type, GFP_KERNEL); + hist_field->type = kstrdup_const(field->type, GFP_KERNEL); if (!hist_field->type) goto free; @@ -1736,7 +1732,7 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, } else { hist_field->size = field->size; hist_field->is_signed = field->is_signed; - hist_field->type = kstrdup(field->type, GFP_KERNEL); + hist_field->type = kstrdup_const(field->type, GFP_KERNEL); if (!hist_field->type) goto free; @@ -1822,7 +1818,7 @@ static int init_var_ref(struct hist_field *ref_field, } } - ref_field->type = kstrdup(var_field->type, GFP_KERNEL); + ref_field->type = kstrdup_const(var_field->type, GFP_KERNEL); if (!ref_field->type) { err = -ENOMEM; goto free; @@ -2215,7 +2211,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, expr->operands[0] = operand1; expr->operator = FIELD_OP_UNARY_MINUS; expr->name = expr_str(expr, 0); - expr->type = kstrdup(operand1->type, GFP_KERNEL); + expr->type = kstrdup_const(operand1->type, GFP_KERNEL); if (!expr->type) { ret = -ENOMEM; goto free; @@ -2355,7 +2351,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, expr->operator = field_op; expr->name = expr_str(expr, 0); - expr->type = kstrdup(operand1->type, GFP_KERNEL); + expr->type = kstrdup_const(operand1->type, GFP_KERNEL); if (!expr->type) { ret = -ENOMEM; goto free; @@ -2743,10 +2739,10 @@ static struct hist_field *create_var(struct hist_trigger_data *hist_data, var->var.hist_data = var->hist_data = hist_data; var->size = size; var->var.name = kstrdup(name, GFP_KERNEL); - var->type = kstrdup(type, GFP_KERNEL); + var->type = kstrdup_const(type, GFP_KERNEL); if (!var->var.name || !var->type) { + kfree_const(var->type); kfree(var->var.name); - kfree(var->type); kfree(var); var = ERR_PTR(-ENOMEM); } -- cgit v1.2.3 From ed2cf90735daf40ab8d938b4b6d3ca43c0f84466 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 22 Jul 2021 10:27:07 -0400 Subject: tracing: Allow execnames to be passed as args for synthetic events Allow common_pid.execname to be saved in a variable in one histogram to be passed to another histogram that can pass it as a parameter to a synthetic event. ># echo 'hist:keys=pid:__arg__1=common_timestamp.usecs:arg2=common_pid.execname' \ > events/sched/sched_waking/trigger ># echo 'wakeup_lat s32 pid; u64 delta; char wake_comm[]' > synthetic_events ># echo 'hist:keys=next_pid:pid=next_pid,delta=common_timestamp.usecs-$__arg__1,exec=$arg2'\ ':onmatch(sched.sched_waking).trace(wakeup_lat,$pid,$delta,$exec)' \ > events/sched/sched_switch/trigger The above is a wake up latency synthetic event setup that passes the execname of the common_pid that woke the task to the scheduling of that task, which triggers a synthetic event that passes the original execname as a parameter to display it. ># echo 1 > events/synthetic/enable ># cat trace -0 [006] d..4 186.863801: wakeup_lat: pid=1306 delta=65 wake_comm=kworker/u16:3 -0 [000] d..4 186.863858: wakeup_lat: pid=163 delta=27 wake_comm= -0 [001] d..4 186.863903: wakeup_lat: pid=1307 delta=36 wake_comm=kworker/u16:4 -0 [000] d..4 186.863927: wakeup_lat: pid=163 delta=5 wake_comm= -0 [006] d..4 186.863957: wakeup_lat: pid=1306 delta=24 wake_comm=kworker/u16:3 sshd-1306 [006] d..4 186.864051: wakeup_lat: pid=61 delta=62 wake_comm= -0 [000] d..4 186.965030: wakeup_lat: pid=609 delta=18 wake_comm= -0 [006] d..4 186.987582: wakeup_lat: pid=1306 delta=65 wake_comm=kworker/u16:3 -0 [000] d..4 186.987639: wakeup_lat: pid=163 delta=27 wake_comm= Link: https://lkml.kernel.org/r/20210722142837.458596338@goodmis.org Reviewed-by: Tom Zanussi Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 46 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index bb466a82b938..9d91b1c06957 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1420,17 +1420,17 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt) struct hist_trigger_data *hist_data = elt->map->private_data; unsigned int size = TASK_COMM_LEN; struct hist_elt_data *elt_data; - struct hist_field *key_field; + struct hist_field *hist_field; unsigned int i, n_str; elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL); if (!elt_data) return -ENOMEM; - for_each_hist_key_field(i, hist_data) { - key_field = hist_data->fields[i]; + for_each_hist_field(i, hist_data) { + hist_field = hist_data->fields[i]; - if (key_field->flags & HIST_FIELD_FL_EXECNAME) { + if (hist_field->flags & HIST_FIELD_FL_EXECNAME) { elt_data->comm = kzalloc(size, GFP_KERNEL); if (!elt_data->comm) { kfree(elt_data); @@ -3771,6 +3771,41 @@ static int create_val_field(struct hist_trigger_data *hist_data, return __create_val_field(hist_data, val_idx, file, NULL, field_str, 0); } +static const char *no_comm = "(no comm)"; + +static u64 hist_field_execname(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_elt_data *elt_data; + + if (WARN_ON_ONCE(!elt)) + return (u64)(unsigned long)no_comm; + + elt_data = elt->private_data; + + if (WARN_ON_ONCE(!elt_data->comm)) + return (u64)(unsigned long)no_comm; + + return (u64)(unsigned long)(elt_data->comm); +} + +/* Convert a var that points to common_pid.execname to a string */ +static void update_var_execname(struct hist_field *hist_field) +{ + hist_field->flags = HIST_FIELD_FL_STRING | HIST_FIELD_FL_VAR | + HIST_FIELD_FL_EXECNAME; + hist_field->size = MAX_FILTER_STR_VAL; + hist_field->is_signed = 0; + + kfree_const(hist_field->type); + hist_field->type = "char[]"; + + hist_field->fn = hist_field_execname; +} + static int create_var_field(struct hist_trigger_data *hist_data, unsigned int val_idx, struct trace_event_file *file, @@ -3795,6 +3830,9 @@ static int create_var_field(struct hist_trigger_data *hist_data, ret = __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags); + if (!ret && hist_data->fields[val_idx]->flags & HIST_FIELD_FL_EXECNAME) + update_var_execname(hist_data->fields[val_idx]); + if (!ret && hist_data->fields[val_idx]->flags & HIST_FIELD_FL_STRING) hist_data->fields[val_idx]->var_str_idx = hist_data->n_var_str++; -- cgit v1.2.3 From de32951b29be3d6cc7a92bfbf366f48a9f4c4407 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 31 Jul 2021 14:22:31 +0900 Subject: tracing: Simplify the Kconfig dependency of FTRACE The entire FTRACE block is surrounded by 'if TRACING_SUPPORT' ... 'endif'. Using 'depends on' is a simpler way to guard FTRACE. Link: https://lkml.kernel.org/r/20210731052233.4703-1-masahiroy@kernel.org Signed-off-by: Masahiro Yamada Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 3ee23f4d437f..420ff4bc67fd 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -135,10 +135,9 @@ config TRACING_SUPPORT depends on STACKTRACE_SUPPORT default y -if TRACING_SUPPORT - menuconfig FTRACE bool "Tracers" + depends on TRACING_SUPPORT default y if DEBUG_KERNEL help Enable the kernel tracing infrastructure. @@ -1037,6 +1036,3 @@ config HIST_TRIGGERS_DEBUG If unsure, say N. endif # FTRACE - -endif # TRACING_SUPPORT - -- cgit v1.2.3 From e66ed86ca6c52488249e95f7b3a6a3d7d6ab5e1e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 10 Aug 2021 11:07:21 +0900 Subject: tracing/boot: Add per-event histogram action options Add a hist-trigger action syntax support to boot-time tracing. Currently, boot-time tracing supports per-event actions as option strings. However, for the histogram action, it has a special syntax and usually needs a long action definition. To make it readable and fit to the bootconfig syntax, this introduces a new options for histogram. Here are the histogram action options for boot-time tracing. ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist { keys = [,...] values = [,...] sort = [,...] size = name = var { = ... } pause|continue|clear onmax|onchange { var = ; [= ] } onmatch { event = ; [= ] } filter = } Where is one of below; trace = , [, ...] save = [, ...] snapshot Link: https://lkml.kernel.org/r/162856124106.203126.10501871028479029087.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 231 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 231 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index d713714cba67..3d0e51368f51 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -171,6 +171,231 @@ trace_boot_add_synth_event(struct xbc_node *node, const char *event) } #endif +#ifdef CONFIG_HIST_TRIGGERS +static int __init __printf(3, 4) +append_printf(char **bufp, char *end, const char *fmt, ...) +{ + va_list args; + int ret; + + if (*bufp == end) + return -ENOSPC; + + va_start(args, fmt); + ret = vsnprintf(*bufp, end - *bufp, fmt, args); + if (ret < end - *bufp) { + *bufp += ret; + } else { + *bufp = end; + ret = -ERANGE; + } + va_end(args); + + return ret; +} + +static int __init +append_str_nospace(char **bufp, char *end, const char *str) +{ + char *p = *bufp; + int len; + + while (p < end - 1 && *str != '\0') { + if (!isspace(*str)) + *(p++) = *str; + str++; + } + *p = '\0'; + if (p == end - 1) { + *bufp = end; + return -ENOSPC; + } + len = p - *bufp; + *bufp = p; + return (int)len; +} + +static int __init +trace_boot_hist_add_array(struct xbc_node *hnode, char **bufp, + char *end, const char *key) +{ + struct xbc_node *knode, *anode; + const char *p; + char sep; + + knode = xbc_node_find_child(hnode, key); + if (knode) { + anode = xbc_node_get_child(knode); + if (!anode) { + pr_err("hist.%s requires value(s).\n", key); + return -EINVAL; + } + + append_printf(bufp, end, ":%s", key); + sep = '='; + xbc_array_for_each_value(anode, p) { + append_printf(bufp, end, "%c%s", sep, p); + if (sep == '=') + sep = ','; + } + } else + return -ENOENT; + + return 0; +} + +static int __init +trace_boot_hist_add_handler(struct xbc_node *hnode, char **bufp, + char *end, const char *param) +{ + struct xbc_node *knode, *anode; + const char *p; + char sep; + + /* Compose 'handler' parameter */ + p = xbc_node_find_value(hnode, param, NULL); + if (!p) { + pr_err("hist.%s requires '%s' option.\n", + xbc_node_get_data(hnode), param); + return -EINVAL; + } + append_printf(bufp, end, ":%s(%s)", xbc_node_get_data(hnode), p); + + /* Compose 'action' parameter */ + knode = xbc_node_find_child(hnode, "trace"); + if (!knode) + knode = xbc_node_find_child(hnode, "save"); + + if (knode) { + anode = xbc_node_get_child(knode); + if (!anode || !xbc_node_is_value(anode)) { + pr_err("hist.%s.%s requires value(s).\n", + xbc_node_get_data(hnode), + xbc_node_get_data(knode)); + return -EINVAL; + } + + append_printf(bufp, end, ".%s", xbc_node_get_data(knode)); + sep = '('; + xbc_array_for_each_value(anode, p) { + append_printf(bufp, end, "%c%s", sep, p); + if (sep == '(') + sep = ','; + } + append_printf(bufp, end, ")"); + } else if (xbc_node_find_child(hnode, "snapshot")) { + append_printf(bufp, end, ".snapshot()"); + } else { + pr_err("hist.%s requires an action.\n", + xbc_node_get_data(hnode)); + return -EINVAL; + } + + return 0; +} + +/* + * Histogram boottime tracing syntax. + * + * ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist { + * keys = [,...] + * values = [,...] + * sort = [,...] + * size = + * name = + * var { = ... } + * pause|continue|clear + * onmax|onchange { var = ; [= ] } + * onmatch { event = ; [= ] } + * filter = + * } + * + * Where are; + * + * trace = , [, ...] + * save = [, ...] + * snapshot + */ +static int __init +trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size) +{ + struct xbc_node *node, *knode; + char *end = buf + size; + const char *p; + int ret = 0; + + append_printf(&buf, end, "hist"); + + ret = trace_boot_hist_add_array(hnode, &buf, end, "keys"); + if (ret < 0) { + if (ret == -ENOENT) + pr_err("hist requires keys.\n"); + return -EINVAL; + } + + ret = trace_boot_hist_add_array(hnode, &buf, end, "values"); + if (ret == -EINVAL) + return ret; + ret = trace_boot_hist_add_array(hnode, &buf, end, "sort"); + if (ret == -EINVAL) + return ret; + + p = xbc_node_find_value(hnode, "size", NULL); + if (p) + append_printf(&buf, end, ":size=%s", p); + + p = xbc_node_find_value(hnode, "name", NULL); + if (p) + append_printf(&buf, end, ":name=%s", p); + + node = xbc_node_find_child(hnode, "var"); + if (node) { + xbc_node_for_each_key_value(node, knode, p) { + /* Expression must not include spaces. */ + append_printf(&buf, end, ":%s=", + xbc_node_get_data(knode)); + append_str_nospace(&buf, end, p); + } + } + + /* Histogram control attributes (mutual exclusive) */ + if (xbc_node_find_child(hnode, "pause")) + append_printf(&buf, end, ":pause"); + else if (xbc_node_find_child(hnode, "continue")) + append_printf(&buf, end, ":continue"); + else if (xbc_node_find_child(hnode, "clear")) + append_printf(&buf, end, ":clear"); + + /* Histogram handler and actions */ + node = xbc_node_find_child(hnode, "onmax"); + if (node && trace_boot_hist_add_handler(node, &buf, end, "var") < 0) + return -EINVAL; + node = xbc_node_find_child(hnode, "onchange"); + if (node && trace_boot_hist_add_handler(node, &buf, end, "var") < 0) + return -EINVAL; + node = xbc_node_find_child(hnode, "onmatch"); + if (node && trace_boot_hist_add_handler(node, &buf, end, "event") < 0) + return -EINVAL; + + p = xbc_node_find_value(hnode, "filter", NULL); + if (p) + append_printf(&buf, end, " if %s", p); + + if (buf == end) { + pr_err("hist exceeds the max command length.\n"); + return -E2BIG; + } + + return 0; +} +#else +static int __init +trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size) +{ + return -EOPNOTSUPP; +} +#endif + static void __init trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, struct xbc_node *enode) @@ -212,6 +437,12 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, else if (trigger_process_regex(file, buf) < 0) pr_err("Failed to apply an action: %s\n", buf); } + anode = xbc_node_find_child(enode, "hist"); + if (anode && + trace_boot_compose_hist_cmd(anode, buf, ARRAY_SIZE(buf)) == 0) { + if (trigger_process_regex(file, buf) < 0) + pr_err("Failed to apply hist trigger: %s\n", buf); + } } else if (xbc_node_find_value(enode, "actions", NULL)) pr_err("Failed to apply event actions because CONFIG_HIST_TRIGGERS is not set.\n"); -- cgit v1.2.3 From 8993665abcce793f00815b3504a486dce70cc2b9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 10 Aug 2021 11:07:29 +0900 Subject: tracing/boot: Support multiple handlers for per-event histogram Support multiple handlers for per-event histogram in boot-time tracing. Since the histogram can register multiple same handler-actions with different parameters, this expands the syntax to support such cases. With this update, the 'onmax', 'onchange' and 'onmatch' handler subkeys under per-event histogram option will take a number subkeys optionally as below. (see [.N]) ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist { onmax|onchange[.N] { var = ; [= ] } onmatch[.N] { event = ; [= ] } } The 'N' must be a digit (or digit started word). Thus user can add several handler-actions to the histogram, for example, ftrace.event.SOMEGROUP.SOMEEVENT.hist { keys = SOME_ID; lat = common_timestamp.usecs-$ts0 onmatch.1 { event = GROUP1.STARTEVENT1 trace = latency_event, SOME_ID, $lat } onmatch.2 { event = GROUP2.STARTEVENT2 trace = latency_event, SOME_ID, $lat } } Then, it can trace the elapsed time from GROUP1.STARTEVENT1 to SOMEGROUP.SOMEEVENT, and from GROUP2.STARTEVENT2 to SOMEGROUP.SOMEEVENT with SOME_ID key. Link: https://lkml.kernel.org/r/162856124905.203126.14913731908137885922.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 43 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 3d0e51368f51..f024f27b3602 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -245,8 +245,9 @@ trace_boot_hist_add_array(struct xbc_node *hnode, char **bufp, } static int __init -trace_boot_hist_add_handler(struct xbc_node *hnode, char **bufp, - char *end, const char *param) +trace_boot_hist_add_one_handler(struct xbc_node *hnode, char **bufp, + char *end, const char *handler, + const char *param) { struct xbc_node *knode, *anode; const char *p; @@ -259,7 +260,7 @@ trace_boot_hist_add_handler(struct xbc_node *hnode, char **bufp, xbc_node_get_data(hnode), param); return -EINVAL; } - append_printf(bufp, end, ":%s(%s)", xbc_node_get_data(hnode), p); + append_printf(bufp, end, ":%s(%s)", handler, p); /* Compose 'action' parameter */ knode = xbc_node_find_child(hnode, "trace"); @@ -294,6 +295,32 @@ trace_boot_hist_add_handler(struct xbc_node *hnode, char **bufp, return 0; } +static int __init +trace_boot_hist_add_handlers(struct xbc_node *hnode, char **bufp, + char *end, const char *param) +{ + struct xbc_node *node; + const char *p, *handler; + int ret; + + handler = xbc_node_get_data(hnode); + + xbc_node_for_each_subkey(hnode, node) { + p = xbc_node_get_data(node); + if (!isdigit(p[0])) + continue; + /* All digit started node should be instances. */ + ret = trace_boot_hist_add_one_handler(node, bufp, end, handler, param); + if (ret < 0) + break; + } + + if (xbc_node_find_child(hnode, param)) + ret = trace_boot_hist_add_one_handler(hnode, bufp, end, handler, param); + + return ret; +} + /* * Histogram boottime tracing syntax. * @@ -305,8 +332,8 @@ trace_boot_hist_add_handler(struct xbc_node *hnode, char **bufp, * name = * var { = ... } * pause|continue|clear - * onmax|onchange { var = ; [= ] } - * onmatch { event = ; [= ] } + * onmax|onchange[.N] { var = ; [= ] } + * onmatch[.N] { event = ; [= ] } * filter = * } * @@ -368,13 +395,13 @@ trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size) /* Histogram handler and actions */ node = xbc_node_find_child(hnode, "onmax"); - if (node && trace_boot_hist_add_handler(node, &buf, end, "var") < 0) + if (node && trace_boot_hist_add_handlers(node, &buf, end, "var") < 0) return -EINVAL; node = xbc_node_find_child(hnode, "onchange"); - if (node && trace_boot_hist_add_handler(node, &buf, end, "var") < 0) + if (node && trace_boot_hist_add_handlers(node, &buf, end, "var") < 0) return -EINVAL; node = xbc_node_find_child(hnode, "onmatch"); - if (node && trace_boot_hist_add_handler(node, &buf, end, "event") < 0) + if (node && trace_boot_hist_add_handlers(node, &buf, end, "event") < 0) return -EINVAL; p = xbc_node_find_value(hnode, "filter", NULL); -- cgit v1.2.3 From 17abd7c36c77c393fa65cde462059395d6437dba Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 10 Aug 2021 11:07:36 +0900 Subject: tracing/boot: Support multiple histograms for each event Add multiple histograms support for each event. This allows user to set multiple histograms to an event. ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist[.N] { ... } The 'N' is a digit started string and it can be omitted for the default histogram. For example, multiple hist triggers example in the Documentation/trace/histogram.rst can be written as below; ftrace.event.net.netif_receive_skb.hist { 1 { keys = skbaddr.hex values = len filter = len < 0 } 2 { keys = skbaddr.hex values = len filter = len > 4096 } 3 { keys = skbaddr.hex values = len filter = len == 256 } 4 { keys = skbaddr.hex values = len } 5 { keys = len values = common_preempt_count } } Link: https://lkml.kernel.org/r/162856125628.203126.15846930277378572120.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index f024f27b3602..1a2b270e9cda 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -324,7 +324,7 @@ trace_boot_hist_add_handlers(struct xbc_node *hnode, char **bufp, /* * Histogram boottime tracing syntax. * - * ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist { + * ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist[.N] { * keys = [,...] * values = [,...] * sort = [,...] @@ -415,11 +415,37 @@ trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size) return 0; } + +static void __init +trace_boot_init_histograms(struct trace_event_file *file, + struct xbc_node *hnode, char *buf, size_t size) +{ + struct xbc_node *node; + const char *p; + + xbc_node_for_each_subkey(hnode, node) { + p = xbc_node_get_data(node); + if (!isdigit(p[0])) + continue; + /* All digit started node should be instances. */ + if (trace_boot_compose_hist_cmd(node, buf, size) == 0) { + if (trigger_process_regex(file, buf) < 0) + pr_err("Failed to apply hist trigger: %s\n", buf); + } + } + + if (xbc_node_find_child(hnode, "keys")) { + if (trace_boot_compose_hist_cmd(hnode, buf, size) == 0) + if (trigger_process_regex(file, buf) < 0) + pr_err("Failed to apply hist trigger: %s\n", buf); + } +} #else -static int __init -trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size) +static void __init +trace_boot_init_histograms(struct trace_event_file *file, + struct xbc_node *hnode, char *buf, size_t size) { - return -EOPNOTSUPP; + /* do nothing */ } #endif @@ -465,11 +491,8 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, pr_err("Failed to apply an action: %s\n", buf); } anode = xbc_node_find_child(enode, "hist"); - if (anode && - trace_boot_compose_hist_cmd(anode, buf, ARRAY_SIZE(buf)) == 0) { - if (trigger_process_regex(file, buf) < 0) - pr_err("Failed to apply hist trigger: %s\n", buf); - } + if (anode) + trace_boot_init_histograms(file, anode, buf, ARRAY_SIZE(buf)); } else if (xbc_node_find_value(enode, "actions", NULL)) pr_err("Failed to apply event actions because CONFIG_HIST_TRIGGERS is not set.\n"); -- cgit v1.2.3 From 64dc7f6958ef56512137ed6ec228127cef7724e9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 10 Aug 2021 11:07:44 +0900 Subject: tracing/boot: Show correct histogram error command Since trigger_process_regex() modifies given trigger actions while parsing, the error message couldn't show what command was passed to the trigger_process_regex() when it returns an error. To fix that, show the backed up trigger action command instead of parsed buffer. Link: https://lkml.kernel.org/r/162856126413.203126.9465564928450701424.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 1a2b270e9cda..1060b0446032 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -422,6 +422,7 @@ trace_boot_init_histograms(struct trace_event_file *file, { struct xbc_node *node; const char *p; + char *tmp; xbc_node_for_each_subkey(hnode, node) { p = xbc_node_get_data(node); @@ -429,15 +430,20 @@ trace_boot_init_histograms(struct trace_event_file *file, continue; /* All digit started node should be instances. */ if (trace_boot_compose_hist_cmd(node, buf, size) == 0) { + tmp = kstrdup(buf, GFP_KERNEL); if (trigger_process_regex(file, buf) < 0) - pr_err("Failed to apply hist trigger: %s\n", buf); + pr_err("Failed to apply hist trigger: %s\n", tmp); + kfree(tmp); } } if (xbc_node_find_child(hnode, "keys")) { - if (trace_boot_compose_hist_cmd(hnode, buf, size) == 0) + if (trace_boot_compose_hist_cmd(hnode, buf, size) == 0) { + tmp = kstrdup(buf, GFP_KERNEL); if (trigger_process_regex(file, buf) < 0) - pr_err("Failed to apply hist trigger: %s\n", buf); + pr_err("Failed to apply hist trigger: %s\n", tmp); + kfree(tmp); + } } } #else @@ -488,7 +494,7 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) pr_err("action string is too long: %s\n", p); else if (trigger_process_regex(file, buf) < 0) - pr_err("Failed to apply an action: %s\n", buf); + pr_err("Failed to apply an action: %s\n", p); } anode = xbc_node_find_child(enode, "hist"); if (anode) -- cgit v1.2.3 From bd74095389b378f2292ae32397b23f4b97ff23c8 Mon Sep 17 00:00:00 2001 From: zhaoxiao Date: Mon, 16 Aug 2021 13:24:30 +0800 Subject: tracepoint: Fix kerneldoc comments Fix function name in tracepoint.c kernel-doc comment to remove a warning found by clang_w1. kernel/tracepoint.c:589: warning: expecting prototype for register_tracepoint_notifier(). Prototype was for register_tracepoint_module_notifier() instead kernel/tracepoint.c:613: warning: expecting prototype for unregister_tracepoint_notifier(). Prototype was for unregister_tracepoint_module_notifier() instead Link: https://lkml.kernel.org/r/20210816052430.16539-1-zhaoxiao@uniontech.com Acked-by: Mathieu Desnoyers Signed-off-by: zhaoxiao Signed-off-by: Steven Rostedt (VMware) --- kernel/tracepoint.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index efd14c79fab4..64ea283f2f86 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -577,7 +577,7 @@ bool trace_module_has_bad_taint(struct module *mod) static BLOCKING_NOTIFIER_HEAD(tracepoint_notify_list); /** - * register_tracepoint_notifier - register tracepoint coming/going notifier + * register_tracepoint_module_notifier - register tracepoint coming/going notifier * @nb: notifier block * * Notifiers registered with this function are called on module @@ -603,7 +603,7 @@ end: EXPORT_SYMBOL_GPL(register_tracepoint_module_notifier); /** - * unregister_tracepoint_notifier - unregister tracepoint coming/going notifier + * unregister_tracepoint_module_notifier - unregister tracepoint coming/going notifier * @nb: notifier block * * The notifier block callback should expect a "struct tp_module" data -- cgit v1.2.3 From dbcfa7156f48ebb72dfc8f4e5d702af8ca1d4b3a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 9 Aug 2021 18:44:42 -0700 Subject: PM: sleep: unmark 'state' functions as kernel-doc Fix kernel-doc warnings in kernel/power/main.c by unmarking the comment block as kernel-doc notation. This eliminates the following kernel-doc warnings: kernel/power/main.c:593: warning: expecting prototype for state(). Prototype was for state_show() instead kernel/power/main.c:593: warning: Function parameter or member 'kobj' not described in 'state_show' kernel/power/main.c:593: warning: Function parameter or member 'attr' not described in 'state_show' kernel/power/main.c:593: warning: Function parameter or member 'buf' not described in 'state_show' Signed-off-by: Randy Dunlap Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 12c7e1bb442f..44169f3081fd 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -577,7 +577,7 @@ static inline void pm_print_times_init(void) {} struct kobject *power_kobj; -/** +/* * state - control system sleep states. * * show() returns available sleep state labels, which may be "mem", "standby", -- cgit v1.2.3 From b2f6662ac08d0e7c25574ce53623c71bdae9dd78 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 11 Aug 2021 21:14:31 +0100 Subject: PM: cpu: Make notifier chain use a raw_spinlock_t Invoking atomic_notifier_chain_notify() requires acquiring a spinlock_t, which can block under CONFIG_PREEMPT_RT. Notifications for members of the cpu_pm notification chain will be issued by the idle task, which can never block. Making *all* atomic_notifiers use a raw_spinlock is too big of a hammer, as only notifications issued by the idle task are problematic. Special-case cpu_pm_notifier_chain by kludging a raw_notifier and raw_spinlock_t together, matching the atomic_notifier behavior with a raw_spinlock_t. Fixes: 70d932985757 ("notifier: Fix broken error handling pattern") Signed-off-by: Valentin Schneider Acked-by: Sebastian Andrzej Siewior Signed-off-by: Rafael J. Wysocki --- kernel/cpu_pm.c | 50 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index f7e1d0eccdbc..246efc74e3f3 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -13,19 +13,32 @@ #include #include -static ATOMIC_NOTIFIER_HEAD(cpu_pm_notifier_chain); +/* + * atomic_notifiers use a spinlock_t, which can block under PREEMPT_RT. + * Notifications for cpu_pm will be issued by the idle task itself, which can + * never block, IOW it requires using a raw_spinlock_t. + */ +static struct { + struct raw_notifier_head chain; + raw_spinlock_t lock; +} cpu_pm_notifier = { + .chain = RAW_NOTIFIER_INIT(cpu_pm_notifier.chain), + .lock = __RAW_SPIN_LOCK_UNLOCKED(cpu_pm_notifier.lock), +}; static int cpu_pm_notify(enum cpu_pm_event event) { int ret; /* - * atomic_notifier_call_chain has a RCU read critical section, which - * could be disfunctional in cpu idle. Copy RCU_NONIDLE code to let - * RCU know this. + * This introduces a RCU read critical section, which could be + * disfunctional in cpu idle. Copy RCU_NONIDLE code to let RCU know + * this. */ rcu_irq_enter_irqson(); - ret = atomic_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL); + rcu_read_lock(); + ret = raw_notifier_call_chain(&cpu_pm_notifier.chain, event, NULL); + rcu_read_unlock(); rcu_irq_exit_irqson(); return notifier_to_errno(ret); @@ -33,10 +46,13 @@ static int cpu_pm_notify(enum cpu_pm_event event) static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event event_down) { + unsigned long flags; int ret; rcu_irq_enter_irqson(); - ret = atomic_notifier_call_chain_robust(&cpu_pm_notifier_chain, event_up, event_down, NULL); + raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags); + ret = raw_notifier_call_chain_robust(&cpu_pm_notifier.chain, event_up, event_down, NULL); + raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags); rcu_irq_exit_irqson(); return notifier_to_errno(ret); @@ -49,12 +65,17 @@ static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event ev * Add a driver to a list of drivers that are notified about * CPU and CPU cluster low power entry and exit. * - * This function may sleep, and has the same return conditions as - * raw_notifier_chain_register. + * This function has the same return conditions as raw_notifier_chain_register. */ int cpu_pm_register_notifier(struct notifier_block *nb) { - return atomic_notifier_chain_register(&cpu_pm_notifier_chain, nb); + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags); + ret = raw_notifier_chain_register(&cpu_pm_notifier.chain, nb); + raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags); + return ret; } EXPORT_SYMBOL_GPL(cpu_pm_register_notifier); @@ -64,12 +85,17 @@ EXPORT_SYMBOL_GPL(cpu_pm_register_notifier); * * Remove a driver from the CPU PM notifier list. * - * This function may sleep, and has the same return conditions as - * raw_notifier_chain_unregister. + * This function has the same return conditions as raw_notifier_chain_unregister. */ int cpu_pm_unregister_notifier(struct notifier_block *nb) { - return atomic_notifier_chain_unregister(&cpu_pm_notifier_chain, nb); + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags); + ret = raw_notifier_chain_unregister(&cpu_pm_notifier.chain, nb); + raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags); + return ret; } EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier); -- cgit v1.2.3 From 15538a20579fa4047912508b03b39bcdeec26b05 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 11 Aug 2021 21:14:32 +0100 Subject: notifier: Remove atomic_notifier_call_chain_robust() This now has no more users, remove it. Suggested-by: Sebastian Andrzej Siewior Signed-off-by: Valentin Schneider Acked-by: Sebastian Andrzej Siewior Signed-off-by: Rafael J. Wysocki --- kernel/notifier.c | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'kernel') diff --git a/kernel/notifier.c b/kernel/notifier.c index 1b019cbca594..b8251dc0bc0f 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -172,25 +172,6 @@ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, } EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); -int atomic_notifier_call_chain_robust(struct atomic_notifier_head *nh, - unsigned long val_up, unsigned long val_down, void *v) -{ - unsigned long flags; - int ret; - - /* - * Musn't use RCU; because then the notifier list can - * change between the up and down traversal. - */ - spin_lock_irqsave(&nh->lock, flags); - ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v); - spin_unlock_irqrestore(&nh->lock, flags); - - return ret; -} -EXPORT_SYMBOL_GPL(atomic_notifier_call_chain_robust); -NOKPROBE_SYMBOL(atomic_notifier_call_chain_robust); - /** * atomic_notifier_call_chain - Call functions in an atomic notifier chain * @nh: Pointer to head of the atomic notifier chain -- cgit v1.2.3 From fb7dd8bca0139fd73d3f4a6cd257b11731317ded Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:05:54 -0700 Subject: bpf: Refactor BPF_PROG_RUN into a function Turn BPF_PROG_RUN into a proper always inlined function. No functional and performance changes are intended, but it makes it much easier to understand what's going on with how BPF programs are actually get executed. It's more obvious what types and callbacks are expected. Also extra () around input parameters can be dropped, as well as `__` variable prefixes intended to avoid naming collisions, which makes the code simpler to read and write. This refactoring also highlighted one extra issue. BPF_PROG_RUN is both a macro and an enum value (BPF_PROG_RUN == BPF_PROG_TEST_RUN). Turning BPF_PROG_RUN into a function causes naming conflict compilation error. So rename BPF_PROG_RUN into lower-case bpf_prog_run(), similar to bpf_prog_run_xdp(), bpf_prog_run_pin_on_cpu(), etc. All existing callers of BPF_PROG_RUN, the macro, are switched to bpf_prog_run() explicitly. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210815070609.987780-2-andrii@kernel.org --- kernel/bpf/bpf_iter.c | 2 +- kernel/bpf/cgroup.c | 16 ++++++++-------- kernel/bpf/core.c | 2 +- kernel/bpf/trampoline.c | 2 +- kernel/bpf/verifier.c | 2 +- kernel/events/core.c | 2 +- kernel/trace/bpf_trace.c | 4 ++-- 7 files changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 2e9d47bb40ff..b2ee45064e06 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -686,7 +686,7 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) rcu_read_lock(); migrate_disable(); - ret = BPF_PROG_RUN(prog, ctx); + ret = bpf_prog_run(prog, ctx); migrate_enable(); rcu_read_unlock(); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 9f6070369caa..16dc467adfa0 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1043,7 +1043,7 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); int ret; - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, bpf_prog_run); return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); @@ -1091,7 +1091,7 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); ret = BPF_PROG_RUN_ARRAY_FLAGS(cgrp->bpf.effective[type], &ctx, - BPF_PROG_RUN, flags); + bpf_prog_run, flags); return ret == 1 ? 0 : -EPERM; } @@ -1121,7 +1121,7 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, int ret; ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops, - BPF_PROG_RUN); + bpf_prog_run); return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); @@ -1140,7 +1140,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, rcu_read_lock(); cgrp = task_dfl_cgroup(current); allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, - BPF_PROG_RUN); + bpf_prog_run); rcu_read_unlock(); return !allow; @@ -1271,7 +1271,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, rcu_read_lock(); cgrp = task_dfl_cgroup(current); - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, bpf_prog_run); rcu_read_unlock(); kfree(ctx.cur_val); @@ -1386,7 +1386,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, lock_sock(sk); ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT], - &ctx, BPF_PROG_RUN); + &ctx, bpf_prog_run); release_sock(sk); if (!ret) { @@ -1496,7 +1496,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, lock_sock(sk); ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], - &ctx, BPF_PROG_RUN); + &ctx, bpf_prog_run); release_sock(sk); if (!ret) { @@ -1557,7 +1557,7 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, */ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], - &ctx, BPF_PROG_RUN); + &ctx, bpf_prog_run); if (!ret) return -EPERM; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 82af6279992d..5ee2ec27c3d4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1879,7 +1879,7 @@ static void bpf_prog_select_func(struct bpf_prog *fp) * @err: pointer to error variable * * Try to JIT eBPF program, if JIT is not available, use interpreter. - * The BPF program will be executed via BPF_PROG_RUN() macro. + * The BPF program will be executed via bpf_prog_run() function. * * Return: the &fp argument along with &err set to 0 for success or * a negative errno code on failure diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index b2535acfe9db..fe1e857324e6 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -548,7 +548,7 @@ static void notrace inc_misses_counter(struct bpf_prog *prog) u64_stats_update_end(&stats->syncp); } -/* The logic is similar to BPF_PROG_RUN, but with an explicit +/* The logic is similar to bpf_prog_run(), but with an explicit * rcu_read_lock() and migrate_disable() which are required * for the trampoline. The macro is split into * call __bpf_prog_enter diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5ea2238a6656..f5a0077c9981 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12383,7 +12383,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) subprog_end = env->subprog_info[i + 1].start; len = subprog_end - subprog_start; - /* BPF_PROG_RUN doesn't call subprogs directly, + /* bpf_prog_run() doesn't call subprogs directly, * hence main prog stats include the runtime of subprogs. * subprogs don't have IDs and not reachable via prog_get_next_id * func[i]->stats will never be accessed and stays NULL diff --git a/kernel/events/core.c b/kernel/events/core.c index 1cb1f9b8392e..7d20743b48e1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9913,7 +9913,7 @@ static void bpf_overflow_handler(struct perf_event *event, if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) goto out; rcu_read_lock(); - ret = BPF_PROG_RUN(event->prog, &ctx); + ret = bpf_prog_run(event->prog, &ctx); rcu_read_unlock(); out: __this_cpu_dec(bpf_prog_active); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0da94e1d6af9..05a5a556671d 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -124,7 +124,7 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) * out of events when it was updated in between this and the * rcu_dereference() which is accepted risk. */ - ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, BPF_PROG_RUN); + ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, bpf_prog_run); out: __this_cpu_dec(bpf_prog_active); @@ -1816,7 +1816,7 @@ void __bpf_trace_run(struct bpf_prog *prog, u64 *args) { cant_sleep(); rcu_read_lock(); - (void) BPF_PROG_RUN(prog, args); + (void) bpf_prog_run(prog, args); rcu_read_unlock(); } -- cgit v1.2.3 From 7d08c2c9117113fee118487425ed55efa50cbfa9 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:05:55 -0700 Subject: bpf: Refactor BPF_PROG_RUN_ARRAY family of macros into functions Similar to BPF_PROG_RUN, turn BPF_PROG_RUN_ARRAY macros into proper functions with all the same readability and maintainability benefits. Making them into functions required shuffling around bpf_set_run_ctx/bpf_reset_run_ctx functions. Also, explicitly specifying the type of the BPF prog run callback required adjusting __bpf_prog_run_save_cb() to accept const void *, casted internally to const struct sk_buff. Further, split out a cgroup-specific BPF_PROG_RUN_ARRAY_CG and BPF_PROG_RUN_ARRAY_CG_FLAGS from the more generic BPF_PROG_RUN_ARRAY due to the differences in bpf_run_ctx used for those two different use cases. I think BPF_PROG_RUN_ARRAY_CG would benefit from further refactoring to accept struct cgroup and enum bpf_attach_type instead of bpf_prog_array, fetching cgrp->bpf.effective[type] and RCU-dereferencing it internally. But that required including include/linux/cgroup-defs.h, which I wasn't sure is ok with everyone. The remaining generic BPF_PROG_RUN_ARRAY function will be extended to pass-through user-provided context value in the next patch. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210815070609.987780-3-andrii@kernel.org --- kernel/bpf/cgroup.c | 32 ++++++++++++++++---------------- kernel/trace/bpf_trace.c | 2 +- 2 files changed, 17 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 16dc467adfa0..a1dedba4c174 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1012,8 +1012,8 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY( cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb); } else { - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, - __bpf_prog_run_save_cb); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], skb, + __bpf_prog_run_save_cb); ret = (ret == 1 ? 0 : -EPERM); } bpf_restore_data_end(skb, saved_data_end); @@ -1043,7 +1043,7 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); int ret; - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], sk, bpf_prog_run); return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); @@ -1090,8 +1090,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, } cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - ret = BPF_PROG_RUN_ARRAY_FLAGS(cgrp->bpf.effective[type], &ctx, - bpf_prog_run, flags); + ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[type], &ctx, + bpf_prog_run, flags); return ret == 1 ? 0 : -EPERM; } @@ -1120,8 +1120,8 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); int ret; - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops, - bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], sock_ops, + bpf_prog_run); return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); @@ -1139,8 +1139,8 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, rcu_read_lock(); cgrp = task_dfl_cgroup(current); - allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, - bpf_prog_run); + allow = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], &ctx, + bpf_prog_run); rcu_read_unlock(); return !allow; @@ -1271,7 +1271,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, rcu_read_lock(); cgrp = task_dfl_cgroup(current); - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], &ctx, bpf_prog_run); rcu_read_unlock(); kfree(ctx.cur_val); @@ -1385,8 +1385,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, } lock_sock(sk); - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT], - &ctx, bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT], + &ctx, bpf_prog_run); release_sock(sk); if (!ret) { @@ -1495,8 +1495,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, } lock_sock(sk); - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], - &ctx, bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], + &ctx, bpf_prog_run); release_sock(sk); if (!ret) { @@ -1556,8 +1556,8 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, * be called if that data shouldn't be "exported". */ - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], - &ctx, bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], + &ctx, bpf_prog_run); if (!ret) return -EPERM; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 05a5a556671d..91867b14b222 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -124,7 +124,7 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) * out of events when it was updated in between this and the * rcu_dereference() which is accepted risk. */ - ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY(call->prog_array, ctx, bpf_prog_run); out: __this_cpu_dec(bpf_prog_active); -- cgit v1.2.3 From 652c1b17b85b9c195978c051aa283027529db1fe Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:05:56 -0700 Subject: bpf: Refactor perf_event_set_bpf_prog() to use struct bpf_prog input Make internal perf_event_set_bpf_prog() use struct bpf_prog pointer as an input argument, which makes it easier to re-use for other internal uses (coming up for BPF link in the next patch). BPF program FD is not as convenient and in some cases it's not available. So switch to struct bpf_prog, move out refcounting outside and let caller do bpf_prog_put() in case of an error. This follows the approach of most of the other BPF internal functions. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Peter Zijlstra (Intel) Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210815070609.987780-4-andrii@kernel.org --- kernel/events/core.c | 61 ++++++++++++++++++++++++---------------------------- 1 file changed, 28 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 7d20743b48e1..2f07718bd41c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5574,7 +5574,7 @@ static inline int perf_fget_light(int fd, struct fd *p) static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event); static int perf_event_set_filter(struct perf_event *event, void __user *arg); -static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd); +static int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog); static int perf_copy_attr(struct perf_event_attr __user *uattr, struct perf_event_attr *attr); @@ -5637,7 +5637,22 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon return perf_event_set_filter(event, (void __user *)arg); case PERF_EVENT_IOC_SET_BPF: - return perf_event_set_bpf_prog(event, arg); + { + struct bpf_prog *prog; + int err; + + prog = bpf_prog_get(arg); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + err = perf_event_set_bpf_prog(event, prog); + if (err) { + bpf_prog_put(prog); + return err; + } + + return 0; + } case PERF_EVENT_IOC_PAUSE_OUTPUT: { struct perf_buffer *rb; @@ -9923,10 +9938,8 @@ out: event->orig_overflow_handler(event, data, regs); } -static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) +static int perf_event_set_bpf_handler(struct perf_event *event, struct bpf_prog *prog) { - struct bpf_prog *prog; - if (event->overflow_handler_context) /* hw breakpoint or kernel counter */ return -EINVAL; @@ -9934,9 +9947,8 @@ static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) if (event->prog) return -EEXIST; - prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT); - if (IS_ERR(prog)) - return PTR_ERR(prog); + if (prog->type != BPF_PROG_TYPE_PERF_EVENT) + return -EINVAL; if (event->attr.precise_ip && prog->call_get_stack && @@ -9952,7 +9964,6 @@ static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) * attached to perf_sample_data, do not allow attaching BPF * program that calls bpf_get_[stack|stackid]. */ - bpf_prog_put(prog); return -EPROTO; } @@ -9974,7 +9985,7 @@ static void perf_event_free_bpf_handler(struct perf_event *event) bpf_prog_put(prog); } #else -static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) +static int perf_event_set_bpf_handler(struct perf_event *event, struct bpf_prog *prog) { return -EOPNOTSUPP; } @@ -10002,14 +10013,12 @@ static inline bool perf_event_is_tracing(struct perf_event *event) return false; } -static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) +static int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog) { bool is_kprobe, is_tracepoint, is_syscall_tp; - struct bpf_prog *prog; - int ret; if (!perf_event_is_tracing(event)) - return perf_event_set_bpf_handler(event, prog_fd); + return perf_event_set_bpf_handler(event, prog); is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; @@ -10018,38 +10027,24 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) /* bpf programs can only be attached to u/kprobe or tracepoint */ return -EINVAL; - prog = bpf_prog_get(prog_fd); - if (IS_ERR(prog)) - return PTR_ERR(prog); - if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) || (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) || - (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) { - /* valid fd, but invalid bpf program type */ - bpf_prog_put(prog); + (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) return -EINVAL; - } /* Kprobe override only works for kprobes, not uprobes. */ if (prog->kprobe_override && - !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) { - bpf_prog_put(prog); + !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) return -EINVAL; - } if (is_tracepoint || is_syscall_tp) { int off = trace_event_get_offsets(event->tp_event); - if (prog->aux->max_ctx_offset > off) { - bpf_prog_put(prog); + if (prog->aux->max_ctx_offset > off) return -EACCES; - } } - ret = perf_event_attach_bpf_prog(event, prog); - if (ret) - bpf_prog_put(prog); - return ret; + return perf_event_attach_bpf_prog(event, prog); } static void perf_event_free_bpf_prog(struct perf_event *event) @@ -10071,7 +10066,7 @@ static void perf_event_free_filter(struct perf_event *event) { } -static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) +static int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog) { return -ENOENT; } -- cgit v1.2.3 From b89fbfbb854c9afc3047e8273cc3a694650b802e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:05:57 -0700 Subject: bpf: Implement minimal BPF perf link Introduce a new type of BPF link - BPF perf link. This brings perf_event-based BPF program attachments (perf_event, tracepoints, kprobes, and uprobes) into the common BPF link infrastructure, allowing to list all active perf_event based attachments, auto-detaching BPF program from perf_event when link's FD is closed, get generic BPF link fdinfo/get_info functionality. BPF_LINK_CREATE command expects perf_event's FD as target_fd. No extra flags are currently supported. Force-detaching and atomic BPF program updates are not yet implemented, but with perf_event-based BPF links we now have common framework for this without the need to extend ioctl()-based perf_event interface. One interesting consideration is a new value for bpf_attach_type, which BPF_LINK_CREATE command expects. Generally, it's either 1-to-1 mapping from bpf_attach_type to bpf_prog_type, or many-to-1 mapping from a subset of bpf_attach_types to one bpf_prog_type (e.g., see BPF_PROG_TYPE_SK_SKB or BPF_PROG_TYPE_CGROUP_SOCK). In this case, though, we have three different program types (KPROBE, TRACEPOINT, PERF_EVENT) using the same perf_event-based mechanism, so it's many bpf_prog_types to one bpf_attach_type. I chose to define a single BPF_PERF_EVENT attach type for all of them and adjust link_create()'s logic for checking correspondence between attach type and program type. The alternative would be to define three new attach types (e.g., BPF_KPROBE, BPF_TRACEPOINT, and BPF_PERF_EVENT), but that seemed like unnecessary overkill and BPF_KPROBE will cause naming conflicts with BPF_KPROBE() macro, defined by libbpf. I chose to not do this to avoid unnecessary proliferation of bpf_attach_type enum values and not have to deal with naming conflicts. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/bpf/20210815070609.987780-5-andrii@kernel.org --- kernel/bpf/syscall.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++---- kernel/events/core.c | 10 ++--- 2 files changed, 102 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9a2068e39d23..80c03bedd6e6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2906,6 +2906,79 @@ static const struct bpf_link_ops bpf_raw_tp_link_lops = { .fill_link_info = bpf_raw_tp_link_fill_link_info, }; +#ifdef CONFIG_PERF_EVENTS +struct bpf_perf_link { + struct bpf_link link; + struct file *perf_file; +}; + +static void bpf_perf_link_release(struct bpf_link *link) +{ + struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link); + struct perf_event *event = perf_link->perf_file->private_data; + + perf_event_free_bpf_prog(event); + fput(perf_link->perf_file); +} + +static void bpf_perf_link_dealloc(struct bpf_link *link) +{ + struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link); + + kfree(perf_link); +} + +static const struct bpf_link_ops bpf_perf_link_lops = { + .release = bpf_perf_link_release, + .dealloc = bpf_perf_link_dealloc, +}; + +static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_link_primer link_primer; + struct bpf_perf_link *link; + struct perf_event *event; + struct file *perf_file; + int err; + + if (attr->link_create.flags) + return -EINVAL; + + perf_file = perf_event_get(attr->link_create.target_fd); + if (IS_ERR(perf_file)) + return PTR_ERR(perf_file); + + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) { + err = -ENOMEM; + goto out_put_file; + } + bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog); + link->perf_file = perf_file; + + err = bpf_link_prime(&link->link, &link_primer); + if (err) { + kfree(link); + goto out_put_file; + } + + event = perf_file->private_data; + err = perf_event_set_bpf_prog(event, prog); + if (err) { + bpf_link_cleanup(&link_primer); + goto out_put_file; + } + /* perf_event_set_bpf_prog() doesn't take its own refcnt on prog */ + bpf_prog_inc(prog); + + return bpf_link_settle(&link_primer); + +out_put_file: + fput(perf_file); + return err; +} +#endif /* CONFIG_PERF_EVENTS */ + #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd static int bpf_raw_tracepoint_open(const union bpf_attr *attr) @@ -4147,15 +4220,26 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) if (ret) goto out; - if (prog->type == BPF_PROG_TYPE_EXT) { + switch (prog->type) { + case BPF_PROG_TYPE_EXT: ret = tracing_bpf_link_attach(attr, uattr, prog); goto out; - } - - ptype = attach_type_to_prog_type(attr->link_create.attach_type); - if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) { - ret = -EINVAL; - goto out; + case BPF_PROG_TYPE_PERF_EVENT: + case BPF_PROG_TYPE_KPROBE: + case BPF_PROG_TYPE_TRACEPOINT: + if (attr->link_create.attach_type != BPF_PERF_EVENT) { + ret = -EINVAL; + goto out; + } + ptype = prog->type; + break; + default: + ptype = attach_type_to_prog_type(attr->link_create.attach_type); + if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) { + ret = -EINVAL; + goto out; + } + break; } switch (ptype) { @@ -4179,6 +4263,13 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_PROG_TYPE_XDP: ret = bpf_xdp_link_attach(attr, prog); break; +#endif +#ifdef CONFIG_PERF_EVENTS + case BPF_PROG_TYPE_PERF_EVENT: + case BPF_PROG_TYPE_TRACEPOINT: + case BPF_PROG_TYPE_KPROBE: + ret = bpf_perf_link_attach(attr, prog); + break; #endif default: ret = -EINVAL; diff --git a/kernel/events/core.c b/kernel/events/core.c index 2f07718bd41c..9fd65667bcb2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4697,7 +4697,6 @@ errout: } static void perf_event_free_filter(struct perf_event *event); -static void perf_event_free_bpf_prog(struct perf_event *event); static void free_event_rcu(struct rcu_head *head) { @@ -5574,7 +5573,6 @@ static inline int perf_fget_light(int fd, struct fd *p) static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event); static int perf_event_set_filter(struct perf_event *event, void __user *arg); -static int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog); static int perf_copy_attr(struct perf_event_attr __user *uattr, struct perf_event_attr *attr); @@ -10013,7 +10011,7 @@ static inline bool perf_event_is_tracing(struct perf_event *event) return false; } -static int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog) +int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog) { bool is_kprobe, is_tracepoint, is_syscall_tp; @@ -10047,7 +10045,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *pr return perf_event_attach_bpf_prog(event, prog); } -static void perf_event_free_bpf_prog(struct perf_event *event) +void perf_event_free_bpf_prog(struct perf_event *event) { if (!perf_event_is_tracing(event)) { perf_event_free_bpf_handler(event); @@ -10066,12 +10064,12 @@ static void perf_event_free_filter(struct perf_event *event) { } -static int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog) +int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog) { return -ENOENT; } -static void perf_event_free_bpf_prog(struct perf_event *event) +void perf_event_free_bpf_prog(struct perf_event *event) { } #endif /* CONFIG_EVENT_TRACING */ -- cgit v1.2.3 From 82e6b1eee6a8875ef4eacfd60711cce6965c6b04 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:05:58 -0700 Subject: bpf: Allow to specify user-provided bpf_cookie for BPF perf links Add ability for users to specify custom u64 value (bpf_cookie) when creating BPF link for perf_event-backed BPF programs (kprobe/uprobe, perf_event, tracepoints). This is useful for cases when the same BPF program is used for attaching and processing invocation of different tracepoints/kprobes/uprobes in a generic fashion, but such that each invocation is distinguished from each other (e.g., BPF program can look up additional information associated with a specific kernel function without having to rely on function IP lookups). This enables new use cases to be implemented simply and efficiently that previously were possible only through code generation (and thus multiple instances of almost identical BPF program) or compilation at runtime (BCC-style) on target hosts (even more expensive resource-wise). For uprobes it is not even possible in some cases to know function IP before hand (e.g., when attaching to shared library without PID filtering, in which case base load address is not known for a library). This is done by storing u64 bpf_cookie in struct bpf_prog_array_item, corresponding to each attached and run BPF program. Given cgroup BPF programs already use two 8-byte pointers for their needs and cgroup BPF programs don't have (yet?) support for bpf_cookie, reuse that space through union of cgroup_storage and new bpf_cookie field. Make it available to kprobe/tracepoint BPF programs through bpf_trace_run_ctx. This is set by BPF_PROG_RUN_ARRAY, used by kprobe/uprobe/tracepoint BPF program execution code, which luckily is now also split from BPF_PROG_RUN_ARRAY_CG. This run context will be utilized by a new BPF helper giving access to this user-provided cookie value from inside a BPF program. Generic perf_event BPF programs will access this value from perf_event itself through passed in BPF program context. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/bpf/20210815070609.987780-6-andrii@kernel.org --- kernel/bpf/core.c | 29 ++++++++++++++++++----------- kernel/bpf/syscall.c | 2 +- kernel/events/core.c | 21 ++++++++++++++------- kernel/trace/bpf_trace.c | 8 +++++--- 4 files changed, 38 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5ee2ec27c3d4..91f24c7b38a1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2119,13 +2119,13 @@ int bpf_prog_array_update_at(struct bpf_prog_array *array, int index, int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *exclude_prog, struct bpf_prog *include_prog, + u64 bpf_cookie, struct bpf_prog_array **new_array) { int new_prog_cnt, carry_prog_cnt = 0; - struct bpf_prog_array_item *existing; + struct bpf_prog_array_item *existing, *new; struct bpf_prog_array *array; bool found_exclude = false; - int new_prog_idx = 0; /* Figure out how many existing progs we need to carry over to * the new array. @@ -2162,20 +2162,27 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array, array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL); if (!array) return -ENOMEM; + new = array->items; /* Fill in the new prog array */ if (carry_prog_cnt) { existing = old_array->items; - for (; existing->prog; existing++) - if (existing->prog != exclude_prog && - existing->prog != &dummy_bpf_prog.prog) { - array->items[new_prog_idx++].prog = - existing->prog; - } + for (; existing->prog; existing++) { + if (existing->prog == exclude_prog || + existing->prog == &dummy_bpf_prog.prog) + continue; + + new->prog = existing->prog; + new->bpf_cookie = existing->bpf_cookie; + new++; + } } - if (include_prog) - array->items[new_prog_idx++].prog = include_prog; - array->items[new_prog_idx].prog = NULL; + if (include_prog) { + new->prog = include_prog; + new->bpf_cookie = bpf_cookie; + new++; + } + new->prog = NULL; *new_array = array; return 0; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 80c03bedd6e6..7420e1334ab2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2963,7 +2963,7 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro } event = perf_file->private_data; - err = perf_event_set_bpf_prog(event, prog); + err = perf_event_set_bpf_prog(event, prog, attr->link_create.perf_event.bpf_cookie); if (err) { bpf_link_cleanup(&link_primer); goto out_put_file; diff --git a/kernel/events/core.c b/kernel/events/core.c index 9fd65667bcb2..2d1e63dd97f2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5643,7 +5643,7 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon if (IS_ERR(prog)) return PTR_ERR(prog); - err = perf_event_set_bpf_prog(event, prog); + err = perf_event_set_bpf_prog(event, prog, 0); if (err) { bpf_prog_put(prog); return err; @@ -9936,7 +9936,9 @@ out: event->orig_overflow_handler(event, data, regs); } -static int perf_event_set_bpf_handler(struct perf_event *event, struct bpf_prog *prog) +static int perf_event_set_bpf_handler(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) { if (event->overflow_handler_context) /* hw breakpoint or kernel counter */ @@ -9966,6 +9968,7 @@ static int perf_event_set_bpf_handler(struct perf_event *event, struct bpf_prog } event->prog = prog; + event->bpf_cookie = bpf_cookie; event->orig_overflow_handler = READ_ONCE(event->overflow_handler); WRITE_ONCE(event->overflow_handler, bpf_overflow_handler); return 0; @@ -9983,7 +9986,9 @@ static void perf_event_free_bpf_handler(struct perf_event *event) bpf_prog_put(prog); } #else -static int perf_event_set_bpf_handler(struct perf_event *event, struct bpf_prog *prog) +static int perf_event_set_bpf_handler(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) { return -EOPNOTSUPP; } @@ -10011,12 +10016,13 @@ static inline bool perf_event_is_tracing(struct perf_event *event) return false; } -int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog) +int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, + u64 bpf_cookie) { bool is_kprobe, is_tracepoint, is_syscall_tp; if (!perf_event_is_tracing(event)) - return perf_event_set_bpf_handler(event, prog); + return perf_event_set_bpf_handler(event, prog, bpf_cookie); is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; @@ -10042,7 +10048,7 @@ int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog) return -EACCES; } - return perf_event_attach_bpf_prog(event, prog); + return perf_event_attach_bpf_prog(event, prog, bpf_cookie); } void perf_event_free_bpf_prog(struct perf_event *event) @@ -10064,7 +10070,8 @@ static void perf_event_free_filter(struct perf_event *event) { } -int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog) +int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, + u64 bpf_cookie) { return -ENOENT; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 91867b14b222..57879d28f824 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1675,7 +1675,8 @@ static DEFINE_MUTEX(bpf_event_mutex); #define BPF_TRACE_MAX_PROGS 64 int perf_event_attach_bpf_prog(struct perf_event *event, - struct bpf_prog *prog) + struct bpf_prog *prog, + u64 bpf_cookie) { struct bpf_prog_array *old_array; struct bpf_prog_array *new_array; @@ -1702,12 +1703,13 @@ int perf_event_attach_bpf_prog(struct perf_event *event, goto unlock; } - ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array); + ret = bpf_prog_array_copy(old_array, NULL, prog, bpf_cookie, &new_array); if (ret < 0) goto unlock; /* set the new array to event->tp_event and set event->prog */ event->prog = prog; + event->bpf_cookie = bpf_cookie; rcu_assign_pointer(event->tp_event->prog_array, new_array); bpf_prog_array_free(old_array); @@ -1728,7 +1730,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event) goto unlock; old_array = bpf_event_rcu_dereference(event->tp_event->prog_array); - ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array); + ret = bpf_prog_array_copy(old_array, event->prog, NULL, 0, &new_array); if (ret == -ENOENT) goto unlock; if (ret < 0) { -- cgit v1.2.3 From 7adfc6c9b315e174cf8743b21b7b691c8766791b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:05:59 -0700 Subject: bpf: Add bpf_get_attach_cookie() BPF helper to access bpf_cookie value Add new BPF helper, bpf_get_attach_cookie(), which can be used by BPF programs to get access to a user-provided bpf_cookie value, specified during BPF program attachment (BPF link creation) time. Naming is hard, though. With the concept being named "BPF cookie", I've considered calling the helper: - bpf_get_cookie() -- seems too unspecific and easily mistaken with socket cookie; - bpf_get_bpf_cookie() -- too much tautology; - bpf_get_link_cookie() -- would be ok, but while we create a BPF link to attach BPF program to BPF hook, it's still an "attachment" and the bpf_cookie is associated with BPF program attachment to a hook, not a BPF link itself. Technically, we could support bpf_cookie with old-style cgroup programs.So I ultimately rejected it in favor of bpf_get_attach_cookie(). Currently all perf_event-backed BPF program types support bpf_get_attach_cookie() helper. Follow-up patches will add support for fentry/fexit programs as well. While at it, mark bpf_tracing_func_proto() as static to make it obvious that it's only used from within the kernel/trace/bpf_trace.c. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210815070609.987780-7-andrii@kernel.org --- kernel/trace/bpf_trace.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 57879d28f824..cbc73c08c4a4 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -975,7 +975,34 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = { .arg1_type = ARG_PTR_TO_CTX, }; -const struct bpf_func_proto * +BPF_CALL_1(bpf_get_attach_cookie_trace, void *, ctx) +{ + struct bpf_trace_run_ctx *run_ctx; + + run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx); + return run_ctx->bpf_cookie; +} + +static const struct bpf_func_proto bpf_get_attach_cookie_proto_trace = { + .func = bpf_get_attach_cookie_trace, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_1(bpf_get_attach_cookie_pe, struct bpf_perf_event_data_kern *, ctx) +{ + return ctx->event->bpf_cookie; +} + +static const struct bpf_func_proto bpf_get_attach_cookie_proto_pe = { + .func = bpf_get_attach_cookie_pe, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +static const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { @@ -1109,6 +1136,8 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #endif case BPF_FUNC_get_func_ip: return &bpf_get_func_ip_proto_kprobe; + case BPF_FUNC_get_attach_cookie: + return &bpf_get_attach_cookie_proto_trace; default: return bpf_tracing_func_proto(func_id, prog); } @@ -1219,6 +1248,8 @@ tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_stackid_proto_tp; case BPF_FUNC_get_stack: return &bpf_get_stack_proto_tp; + case BPF_FUNC_get_attach_cookie: + return &bpf_get_attach_cookie_proto_trace; default: return bpf_tracing_func_proto(func_id, prog); } @@ -1326,6 +1357,8 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_perf_prog_read_value_proto; case BPF_FUNC_read_branch_records: return &bpf_read_branch_records_proto; + case BPF_FUNC_get_attach_cookie: + return &bpf_get_attach_cookie_proto_pe; default: return bpf_tracing_func_proto(func_id, prog); } -- cgit v1.2.3 From a2071573d6346819cc4e5787b4206f2184985160 Mon Sep 17 00:00:00 2001 From: Jia He Date: Tue, 3 Aug 2021 12:59:36 +0200 Subject: sysctl: introduce new proc handler proc_dobool This is to let bool variable could be correctly displayed in big/little endian sysctl procfs. sizeof(bool) is arch dependent, proc_dobool should work in all arches. Suggested-by: Pan Xinhui Signed-off-by: Jia He [thuth: rebased the patch to the current kernel version] Signed-off-by: Thomas Huth Signed-off-by: Chuck Lever --- kernel/sysctl.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 272f4a272f8c..25e49b4d8049 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -536,6 +536,21 @@ static void proc_put_char(void **buf, size_t *size, char c) } } +static int do_proc_dobool_conv(bool *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + *(bool *)valp = *lvalp; + } else { + int val = *(bool *)valp; + + *lvalp = (unsigned long)val; + *negp = false; + } + return 0; +} + static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, int *valp, int write, void *data) @@ -798,6 +813,26 @@ static int do_proc_douintvec(struct ctl_table *table, int write, buffer, lenp, ppos, conv, data); } +/** + * proc_dobool - read/write a bool + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * + * Returns 0 on success. + */ +int proc_dobool(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table, write, buffer, lenp, ppos, + do_proc_dobool_conv, NULL); +} + /** * proc_dointvec - read a vector of integers * @table: the sysctl table @@ -1630,6 +1665,12 @@ int proc_dostring(struct ctl_table *table, int write, return -ENOSYS; } +int proc_dobool(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + int proc_dointvec(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -3425,6 +3466,7 @@ int __init sysctl_init(void) * No sense putting this after each symbol definition, twice, * exception granted :-) */ +EXPORT_SYMBOL(proc_dobool); EXPORT_SYMBOL(proc_dointvec); EXPORT_SYMBOL(proc_douintvec); EXPORT_SYMBOL(proc_dointvec_jiffies); -- cgit v1.2.3 From f97a4a1a3f8769e3452885967955e21c88f3f263 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 17 Aug 2021 09:32:34 +0800 Subject: workqueue: Rename "delayed" (delayed by active management) to "inactive" There are two kinds of "delayed" work items in workqueue subsystem. One is for timer-delayed work items which are visible to workqueue users. The other kind is for work items delayed by active management which can not be directly visible to workqueue users. We mixed the word "delayed" for both kinds and caused somewhat ambiguity. This patch renames the later one (delayed by active management) to "inactive", because it is used for workqueue active management and most of its related symbols are named with "active" or "activate". All "delayed" and "DELAYED" are carefully checked and renamed one by one to avoid accidentally changing the name of the other kind for timer-delayed. No functional change intended. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 58 +++++++++++++++++++++++++++--------------------------- 1 file changed, 29 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9bce39dba297..9a00ba096032 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -207,7 +207,7 @@ struct pool_workqueue { /* L: nr of in_flight works */ int nr_active; /* L: nr of active works */ int max_active; /* L: max active works */ - struct list_head delayed_works; /* L: delayed works */ + struct list_head inactive_works; /* L: inactive works */ struct list_head pwqs_node; /* WR: node on wq->pwqs */ struct list_head mayday_node; /* MD: node on wq->maydays */ @@ -1136,7 +1136,7 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq) } } -static void pwq_activate_delayed_work(struct work_struct *work) +static void pwq_activate_inactive_work(struct work_struct *work) { struct pool_workqueue *pwq = get_work_pwq(work); @@ -1144,16 +1144,16 @@ static void pwq_activate_delayed_work(struct work_struct *work) if (list_empty(&pwq->pool->worklist)) pwq->pool->watchdog_ts = jiffies; move_linked_works(work, &pwq->pool->worklist, NULL); - __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); + __clear_bit(WORK_STRUCT_INACTIVE_BIT, work_data_bits(work)); pwq->nr_active++; } -static void pwq_activate_first_delayed(struct pool_workqueue *pwq) +static void pwq_activate_first_inactive(struct pool_workqueue *pwq) { - struct work_struct *work = list_first_entry(&pwq->delayed_works, + struct work_struct *work = list_first_entry(&pwq->inactive_works, struct work_struct, entry); - pwq_activate_delayed_work(work); + pwq_activate_inactive_work(work); } /** @@ -1176,10 +1176,10 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) pwq->nr_in_flight[color]--; pwq->nr_active--; - if (!list_empty(&pwq->delayed_works)) { - /* one down, submit a delayed one */ + if (!list_empty(&pwq->inactive_works)) { + /* one down, submit an inactive one */ if (pwq->nr_active < pwq->max_active) - pwq_activate_first_delayed(pwq); + pwq_activate_first_inactive(pwq); } /* is flush in progress and are we at the flushing tip? */ @@ -1281,14 +1281,14 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, debug_work_deactivate(work); /* - * A delayed work item cannot be grabbed directly because + * An inactive work item cannot be grabbed directly because * it might have linked NO_COLOR work items which, if left - * on the delayed_list, will confuse pwq->nr_active + * on the inactive_works list, will confuse pwq->nr_active * management later on and cause stall. Make sure the work * item is activated before grabbing. */ - if (*work_data_bits(work) & WORK_STRUCT_DELAYED) - pwq_activate_delayed_work(work); + if (*work_data_bits(work) & WORK_STRUCT_INACTIVE) + pwq_activate_inactive_work(work); list_del_init(&work->entry); pwq_dec_nr_in_flight(pwq, get_work_color(work)); @@ -1490,8 +1490,8 @@ retry: if (list_empty(worklist)) pwq->pool->watchdog_ts = jiffies; } else { - work_flags |= WORK_STRUCT_DELAYED; - worklist = &pwq->delayed_works; + work_flags |= WORK_STRUCT_INACTIVE; + worklist = &pwq->inactive_works; } debug_work_activate(work); @@ -2530,7 +2530,7 @@ repeat: /* * The above execution of rescued work items could * have created more to rescue through - * pwq_activate_first_delayed() or chained + * pwq_activate_first_inactive() or chained * queueing. Let's put @pwq back on mayday list so * that such back-to-back work items, which may be * being used to relieve memory pressure, don't @@ -2956,7 +2956,7 @@ reflush: bool drained; raw_spin_lock_irq(&pwq->pool->lock); - drained = !pwq->nr_active && list_empty(&pwq->delayed_works); + drained = !pwq->nr_active && list_empty(&pwq->inactive_works); raw_spin_unlock_irq(&pwq->pool->lock); if (drained) @@ -3712,7 +3712,7 @@ static void pwq_unbound_release_workfn(struct work_struct *work) * @pwq: target pool_workqueue * * If @pwq isn't freezing, set @pwq->max_active to the associated - * workqueue's saved_max_active and activate delayed work items + * workqueue's saved_max_active and activate inactive work items * accordingly. If @pwq is freezing, clear @pwq->max_active to zero. */ static void pwq_adjust_max_active(struct pool_workqueue *pwq) @@ -3741,9 +3741,9 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) pwq->max_active = wq->saved_max_active; - while (!list_empty(&pwq->delayed_works) && + while (!list_empty(&pwq->inactive_works) && pwq->nr_active < pwq->max_active) { - pwq_activate_first_delayed(pwq); + pwq_activate_first_inactive(pwq); kick = true; } @@ -3774,7 +3774,7 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, pwq->wq = wq; pwq->flush_color = -1; pwq->refcnt = 1; - INIT_LIST_HEAD(&pwq->delayed_works); + INIT_LIST_HEAD(&pwq->inactive_works); INIT_LIST_HEAD(&pwq->pwqs_node); INIT_LIST_HEAD(&pwq->mayday_node); INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); @@ -4361,7 +4361,7 @@ static bool pwq_busy(struct pool_workqueue *pwq) if ((pwq != pwq->wq->dfl_pwq) && (pwq->refcnt > 1)) return true; - if (pwq->nr_active || !list_empty(&pwq->delayed_works)) + if (pwq->nr_active || !list_empty(&pwq->inactive_works)) return true; return false; @@ -4557,7 +4557,7 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) else pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); - ret = !list_empty(&pwq->delayed_works); + ret = !list_empty(&pwq->inactive_works); preempt_enable(); rcu_read_unlock(); @@ -4753,11 +4753,11 @@ static void show_pwq(struct pool_workqueue *pwq) pr_cont("\n"); } - if (!list_empty(&pwq->delayed_works)) { + if (!list_empty(&pwq->inactive_works)) { bool comma = false; - pr_info(" delayed:"); - list_for_each_entry(work, &pwq->delayed_works, entry) { + pr_info(" inactive:"); + list_for_each_entry(work, &pwq->inactive_works, entry) { pr_cont_work(comma, work); comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); } @@ -4787,7 +4787,7 @@ void show_workqueue_state(void) bool idle = true; for_each_pwq(pwq, wq) { - if (pwq->nr_active || !list_empty(&pwq->delayed_works)) { + if (pwq->nr_active || !list_empty(&pwq->inactive_works)) { idle = false; break; } @@ -4799,7 +4799,7 @@ void show_workqueue_state(void) for_each_pwq(pwq, wq) { raw_spin_lock_irqsave(&pwq->pool->lock, flags); - if (pwq->nr_active || !list_empty(&pwq->delayed_works)) + if (pwq->nr_active || !list_empty(&pwq->inactive_works)) show_pwq(pwq); raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); /* @@ -5182,7 +5182,7 @@ EXPORT_SYMBOL_GPL(work_on_cpu_safe); * freeze_workqueues_begin - begin freezing workqueues * * Start freezing workqueues. After this function returns, all freezable - * workqueues will queue new works to their delayed_works list instead of + * workqueues will queue new works to their inactive_works list instead of * pool->worklist. * * CONTEXT: -- cgit v1.2.3 From c4560c2c88a4c809800ba8e76faabaf80bf6ee89 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 17 Aug 2021 09:32:35 +0800 Subject: workqueue: Change arguement of pwq_dec_nr_in_flight() Make pwq_dec_nr_in_flight() use work_data rather just work_color. Prepare for later patch to get WORK_STRUCT_INACTIVE bit from work_data in pwq_dec_nr_in_flight(). No functional change intended. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9a00ba096032..41796000f3eb 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -579,9 +579,9 @@ static unsigned int work_color_to_flags(int color) return color << WORK_STRUCT_COLOR_SHIFT; } -static int get_work_color(struct work_struct *work) +static int get_work_color(unsigned long work_data) { - return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) & + return (work_data >> WORK_STRUCT_COLOR_SHIFT) & ((1 << WORK_STRUCT_COLOR_BITS) - 1); } @@ -1159,7 +1159,7 @@ static void pwq_activate_first_inactive(struct pool_workqueue *pwq) /** * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight * @pwq: pwq of interest - * @color: color of work which left the queue + * @work_data: work_data of work which left the queue * * A work either has completed or is removed from pending queue, * decrement nr_in_flight of its pwq and handle workqueue flushing. @@ -1167,8 +1167,10 @@ static void pwq_activate_first_inactive(struct pool_workqueue *pwq) * CONTEXT: * raw_spin_lock_irq(pool->lock). */ -static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) +static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_data) { + int color = get_work_color(work_data); + /* uncolored work items don't participate in flushing or nr_active */ if (color == WORK_NO_COLOR) goto out_put; @@ -1291,7 +1293,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, pwq_activate_inactive_work(work); list_del_init(&work->entry); - pwq_dec_nr_in_flight(pwq, get_work_color(work)); + pwq_dec_nr_in_flight(pwq, *work_data_bits(work)); /* work->data points to pwq iff queued, point to pool */ set_work_pool_and_keep_pending(work, pool->id); @@ -2172,7 +2174,7 @@ __acquires(&pool->lock) struct pool_workqueue *pwq = get_work_pwq(work); struct worker_pool *pool = worker->pool; bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE; - int work_color; + unsigned long work_data; struct worker *collision; #ifdef CONFIG_LOCKDEP /* @@ -2208,7 +2210,7 @@ __acquires(&pool->lock) worker->current_work = work; worker->current_func = work->func; worker->current_pwq = pwq; - work_color = get_work_color(work); + work_data = *work_data_bits(work); /* * Record wq name for cmdline and debug reporting, may get @@ -2314,7 +2316,7 @@ __acquires(&pool->lock) worker->current_work = NULL; worker->current_func = NULL; worker->current_pwq = NULL; - pwq_dec_nr_in_flight(pwq, work_color); + pwq_dec_nr_in_flight(pwq, work_data); } /** -- cgit v1.2.3 From d21cece0dbb424ad3ff9e49bde6954632b8efede Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 17 Aug 2021 09:32:36 +0800 Subject: workqueue: Change the code of calculating work_flags in insert_wq_barrier() Add a local var @work_flags to calculate work_flags step by step, so that we don't need to squeeze several flags in only the last line of code. Parepare for next patch to add a bit to barrier work item's flag. Not squshing this to next patch makes it clear that what it will have changed. No functional change intended. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 41796000f3eb..84fd2a8f56aa 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2659,8 +2659,8 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, struct wq_barrier *barr, struct work_struct *target, struct worker *worker) { + unsigned int work_flags = work_color_to_flags(WORK_NO_COLOR); struct list_head *head; - unsigned int linked = 0; /* * debugobject calls are safe here even with pool->lock locked @@ -2686,13 +2686,12 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, head = target->entry.next; /* there can already be other linked works, inherit and set */ - linked = *bits & WORK_STRUCT_LINKED; + work_flags |= *bits & WORK_STRUCT_LINKED; __set_bit(WORK_STRUCT_LINKED_BIT, bits); } debug_work_activate(&barr->work); - insert_work(pwq, &barr->work, head, - work_color_to_flags(WORK_NO_COLOR) | linked); + insert_work(pwq, &barr->work, head, work_flags); } /** -- cgit v1.2.3 From 018f3a13dd6300701103f268b6bfec0a56beea57 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 17 Aug 2021 09:32:37 +0800 Subject: workqueue: Mark barrier work with WORK_STRUCT_INACTIVE Currently, WORK_NO_COLOR has two meanings: Not participate in flushing Not participate in nr_active And only non-barrier work items are marked with WORK_STRUCT_INACTIVE when they are in inactive_works list. The barrier work items are not marked INACTIVE even linked in inactive_works list since these tail items are always moved together with the head work item. These definitions are simple, clean and practical. (Except a small blemish that only the first meaning of WORK_NO_COLOR is documented in include/linux/workqueue.h while both meanings are in workqueue.c) But dual-purpose WORK_NO_COLOR used for barrier work items has proven to be problematical[1]. Only the second purpose is obligatory. So we plan to make barrier work items participate in flushing but keep them still not participating in nr_active. So the plan is to mark barrier work items inactive without using WORK_NO_COLOR in this patch so that we can assign a flushing color to them in next patch. The reasonable way is to add or reuse a bit in work data of the work item. But adding a bit will double the size of pool_workqueue. Currently, WORK_STRUCT_INACTIVE is only used in try_to_grab_pending() for user-queued work items and try_to_grab_pending() can't work for barrier work items. So we extend WORK_STRUCT_INACTIVE to also mark barrier work items no matter which list they are in because we don't need to determind which list a barrier work item is in. So the meaning of WORK_STRUCT_INACTIVE becomes just "the work items don't participate in nr_active" (no matter whether it is a barrier work item or a user-queued work item). And WORK_STRUCT_INACTIVE for user-queued work items means they are in inactive_works list. This patch does it by setting WORK_STRUCT_INACTIVE for barrier work items in insert_wq_barrier() and checking WORK_STRUCT_INACTIVE first in pwq_dec_nr_in_flight(). And the meaning of WORK_NO_COLOR is reduced to only "not participating in flushing". There is no functionality change intended in this patch. Because WORK_NO_COLOR+WORK_STRUCT_INACTIVE represents the previous WORK_NO_COLOR in meaning and try_to_grab_pending() doesn't use for barrier work items and avoids being confused by this extended WORK_STRUCT_INACTIVE. A bunch of comment for nr_active & WORK_STRUCT_INACTIVE is also added for documenting how WORK_STRUCT_INACTIVE works in nr_active management. [1]: https://lore.kernel.org/lkml/20210812083814.32453-1-lizhe.67@bytedance.com/ Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 42 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 84fd2a8f56aa..6657bcbd2064 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -205,6 +205,23 @@ struct pool_workqueue { int refcnt; /* L: reference count */ int nr_in_flight[WORK_NR_COLORS]; /* L: nr of in_flight works */ + + /* + * nr_active management and WORK_STRUCT_INACTIVE: + * + * When pwq->nr_active >= max_active, new work item is queued to + * pwq->inactive_works instead of pool->worklist and marked with + * WORK_STRUCT_INACTIVE. + * + * All work items marked with WORK_STRUCT_INACTIVE do not participate + * in pwq->nr_active and all work items in pwq->inactive_works are + * marked with WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE + * work items are in pwq->inactive_works. Some of them are ready to + * run in pool->worklist or worker->scheduled. Those work itmes are + * only struct wq_barrier which is used for flush_work() and should + * not participate in pwq->nr_active. For non-barrier work item, it + * is marked with WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works. + */ int nr_active; /* L: nr of active works */ int max_active; /* L: max active works */ struct list_head inactive_works; /* L: inactive works */ @@ -1171,19 +1188,21 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_ { int color = get_work_color(work_data); - /* uncolored work items don't participate in flushing or nr_active */ + if (!(work_data & WORK_STRUCT_INACTIVE)) { + pwq->nr_active--; + if (!list_empty(&pwq->inactive_works)) { + /* one down, submit an inactive one */ + if (pwq->nr_active < pwq->max_active) + pwq_activate_first_inactive(pwq); + } + } + + /* uncolored work items don't participate in flushing */ if (color == WORK_NO_COLOR) goto out_put; pwq->nr_in_flight[color]--; - pwq->nr_active--; - if (!list_empty(&pwq->inactive_works)) { - /* one down, submit an inactive one */ - if (pwq->nr_active < pwq->max_active) - pwq_activate_first_inactive(pwq); - } - /* is flush in progress and are we at the flushing tip? */ if (likely(pwq->flush_color != color)) goto out_put; @@ -1283,6 +1302,10 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, debug_work_deactivate(work); /* + * A cancelable inactive work item must be in the + * pwq->inactive_works since a queued barrier can't be + * canceled (see the comments in insert_wq_barrier()). + * * An inactive work item cannot be grabbed directly because * it might have linked NO_COLOR work items which, if left * on the inactive_works list, will confuse pwq->nr_active @@ -2675,6 +2698,9 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, barr->task = current; + /* The barrier work item does not participate in pwq->nr_active. */ + work_flags |= WORK_STRUCT_INACTIVE; + /* * If @target is currently being executed, schedule the * barrier to the worker; otherwise, put it after @target. -- cgit v1.2.3 From d812796eb3906424cd3c0eea530983961e2f88bd Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 17 Aug 2021 09:32:38 +0800 Subject: workqueue: Assign a color to barrier work items There was no strong reason to or not to flush barrier work items in flush_workqueue(). And we have to make barrier work items not participate in nr_active so we had been using WORK_NO_COLOR for them which also makes them can't be flushed by flush_workqueue(). And the users of flush_workqueue() often do not intend to wait barrier work items issued by flush_work(). That made the choice sound perfect. But barrier work items have reference to internal structure (pool_workqueue) and the worker thread[s] is/are still busy for the workqueue user when the barrrier work items are not done. So it is reasonable to make flush_workqueue() also watch for flush_work() to make it more robust. And a problem[1] reported by Li Zhe shows that we need such robustness. The warning logs are listed below: WARNING: CPU: 0 PID: 19336 at kernel/workqueue.c:4430 destroy_workqueue+0x11a/0x2f0 ***** destroy_workqueue: test_workqueue9 has the following busy pwq pwq 4: cpus=2 node=0 flags=0x0 nice=0 active=0/1 refcnt=2 in-flight: 5658:wq_barrier_func Showing busy workqueues and worker pools: ***** It shows that even after drain_workqueue() returns, the barrier work item is still in flight and the pwq (and a worker) is still busy on it. The problem is caused by flush_workqueue() not watching flush_work(): Thread A Worker /* normal work item with linked */ process_scheduled_works() destroy_workqueue() process_one_work() drain_workqueue() /* run normal work item */ /-- pwq_dec_nr_in_flight() flush_workqueue() <---/ /* the last normal work item is done */ sanity_check process_one_work() /-- raw_spin_unlock_irq(&pool->lock) raw_spin_lock_irq(&pool->lock) <-/ /* maybe preempt */ *WARNING* wq_barrier_func() /* maybe preempt by cond_resched() */ Thread A can get the pool lock after the Worker unlocks the pool lock before running wq_barrier_func(). And if there is any preemption happen around wq_barrier_func(), destroy_workqueue()'s sanity check is more likely to get the lock and catch it. (Note: preemption is not necessary to cause the bug, the unlocking is enough to possibly trigger the WARNING.) A simple solution might be just executing all linked barrier work items once without releasing pool lock after the head work item's pwq_dec_nr_in_flight(). But this solution has two problems: 1) the head work item might also be barrier work item when the user-queued work item is cancelled. For example: thread 1: thread 2: queue_work(wq, &my_work) flush_work(&my_work) cancel_work_sync(&my_work); /* Neiter my_work nor the barrier work is scheduled. */ destroy_workqueue(wq); /* This is an easier way to catch the WARNING. */ 2) there might be too much linked barrier work items and running them all once without releasing pool lock just causes trouble. The only solution is to make flush_workqueue() aslo watch barrier work items. So we have to assign a color to these barrier work items which is the color of the head (user-queued) work item. Assigning a color doesn't cause any problem in ative management, because the prvious patch made barrier work items not participate in nr_active via WORK_STRUCT_INACTIVE rather than reliance on the (old) WORK_NO_COLOR. [1]: https://lore.kernel.org/lkml/20210812083814.32453-1-lizhe.67@bytedance.com/ Reported-by: Li Zhe Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 20 ++++++++++++-------- kernel/workqueue_internal.h | 3 ++- 2 files changed, 14 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6657bcbd2064..33a6b4a2443d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1197,10 +1197,6 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_ } } - /* uncolored work items don't participate in flushing */ - if (color == WORK_NO_COLOR) - goto out_put; - pwq->nr_in_flight[color]--; /* is flush in progress and are we at the flushing tip? */ @@ -1307,7 +1303,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, * canceled (see the comments in insert_wq_barrier()). * * An inactive work item cannot be grabbed directly because - * it might have linked NO_COLOR work items which, if left + * it might have linked barrier work items which, if left * on the inactive_works list, will confuse pwq->nr_active * management later on and cause stall. Make sure the work * item is activated before grabbing. @@ -2234,6 +2230,7 @@ __acquires(&pool->lock) worker->current_func = work->func; worker->current_pwq = pwq; work_data = *work_data_bits(work); + worker->current_color = get_work_color(work_data); /* * Record wq name for cmdline and debug reporting, may get @@ -2339,6 +2336,7 @@ __acquires(&pool->lock) worker->current_work = NULL; worker->current_func = NULL; worker->current_pwq = NULL; + worker->current_color = INT_MAX; pwq_dec_nr_in_flight(pwq, work_data); } @@ -2682,7 +2680,8 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, struct wq_barrier *barr, struct work_struct *target, struct worker *worker) { - unsigned int work_flags = work_color_to_flags(WORK_NO_COLOR); + unsigned int work_flags = 0; + unsigned int work_color; struct list_head *head; /* @@ -2705,17 +2704,22 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, * If @target is currently being executed, schedule the * barrier to the worker; otherwise, put it after @target. */ - if (worker) + if (worker) { head = worker->scheduled.next; - else { + work_color = worker->current_color; + } else { unsigned long *bits = work_data_bits(target); head = target->entry.next; /* there can already be other linked works, inherit and set */ work_flags |= *bits & WORK_STRUCT_LINKED; + work_color = get_work_color(*bits); __set_bit(WORK_STRUCT_LINKED_BIT, bits); } + pwq->nr_in_flight[work_color]++; + work_flags |= work_color_to_flags(work_color); + debug_work_activate(&barr->work); insert_work(pwq, &barr->work, head, work_flags); } diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 498de0e909a4..e00b1204a8e9 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -30,7 +30,8 @@ struct worker { struct work_struct *current_work; /* L: work being processed */ work_func_t current_func; /* L: current_work's fn */ - struct pool_workqueue *current_pwq; /* L: current_work's pwq */ + struct pool_workqueue *current_pwq; /* L: current_work's pwq */ + unsigned int current_color; /* L: current_work's color */ struct list_head scheduled; /* L: scheduled works */ /* 64 bytes boundary on 64bit, 32 on 32bit */ -- cgit v1.2.3 From 99c37d1a63eafcd3673302a7953df760b46d0f6f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 3 Aug 2021 16:16:19 +0200 Subject: tracing: Replace deprecated CPU-hotplug functions. The functions get_online_cpus() and put_online_cpus() have been deprecated during the CPU hotplug rework. They map directly to cpus_read_lock() and cpus_read_unlock(). Replace deprecated CPU-hotplug functions with the official version. The behavior remains unchanged. Link: https://lkml.kernel.org/r/20210803141621.780504-37-bigeasy@linutronix.de Cc: Peter Zijlstra Cc: Ingo Molnar Acked-by: Daniel Bristot de Oliveira Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 8 ++++---- kernel/trace/trace_hwlat.c | 28 ++++++++++++++-------------- kernel/trace/trace_osnoise.c | 16 ++++++++-------- 3 files changed, 26 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index e592d1df6f88..c5a3fbf19617 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2111,7 +2111,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, } } - get_online_cpus(); + cpus_read_lock(); /* * Fire off all the required work handlers * We can't schedule on offline CPUs, but it's not necessary @@ -2143,7 +2143,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, cpu_buffer->nr_pages_to_update = 0; } - put_online_cpus(); + cpus_read_unlock(); } else { cpu_buffer = buffer->buffers[cpu_id]; @@ -2171,7 +2171,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, goto out_err; } - get_online_cpus(); + cpus_read_lock(); /* Can't run something on an offline CPU. */ if (!cpu_online(cpu_id)) @@ -2183,7 +2183,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, } cpu_buffer->nr_pages_to_update = 0; - put_online_cpus(); + cpus_read_unlock(); } out: diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 14f46aae1981..1b83d75eb103 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -325,10 +325,10 @@ static void move_to_next_cpu(void) if (!cpumask_equal(current_mask, current->cpus_ptr)) goto change_mode; - get_online_cpus(); + cpus_read_lock(); cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask); next_cpu = cpumask_next(raw_smp_processor_id(), current_mask); - put_online_cpus(); + cpus_read_unlock(); if (next_cpu >= nr_cpu_ids) next_cpu = cpumask_first(current_mask); @@ -398,7 +398,7 @@ static void stop_single_kthread(void) struct hwlat_kthread_data *kdata = get_cpu_data(); struct task_struct *kthread; - get_online_cpus(); + cpus_read_lock(); kthread = kdata->kthread; if (!kthread) @@ -408,7 +408,7 @@ static void stop_single_kthread(void) kdata->kthread = NULL; out_put_cpus: - put_online_cpus(); + cpus_read_unlock(); } @@ -425,14 +425,14 @@ static int start_single_kthread(struct trace_array *tr) struct task_struct *kthread; int next_cpu; - get_online_cpus(); + cpus_read_lock(); if (kdata->kthread) goto out_put_cpus; kthread = kthread_create(kthread_fn, NULL, "hwlatd"); if (IS_ERR(kthread)) { pr_err(BANNER "could not start sampling thread\n"); - put_online_cpus(); + cpus_read_unlock(); return -ENOMEM; } @@ -452,7 +452,7 @@ static int start_single_kthread(struct trace_array *tr) wake_up_process(kthread); out_put_cpus: - put_online_cpus(); + cpus_read_unlock(); return 0; } @@ -479,10 +479,10 @@ static void stop_per_cpu_kthreads(void) { unsigned int cpu; - get_online_cpus(); + cpus_read_lock(); for_each_online_cpu(cpu) stop_cpu_kthread(cpu); - put_online_cpus(); + cpus_read_unlock(); } /* @@ -515,7 +515,7 @@ static void hwlat_hotplug_workfn(struct work_struct *dummy) mutex_lock(&trace_types_lock); mutex_lock(&hwlat_data.lock); - get_online_cpus(); + cpus_read_lock(); if (!hwlat_busy || hwlat_data.thread_mode != MODE_PER_CPU) goto out_unlock; @@ -526,7 +526,7 @@ static void hwlat_hotplug_workfn(struct work_struct *dummy) start_cpu_kthread(cpu); out_unlock: - put_online_cpus(); + cpus_read_unlock(); mutex_unlock(&hwlat_data.lock); mutex_unlock(&trace_types_lock); } @@ -582,7 +582,7 @@ static int start_per_cpu_kthreads(struct trace_array *tr) unsigned int cpu; int retval; - get_online_cpus(); + cpus_read_lock(); /* * Run only on CPUs in which hwlat is allowed to run. */ @@ -596,12 +596,12 @@ static int start_per_cpu_kthreads(struct trace_array *tr) if (retval) goto out_error; } - put_online_cpus(); + cpus_read_unlock(); return 0; out_error: - put_online_cpus(); + cpus_read_unlock(); stop_per_cpu_kthreads(); return retval; } diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index b61eefe5ccf5..65b08b8e5bf8 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1498,12 +1498,12 @@ static void stop_per_cpu_kthreads(void) { int cpu; - get_online_cpus(); + cpus_read_lock(); for_each_online_cpu(cpu) stop_kthread(cpu); - put_online_cpus(); + cpus_read_unlock(); } /* @@ -1551,7 +1551,7 @@ static int start_per_cpu_kthreads(struct trace_array *tr) int retval; int cpu; - get_online_cpus(); + cpus_read_lock(); /* * Run only on CPUs in which trace and osnoise are allowed to run. */ @@ -1572,7 +1572,7 @@ static int start_per_cpu_kthreads(struct trace_array *tr) } } - put_online_cpus(); + cpus_read_unlock(); return 0; } @@ -1590,7 +1590,7 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy) goto out_unlock_trace; mutex_lock(&interface_lock); - get_online_cpus(); + cpus_read_lock(); if (!cpumask_test_cpu(cpu, &osnoise_cpumask)) goto out_unlock; @@ -1601,7 +1601,7 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy) start_kthread(cpu); out_unlock: - put_online_cpus(); + cpus_read_unlock(); mutex_unlock(&interface_lock); out_unlock_trace: mutex_unlock(&trace_types_lock); @@ -1743,11 +1743,11 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, /* * osnoise_cpumask is read by CPU hotplug operations. */ - get_online_cpus(); + cpus_read_lock(); cpumask_copy(&osnoise_cpumask, osnoise_cpumask_new); - put_online_cpus(); + cpus_read_unlock(); mutex_unlock(&interface_lock); if (running) -- cgit v1.2.3 From 8cacfc85b615cc0bae01241593c4b25da6570efc Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 17 Aug 2021 18:08:42 +0100 Subject: bpf: Remove redundant initialization of variable allow The variable allow is being initialized with a value that is never read, it is being updated later on. The assignment is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210817170842.495440-1-colin.king@canonical.com --- kernel/bpf/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index a1dedba4c174..9f35928bab0a 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1135,7 +1135,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, .major = major, .minor = minor, }; - int allow = 1; + int allow; rcu_read_lock(); cgrp = task_dfl_cgroup(current); -- cgit v1.2.3 From faf4ef823ac5f3b6a34a73b76c52895dee3dce55 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Jun 2021 14:21:16 +0200 Subject: dma-direct: add support for dma_coherent_default_memory Add an option to allocate uncached memory for dma_alloc_coherent from the global dma_coherent_default_memory. This will allow to move arm-nommu (and eventually other platforms) to use generic code for allocating uncached memory from a pre-populated pool. Note that this is a different pool from the one that platforms that can remap at runtime use for GFP_ATOMIC allocations for now, although there might be opportunities to eventually end up with a common codebase for the two use cases. Signed-off-by: Christoph Hellwig Tested-by: Dillon Min --- kernel/dma/Kconfig | 4 ++++ kernel/dma/direct.c | 15 +++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 77b405508743..725cfd51762b 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -93,6 +93,10 @@ config DMA_COHERENT_POOL select GENERIC_ALLOCATOR bool +config DMA_GLOBAL_POOL + select DMA_DECLARE_COHERENT + bool + config DMA_REMAP bool depends on MMU diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index f33ceb68aef2..8dca4f97d12d 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -156,9 +156,14 @@ void *dma_direct_alloc(struct device *dev, size_t size, if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && + !IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) && !dev_is_dma_coherent(dev)) return arch_dma_alloc(dev, size, dma_handle, gfp, attrs); + if (IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) && + !dev_is_dma_coherent(dev)) + return dma_alloc_from_global_coherent(dev, size, dma_handle); + /* * Remapping or decrypting memory may block. If either is required and * we can't block, allocate the memory from the atomic pools. @@ -255,11 +260,19 @@ void dma_direct_free(struct device *dev, size_t size, if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && + !IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) && !dev_is_dma_coherent(dev)) { arch_dma_free(dev, size, cpu_addr, dma_addr, attrs); return; } + if (IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) && + !dev_is_dma_coherent(dev)) { + if (!dma_release_from_global_coherent(page_order, cpu_addr)) + WARN_ON_ONCE(1); + return; + } + /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */ if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) && dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size))) @@ -462,6 +475,8 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) return ret; + if (dma_mmap_from_global_coherent(vma, cpu_addr, size, &ret)) + return ret; if (vma->vm_pgoff >= count || user_count > count - vma->vm_pgoff) return -ENXIO; -- cgit v1.2.3 From 70d6aa0ecfed253a2b14659a6c77359af6d9b3ee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Jun 2021 19:38:19 +0200 Subject: dma-mapping: allow using the global coherent pool for !ARM Switch an ifdef so that the global coherent pool is initialized for any architecture that selects the DMA_GLOBAL_POOL symbol insted of hardcoding ARM. Signed-off-by: Christoph Hellwig Tested-by: Dillon Min --- kernel/dma/coherent.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 794e76b03b34..67b126afac5a 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -361,7 +361,9 @@ static int __init rmem_dma_setup(struct reserved_mem *rmem) pr_err("Reserved memory: regions without no-map are not yet supported\n"); return -EINVAL; } +#endif +#ifdef CONFIG_DMA_GLOBAL_POOL if (of_get_flat_dt_prop(node, "linux,dma-default", NULL)) { WARN(dma_reserved_default_memory, "Reserved memory: region for default DMA coherent area is redefined\n"); -- cgit v1.2.3 From a6933571f34a9aee843fff2aa4b96949c57d6274 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Jun 2021 16:00:04 +0200 Subject: dma-mapping: simplify dma_init_coherent_memory Return the allocated dma_coherent_mem structure, set the use_dma_pfn_offset and print the failure warning inside of dma_init_coherent_memory instead of leaving that to the callers. Signed-off-by: Christoph Hellwig Tested-by: Dillon Min --- kernel/dma/coherent.c | 78 ++++++++++++++++++++++----------------------------- 1 file changed, 33 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 67b126afac5a..ab397ebfd5ad 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -37,51 +37,44 @@ static inline dma_addr_t dma_get_device_base(struct device *dev, return mem->device_base; } -static int dma_init_coherent_memory(phys_addr_t phys_addr, - dma_addr_t device_addr, size_t size, - struct dma_coherent_mem **mem) +static struct dma_coherent_mem *dma_init_coherent_memory(phys_addr_t phys_addr, + dma_addr_t device_addr, size_t size, bool use_dma_pfn_offset) { - struct dma_coherent_mem *dma_mem = NULL; - void *mem_base = NULL; + struct dma_coherent_mem *dma_mem; int pages = size >> PAGE_SHIFT; int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); - int ret; + void *mem_base; - if (!size) { - ret = -EINVAL; - goto out; - } + if (!size) + return ERR_PTR(-EINVAL); mem_base = memremap(phys_addr, size, MEMREMAP_WC); - if (!mem_base) { - ret = -EINVAL; - goto out; - } + if (!mem_base) + return ERR_PTR(-EINVAL); + dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); - if (!dma_mem) { - ret = -ENOMEM; - goto out; - } + if (!dma_mem) + goto out_unmap_membase; dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL); - if (!dma_mem->bitmap) { - ret = -ENOMEM; - goto out; - } + if (!dma_mem->bitmap) + goto out_free_dma_mem; dma_mem->virt_base = mem_base; dma_mem->device_base = device_addr; dma_mem->pfn_base = PFN_DOWN(phys_addr); dma_mem->size = pages; + dma_mem->use_dev_dma_pfn_offset = use_dma_pfn_offset; spin_lock_init(&dma_mem->spinlock); - *mem = dma_mem; - return 0; + return dma_mem; -out: +out_free_dma_mem: kfree(dma_mem); - if (mem_base) - memunmap(mem_base); - return ret; +out_unmap_membase: + memunmap(mem_base); + pr_err("Reserved memory: failed to init DMA memory pool at %pa, size %zd MiB\n", + &phys_addr, size / SZ_1M); + return ERR_PTR(-ENOMEM); } static void dma_release_coherent_memory(struct dma_coherent_mem *mem) @@ -130,9 +123,9 @@ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, struct dma_coherent_mem *mem; int ret; - ret = dma_init_coherent_memory(phys_addr, device_addr, size, &mem); - if (ret) - return ret; + mem = dma_init_coherent_memory(phys_addr, device_addr, size, false); + if (IS_ERR(mem)) + return PTR_ERR(mem); ret = dma_assign_coherent_memory(dev, mem); if (ret) @@ -319,21 +312,16 @@ static struct reserved_mem *dma_reserved_default_memory __initdata; static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev) { - struct dma_coherent_mem *mem = rmem->priv; - int ret; - - if (!mem) { - ret = dma_init_coherent_memory(rmem->base, rmem->base, - rmem->size, &mem); - if (ret) { - pr_err("Reserved memory: failed to init DMA memory pool at %pa, size %ld MiB\n", - &rmem->base, (unsigned long)rmem->size / SZ_1M); - return ret; - } + if (!rmem->priv) { + struct dma_coherent_mem *mem; + + mem = dma_init_coherent_memory(rmem->base, rmem->base, + rmem->size, true); + if (IS_ERR(mem)) + return PTR_ERR(mem); + rmem->priv = mem; } - mem->use_dev_dma_pfn_offset = true; - rmem->priv = mem; - dma_assign_coherent_memory(dev, mem); + dma_assign_coherent_memory(dev, rmem->priv); return 0; } -- cgit v1.2.3 From 39a2d3506b2d53c569a6db13d65b2f3728c4feec Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Jun 2021 16:05:05 +0200 Subject: dma-mapping: add a dma_init_global_coherent helper Add a new helper to initialize the global coherent pool. This both cleans up the existing initialization which indirects through the reserved_mem_ops that are normally only used for struct device, and also allows using the global pool for non-devicetree architectures. Signed-off-by: Christoph Hellwig Tested-by: Dillon Min --- kernel/dma/coherent.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index ab397ebfd5ad..160d4e246ecb 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -300,6 +300,18 @@ int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *vaddr, vaddr, size, ret); } +int dma_init_global_coherent(phys_addr_t phys_addr, size_t size) +{ + struct dma_coherent_mem *mem; + + mem = dma_init_coherent_memory(phys_addr, phys_addr, size, true); + if (IS_ERR(mem)) + return PTR_ERR(mem); + dma_coherent_default_memory = mem; + pr_info("DMA: default coherent area is set\n"); + return 0; +} + /* * Support for reserved memory regions defined in device tree */ @@ -367,26 +379,10 @@ static int __init rmem_dma_setup(struct reserved_mem *rmem) static int __init dma_init_reserved_memory(void) { - const struct reserved_mem_ops *ops; - int ret; - if (!dma_reserved_default_memory) return -ENOMEM; - - ops = dma_reserved_default_memory->ops; - - /* - * We rely on rmem_dma_device_init() does not propagate error of - * dma_assign_coherent_memory() for "NULL" device. - */ - ret = ops->device_init(dma_reserved_default_memory, NULL); - - if (!ret) { - dma_coherent_default_memory = dma_reserved_default_memory->priv; - pr_info("DMA: default coherent area is set\n"); - } - - return ret; + return dma_init_global_coherent(dma_reserved_default_memory->base, + dma_reserved_default_memory->size); } core_initcall(dma_init_reserved_memory); -- cgit v1.2.3 From 8b0e6c744fef6462382041b30878c91f15069fc6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 16 Aug 2021 23:42:56 -0400 Subject: tracing: Add DYNAMIC flag for dynamic events To differentiate between static and dynamic events, add a new flag DYNAMIC to the event flags that all dynamic events have set. This will allow to differentiate when attaching to a dynamic event from a static event. Static events have a mod pointer that references the module they were created in (or NULL for core kernel). This can be incremented when the event has something attached to it. But there exists no such mechanism for dynamic events. This is dangerous as the dynamic events may now disappear without the "attachment" knowing that it no longer exists. To enforce the dynamic flag, change dyn_event_add() to pass the event that is being created such that it can set the DYNAMIC flag of the event. This helps make sure that no location that creates a dynamic event misses setting this flag. Link: https://lore.kernel.org/linux-trace-devel/20210813004448.51c7de69ce432d338f4d226b@kernel.org/ Link: https://lkml.kernel.org/r/20210817035026.936958254@goodmis.org Suggested-by: Masami Hiramatsu Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_dynevent.h | 4 +++- kernel/trace/trace_events_synth.c | 2 +- kernel/trace/trace_kprobe.c | 4 ++-- kernel/trace/trace_uprobe.c | 4 ++-- 4 files changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h index 7754936b57ee..936477a111d3 100644 --- a/kernel/trace/trace_dynevent.h +++ b/kernel/trace/trace_dynevent.h @@ -76,13 +76,15 @@ int dyn_event_init(struct dyn_event *ev, struct dyn_event_operations *ops) return 0; } -static inline int dyn_event_add(struct dyn_event *ev) +static inline int dyn_event_add(struct dyn_event *ev, + struct trace_event_call *call) { lockdep_assert_held(&event_mutex); if (!ev || !ev->ops) return -EINVAL; + call->flags |= TRACE_EVENT_FL_DYNAMIC; list_add_tail(&ev->list, &dyn_event_list); return 0; } diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 9315fc03e303..f4f5489e1e28 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -1298,7 +1298,7 @@ static int __create_synth_event(const char *name, const char *raw_fields) } ret = register_synth_event(event); if (!ret) - dyn_event_add(&event->devent); + dyn_event_add(&event->devent, &event->call); else free_synth_event(event); out: diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index ea6178cb5e33..bfef43bfce37 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -618,7 +618,7 @@ static int append_trace_kprobe(struct trace_kprobe *tk, struct trace_kprobe *to) if (ret) trace_probe_unlink(&tk->tp); else - dyn_event_add(&tk->devent); + dyn_event_add(&tk->devent, trace_probe_event_call(&tk->tp)); return ret; } @@ -661,7 +661,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk) if (ret < 0) unregister_kprobe_event(tk); else - dyn_event_add(&tk->devent); + dyn_event_add(&tk->devent, trace_probe_event_call(&tk->tp)); end: mutex_unlock(&event_mutex); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 9b50869a5ddb..50eca53b8d22 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -455,7 +455,7 @@ static int append_trace_uprobe(struct trace_uprobe *tu, struct trace_uprobe *to) /* Append to existing event */ ret = trace_probe_append(&tu->tp, &to->tp); if (!ret) - dyn_event_add(&tu->devent); + dyn_event_add(&tu->devent, trace_probe_event_call(&tu->tp)); return ret; } @@ -518,7 +518,7 @@ static int register_trace_uprobe(struct trace_uprobe *tu) goto end; } - dyn_event_add(&tu->devent); + dyn_event_add(&tu->devent, trace_probe_event_call(&tu->tp)); end: mutex_unlock(&event_mutex); -- cgit v1.2.3 From 1d18538e6a09265003a0a94ca779d7a6127cb76c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 16 Aug 2021 23:42:57 -0400 Subject: tracing: Have dynamic events have a ref counter As dynamic events are not created by modules, if something is attached to one, calling "try_module_get()" on its "mod" field, is not going to keep the dynamic event from going away. Since dynamic events do not need the "mod" pointer of the event structure, make a union out of it in order to save memory (there's one structure for each of the thousand+ events in the kernel), and have any event with the DYNAMIC flag set to use a ref counter instead. Link: https://lore.kernel.org/linux-trace-devel/20210813004448.51c7de69ce432d338f4d226b@kernel.org/ Link: https://lkml.kernel.org/r/20210817035027.174869074@goodmis.org Suggested-by: Masami Hiramatsu Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 4 ++-- kernel/trace/trace_dynevent.c | 38 +++++++++++++++++++++++++++++++++++++ kernel/trace/trace_event_perf.c | 6 +++--- kernel/trace/trace_events.c | 22 +++++++++++++-------- kernel/trace/trace_events_synth.c | 19 ++++++++++++------- kernel/trace/trace_events_trigger.c | 6 +++--- kernel/trace/trace_kprobe.c | 4 ++++ kernel/trace/trace_uprobe.c | 4 ++++ 8 files changed, 80 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index be0169594de5..8425c3d70895 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3697,11 +3697,11 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str) return false; event = container_of(trace_event, struct trace_event_call, event); - if (!event->mod) + if ((event->flags & TRACE_EVENT_FL_DYNAMIC) || !event->module) return false; /* Would rather have rodata, but this will suffice */ - if (within_module_core(addr, event->mod)) + if (within_module_core(addr, event->module)) return true; return false; diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index e57cc0870892..1110112e55bd 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -13,11 +13,49 @@ #include #include "trace.h" +#include "trace_output.h" /* for trace_event_sem */ #include "trace_dynevent.h" static DEFINE_MUTEX(dyn_event_ops_mutex); static LIST_HEAD(dyn_event_ops_list); +bool trace_event_dyn_try_get_ref(struct trace_event_call *dyn_call) +{ + struct trace_event_call *call; + bool ret = false; + + if (WARN_ON_ONCE(!(dyn_call->flags & TRACE_EVENT_FL_DYNAMIC))) + return false; + + down_read(&trace_event_sem); + list_for_each_entry(call, &ftrace_events, list) { + if (call == dyn_call) { + atomic_inc(&dyn_call->refcnt); + ret = true; + } + } + up_read(&trace_event_sem); + return ret; +} + +void trace_event_dyn_put_ref(struct trace_event_call *call) +{ + if (WARN_ON_ONCE(!(call->flags & TRACE_EVENT_FL_DYNAMIC))) + return; + + if (WARN_ON_ONCE(atomic_read(&call->refcnt) <= 0)) { + atomic_set(&call->refcnt, 0); + return; + } + + atomic_dec(&call->refcnt); +} + +bool trace_event_dyn_busy(struct trace_event_call *call) +{ + return atomic_read(&call->refcnt) != 0; +} + int dyn_event_register(struct dyn_event_operations *ops) { if (!ops || !ops->create || !ops->show || !ops->is_busy || diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 03be4435d103..6aed10e2f7ce 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -177,7 +177,7 @@ static void perf_trace_event_unreg(struct perf_event *p_event) } } out: - module_put(tp_event->mod); + trace_event_put_ref(tp_event); } static int perf_trace_event_open(struct perf_event *p_event) @@ -224,10 +224,10 @@ int perf_trace_init(struct perf_event *p_event) list_for_each_entry(tp_event, &ftrace_events, list) { if (tp_event->event.type == event_id && tp_event->class && tp_event->class->reg && - try_module_get(tp_event->mod)) { + trace_event_try_get_ref(tp_event)) { ret = perf_trace_event_init(tp_event, p_event); if (ret) - module_put(tp_event->mod); + trace_event_put_ref(tp_event); break; } } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 80e96989770e..1349b6de5eeb 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2525,7 +2525,10 @@ __register_event(struct trace_event_call *call, struct module *mod) return ret; list_add(&call->list, &ftrace_events); - call->mod = mod; + if (call->flags & TRACE_EVENT_FL_DYNAMIC) + atomic_set(&call->refcnt, 0); + else + call->module = mod; return 0; } @@ -2839,7 +2842,9 @@ static void trace_module_remove_events(struct module *mod) down_write(&trace_event_sem); list_for_each_entry_safe(call, p, &ftrace_events, list) { - if (call->mod == mod) + if ((call->flags & TRACE_EVENT_FL_DYNAMIC) || !call->module) + continue; + if (call->module == mod) __trace_remove_event_call(call); } up_write(&trace_event_sem); @@ -2982,7 +2987,7 @@ struct trace_event_file *trace_get_event_file(const char *instance, } /* Don't let event modules unload while in use */ - ret = try_module_get(file->event_call->mod); + ret = trace_event_try_get_ref(file->event_call); if (!ret) { trace_array_put(tr); ret = -EBUSY; @@ -3012,7 +3017,7 @@ EXPORT_SYMBOL_GPL(trace_get_event_file); void trace_put_event_file(struct trace_event_file *file) { mutex_lock(&event_mutex); - module_put(file->event_call->mod); + trace_event_put_ref(file->event_call); mutex_unlock(&event_mutex); trace_array_put(file->tr); @@ -3147,7 +3152,7 @@ static int free_probe_data(void *data) if (!edata->ref) { /* Remove the SOFT_MODE flag */ __ftrace_event_enable_disable(edata->file, 0, 1); - module_put(edata->file->event_call->mod); + trace_event_put_ref(edata->file->event_call); kfree(edata); } return 0; @@ -3280,7 +3285,7 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, out_reg: /* Don't let event modules unload while probe registered */ - ret = try_module_get(file->event_call->mod); + ret = trace_event_try_get_ref(file->event_call); if (!ret) { ret = -EBUSY; goto out_free; @@ -3310,7 +3315,7 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, out_disable: __ftrace_event_enable_disable(file, 0, 1); out_put: - module_put(file->event_call->mod); + trace_event_put_ref(file->event_call); out_free: kfree(data); goto out; @@ -3376,7 +3381,8 @@ void __trace_early_add_events(struct trace_array *tr) list_for_each_entry(call, &ftrace_events, list) { /* Early boot up should not have any modules loaded */ - if (WARN_ON_ONCE(call->mod)) + if (!(call->flags & TRACE_EVENT_FL_DYNAMIC) && + WARN_ON_ONCE(call->module)) continue; ret = __trace_early_add_new_event(call, tr); diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index f4f5489e1e28..d54094b7a9d7 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -1369,13 +1369,15 @@ static int destroy_synth_event(struct synth_event *se) int ret; if (se->ref) - ret = -EBUSY; - else { - ret = unregister_synth_event(se); - if (!ret) { - dyn_event_remove(&se->devent); - free_synth_event(se); - } + return -EBUSY; + + if (trace_event_dyn_busy(&se->call)) + return -EBUSY; + + ret = unregister_synth_event(se); + if (!ret) { + dyn_event_remove(&se->devent); + free_synth_event(se); } return ret; @@ -2102,6 +2104,9 @@ static int synth_event_release(struct dyn_event *ev) if (event->ref) return -EBUSY; + if (trace_event_dyn_busy(&event->call)) + return -EBUSY; + ret = unregister_synth_event(event); if (ret) return ret; diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index cf84d0f6583a..6b11e335a62e 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1334,7 +1334,7 @@ void event_enable_trigger_free(struct event_trigger_ops *ops, if (!data->ref) { /* Remove the SOFT_MODE flag */ trace_event_enable_disable(enable_data->file, 0, 1); - module_put(enable_data->file->event_call->mod); + trace_event_put_ref(enable_data->file->event_call); trigger_data_free(data); kfree(enable_data); } @@ -1481,7 +1481,7 @@ int event_enable_trigger_func(struct event_command *cmd_ops, out_reg: /* Don't let event modules unload while probe registered */ - ret = try_module_get(event_enable_file->event_call->mod); + ret = trace_event_try_get_ref(event_enable_file->event_call); if (!ret) { ret = -EBUSY; goto out_free; @@ -1510,7 +1510,7 @@ int event_enable_trigger_func(struct event_command *cmd_ops, out_disable: trace_event_enable_disable(event_enable_file, 0, 1); out_put: - module_put(event_enable_file->event_call->mod); + trace_event_put_ref(event_enable_file->event_call); out_free: if (cmd_ops->set_filter) cmd_ops->set_filter(NULL, trigger_data, NULL); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index bfef43bfce37..82c3b86013b2 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -543,6 +543,10 @@ static int unregister_trace_kprobe(struct trace_kprobe *tk) if (trace_probe_is_enabled(&tk->tp)) return -EBUSY; + /* If there's a reference to the dynamic event */ + if (trace_event_dyn_busy(trace_probe_event_call(&tk->tp))) + return -EBUSY; + /* Will fail if probe is being used by ftrace or perf */ if (unregister_kprobe_event(tk)) return -EBUSY; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 50eca53b8d22..1e2a92e7607d 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -393,6 +393,10 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu) if (trace_probe_has_sibling(&tu->tp)) goto unreg; + /* If there's a reference to the dynamic event */ + if (trace_event_dyn_busy(trace_probe_event_call(&tu->tp))) + return -EBUSY; + ret = unregister_uprobe_event(tu); if (ret) return ret; -- cgit v1.2.3 From fcd9db51df8e219e3a61b14e9b8c5ee67d39d37c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 16 Aug 2021 23:42:58 -0400 Subject: tracing/probe: Have traceprobe_parse_probe_arg() take a const arg The two places that call traceprobe_parse_probe_arg() allocate a temporary buffer to copy the argv[i] into, because argv[i] is constant and the traceprobe_parse_probe_arg() will modify it to do the parsing. These two places allocate this buffer and then free it right after calling this function, leaving the onus of this allocation to the caller. As there's about to be a third user of this function that will have to do the same thing, instead of having the caller allocate the temporary buffer, simply move that allocation into the traceprobe_parse_probe_arg() itself, which will simplify the code of the callers. Link: https://lkml.kernel.org/r/20210817035027.385422828@goodmis.org Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 9 +-------- kernel/trace/trace_probe.c | 47 +++++++++++++++++++++++++++------------------ kernel/trace/trace_probe.h | 2 +- kernel/trace/trace_uprobe.c | 9 +-------- 4 files changed, 31 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 82c3b86013b2..ed1e3c2087ab 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -873,15 +873,8 @@ static int __trace_kprobe_create(int argc, const char *argv[]) /* parse arguments */ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { - tmp = kstrdup(argv[i], GFP_KERNEL); - if (!tmp) { - ret = -ENOMEM; - goto error; - } - trace_probe_log_set_index(i + 2); - ret = traceprobe_parse_probe_arg(&tk->tp, i, tmp, flags); - kfree(tmp); + ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], flags); if (ret) goto error; /* This can be -ENOMEM */ } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 15413ad7cef2..ef717b373443 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -540,26 +540,34 @@ static int __parse_bitfield_probe_arg(const char *bf, } /* String length checking wrapper */ -static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, +static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, struct probe_arg *parg, unsigned int flags, int offset) { struct fetch_insn *code, *scode, *tmp = NULL; char *t, *t2, *t3; + char *arg; int ret, len; + arg = kstrdup(argv, GFP_KERNEL); + if (!arg) + return -ENOMEM; + + ret = -EINVAL; len = strlen(arg); if (len > MAX_ARGSTR_LEN) { trace_probe_log_err(offset, ARG_TOO_LONG); - return -EINVAL; + goto out; } else if (len == 0) { trace_probe_log_err(offset, NO_ARG_BODY); - return -EINVAL; + goto out; } + ret = -ENOMEM; parg->comm = kstrdup(arg, GFP_KERNEL); if (!parg->comm) - return -ENOMEM; + goto out; + ret = -EINVAL; t = strchr(arg, ':'); if (t) { *t = '\0'; @@ -571,22 +579,22 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, offset += t2 + strlen(t2) - arg; trace_probe_log_err(offset, ARRAY_NO_CLOSE); - return -EINVAL; + goto out; } else if (t3[1] != '\0') { trace_probe_log_err(offset + t3 + 1 - arg, BAD_ARRAY_SUFFIX); - return -EINVAL; + goto out; } *t3 = '\0'; if (kstrtouint(t2, 0, &parg->count) || !parg->count) { trace_probe_log_err(offset + t2 - arg, BAD_ARRAY_NUM); - return -EINVAL; + goto out; } if (parg->count > MAX_ARRAY_LEN) { trace_probe_log_err(offset + t2 - arg, ARRAY_TOO_BIG); - return -EINVAL; + goto out; } } } @@ -598,29 +606,30 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, if (strcmp(arg, "$comm") == 0 || strncmp(arg, "\\\"", 2) == 0) { /* The type of $comm must be "string", and not an array. */ if (parg->count || (t && strcmp(t, "string"))) - return -EINVAL; + goto out; parg->type = find_fetch_type("string"); } else parg->type = find_fetch_type(t); if (!parg->type) { trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_TYPE); - return -EINVAL; + goto out; } parg->offset = *size; *size += parg->type->size * (parg->count ?: 1); + ret = -ENOMEM; if (parg->count) { len = strlen(parg->type->fmttype) + 6; parg->fmt = kmalloc(len, GFP_KERNEL); if (!parg->fmt) - return -ENOMEM; + goto out; snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype, parg->count); } code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL); if (!code) - return -ENOMEM; + goto out; code[FETCH_INSN_MAX - 1].op = FETCH_OP_END; ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1], @@ -628,6 +637,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, if (ret) goto fail; + ret = -EINVAL; /* Store operation */ if (!strcmp(parg->type->name, "string") || !strcmp(parg->type->name, "ustring")) { @@ -636,7 +646,6 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, code->op != FETCH_OP_DATA) { trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_STRING); - ret = -EINVAL; goto fail; } if ((code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM || @@ -650,7 +659,6 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, code++; if (code->op != FETCH_OP_NOP) { trace_probe_log_err(offset, TOO_MANY_OPS); - ret = -EINVAL; goto fail; } } @@ -672,7 +680,6 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, code++; if (code->op != FETCH_OP_NOP) { trace_probe_log_err(offset, TOO_MANY_OPS); - ret = -EINVAL; goto fail; } code->op = FETCH_OP_ST_RAW; @@ -687,6 +694,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, goto fail; } } + ret = -EINVAL; /* Loop(Array) operation */ if (parg->count) { if (scode->op != FETCH_OP_ST_MEM && @@ -694,13 +702,11 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, scode->op != FETCH_OP_ST_USTRING) { trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_STRING); - ret = -EINVAL; goto fail; } code++; if (code->op != FETCH_OP_NOP) { trace_probe_log_err(offset, TOO_MANY_OPS); - ret = -EINVAL; goto fail; } code->op = FETCH_OP_LP_ARRAY; @@ -709,6 +715,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, code++; code->op = FETCH_OP_END; + ret = 0; /* Shrink down the code buffer */ parg->code = kcalloc(code - tmp + 1, sizeof(*code), GFP_KERNEL); if (!parg->code) @@ -724,6 +731,8 @@ fail: kfree(code->data); } kfree(tmp); +out: + kfree(arg); return ret; } @@ -745,11 +754,11 @@ static int traceprobe_conflict_field_name(const char *name, return 0; } -int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg, +int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *arg, unsigned int flags) { struct probe_arg *parg = &tp->args[i]; - char *body; + const char *body; /* Increment count for freeing args in error case */ tp->nr_args++; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 227d518e5ba5..42aa084902fa 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -354,7 +354,7 @@ int trace_probe_create(const char *raw_command, int (*createfn)(int, const char #define TPARG_FL_MASK GENMASK(2, 0) extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, - char *arg, unsigned int flags); + const char *argv, unsigned int flags); extern int traceprobe_update_arg(struct probe_arg *arg); extern void traceprobe_free_probe_arg(struct probe_arg *arg); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 1e2a92e7607d..93ff96541971 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -684,16 +684,9 @@ static int __trace_uprobe_create(int argc, const char **argv) /* parse arguments */ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { - tmp = kstrdup(argv[i], GFP_KERNEL); - if (!tmp) { - ret = -ENOMEM; - goto error; - } - trace_probe_log_set_index(i + 2); - ret = traceprobe_parse_probe_arg(&tu->tp, i, tmp, + ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], is_return ? TPARG_FL_RETURN : 0); - kfree(tmp); if (ret) goto error; } -- cgit v1.2.3 From bc1b973455fd5d84dac4a094da44202f2d8a98ef Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 16 Aug 2021 23:42:59 -0400 Subject: tracing/probes: Allow for dot delimiter as well as slash for system names Kprobe and uprobe events can add a "system" to the events that are created via the kprobe_events and uprobe_events files respectively. If they do not include a "system" in the name, then the default "kprobes" or "uprobes" is used. The current notation to specify a system for one of these probe events is to add a '/' delimiter in the name, where the content before the '/' will be the system to use, and the content after will be the event name. echo 'p:my_system/my_event' > kprobe_events But this is inconsistent with the way histogram triggers separate their system / event names. The histogram triggers use a '.' delimiter, which can be confusing. To allow this to be more consistent, as well as keep backward compatibility, allow the kprobe and uprobe events to denote a system name with either a '/' or a '.'. That is: echo 'p:my_system/my_event' > kprobe_events is equivalent to: echo 'p:my_system.my_event' > kprobe_events Link: https://lore.kernel.org/linux-trace-devel/20210813004448.51c7de69ce432d338f4d226b@kernel.org/ Link: https://lkml.kernel.org/r/20210817035027.580493202@goodmis.org Suggested-by: Masami Hiramatsu Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index ef717b373443..0916a9964719 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -233,6 +233,9 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, int len; slash = strchr(event, '/'); + if (!slash) + slash = strchr(event, '.'); + if (slash) { if (slash == event) { trace_probe_log_err(offset, NO_GROUP_NAME); -- cgit v1.2.3 From 845cbf3e11acc263ec7a46a89097d88e7e50a9ae Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 16 Aug 2021 23:43:00 -0400 Subject: tracing/probes: Use struct_size() instead of defining custom macros Remove SIZEOF_TRACE_KPROBE() and SIZEOF_TRACE_UPROBE() and use struct_size() as that's what it is made for. No need to have custom macros. Especially since struct_size() has some extra memory checks for correctness. Link: https://lkml.kernel.org/r/20210817035027.795000217@goodmis.org Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 6 +----- kernel/trace/trace_uprobe.c | 6 +----- 2 files changed, 2 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index ed1e3c2087ab..ca726c9d0859 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -80,10 +80,6 @@ static struct trace_kprobe *to_trace_kprobe(struct dyn_event *ev) for_each_dyn_event(dpos) \ if (is_trace_kprobe(dpos) && (pos = to_trace_kprobe(dpos))) -#define SIZEOF_TRACE_KPROBE(n) \ - (offsetof(struct trace_kprobe, tp.args) + \ - (sizeof(struct probe_arg) * (n))) - static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk) { return tk->rp.handler != NULL; @@ -265,7 +261,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, struct trace_kprobe *tk; int ret = -ENOMEM; - tk = kzalloc(SIZEOF_TRACE_KPROBE(nargs), GFP_KERNEL); + tk = kzalloc(struct_size(tk, tp.args, nargs), GFP_KERNEL); if (!tk) return ERR_PTR(ret); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 93ff96541971..590bb9a02f8d 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -83,10 +83,6 @@ static struct trace_uprobe *to_trace_uprobe(struct dyn_event *ev) for_each_dyn_event(dpos) \ if (is_trace_uprobe(dpos) && (pos = to_trace_uprobe(dpos))) -#define SIZEOF_TRACE_UPROBE(n) \ - (offsetof(struct trace_uprobe, tp.args) + \ - (sizeof(struct probe_arg) * (n))) - static int register_uprobe_event(struct trace_uprobe *tu); static int unregister_uprobe_event(struct trace_uprobe *tu); @@ -340,7 +336,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) struct trace_uprobe *tu; int ret; - tu = kzalloc(SIZEOF_TRACE_UPROBE(nargs), GFP_KERNEL); + tu = kzalloc(struct_size(tu, tp.args, nargs), GFP_KERNEL); if (!tu) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From 39f75da7bcc829ddc4d40bb60d0e95520de7898b Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 2 Aug 2021 23:40:31 +0300 Subject: isystem: trim/fixup stdarg.h and other headers Delete/fixup few includes in anticipation of global -isystem compile option removal. Note: crypto/aegis128-neon-inner.c keeps due to redefinition of uintptr_t error (one definition comes from , another from ). Signed-off-by: Alexey Dobriyan Signed-off-by: Masahiro Yamada --- kernel/debug/kdb/kdb_support.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 9f50d22d68e6..4f9950678e7b 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -10,7 +10,6 @@ * 03/02/13 added new 2.5 kallsyms */ -#include #include #include #include -- cgit v1.2.3 From 22f9feb49950885cdb6e37513f134d154175e743 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Jun 2021 19:37:00 +0200 Subject: dma-mapping: make the global coherent pool conditional Only build the code to support the global coherent pool if support for it is enabled. Signed-off-by: Christoph Hellwig Tested-by: Dillon Min --- kernel/dma/coherent.c | 49 +++++++++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 160d4e246ecb..25fc85a7aebe 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -20,8 +20,6 @@ struct dma_coherent_mem { bool use_dev_dma_pfn_offset; }; -static struct dma_coherent_mem *dma_coherent_default_memory __ro_after_init; - static inline struct dma_coherent_mem *dev_get_coherent_memory(struct device *dev) { if (dev && dev->dma_mem) @@ -191,16 +189,6 @@ int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size, return 1; } -void *dma_alloc_from_global_coherent(struct device *dev, ssize_t size, - dma_addr_t *dma_handle) -{ - if (!dma_coherent_default_memory) - return NULL; - - return __dma_alloc_from_coherent(dev, dma_coherent_default_memory, size, - dma_handle); -} - static int __dma_release_from_coherent(struct dma_coherent_mem *mem, int order, void *vaddr) { @@ -236,15 +224,6 @@ int dma_release_from_dev_coherent(struct device *dev, int order, void *vaddr) return __dma_release_from_coherent(mem, order, vaddr); } -int dma_release_from_global_coherent(int order, void *vaddr) -{ - if (!dma_coherent_default_memory) - return 0; - - return __dma_release_from_coherent(dma_coherent_default_memory, order, - vaddr); -} - static int __dma_mmap_from_coherent(struct dma_coherent_mem *mem, struct vm_area_struct *vma, void *vaddr, size_t size, int *ret) { @@ -290,6 +269,28 @@ int dma_mmap_from_dev_coherent(struct device *dev, struct vm_area_struct *vma, return __dma_mmap_from_coherent(mem, vma, vaddr, size, ret); } +#ifdef CONFIG_DMA_GLOBAL_POOL +static struct dma_coherent_mem *dma_coherent_default_memory __ro_after_init; + +void *dma_alloc_from_global_coherent(struct device *dev, ssize_t size, + dma_addr_t *dma_handle) +{ + if (!dma_coherent_default_memory) + return NULL; + + return __dma_alloc_from_coherent(dev, dma_coherent_default_memory, size, + dma_handle); +} + +int dma_release_from_global_coherent(int order, void *vaddr) +{ + if (!dma_coherent_default_memory) + return 0; + + return __dma_release_from_coherent(dma_coherent_default_memory, order, + vaddr); +} + int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *vaddr, size_t size, int *ret) { @@ -311,6 +312,7 @@ int dma_init_global_coherent(phys_addr_t phys_addr, size_t size) pr_info("DMA: default coherent area is set\n"); return 0; } +#endif /* CONFIG_DMA_GLOBAL_POOL */ /* * Support for reserved memory regions defined in device tree @@ -320,7 +322,9 @@ int dma_init_global_coherent(phys_addr_t phys_addr, size_t size) #include #include +#ifdef CONFIG_DMA_GLOBAL_POOL static struct reserved_mem *dma_reserved_default_memory __initdata; +#endif static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev) { @@ -377,6 +381,7 @@ static int __init rmem_dma_setup(struct reserved_mem *rmem) return 0; } +#ifdef CONFIG_DMA_GLOBAL_POOL static int __init dma_init_reserved_memory(void) { if (!dma_reserved_default_memory) @@ -384,8 +389,8 @@ static int __init dma_init_reserved_memory(void) return dma_init_global_coherent(dma_reserved_default_memory->base, dma_reserved_default_memory->size); } - core_initcall(dma_init_reserved_memory); +#endif /* CONFIG_DMA_GLOBAL_POOL */ RESERVEDMEM_OF_DECLARE(dma, "shared-dma-pool", rmem_dma_setup); #endif -- cgit v1.2.3 From 1daf08a066cfe500587affd3fa3be8c13b8ff007 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 3 Aug 2021 16:16:09 +0200 Subject: livepatch: Replace deprecated CPU-hotplug functions. The functions get_online_cpus() and put_online_cpus() have been deprecated during the CPU hotplug rework. They map directly to cpus_read_lock() and cpus_read_unlock(). Replace deprecated CPU-hotplug functions with the official version. The behavior remains unchanged. Cc: Josh Poimboeuf Cc: Jiri Kosina Cc: Miroslav Benes Cc: Petr Mladek Cc: Joe Lawrence Cc: live-patching@vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Petr Mladek Signed-off-by: Jiri Kosina --- kernel/livepatch/transition.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index 3a4beb9395c4..291b857a6e20 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -411,7 +411,7 @@ void klp_try_complete_transition(void) /* * Ditto for the idle "swapper" tasks. */ - get_online_cpus(); + cpus_read_lock(); for_each_possible_cpu(cpu) { task = idle_task(cpu); if (cpu_online(cpu)) { @@ -423,7 +423,7 @@ void klp_try_complete_transition(void) task->patch_state = klp_target_state; } } - put_online_cpus(); + cpus_read_unlock(); if (!complete) { if (klp_signals_cnt && !(klp_signals_cnt % SIGNALS_TIMEOUT)) -- cgit v1.2.3 From 007517a01995fb24f2f4effc9cf34814361a9d10 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 19 Aug 2021 00:13:27 -0400 Subject: tracing/probe: Change traceprobe_set_print_fmt() to take a type Instead of a boolean "is_return" have traceprobe_set_print_fmt() take a type (currently just PROBE_PRINT_NORMAL and PROBE_PRINT_RETURN). This will simplify adding different types. For example, the development of the event_probe, will need its own type as it prints an event, and not an IP. Link: https://lkml.kernel.org/r/20210819041842.104626301@goodmis.org Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 9 +++++++-- kernel/trace/trace_probe.c | 18 ++++++++++++------ kernel/trace/trace_probe.h | 7 ++++++- kernel/trace/trace_uprobe.c | 8 ++++++-- 4 files changed, 31 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index ca726c9d0859..c6fe7a6e3f35 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -742,6 +742,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) bool is_return = false; char *symbol = NULL, *tmp = NULL; const char *event = NULL, *group = KPROBE_EVENT_SYSTEM; + enum probe_print_type ptype; int maxactive = 0; long offset = 0; void *addr = NULL; @@ -875,7 +876,8 @@ static int __trace_kprobe_create(int argc, const char *argv[]) goto error; /* This can be -ENOMEM */ } - ret = traceprobe_set_print_fmt(&tk->tp, is_return); + ptype = is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; + ret = traceprobe_set_print_fmt(&tk->tp, ptype); if (ret < 0) goto error; @@ -1799,6 +1801,7 @@ struct trace_event_call * create_local_trace_kprobe(char *func, void *addr, unsigned long offs, bool is_return) { + enum probe_print_type ptype; struct trace_kprobe *tk; int ret; char *event; @@ -1822,7 +1825,9 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs, init_trace_event_call(tk); - if (traceprobe_set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) { + ptype = trace_kprobe_is_return(tk) ? + PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; + if (traceprobe_set_print_fmt(&tk->tp, ptype) < 0) { ret = -ENOMEM; goto error; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 0916a9964719..9c9c83a063b2 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -851,19 +851,25 @@ int traceprobe_update_arg(struct probe_arg *arg) /* When len=0, we just calculate the needed length */ #define LEN_OR_ZERO (len ? len - pos : 0) static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, - bool is_return) + enum probe_print_type ptype) { struct probe_arg *parg; int i, j; int pos = 0; const char *fmt, *arg; - if (!is_return) { + switch (ptype) { + case PROBE_PRINT_NORMAL: fmt = "(%lx)"; arg = "REC->" FIELD_STRING_IP; - } else { + break; + case PROBE_PRINT_RETURN: fmt = "(%lx <- %lx)"; arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; + break; + default: + WARN_ON_ONCE(1); + return 0; } pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); @@ -912,20 +918,20 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, } #undef LEN_OR_ZERO -int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return) +int traceprobe_set_print_fmt(struct trace_probe *tp, enum probe_print_type ptype) { struct trace_event_call *call = trace_probe_event_call(tp); int len; char *print_fmt; /* First: called with 0 length to calculate the needed length */ - len = __set_print_fmt(tp, NULL, 0, is_return); + len = __set_print_fmt(tp, NULL, 0, ptype); print_fmt = kmalloc(len + 1, GFP_KERNEL); if (!print_fmt) return -ENOMEM; /* Second: actually write the @print_fmt */ - __set_print_fmt(tp, print_fmt, len + 1, is_return); + __set_print_fmt(tp, print_fmt, len + 1, ptype); call->print_fmt = print_fmt; return 0; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 42aa084902fa..8adf5f3542a6 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -363,7 +363,12 @@ extern int traceprobe_split_symbol_offset(char *symbol, long *offset); int traceprobe_parse_event_name(const char **pevent, const char **pgroup, char *buf, int offset); -extern int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return); +enum probe_print_type { + PROBE_PRINT_NORMAL, + PROBE_PRINT_RETURN, +}; + +extern int traceprobe_set_print_fmt(struct trace_probe *tp, enum probe_print_type ptype); #ifdef CONFIG_PERF_EVENTS extern struct trace_event_call * diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 590bb9a02f8d..09f8ca7f7ba0 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -536,6 +536,7 @@ static int __trace_uprobe_create(int argc, const char **argv) const char *event = NULL, *group = UPROBE_EVENT_SYSTEM; char *arg, *filename, *rctr, *rctr_end, *tmp; char buf[MAX_EVENT_NAME_LEN]; + enum probe_print_type ptype; struct path path; unsigned long offset, ref_ctr_offset; bool is_return = false; @@ -687,7 +688,8 @@ static int __trace_uprobe_create(int argc, const char **argv) goto error; } - ret = traceprobe_set_print_fmt(&tu->tp, is_ret_probe(tu)); + ptype = is_ret_probe(tu) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; + ret = traceprobe_set_print_fmt(&tu->tp, ptype); if (ret < 0) goto error; @@ -1578,6 +1580,7 @@ struct trace_event_call * create_local_trace_uprobe(char *name, unsigned long offs, unsigned long ref_ctr_offset, bool is_return) { + enum probe_print_type ptype; struct trace_uprobe *tu; struct path path; int ret; @@ -1612,7 +1615,8 @@ create_local_trace_uprobe(char *name, unsigned long offs, tu->filename = kstrdup(name, GFP_KERNEL); init_trace_event_call(tu); - if (traceprobe_set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) { + ptype = is_ret_probe(tu) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; + if (traceprobe_set_print_fmt(&tu->tp, ptype) < 0) { ret = -ENOMEM; goto error; } -- cgit v1.2.3 From 8565a45d0858078b63c7d84074a21a42ba9ebf01 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 19 Aug 2021 00:13:28 -0400 Subject: tracing/probes: Have process_fetch_insn() take a void * instead of pt_regs In preparation to allow event probes to use the process_fetch_insn() callback in trace_probe_tmpl.h, change the data passed to it from a pointer to pt_regs, as the event probe will not be using regs, and make it a void pointer instead. Update the process_fetch_insn() callers for kprobe and uprobe events to have the regs defined in the function and just typecast the void pointer parameter. Link: https://lkml.kernel.org/r/20210819041842.291622924@goodmis.org Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 3 ++- kernel/trace/trace_probe_tmpl.h | 6 +++--- kernel/trace/trace_uprobe.c | 3 ++- 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c6fe7a6e3f35..4b013d24f5a9 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1325,9 +1325,10 @@ probe_mem_read(void *dest, void *src, size_t size) /* Note that we don't verify it, since the code does not come from user space */ static int -process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, +process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, void *base) { + struct pt_regs *regs = rec; unsigned long val; retry: diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index f003c5d02a3a..b3bdb8ddb862 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -54,7 +54,7 @@ fetch_apply_bitfield(struct fetch_insn *code, void *buf) * If dest is NULL, don't store result and return required dynamic data size. */ static int -process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, +process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, void *base); static nokprobe_inline int fetch_store_strlen(unsigned long addr); static nokprobe_inline int @@ -188,7 +188,7 @@ __get_data_size(struct trace_probe *tp, struct pt_regs *regs) /* Store the value of each argument */ static nokprobe_inline void -store_trace_args(void *data, struct trace_probe *tp, struct pt_regs *regs, +store_trace_args(void *data, struct trace_probe *tp, void *rec, int header_size, int maxlen) { struct probe_arg *arg; @@ -203,7 +203,7 @@ store_trace_args(void *data, struct trace_probe *tp, struct pt_regs *regs, /* Point the dynamic data area if needed */ if (unlikely(arg->dynamic)) *dl = make_data_loc(maxlen, dyndata - base); - ret = process_fetch_insn(arg->code, regs, dl, base); + ret = process_fetch_insn(arg->code, rec, dl, base); if (unlikely(ret < 0 && arg->dynamic)) { *dl = make_data_loc(0, dyndata - base); } else { diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 09f8ca7f7ba0..d219ba50efbd 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -213,9 +213,10 @@ static unsigned long translate_user_vaddr(unsigned long file_offset) /* Note that we don't verify it, since the code does not come from user space */ static int -process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, +process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, void *base) { + struct pt_regs *regs = rec; unsigned long val; /* 1st stage: get value from context */ -- cgit v1.2.3 From 8e242060c6a4947e8ae7d29794af6a581db08841 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 19 Aug 2021 19:26:02 +0900 Subject: tracing/probes: Reject events which have the same name of existing one Since kprobe_events and uprobe_events only check whether the other same-type probe event has the same name or not, if the user gives the same name of the existing tracepoint event (or the other type of probe events), it silently fails to create the tracefs entry (but registered.) as below. /sys/kernel/tracing # ls events/task/task_rename enable filter format hist id trigger /sys/kernel/tracing # echo p:task/task_rename vfs_read >> kprobe_events [ 113.048508] Could not create tracefs 'task_rename' directory /sys/kernel/tracing # cat kprobe_events p:task/task_rename vfs_read To fix this issue, check whether the existing events have the same name or not in trace_probe_register_event_call(). If exists, it rejects to register the new event. Link: https://lkml.kernel.org/r/162936876189.187130.17558311387542061930.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 6 +++++- kernel/trace/trace_probe.c | 25 +++++++++++++++++++++++++ kernel/trace/trace_probe.h | 1 + kernel/trace/trace_uprobe.c | 6 +++++- 4 files changed, 36 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 4b013d24f5a9..882c27044029 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -647,7 +647,11 @@ static int register_trace_kprobe(struct trace_kprobe *tk) /* Register new event */ ret = register_kprobe_event(tk); if (ret) { - pr_warn("Failed to register probe event(%d)\n", ret); + if (ret == -EEXIST) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, EVENT_EXIST); + } else + pr_warn("Failed to register probe event(%d)\n", ret); goto end; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 9c9c83a063b2..782c00eb6859 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1047,11 +1047,36 @@ error: return ret; } +static struct trace_event_call * +find_trace_event_call(const char *system, const char *event_name) +{ + struct trace_event_call *tp_event; + const char *name; + + list_for_each_entry(tp_event, &ftrace_events, list) { + if (!tp_event->class->system || + strcmp(system, tp_event->class->system)) + continue; + name = trace_event_name(tp_event); + if (!name || strcmp(event_name, name)) + continue; + return tp_event; + } + + return NULL; +} + int trace_probe_register_event_call(struct trace_probe *tp) { struct trace_event_call *call = trace_probe_event_call(tp); int ret; + lockdep_assert_held(&event_mutex); + + if (find_trace_event_call(trace_probe_group_name(tp), + trace_probe_name(tp))) + return -EEXIST; + ret = register_trace_event(&call->event); if (!ret) return -ENODEV; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 8adf5f3542a6..66701a92d186 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -404,6 +404,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(NO_EVENT_NAME, "Event name is not specified"), \ C(EVENT_TOO_LONG, "Event name is too long"), \ C(BAD_EVENT_NAME, "Event name must follow the same rules as C identifiers"), \ + C(EVENT_EXIST, "Given group/event name is already used by another event"), \ C(RETVAL_ON_PROBE, "$retval is not available on probe"), \ C(BAD_STACK_NUM, "Invalid stack number"), \ C(BAD_ARG_NUM, "Invalid argument number"), \ diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index d219ba50efbd..225ce569bf8f 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -515,7 +515,11 @@ static int register_trace_uprobe(struct trace_uprobe *tu) ret = register_uprobe_event(tu); if (ret) { - pr_warn("Failed to register probe event(%d)\n", ret); + if (ret == -EEXIST) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, EVENT_EXIST); + } else + pr_warn("Failed to register probe event(%d)\n", ret); goto end; } -- cgit v1.2.3 From f9dabe016b63c9629e152bf876c126c29de223cb Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 19 Aug 2021 15:59:33 +0200 Subject: bpf: Undo off-by-one in interpreter tail call count limit The BPF interpreter as well as x86-64 BPF JIT were both in line by allowing up to 33 tail calls (however odd that number may be!). Recently, this was changed for the interpreter to reduce it down to 32 with the assumption that this should have been the actual limit "which is in line with the behavior of the x86 JITs" according to b61a28cf11d61 ("bpf: Fix off-by-one in tail call count limiting"). Paul recently reported: I'm a bit surprised by this because I had previously tested the tail call limit of several JIT compilers and found it to be 33 (i.e., allowing chains of up to 34 programs). I've just extended a test program I had to validate this again on the x86-64 JIT, and found a limit of 33 tail calls again [1]. Also note we had previously changed the RISC-V and MIPS JITs to allow up to 33 tail calls [2, 3], for consistency with other JITs and with the interpreter. We had decided to increase these two to 33 rather than decrease the other JITs to 32 for backward compatibility, though that probably doesn't matter much as I'd expect few people to actually use 33 tail calls. [1] https://github.com/pchaigno/tail-call-bench/commit/ae7887482985b4b1745c9b2ef7ff9ae506c82886 [2] 96bc4432f5ad ("bpf, riscv: Limit to 33 tail calls") [3] e49e6f6db04e ("bpf, mips: Limit to 33 tail calls") Therefore, revert b61a28cf11d61 to re-align interpreter to limit a maximum of 33 tail calls. While it is unlikely to hit the limit for the vast majority, programs in the wild could one way or another depend on this, so lets rather be a bit more conservative, and lets align the small remainder of JITs to 33. If needed in future, this limit could be slightly increased, but not decreased. Fixes: b61a28cf11d61 ("bpf: Fix off-by-one in tail call count limiting") Reported-by: Paul Chaignon Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Johan Almbladh Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/CAO5pjwTWrC0_dzTbTHFPSqDwA56aVH+4KFGVqdq8=ASs0MqZGQ@mail.gmail.com --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 91f24c7b38a1..9f4636d021b1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1564,7 +1564,7 @@ select_insn: if (unlikely(index >= array->map.max_entries)) goto out; - if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT)) + if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT)) goto out; tail_call_cnt++; -- cgit v1.2.3 From 594286b7574c6e8217b1c233cc0d0650f2268a77 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 19 Aug 2021 08:52:09 -0700 Subject: bpf: Fix NULL event->prog pointer access in bpf_overflow_handler Andrii reported that libbpf CI hit the following oops when running selftest send_signal: [ 1243.160719] BUG: kernel NULL pointer dereference, address: 0000000000000030 [ 1243.161066] #PF: supervisor read access in kernel mode [ 1243.161066] #PF: error_code(0x0000) - not-present page [ 1243.161066] PGD 0 P4D 0 [ 1243.161066] Oops: 0000 [#1] PREEMPT SMP NOPTI [ 1243.161066] CPU: 1 PID: 882 Comm: new_name Tainted: G O 5.14.0-rc5 #1 [ 1243.161066] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 [ 1243.161066] RIP: 0010:bpf_overflow_handler+0x9a/0x1e0 [ 1243.161066] Code: 5a 84 c0 0f 84 06 01 00 00 be 66 02 00 00 48 c7 c7 6d 96 07 82 48 8b ab 18 05 00 00 e8 df 55 eb ff 66 90 48 8d 75 48 48 89 e7 55 30 41 89 c4 e8 fb c1 f0 ff 84 c0 0f 84 94 00 00 00 e8 6e 0f [ 1243.161066] RSP: 0018:ffffc900000c0d80 EFLAGS: 00000046 [ 1243.161066] RAX: 0000000000000002 RBX: ffff8881002e0dd0 RCX: 00000000b4b47cf8 [ 1243.161066] RDX: ffffffff811dcb06 RSI: 0000000000000048 RDI: ffffc900000c0d80 [ 1243.161066] RBP: 0000000000000000 R08: 0000000000000000 R09: 1a9d56bb00000000 [ 1243.161066] R10: 0000000000000001 R11: 0000000000080000 R12: 0000000000000000 [ 1243.161066] R13: ffffc900000c0e00 R14: ffffc900001c3c68 R15: 0000000000000082 [ 1243.161066] FS: 00007fc0be2d3380(0000) GS:ffff88813bd00000(0000) knlGS:0000000000000000 [ 1243.161066] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1243.161066] CR2: 0000000000000030 CR3: 0000000104f8e000 CR4: 00000000000006e0 [ 1243.161066] Call Trace: [ 1243.161066] [ 1243.161066] __perf_event_overflow+0x4f/0xf0 [ 1243.161066] perf_swevent_hrtimer+0x116/0x130 [ 1243.161066] ? __lock_acquire+0x378/0x2730 [ 1243.161066] ? __lock_acquire+0x372/0x2730 [ 1243.161066] ? lock_is_held_type+0xd5/0x130 [ 1243.161066] ? find_held_lock+0x2b/0x80 [ 1243.161066] ? lock_is_held_type+0xd5/0x130 [ 1243.161066] ? perf_event_groups_first+0x80/0x80 [ 1243.161066] ? perf_event_groups_first+0x80/0x80 [ 1243.161066] __hrtimer_run_queues+0x1a3/0x460 [ 1243.161066] hrtimer_interrupt+0x110/0x220 [ 1243.161066] __sysvec_apic_timer_interrupt+0x8a/0x260 [ 1243.161066] sysvec_apic_timer_interrupt+0x89/0xc0 [ 1243.161066] [ 1243.161066] asm_sysvec_apic_timer_interrupt+0x12/0x20 [ 1243.161066] RIP: 0010:finish_task_switch+0xaf/0x250 [ 1243.161066] Code: 31 f6 68 90 2a 09 81 49 8d 7c 24 18 e8 aa d6 03 00 4c 89 e7 e8 12 ff ff ff 4c 89 e7 e8 ca 9c 80 00 e8 35 af 0d 00 fb 4d 85 f6 <58> 74 1d 65 48 8b 04 25 c0 6d 01 00 4c 3b b0 a0 04 00 00 74 37 f0 [ 1243.161066] RSP: 0018:ffffc900001c3d18 EFLAGS: 00000282 [ 1243.161066] RAX: 000000000000031f RBX: ffff888104cf4980 RCX: 0000000000000000 [ 1243.161066] RDX: 0000000000000000 RSI: ffffffff82095460 RDI: ffffffff820adc4e [ 1243.161066] RBP: ffffc900001c3d58 R08: 0000000000000001 R09: 0000000000000001 [ 1243.161066] R10: 0000000000000001 R11: 0000000000080000 R12: ffff88813bd2bc80 [ 1243.161066] R13: ffff8881002e8000 R14: ffff88810022ad80 R15: 0000000000000000 [ 1243.161066] ? finish_task_switch+0xab/0x250 [ 1243.161066] ? finish_task_switch+0x70/0x250 [ 1243.161066] __schedule+0x36b/0xbb0 [ 1243.161066] ? _raw_spin_unlock_irqrestore+0x2d/0x50 [ 1243.161066] ? lockdep_hardirqs_on+0x79/0x100 [ 1243.161066] schedule+0x43/0xe0 [ 1243.161066] pipe_read+0x30b/0x450 [ 1243.161066] ? wait_woken+0x80/0x80 [ 1243.161066] new_sync_read+0x164/0x170 [ 1243.161066] vfs_read+0x122/0x1b0 [ 1243.161066] ksys_read+0x93/0xd0 [ 1243.161066] do_syscall_64+0x35/0x80 [ 1243.161066] entry_SYSCALL_64_after_hwframe+0x44/0xae The oops can also be reproduced with the following steps: ./vmtest.sh -s # at qemu shell cd /root/bpf && while true; do ./test_progs -t send_signal Further analysis showed that the failure is introduced with commit b89fbfbb854c ("bpf: Implement minimal BPF perf link"). With the above commit, the following scenario becomes possible: cpu1 cpu2 hrtimer_interrupt -> bpf_overflow_handler (due to closing link_fd) bpf_perf_link_release -> perf_event_free_bpf_prog -> perf_event_free_bpf_handler -> WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler) event->prog = NULL bpf_prog_run(event->prog, &ctx) In the above case, the event->prog is NULL for bpf_prog_run, hence causing oops. To fix the issue, check whether event->prog is NULL or not. If it is, do not call bpf_prog_run. This seems working as the above reproducible step runs more than one hour and I didn't see any failures. Fixes: b89fbfbb854c ("bpf: Implement minimal BPF perf link") Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210819155209.1927994-1-yhs@fb.com --- kernel/events/core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 2d1e63dd97f2..011cc5069b7b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9920,13 +9920,16 @@ static void bpf_overflow_handler(struct perf_event *event, .data = data, .event = event, }; + struct bpf_prog *prog; int ret = 0; ctx.regs = perf_arch_bpf_user_pt_regs(regs); if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) goto out; rcu_read_lock(); - ret = bpf_prog_run(event->prog, &ctx); + prog = READ_ONCE(event->prog); + if (prog) + ret = bpf_prog_run(prog, &ctx); rcu_read_unlock(); out: __this_cpu_dec(bpf_prog_active); -- cgit v1.2.3 From f0dce1d9b7c81fc3dc9d0cc0bc7ef9b3eae22584 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 18 Aug 2021 16:52:15 -0700 Subject: bpf: Use kvmalloc for map values in syscall Use kvmalloc/kvfree for temporary value when manipulating a map via syscall. kmalloc might not be sufficient for percpu maps where the value is big (and further multiplied by hundreds of CPUs). Can be reproduced with netcnt test on qemu with "-smp 255". Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210818235216.1159202-1-sdf@google.com --- kernel/bpf/syscall.c | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7420e1334ab2..075f650d297a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1076,7 +1076,7 @@ static int map_lookup_elem(union bpf_attr *attr) value_size = bpf_map_value_size(map); err = -ENOMEM; - value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); + value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; @@ -1091,7 +1091,7 @@ static int map_lookup_elem(union bpf_attr *attr) err = 0; free_value: - kfree(value); + kvfree(value); free_key: kfree(key); err_put: @@ -1137,16 +1137,10 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) goto err_put; } - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || - map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) - value_size = round_up(map->value_size, 8) * num_possible_cpus(); - else - value_size = map->value_size; + value_size = bpf_map_value_size(map); err = -ENOMEM; - value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); + value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; @@ -1157,7 +1151,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) err = bpf_map_update_value(map, f, key, value, attr->flags); free_value: - kfree(value); + kvfree(value); free_key: kfree(key); err_put: @@ -1367,7 +1361,7 @@ int generic_map_update_batch(struct bpf_map *map, if (!key) return -ENOMEM; - value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); + value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) { kfree(key); return -ENOMEM; @@ -1390,7 +1384,7 @@ int generic_map_update_batch(struct bpf_map *map, if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) err = -EFAULT; - kfree(value); + kvfree(value); kfree(key); return err; } @@ -1429,7 +1423,7 @@ int generic_map_lookup_batch(struct bpf_map *map, if (!buf_prevkey) return -ENOMEM; - buf = kmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN); + buf = kvmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN); if (!buf) { kfree(buf_prevkey); return -ENOMEM; @@ -1492,7 +1486,7 @@ int generic_map_lookup_batch(struct bpf_map *map, free_buf: kfree(buf_prevkey); - kfree(buf); + kvfree(buf); return err; } @@ -1547,7 +1541,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) value_size = bpf_map_value_size(map); err = -ENOMEM; - value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); + value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; @@ -1579,7 +1573,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) err = 0; free_value: - kfree(value); + kvfree(value); free_key: kfree(key); err_put: -- cgit v1.2.3 From 44779a4b85abd1d1dab9e5b90bd5e6adcfc8143a Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 18 Aug 2021 16:52:16 -0700 Subject: bpf: Use kvmalloc for map keys in syscalls Same as previous patch but for the keys. memdup_bpfptr is renamed to kvmemdup_bpfptr (and converted to kvmalloc). Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210818235216.1159202-2-sdf@google.com --- kernel/bpf/syscall.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 075f650d297a..4e50c0bfdb7d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1013,7 +1013,7 @@ int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) static void *__bpf_copy_key(void __user *ukey, u64 key_size) { if (key_size) - return memdup_user(ukey, key_size); + return vmemdup_user(ukey, key_size); if (ukey) return ERR_PTR(-EINVAL); @@ -1024,7 +1024,7 @@ static void *__bpf_copy_key(void __user *ukey, u64 key_size) static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size) { if (key_size) - return memdup_bpfptr(ukey, key_size); + return kvmemdup_bpfptr(ukey, key_size); if (!bpfptr_is_null(ukey)) return ERR_PTR(-EINVAL); @@ -1093,7 +1093,7 @@ static int map_lookup_elem(union bpf_attr *attr) free_value: kvfree(value); free_key: - kfree(key); + kvfree(key); err_put: fdput(f); return err; @@ -1153,7 +1153,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) free_value: kvfree(value); free_key: - kfree(key); + kvfree(key); err_put: fdput(f); return err; @@ -1205,7 +1205,7 @@ static int map_delete_elem(union bpf_attr *attr) bpf_enable_instrumentation(); maybe_wait_bpf_programs(map); out: - kfree(key); + kvfree(key); err_put: fdput(f); return err; @@ -1247,7 +1247,7 @@ static int map_get_next_key(union bpf_attr *attr) } err = -ENOMEM; - next_key = kmalloc(map->key_size, GFP_USER); + next_key = kvmalloc(map->key_size, GFP_USER); if (!next_key) goto free_key; @@ -1270,9 +1270,9 @@ out: err = 0; free_next_key: - kfree(next_key); + kvfree(next_key); free_key: - kfree(key); + kvfree(key); err_put: fdput(f); return err; @@ -1299,7 +1299,7 @@ int generic_map_delete_batch(struct bpf_map *map, if (!max_count) return 0; - key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN); + key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); if (!key) return -ENOMEM; @@ -1326,7 +1326,7 @@ int generic_map_delete_batch(struct bpf_map *map, if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) err = -EFAULT; - kfree(key); + kvfree(key); return err; } @@ -1357,13 +1357,13 @@ int generic_map_update_batch(struct bpf_map *map, if (!max_count) return 0; - key = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN); + key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); if (!key) return -ENOMEM; value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) { - kfree(key); + kvfree(key); return -ENOMEM; } @@ -1385,7 +1385,7 @@ int generic_map_update_batch(struct bpf_map *map, err = -EFAULT; kvfree(value); - kfree(key); + kvfree(key); return err; } @@ -1419,13 +1419,13 @@ int generic_map_lookup_batch(struct bpf_map *map, if (put_user(0, &uattr->batch.count)) return -EFAULT; - buf_prevkey = kmalloc(map->key_size, GFP_USER | __GFP_NOWARN); + buf_prevkey = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); if (!buf_prevkey) return -ENOMEM; buf = kvmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN); if (!buf) { - kfree(buf_prevkey); + kvfree(buf_prevkey); return -ENOMEM; } @@ -1485,7 +1485,7 @@ int generic_map_lookup_batch(struct bpf_map *map, err = -EFAULT; free_buf: - kfree(buf_prevkey); + kvfree(buf_prevkey); kvfree(buf); return err; } @@ -1575,7 +1575,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) free_value: kvfree(value); free_key: - kfree(key); + kvfree(key); err_put: fdput(f); return err; -- cgit v1.2.3 From 2c531639deb5e3ddfd6e8123b82052b2d9fbc6e5 Mon Sep 17 00:00:00 2001 From: Prankur Gupta Date: Tue, 17 Aug 2021 15:42:20 -0700 Subject: bpf: Add support for {set|get} socket options from setsockopt BPF Add logic to call bpf_setsockopt() and bpf_getsockopt() from setsockopt BPF programs. An example use case is when the user sets the IPV6_TCLASS socket option, we would also like to change the tcp-cc for that socket. We don't have any use case for calling bpf_setsockopt() from supposedly read- only sys_getsockopt(), so it is made available to BPF_CGROUP_SETSOCKOPT only at this point. Signed-off-by: Prankur Gupta Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210817224221.3257826-2-prankgup@fb.com --- kernel/bpf/cgroup.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 9f35928bab0a..8e9d99e2ade4 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1873,6 +1873,14 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; + case BPF_FUNC_setsockopt: + if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT) + return &bpf_sk_setsockopt_proto; + return NULL; + case BPF_FUNC_getsockopt: + if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT) + return &bpf_sk_getsockopt_proto; + return NULL; #endif #ifdef CONFIG_INET case BPF_FUNC_tcp_sock: -- cgit v1.2.3 From 7491e2c442781a1860181adb5ab472a52075f393 Mon Sep 17 00:00:00 2001 From: "Tzvetomir Stoyanov (VMware)" Date: Thu, 19 Aug 2021 11:26:06 -0400 Subject: tracing: Add a probe that attaches to trace events A new dynamic event is introduced: event probe. The event is attached to an existing tracepoint and uses its fields as arguments. The user can specify custom format string of the new event, select what tracepoint arguments will be printed and how to print them. An event probe is created by writing configuration string in 'dynamic_events' ftrace file: e[:[SNAME/]ENAME] SYSTEM/EVENT [FETCHARGS] - Set an event probe -:SNAME/ENAME - Delete an event probe Where: SNAME - System name, if omitted 'eprobes' is used. ENAME - Name of the new event in SNAME, if omitted the SYSTEM_EVENT is used. SYSTEM - Name of the system, where the tracepoint is defined, mandatory. EVENT - Name of the tracepoint event in SYSTEM, mandatory. FETCHARGS - Arguments: =$[:TYPE] - Fetch given filed of the tracepoint and print it as given TYPE with given name. Supported types are: (u8/u16/u32/u64/s8/s16/s32/s64), basic type (x8/x16/x32/x64), hexadecimal types "string", "ustring" and bitfield. Example, attach an event probe on openat system call and print name of the file that will be opened: echo "e:esys/eopen syscalls/sys_enter_openat file=\$filename:string" >> dynamic_events A new dynamic event is created in events/esys/eopen/ directory. It can be deleted with: echo "-:esys/eopen" >> dynamic_events Filters, triggers and histograms can be attached to the new event, it can be matched in synthetic events. There is one limitation - an event probe can not be attached to kprobe, uprobe or another event probe. Link: https://lkml.kernel.org/r/20210812145805.2292326-1-tz.stoyanov@gmail.com Link: https://lkml.kernel.org/r/20210819152825.142428383@goodmis.org Acked-by: Masami Hiramatsu Co-developed-by: Steven Rostedt (VMware) Signed-off-by: Tzvetomir Stoyanov (VMware) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Makefile | 1 + kernel/trace/trace.c | 5 +- kernel/trace/trace.h | 18 + kernel/trace/trace_eprobe.c | 903 ++++++++++++++++++++++++++++++++++++ kernel/trace/trace_events_trigger.c | 14 +- kernel/trace/trace_kprobe.c | 8 - kernel/trace/trace_probe.c | 16 +- kernel/trace/trace_probe.h | 6 +- 8 files changed, 958 insertions(+), 13 deletions(-) create mode 100644 kernel/trace/trace_eprobe.c (limited to 'kernel') diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index b1c47ccf4f73..6de5d4d63165 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -77,6 +77,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o +obj-$(CONFIG_PROBE_EVENTS) += trace_eprobe.o obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8425c3d70895..489924cde4f8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5543,6 +5543,7 @@ static const char readme_msg[] = #ifdef CONFIG_HIST_TRIGGERS "\t s:[synthetic/] []\n" #endif + "\t e[:[/]] . []\n" "\t -:[/]\n" #ifdef CONFIG_KPROBE_EVENTS "\t place: [:][+]|\n" @@ -5552,7 +5553,7 @@ static const char readme_msg[] = " place (uprobe): :[%return][(ref_ctr_offset)]\n" #endif "\t args: =fetcharg[:type]\n" - "\t fetcharg: %, @
, @[+|-],\n" + "\t fetcharg: (%|$), @
, @[+|-],\n" #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API "\t $stack, $stack, $retval, $comm, $arg,\n" #else @@ -5567,6 +5568,8 @@ static const char readme_msg[] = "\t stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n" "\t [unsigned] char/int/long\n" #endif + "\t efield: For event probes ('e' types), the field is on of the fields\n" + "\t of the /.\n" #endif " events/\t\t- Directory containing all trace event subsystems:\n" " enable\t\t- Write 0/1 to enable/disable tracing of all events\n" diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 4a0e693000c6..b7c0f8e160fb 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -126,6 +126,11 @@ struct kprobe_trace_entry_head { unsigned long ip; }; +struct eprobe_trace_entry_head { + struct trace_entry ent; + unsigned int type; +}; + struct kretprobe_trace_entry_head { struct trace_entry ent; unsigned long func; @@ -1508,9 +1513,14 @@ static inline int register_trigger_hist_enable_disable_cmds(void) { return 0; } extern int register_trigger_cmds(void); extern void clear_event_triggers(struct trace_array *tr); +enum { + EVENT_TRIGGER_FL_PROBE = BIT(0), +}; + struct event_trigger_data { unsigned long count; int ref; + int flags; struct event_trigger_ops *ops; struct event_command *cmd_ops; struct event_filter __rcu *filter; @@ -1918,6 +1928,14 @@ static inline bool is_good_name(const char *name) return true; } +/* Convert certain expected symbols into '_' when generating event names */ +static inline void sanitize_event_name(char *name) +{ + while (*name++ != '\0') + if (*name == ':' || *name == '.') + *name = '_'; +} + /* * This is a generic way to read and write a u64 value from a file in tracefs. * diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c new file mode 100644 index 000000000000..56a96e9750cf --- /dev/null +++ b/kernel/trace/trace_eprobe.c @@ -0,0 +1,903 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * event probes + * + * Part of this code was copied from kernel/trace/trace_kprobe.c written by + * Masami Hiramatsu + * + * Copyright (C) 2021, VMware Inc, Steven Rostedt + * Copyright (C) 2021, VMware Inc, Tzvetomir Stoyanov tz.stoyanov@gmail.com> + * + */ +#include +#include +#include + +#include "trace_dynevent.h" +#include "trace_probe.h" +#include "trace_probe_tmpl.h" + +#define EPROBE_EVENT_SYSTEM "eprobes" + +struct trace_eprobe { + /* tracepoint system */ + const char *event_system; + + /* tracepoint event */ + const char *event_name; + + struct trace_event_call *event; + + struct dyn_event devent; + struct trace_probe tp; +}; + +struct eprobe_data { + struct trace_event_file *file; + struct trace_eprobe *ep; +}; + +static int __trace_eprobe_create(int argc, const char *argv[]); + +static void trace_event_probe_cleanup(struct trace_eprobe *ep) +{ + if (!ep) + return; + trace_probe_cleanup(&ep->tp); + kfree(ep->event_name); + kfree(ep->event_system); + if (ep->event) + trace_event_put_ref(ep->event); + kfree(ep); +} + +static struct trace_eprobe *to_trace_eprobe(struct dyn_event *ev) +{ + return container_of(ev, struct trace_eprobe, devent); +} + +static int eprobe_dyn_event_create(const char *raw_command) +{ + return trace_probe_create(raw_command, __trace_eprobe_create); +} + +static int eprobe_dyn_event_show(struct seq_file *m, struct dyn_event *ev) +{ + struct trace_eprobe *ep = to_trace_eprobe(ev); + int i; + + seq_printf(m, "e:%s/%s", trace_probe_group_name(&ep->tp), + trace_probe_name(&ep->tp)); + seq_printf(m, " %s.%s", ep->event_system, ep->event_name); + + for (i = 0; i < ep->tp.nr_args; i++) + seq_printf(m, " %s=%s", ep->tp.args[i].name, ep->tp.args[i].comm); + seq_putc(m, '\n'); + + return 0; +} + +static int unregister_trace_eprobe(struct trace_eprobe *ep) +{ + /* If other probes are on the event, just unregister eprobe */ + if (trace_probe_has_sibling(&ep->tp)) + goto unreg; + + /* Enabled event can not be unregistered */ + if (trace_probe_is_enabled(&ep->tp)) + return -EBUSY; + + /* Will fail if probe is being used by ftrace or perf */ + if (trace_probe_unregister_event_call(&ep->tp)) + return -EBUSY; + +unreg: + dyn_event_remove(&ep->devent); + trace_probe_unlink(&ep->tp); + + return 0; +} + +static int eprobe_dyn_event_release(struct dyn_event *ev) +{ + struct trace_eprobe *ep = to_trace_eprobe(ev); + int ret = unregister_trace_eprobe(ep); + + if (!ret) + trace_event_probe_cleanup(ep); + return ret; +} + +static bool eprobe_dyn_event_is_busy(struct dyn_event *ev) +{ + struct trace_eprobe *ep = to_trace_eprobe(ev); + + return trace_probe_is_enabled(&ep->tp); +} + +static bool eprobe_dyn_event_match(const char *system, const char *event, + int argc, const char **argv, struct dyn_event *ev) +{ + struct trace_eprobe *ep = to_trace_eprobe(ev); + + return strcmp(trace_probe_name(&ep->tp), event) == 0 && + (!system || strcmp(trace_probe_group_name(&ep->tp), system) == 0) && + trace_probe_match_command_args(&ep->tp, argc, argv); +} + +static struct dyn_event_operations eprobe_dyn_event_ops = { + .create = eprobe_dyn_event_create, + .show = eprobe_dyn_event_show, + .is_busy = eprobe_dyn_event_is_busy, + .free = eprobe_dyn_event_release, + .match = eprobe_dyn_event_match, +}; + +static struct trace_eprobe *alloc_event_probe(const char *group, + const char *this_event, + struct trace_event_call *event, + int nargs) +{ + struct trace_eprobe *ep; + const char *event_name; + const char *sys_name; + int ret = -ENOMEM; + + if (!event) + return ERR_PTR(-ENODEV); + + sys_name = event->class->system; + event_name = trace_event_name(event); + + ep = kzalloc(struct_size(ep, tp.args, nargs), GFP_KERNEL); + if (!ep) { + trace_event_put_ref(ep->event); + goto error; + } + ep->event = event; + ep->event_name = kstrdup(event_name, GFP_KERNEL); + if (!ep->event_name) + goto error; + ep->event_system = kstrdup(sys_name, GFP_KERNEL); + if (!ep->event_system) + goto error; + + ret = trace_probe_init(&ep->tp, this_event, group, false); + if (ret < 0) + goto error; + + dyn_event_init(&ep->devent, &eprobe_dyn_event_ops); + return ep; +error: + trace_event_probe_cleanup(ep); + return ERR_PTR(ret); +} + +static int trace_eprobe_tp_arg_update(struct trace_eprobe *ep, int i) +{ + struct probe_arg *parg = &ep->tp.args[i]; + struct ftrace_event_field *field; + struct list_head *head; + + head = trace_get_fields(ep->event); + list_for_each_entry(field, head, link) { + if (!strcmp(parg->code->data, field->name)) { + kfree(parg->code->data); + parg->code->data = field; + return 0; + } + } + kfree(parg->code->data); + parg->code->data = NULL; + return -ENOENT; +} + +static int eprobe_event_define_fields(struct trace_event_call *event_call) +{ + int ret; + struct eprobe_trace_entry_head field; + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(event_call); + if (WARN_ON_ONCE(!tp)) + return -ENOENT; + + DEFINE_FIELD(unsigned int, type, FIELD_STRING_TYPE, 0); + + return traceprobe_define_arg_fields(event_call, sizeof(field), tp); +} + +static struct trace_event_fields eprobe_fields_array[] = { + { .type = TRACE_FUNCTION_TYPE, + .define_fields = eprobe_event_define_fields }, + {} +}; + +/* Event entry printers */ +static enum print_line_t +print_eprobe_event(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct eprobe_trace_entry_head *field; + struct trace_event_call *pevent; + struct trace_event *probed_event; + struct trace_seq *s = &iter->seq; + struct trace_probe *tp; + + field = (struct eprobe_trace_entry_head *)iter->ent; + tp = trace_probe_primary_from_call( + container_of(event, struct trace_event_call, event)); + if (WARN_ON_ONCE(!tp)) + goto out; + + trace_seq_printf(s, "%s: (", trace_probe_name(tp)); + + probed_event = ftrace_find_event(field->type); + if (probed_event) { + pevent = container_of(probed_event, struct trace_event_call, event); + trace_seq_printf(s, "%s.%s", pevent->class->system, + trace_event_name(pevent)); + } else { + trace_seq_printf(s, "%u", field->type); + } + + trace_seq_putc(s, ')'); + + if (print_probe_args(s, tp->args, tp->nr_args, + (u8 *)&field[1], field) < 0) + goto out; + + trace_seq_putc(s, '\n'); + out: + return trace_handle_return(s); +} + +static unsigned long get_event_field(struct fetch_insn *code, void *rec) +{ + struct ftrace_event_field *field = code->data; + unsigned long val; + void *addr; + + addr = rec + field->offset; + + switch (field->size) { + case 1: + if (field->is_signed) + val = *(char *)addr; + else + val = *(unsigned char *)addr; + break; + case 2: + if (field->is_signed) + val = *(short *)addr; + else + val = *(unsigned short *)addr; + break; + case 4: + if (field->is_signed) + val = *(int *)addr; + else + val = *(unsigned int *)addr; + break; + default: + if (field->is_signed) + val = *(long *)addr; + else + val = *(unsigned long *)addr; + break; + } + return val; +} + +static int get_eprobe_size(struct trace_probe *tp, void *rec) +{ + struct probe_arg *arg; + int i, len, ret = 0; + + for (i = 0; i < tp->nr_args; i++) { + arg = tp->args + i; + if (unlikely(arg->dynamic)) { + unsigned long val; + + val = get_event_field(arg->code, rec); + len = process_fetch_insn_bottom(arg->code + 1, val, NULL, NULL); + if (len > 0) + ret += len; + } + } + + return ret; +} + +/* Kprobe specific fetch functions */ + +/* Note that we don't verify it, since the code does not come from user space */ +static int +process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, + void *base) +{ + unsigned long val; + + val = get_event_field(code, rec); + return process_fetch_insn_bottom(code + 1, val, dest, base); +} +NOKPROBE_SYMBOL(process_fetch_insn) + +/* Return the length of string -- including null terminal byte */ +static nokprobe_inline int +fetch_store_strlen_user(unsigned long addr) +{ + const void __user *uaddr = (__force const void __user *)addr; + + return strnlen_user_nofault(uaddr, MAX_STRING_SIZE); +} + +/* Return the length of string -- including null terminal byte */ +static nokprobe_inline int +fetch_store_strlen(unsigned long addr) +{ + int ret, len = 0; + u8 c; + +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if (addr < TASK_SIZE) + return fetch_store_strlen_user(addr); +#endif + + do { + ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1); + len++; + } while (c && ret == 0 && len < MAX_STRING_SIZE); + + return (ret < 0) ? ret : len; +} + +/* + * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf + * with max length and relative data location. + */ +static nokprobe_inline int +fetch_store_string_user(unsigned long addr, void *dest, void *base) +{ + const void __user *uaddr = (__force const void __user *)addr; + int maxlen = get_loc_len(*(u32 *)dest); + void *__dest; + long ret; + + if (unlikely(!maxlen)) + return -ENOMEM; + + __dest = get_loc_data(dest, base); + + ret = strncpy_from_user_nofault(__dest, uaddr, maxlen); + if (ret >= 0) + *(u32 *)dest = make_data_loc(ret, __dest - base); + + return ret; +} + +/* + * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max + * length and relative data location. + */ +static nokprobe_inline int +fetch_store_string(unsigned long addr, void *dest, void *base) +{ + int maxlen = get_loc_len(*(u32 *)dest); + void *__dest; + long ret; + +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if ((unsigned long)addr < TASK_SIZE) + return fetch_store_string_user(addr, dest, base); +#endif + + if (unlikely(!maxlen)) + return -ENOMEM; + + __dest = get_loc_data(dest, base); + + /* + * Try to get string again, since the string can be changed while + * probing. + */ + ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen); + if (ret >= 0) + *(u32 *)dest = make_data_loc(ret, __dest - base); + + return ret; +} + +static nokprobe_inline int +probe_mem_read_user(void *dest, void *src, size_t size) +{ + const void __user *uaddr = (__force const void __user *)src; + + return copy_from_user_nofault(dest, uaddr, size); +} + +static nokprobe_inline int +probe_mem_read(void *dest, void *src, size_t size) +{ +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if ((unsigned long)src < TASK_SIZE) + return probe_mem_read_user(dest, src, size); +#endif + return copy_from_kernel_nofault(dest, src, size); +} + +/* eprobe handler */ +static inline void +__eprobe_trace_func(struct eprobe_data *edata, void *rec) +{ + struct eprobe_trace_entry_head *entry; + struct trace_event_call *call = trace_probe_event_call(&edata->ep->tp); + struct trace_event_buffer fbuffer; + int dsize; + + if (WARN_ON_ONCE(call != edata->file->event_call)) + return; + + if (trace_trigger_soft_disabled(edata->file)) + return; + + fbuffer.trace_ctx = tracing_gen_ctx(); + fbuffer.trace_file = edata->file; + + dsize = get_eprobe_size(&edata->ep->tp, rec); + fbuffer.regs = NULL; + + fbuffer.event = + trace_event_buffer_lock_reserve(&fbuffer.buffer, edata->file, + call->event.type, + sizeof(*entry) + edata->ep->tp.size + dsize, + fbuffer.trace_ctx); + if (!fbuffer.event) + return; + + entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); + if (edata->ep->event) + entry->type = edata->ep->event->event.type; + else + entry->type = 0; + store_trace_args(&entry[1], &edata->ep->tp, rec, sizeof(*entry), dsize); + + trace_event_buffer_commit(&fbuffer); +} + +/* + * The event probe implementation uses event triggers to get access to + * the event it is attached to, but is not an actual trigger. The below + * functions are just stubs to fulfill what is needed to use the trigger + * infrastructure. + */ +static int eprobe_trigger_init(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + return 0; +} + +static void eprobe_trigger_free(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + +} + +static int eprobe_trigger_print(struct seq_file *m, + struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + /* Do not print eprobe event triggers */ + return 0; +} + +static void eprobe_trigger_func(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *rbe) +{ + struct eprobe_data *edata = data->private_data; + + __eprobe_trace_func(edata, rec); +} + +static struct event_trigger_ops eprobe_trigger_ops = { + .func = eprobe_trigger_func, + .print = eprobe_trigger_print, + .init = eprobe_trigger_init, + .free = eprobe_trigger_free, +}; + +static int eprobe_trigger_cmd_func(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param) +{ + return -1; +} + +static int eprobe_trigger_reg_func(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct trace_event_file *file) +{ + return -1; +} + +static void eprobe_trigger_unreg_func(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct trace_event_file *file) +{ + +} + +static struct event_trigger_ops *eprobe_trigger_get_ops(char *cmd, + char *param) +{ + return &eprobe_trigger_ops; +} + +static struct event_command event_trigger_cmd = { + .name = "eprobe", + .trigger_type = ETT_EVENT_EPROBE, + .flags = EVENT_CMD_FL_NEEDS_REC, + .func = eprobe_trigger_cmd_func, + .reg = eprobe_trigger_reg_func, + .unreg = eprobe_trigger_unreg_func, + .unreg_all = NULL, + .get_trigger_ops = eprobe_trigger_get_ops, + .set_filter = NULL, +}; + +static struct event_trigger_data * +new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file) +{ + struct event_trigger_data *trigger; + struct eprobe_data *edata; + + edata = kzalloc(sizeof(*edata), GFP_KERNEL); + trigger = kzalloc(sizeof(*trigger), GFP_KERNEL); + if (!trigger || !edata) { + kfree(edata); + kfree(trigger); + return ERR_PTR(-ENOMEM); + } + + trigger->flags = EVENT_TRIGGER_FL_PROBE; + trigger->count = -1; + trigger->ops = &eprobe_trigger_ops; + + /* + * EVENT PROBE triggers are not registered as commands with + * register_event_command(), as they are not controlled by the user + * from the trigger file + */ + trigger->cmd_ops = &event_trigger_cmd; + + INIT_LIST_HEAD(&trigger->list); + RCU_INIT_POINTER(trigger->filter, NULL); + + edata->file = file; + edata->ep = ep; + trigger->private_data = edata; + + return trigger; +} + +static int enable_eprobe(struct trace_eprobe *ep, + struct trace_event_file *eprobe_file) +{ + struct event_trigger_data *trigger; + struct trace_event_file *file; + struct trace_array *tr = eprobe_file->tr; + + file = find_event_file(tr, ep->event_system, ep->event_name); + if (!file) + return -ENOENT; + trigger = new_eprobe_trigger(ep, eprobe_file); + if (IS_ERR(trigger)) + return PTR_ERR(trigger); + + list_add_tail_rcu(&trigger->list, &file->triggers); + + trace_event_trigger_enable_disable(file, 1); + update_cond_flag(file); + + return 0; +} + +static struct trace_event_functions eprobe_funcs = { + .trace = print_eprobe_event +}; + +static int disable_eprobe(struct trace_eprobe *ep, + struct trace_array *tr) +{ + struct event_trigger_data *trigger; + struct trace_event_file *file; + struct eprobe_data *edata; + + file = find_event_file(tr, ep->event_system, ep->event_name); + if (!file) + return -ENOENT; + + list_for_each_entry(trigger, &file->triggers, list) { + if (!(trigger->flags & EVENT_TRIGGER_FL_PROBE)) + continue; + edata = trigger->private_data; + if (edata->ep == ep) + break; + } + if (list_entry_is_head(trigger, &file->triggers, list)) + return -ENODEV; + + list_del_rcu(&trigger->list); + + trace_event_trigger_enable_disable(file, 0); + update_cond_flag(file); + return 0; +} + +static int enable_trace_eprobe(struct trace_event_call *call, + struct trace_event_file *file) +{ + struct trace_probe *pos, *tp; + struct trace_eprobe *ep; + bool enabled; + int ret = 0; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + enabled = trace_probe_is_enabled(tp); + + /* This also changes "enabled" state */ + if (file) { + ret = trace_probe_add_file(tp, file); + if (ret) + return ret; + } else + trace_probe_set_flag(tp, TP_FLAG_PROFILE); + + if (enabled) + return 0; + + list_for_each_entry(pos, trace_probe_probe_list(tp), list) { + ep = container_of(pos, struct trace_eprobe, tp); + ret = enable_eprobe(ep, file); + if (ret) + break; + enabled = true; + } + + if (ret) { + /* Failed to enable one of them. Roll back all */ + if (enabled) + disable_eprobe(ep, file->tr); + if (file) + trace_probe_remove_file(tp, file); + else + trace_probe_clear_flag(tp, TP_FLAG_PROFILE); + } + + return ret; +} + +static int disable_trace_eprobe(struct trace_event_call *call, + struct trace_event_file *file) +{ + struct trace_probe *pos, *tp; + struct trace_eprobe *ep; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + + if (file) { + if (!trace_probe_get_file_link(tp, file)) + return -ENOENT; + if (!trace_probe_has_single_file(tp)) + goto out; + trace_probe_clear_flag(tp, TP_FLAG_TRACE); + } else + trace_probe_clear_flag(tp, TP_FLAG_PROFILE); + + if (!trace_probe_is_enabled(tp)) { + list_for_each_entry(pos, trace_probe_probe_list(tp), list) { + ep = container_of(pos, struct trace_eprobe, tp); + disable_eprobe(ep, file->tr); + } + } + + out: + if (file) + /* + * Synchronization is done in below function. For perf event, + * file == NULL and perf_trace_event_unreg() calls + * tracepoint_synchronize_unregister() to ensure synchronize + * event. We don't need to care about it. + */ + trace_probe_remove_file(tp, file); + + return 0; +} + +static int eprobe_register(struct trace_event_call *event, + enum trace_reg type, void *data) +{ + struct trace_event_file *file = data; + + switch (type) { + case TRACE_REG_REGISTER: + return enable_trace_eprobe(event, file); + case TRACE_REG_UNREGISTER: + return disable_trace_eprobe(event, file); +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + case TRACE_REG_PERF_UNREGISTER: + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: + return 0; +#endif + } + return 0; +} + +static inline void init_trace_eprobe_call(struct trace_eprobe *ep) +{ + struct trace_event_call *call = trace_probe_event_call(&ep->tp); + + call->flags = TRACE_EVENT_FL_EPROBE; + call->event.funcs = &eprobe_funcs; + call->class->fields_array = eprobe_fields_array; + call->class->reg = eprobe_register; +} + +static struct trace_event_call * +find_and_get_event(const char *system, const char *event_name) +{ + struct trace_event_call *tp_event; + const char *name; + + list_for_each_entry(tp_event, &ftrace_events, list) { + /* Skip other probes and ftrace events */ + if (tp_event->flags & + (TRACE_EVENT_FL_IGNORE_ENABLE | + TRACE_EVENT_FL_KPROBE | + TRACE_EVENT_FL_UPROBE | + TRACE_EVENT_FL_EPROBE)) + continue; + if (!tp_event->class->system || + strcmp(system, tp_event->class->system)) + continue; + name = trace_event_name(tp_event); + if (!name || strcmp(event_name, name)) + continue; + if (!trace_event_try_get_ref(tp_event)) { + return NULL; + break; + } + return tp_event; + break; + } + return NULL; +} + +static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[], int i) +{ + unsigned int flags = TPARG_FL_KERNEL | TPARG_FL_TPOINT; + int ret; + + ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], flags); + if (ret) + return ret; + + if (ep->tp.args[i].code->op == FETCH_OP_TP_ARG) + ret = trace_eprobe_tp_arg_update(ep, i); + + return ret; +} + +static int __trace_eprobe_create(int argc, const char *argv[]) +{ + /* + * Argument syntax: + * e[:[GRP/]ENAME] SYSTEM.EVENT [FETCHARGS] + * Fetch args: + * =$[:TYPE] + */ + const char *event = NULL, *group = EPROBE_EVENT_SYSTEM; + const char *sys_event = NULL, *sys_name = NULL; + struct trace_event_call *event_call; + struct trace_eprobe *ep = NULL; + char buf1[MAX_EVENT_NAME_LEN]; + char buf2[MAX_EVENT_NAME_LEN]; + int ret = 0; + int i; + + if (argc < 2 || argv[0][0] != 'e') + return -ECANCELED; + + trace_probe_log_init("event_probe", argc, argv); + + event = strchr(&argv[0][1], ':'); + if (event) { + event++; + ret = traceprobe_parse_event_name(&event, &group, buf1, + event - argv[0]); + if (ret) + goto parse_error; + } else { + strscpy(buf1, argv[1], MAX_EVENT_NAME_LEN); + sanitize_event_name(buf1); + event = buf1; + } + if (!is_good_name(event) || !is_good_name(group)) + goto parse_error; + + sys_event = argv[1]; + ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, + sys_event - argv[1]); + if (ret || !sys_name) + goto parse_error; + if (!is_good_name(sys_event) || !is_good_name(sys_name)) + goto parse_error; + + mutex_lock(&event_mutex); + event_call = find_and_get_event(sys_name, sys_event); + ep = alloc_event_probe(group, event, event_call, argc - 2); + mutex_unlock(&event_mutex); + + if (IS_ERR(ep)) { + ret = PTR_ERR(ep); + /* This must return -ENOMEM, else there is a bug */ + WARN_ON_ONCE(ret != -ENOMEM); + goto error; /* We know ep is not allocated */ + } + + argc -= 2; argv += 2; + /* parse arguments */ + for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + trace_probe_log_set_index(i + 2); + ret = trace_eprobe_tp_update_arg(ep, argv, i); + if (ret) + goto error; + } + ret = traceprobe_set_print_fmt(&ep->tp, PROBE_PRINT_EVENT); + if (ret < 0) + goto error; + init_trace_eprobe_call(ep); + mutex_lock(&event_mutex); + ret = trace_probe_register_event_call(&ep->tp); + if (ret) { + if (ret == -EEXIST) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, EVENT_EXIST); + } + mutex_unlock(&event_mutex); + goto error; + } + ret = dyn_event_add(&ep->devent, &ep->tp.event->call); + mutex_unlock(&event_mutex); + return ret; +parse_error: + ret = -EINVAL; +error: + trace_event_probe_cleanup(ep); + return ret; +} + +/* + * Register dynevent at core_initcall. This allows kernel to setup eprobe + * events in postcore_initcall without tracefs. + */ +static __init int trace_events_eprobe_init_early(void) +{ + int err = 0; + + err = dyn_event_register(&eprobe_dyn_event_ops); + if (err) + pr_warn("Could not register eprobe_dyn_event_ops\n"); + + return err; +} +core_initcall(trace_events_eprobe_init_early); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 6b11e335a62e..3d5c07239a2a 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -124,6 +124,18 @@ static void *trigger_next(struct seq_file *m, void *t, loff_t *pos) return seq_list_next(t, &event_file->triggers, pos); } +static bool check_user_trigger(struct trace_event_file *file) +{ + struct event_trigger_data *data; + + list_for_each_entry_rcu(data, &file->triggers, list) { + if (data->flags & EVENT_TRIGGER_FL_PROBE) + continue; + return true; + } + return false; +} + static void *trigger_start(struct seq_file *m, loff_t *pos) { struct trace_event_file *event_file; @@ -134,7 +146,7 @@ static void *trigger_start(struct seq_file *m, loff_t *pos) if (unlikely(!event_file)) return ERR_PTR(-ENODEV); - if (list_empty(&event_file->triggers)) + if (list_empty(&event_file->triggers) || !check_user_trigger(event_file)) return *pos == 0 ? SHOW_AVAILABLE_TRIGGERS : NULL; return seq_list_start(&event_file->triggers, *pos); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 882c27044029..3a64ba4bbad6 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -707,14 +707,6 @@ static struct notifier_block trace_kprobe_module_nb = { .priority = 1 /* Invoked after kprobe module callback */ }; -/* Convert certain expected symbols into '_' when generating event names */ -static inline void sanitize_event_name(char *name) -{ - while (*name++ != '\0') - if (*name == ':' || *name == '.') - *name = '_'; -} - static int __trace_kprobe_create(int argc, const char *argv[]) { /* diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 782c00eb6859..3ed2a3f37297 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -319,6 +319,13 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, code->op = FETCH_OP_ARG; code->param = (unsigned int)param - 1; #endif + } else if (flags & TPARG_FL_TPOINT) { + if (code->data) + return -EFAULT; + code->data = kstrdup(arg, GFP_KERNEL); + if (!code->data) + return -ENOMEM; + code->op = FETCH_OP_TP_ARG; } else goto inval_var; @@ -646,13 +653,14 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, !strcmp(parg->type->name, "ustring")) { if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF && code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM && - code->op != FETCH_OP_DATA) { + code->op != FETCH_OP_DATA && code->op != FETCH_OP_TP_ARG) { trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_STRING); goto fail; } if ((code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM || - code->op == FETCH_OP_DATA) || parg->count) { + code->op == FETCH_OP_DATA) || code->op == FETCH_OP_TP_ARG || + parg->count) { /* * IMM, DATA and COMM is pointing actual address, those * must be kept, and if parg->count != 0, this is an @@ -867,6 +875,10 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, fmt = "(%lx <- %lx)"; arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; break; + case PROBE_PRINT_EVENT: + fmt = "(%u)"; + arg = "REC->" FIELD_STRING_TYPE; + break; default: WARN_ON_ONCE(1); return 0; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 66701a92d186..99e7a5df025e 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -38,6 +38,7 @@ #define FIELD_STRING_IP "__probe_ip" #define FIELD_STRING_RETIP "__probe_ret_ip" #define FIELD_STRING_FUNC "__probe_func" +#define FIELD_STRING_TYPE "__probe_type" #undef DEFINE_FIELD #define DEFINE_FIELD(type, item, name, is_signed) \ @@ -102,6 +103,7 @@ enum fetch_op { FETCH_OP_MOD_BF, /* Bitfield: .basesize, .lshift, .rshift */ // Stage 5 (loop) op FETCH_OP_LP_ARRAY, /* Array: .param = loop count */ + FETCH_OP_TP_ARG, /* Trace Point argument */ FETCH_OP_END, FETCH_NOP_SYMBOL, /* Unresolved Symbol holder */ }; @@ -351,7 +353,8 @@ int trace_probe_create(const char *raw_command, int (*createfn)(int, const char #define TPARG_FL_RETURN BIT(0) #define TPARG_FL_KERNEL BIT(1) #define TPARG_FL_FENTRY BIT(2) -#define TPARG_FL_MASK GENMASK(2, 0) +#define TPARG_FL_TPOINT BIT(3) +#define TPARG_FL_MASK GENMASK(3, 0) extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *argv, unsigned int flags); @@ -366,6 +369,7 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, enum probe_print_type { PROBE_PRINT_NORMAL, PROBE_PRINT_RETURN, + PROBE_PRINT_EVENT, }; extern int traceprobe_set_print_fmt(struct trace_probe *tp, enum probe_print_type ptype); -- cgit v1.2.3 From f552a27afe67f05c47bb0c33b92af2a23b684c31 Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Thu, 12 Aug 2021 12:14:35 +0800 Subject: io_uring: remove files pointer in cancellation functions When doing cancellation, we use a parameter to indicate where it's from do_exit or exec. So a boolean value is good enough for this, remove the struct files* as it is not necessary. Signed-off-by: Hao Xu [axboe: fixup io_uring_files_cancel for !CONFIG_IO_URING] Signed-off-by: Jens Axboe --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 9a89e7f36acb..91a43e57a32e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -777,7 +777,7 @@ void __noreturn do_exit(long code) schedule(); } - io_uring_files_cancel(tsk->files); + io_uring_files_cancel(); exit_signals(tsk); /* sets PF_EXITING */ /* sync mm's RSS info before statistics gathering */ -- cgit v1.2.3 From 6fc88c354f3af83ffa2c285b86e76c759755693f Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Thu, 19 Aug 2021 02:24:20 -0700 Subject: bpf: Migrate cgroup_bpf to internal cgroup_bpf_attach_type enum Add an enum (cgroup_bpf_attach_type) containing only valid cgroup_bpf attach types and a function to map bpf_attach_type values to the new enum. Inspired by netns_bpf_attach_type. Then, migrate cgroup_bpf to use cgroup_bpf_attach_type wherever possible. Functionality is unchanged as attach_type_to_prog_type switches in bpf/syscall.c were preventing non-cgroup programs from making use of the invalid cgroup_bpf array slots. As a result struct cgroup_bpf uses 504 fewer bytes relative to when its arrays were sized using MAX_BPF_ATTACH_TYPE. bpf_cgroup_storage is notably not migrated as struct bpf_cgroup_storage_key is part of uapi and contains a bpf_attach_type member which is not meant to be opaque. Similarly, bpf_cgroup_link continues to report its bpf_attach_type member to userspace via fdinfo and bpf_link_info. To ease disambiguation, bpf_attach_type variables are renamed from 'type' to 'atype' when changed to cgroup_bpf_attach_type. Signed-off-by: Dave Marchevsky Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210819092420.1984861-2-davemarchevsky@fb.com --- kernel/bpf/cgroup.c | 156 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 93 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 8e9d99e2ade4..03145d45e3d5 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -19,7 +19,7 @@ #include "../cgroup/cgroup-internal.h" -DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_BPF_ATTACH_TYPE); +DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE); EXPORT_SYMBOL(cgroup_bpf_enabled_key); void cgroup_bpf_offline(struct cgroup *cgrp) @@ -113,12 +113,12 @@ static void cgroup_bpf_release(struct work_struct *work) struct list_head *storages = &cgrp->bpf.storages; struct bpf_cgroup_storage *storage, *stmp; - unsigned int type; + unsigned int atype; mutex_lock(&cgroup_mutex); - for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { - struct list_head *progs = &cgrp->bpf.progs[type]; + for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) { + struct list_head *progs = &cgrp->bpf.progs[atype]; struct bpf_prog_list *pl, *pltmp; list_for_each_entry_safe(pl, pltmp, progs, node) { @@ -128,10 +128,10 @@ static void cgroup_bpf_release(struct work_struct *work) if (pl->link) bpf_cgroup_link_auto_detach(pl->link); kfree(pl); - static_branch_dec(&cgroup_bpf_enabled_key[type]); + static_branch_dec(&cgroup_bpf_enabled_key[atype]); } old_array = rcu_dereference_protected( - cgrp->bpf.effective[type], + cgrp->bpf.effective[atype], lockdep_is_held(&cgroup_mutex)); bpf_prog_array_free(old_array); } @@ -196,7 +196,7 @@ static u32 prog_list_length(struct list_head *head) * if parent has overridable or multi-prog, allow attaching */ static bool hierarchy_allows_attach(struct cgroup *cgrp, - enum bpf_attach_type type) + enum cgroup_bpf_attach_type atype) { struct cgroup *p; @@ -204,12 +204,12 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp, if (!p) return true; do { - u32 flags = p->bpf.flags[type]; + u32 flags = p->bpf.flags[atype]; u32 cnt; if (flags & BPF_F_ALLOW_MULTI) return true; - cnt = prog_list_length(&p->bpf.progs[type]); + cnt = prog_list_length(&p->bpf.progs[atype]); WARN_ON_ONCE(cnt > 1); if (cnt == 1) return !!(flags & BPF_F_ALLOW_OVERRIDE); @@ -225,7 +225,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp, * to programs in this cgroup */ static int compute_effective_progs(struct cgroup *cgrp, - enum bpf_attach_type type, + enum cgroup_bpf_attach_type atype, struct bpf_prog_array **array) { struct bpf_prog_array_item *item; @@ -236,8 +236,8 @@ static int compute_effective_progs(struct cgroup *cgrp, /* count number of effective programs by walking parents */ do { - if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) - cnt += prog_list_length(&p->bpf.progs[type]); + if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) + cnt += prog_list_length(&p->bpf.progs[atype]); p = cgroup_parent(p); } while (p); @@ -249,10 +249,10 @@ static int compute_effective_progs(struct cgroup *cgrp, cnt = 0; p = cgrp; do { - if (cnt > 0 && !(p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) + if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) continue; - list_for_each_entry(pl, &p->bpf.progs[type], node) { + list_for_each_entry(pl, &p->bpf.progs[atype], node) { if (!prog_list_prog(pl)) continue; @@ -269,10 +269,10 @@ static int compute_effective_progs(struct cgroup *cgrp, } static void activate_effective_progs(struct cgroup *cgrp, - enum bpf_attach_type type, + enum cgroup_bpf_attach_type atype, struct bpf_prog_array *old_array) { - old_array = rcu_replace_pointer(cgrp->bpf.effective[type], old_array, + old_array = rcu_replace_pointer(cgrp->bpf.effective[atype], old_array, lockdep_is_held(&cgroup_mutex)); /* free prog array after grace period, since __cgroup_bpf_run_*() * might be still walking the array @@ -328,7 +328,7 @@ cleanup: } static int update_effective_progs(struct cgroup *cgrp, - enum bpf_attach_type type) + enum cgroup_bpf_attach_type atype) { struct cgroup_subsys_state *css; int err; @@ -340,7 +340,7 @@ static int update_effective_progs(struct cgroup *cgrp, if (percpu_ref_is_zero(&desc->bpf.refcnt)) continue; - err = compute_effective_progs(desc, type, &desc->bpf.inactive); + err = compute_effective_progs(desc, atype, &desc->bpf.inactive); if (err) goto cleanup; } @@ -357,7 +357,7 @@ static int update_effective_progs(struct cgroup *cgrp, continue; } - activate_effective_progs(desc, type, desc->bpf.inactive); + activate_effective_progs(desc, atype, desc->bpf.inactive); desc->bpf.inactive = NULL; } @@ -436,11 +436,12 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, enum bpf_attach_type type, u32 flags) { u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)); - struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; + enum cgroup_bpf_attach_type atype; struct bpf_prog_list *pl; + struct list_head *progs; int err; if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) || @@ -454,10 +455,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, /* replace_prog implies BPF_F_REPLACE, and vice versa */ return -EINVAL; - if (!hierarchy_allows_attach(cgrp, type)) + atype = to_cgroup_bpf_attach_type(type); + if (atype < 0) + return -EINVAL; + + progs = &cgrp->bpf.progs[atype]; + + if (!hierarchy_allows_attach(cgrp, atype)) return -EPERM; - if (!list_empty(progs) && cgrp->bpf.flags[type] != saved_flags) + if (!list_empty(progs) && cgrp->bpf.flags[atype] != saved_flags) /* Disallow attaching non-overridable on top * of existing overridable in this cgroup. * Disallow attaching multi-prog if overridable or none @@ -490,16 +497,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, pl->prog = prog; pl->link = link; bpf_cgroup_storages_assign(pl->storage, storage); - cgrp->bpf.flags[type] = saved_flags; + cgrp->bpf.flags[atype] = saved_flags; - err = update_effective_progs(cgrp, type); + err = update_effective_progs(cgrp, atype); if (err) goto cleanup; if (old_prog) bpf_prog_put(old_prog); else - static_branch_inc(&cgroup_bpf_enabled_key[type]); + static_branch_inc(&cgroup_bpf_enabled_key[atype]); bpf_cgroup_storages_link(new_storage, cgrp, type); return 0; @@ -520,7 +527,7 @@ cleanup: * all descendant cgroups. This function is guaranteed to succeed. */ static void replace_effective_prog(struct cgroup *cgrp, - enum bpf_attach_type type, + enum cgroup_bpf_attach_type atype, struct bpf_cgroup_link *link) { struct bpf_prog_array_item *item; @@ -539,10 +546,10 @@ static void replace_effective_prog(struct cgroup *cgrp, /* find position of link in effective progs array */ for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) { - if (pos && !(cg->bpf.flags[type] & BPF_F_ALLOW_MULTI)) + if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) continue; - head = &cg->bpf.progs[type]; + head = &cg->bpf.progs[atype]; list_for_each_entry(pl, head, node) { if (!prog_list_prog(pl)) continue; @@ -554,7 +561,7 @@ static void replace_effective_prog(struct cgroup *cgrp, found: BUG_ON(!cg); progs = rcu_dereference_protected( - desc->bpf.effective[type], + desc->bpf.effective[atype], lockdep_is_held(&cgroup_mutex)); item = &progs->items[pos]; WRITE_ONCE(item->prog, link->link.prog); @@ -574,11 +581,18 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link, struct bpf_prog *new_prog) { - struct list_head *progs = &cgrp->bpf.progs[link->type]; + enum cgroup_bpf_attach_type atype; struct bpf_prog *old_prog; struct bpf_prog_list *pl; + struct list_head *progs; bool found = false; + atype = to_cgroup_bpf_attach_type(link->type); + if (atype < 0) + return -EINVAL; + + progs = &cgrp->bpf.progs[atype]; + if (link->link.prog->type != new_prog->type) return -EINVAL; @@ -592,7 +606,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp, return -ENOENT; old_prog = xchg(&link->link.prog, new_prog); - replace_effective_prog(cgrp, link->type, link); + replace_effective_prog(cgrp, atype, link); bpf_prog_put(old_prog); return 0; } @@ -667,12 +681,20 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs, int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_cgroup_link *link, enum bpf_attach_type type) { - struct list_head *progs = &cgrp->bpf.progs[type]; - u32 flags = cgrp->bpf.flags[type]; - struct bpf_prog_list *pl; + enum cgroup_bpf_attach_type atype; struct bpf_prog *old_prog; + struct bpf_prog_list *pl; + struct list_head *progs; + u32 flags; int err; + atype = to_cgroup_bpf_attach_type(type); + if (atype < 0) + return -EINVAL; + + progs = &cgrp->bpf.progs[atype]; + flags = cgrp->bpf.flags[atype]; + if (prog && link) /* only one of prog or link can be specified */ return -EINVAL; @@ -686,7 +708,7 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, pl->prog = NULL; pl->link = NULL; - err = update_effective_progs(cgrp, type); + err = update_effective_progs(cgrp, atype); if (err) goto cleanup; @@ -695,10 +717,10 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, kfree(pl); if (list_empty(progs)) /* last program was detached, reset flags to zero */ - cgrp->bpf.flags[type] = 0; + cgrp->bpf.flags[atype] = 0; if (old_prog) bpf_prog_put(old_prog); - static_branch_dec(&cgroup_bpf_enabled_key[type]); + static_branch_dec(&cgroup_bpf_enabled_key[atype]); return 0; cleanup: @@ -714,13 +736,21 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, { __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); enum bpf_attach_type type = attr->query.attach_type; - struct list_head *progs = &cgrp->bpf.progs[type]; - u32 flags = cgrp->bpf.flags[type]; + enum cgroup_bpf_attach_type atype; struct bpf_prog_array *effective; + struct list_head *progs; struct bpf_prog *prog; int cnt, ret = 0, i; + u32 flags; + + atype = to_cgroup_bpf_attach_type(type); + if (atype < 0) + return -EINVAL; + + progs = &cgrp->bpf.progs[atype]; + flags = cgrp->bpf.flags[atype]; - effective = rcu_dereference_protected(cgrp->bpf.effective[type], + effective = rcu_dereference_protected(cgrp->bpf.effective[atype], lockdep_is_held(&cgroup_mutex)); if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) @@ -925,14 +955,14 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) link->cgroup = cgrp; link->type = attr->link_create.attach_type; - err = bpf_link_prime(&link->link, &link_primer); + err = bpf_link_prime(&link->link, &link_primer); if (err) { kfree(link); goto out_put_cgroup; } - err = cgroup_bpf_attach(cgrp, NULL, NULL, link, link->type, - BPF_F_ALLOW_MULTI); + err = cgroup_bpf_attach(cgrp, NULL, NULL, link, + link->type, BPF_F_ALLOW_MULTI); if (err) { bpf_link_cleanup(&link_primer); goto out_put_cgroup; @@ -986,7 +1016,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, */ int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, - enum bpf_attach_type type) + enum cgroup_bpf_attach_type atype) { unsigned int offset = skb->data - skb_network_header(skb); struct sock *save_sk; @@ -1008,11 +1038,11 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, /* compute pointers for the bpf prog */ bpf_compute_and_save_data_end(skb, &saved_data_end); - if (type == BPF_CGROUP_INET_EGRESS) { + if (atype == CGROUP_INET_EGRESS) { ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY( - cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb); + cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb); } else { - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], skb, + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb); ret = (ret == 1 ? 0 : -EPERM); } @@ -1038,12 +1068,12 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); * and if it returned != 1 during execution. In all other cases, 0 is returned. */ int __cgroup_bpf_run_filter_sk(struct sock *sk, - enum bpf_attach_type type) + enum cgroup_bpf_attach_type atype) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); int ret; - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], sk, bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk, bpf_prog_run); return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); @@ -1065,7 +1095,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); */ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct sockaddr *uaddr, - enum bpf_attach_type type, + enum cgroup_bpf_attach_type atype, void *t_ctx, u32 *flags) { @@ -1090,7 +1120,7 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, } cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[type], &ctx, + ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx, bpf_prog_run, flags); return ret == 1 ? 0 : -EPERM; @@ -1115,19 +1145,19 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); */ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct bpf_sock_ops_kern *sock_ops, - enum bpf_attach_type type) + enum cgroup_bpf_attach_type atype) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); int ret; - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], sock_ops, + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops, bpf_prog_run); return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, - short access, enum bpf_attach_type type) + short access, enum cgroup_bpf_attach_type atype) { struct cgroup *cgrp; struct bpf_cgroup_dev_ctx ctx = { @@ -1139,7 +1169,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, rcu_read_lock(); cgrp = task_dfl_cgroup(current); - allow = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], &ctx, + allow = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, bpf_prog_run); rcu_read_unlock(); @@ -1231,7 +1261,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, char **buf, size_t *pcount, loff_t *ppos, - enum bpf_attach_type type) + enum cgroup_bpf_attach_type atype) { struct bpf_sysctl_kern ctx = { .head = head, @@ -1271,7 +1301,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, rcu_read_lock(); cgrp = task_dfl_cgroup(current); - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[type], &ctx, bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, bpf_prog_run); rcu_read_unlock(); kfree(ctx.cur_val); @@ -1289,7 +1319,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, #ifdef CONFIG_NET static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, - enum bpf_attach_type attach_type) + enum cgroup_bpf_attach_type attach_type) { struct bpf_prog_array *prog_array; bool empty; @@ -1364,7 +1394,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, * attached to the hook so we don't waste time allocating * memory and locking the socket. */ - if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) + if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_SETSOCKOPT)) return 0; /* Allocate a bit more than the initial user buffer for @@ -1385,7 +1415,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, } lock_sock(sk); - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT], + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_SETSOCKOPT], &ctx, bpf_prog_run); release_sock(sk); @@ -1460,7 +1490,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, * attached to the hook so we don't waste time allocating * memory and locking the socket. */ - if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) + if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_GETSOCKOPT)) return retval; ctx.optlen = max_optlen; @@ -1495,7 +1525,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, } lock_sock(sk); - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT], &ctx, bpf_prog_run); release_sock(sk); @@ -1556,7 +1586,7 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, * be called if that data shouldn't be "exported". */ - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT], &ctx, bpf_prog_run); if (!ret) return -EPERM; -- cgit v1.2.3 From d7af7e497f0308bc97809cc48b58e8e0f13887e1 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 20 Aug 2021 09:39:35 -0700 Subject: bpf: Fix possible out of bound write in narrow load handling Fix a verifier bug found by smatch static checker in [0]. This problem has never been seen in prod to my best knowledge. Fixing it still seems to be a good idea since it's hard to say for sure whether it's possible or not to have a scenario where a combination of convert_ctx_access() and a narrow load would lead to an out of bound write. When narrow load is handled, one or two new instructions are added to insn_buf array, but before it was only checked that cnt >= ARRAY_SIZE(insn_buf) And it's safe to add a new instruction to insn_buf[cnt++] only once. The second try will lead to out of bound write. And this is what can happen if `shift` is set. Fix it by making sure that if the BPF_RSH instruction has to be added in addition to BPF_AND then there is enough space for two more instructions in insn_buf. The full report [0] is below: kernel/bpf/verifier.c:12304 convert_ctx_accesses() warn: offset 'cnt' incremented past end of array kernel/bpf/verifier.c:12311 convert_ctx_accesses() warn: offset 'cnt' incremented past end of array kernel/bpf/verifier.c 12282 12283 insn->off = off & ~(size_default - 1); 12284 insn->code = BPF_LDX | BPF_MEM | size_code; 12285 } 12286 12287 target_size = 0; 12288 cnt = convert_ctx_access(type, insn, insn_buf, env->prog, 12289 &target_size); 12290 if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) || ^^^^^^^^^^^^^^^^^^^^^^^^^^^ Bounds check. 12291 (ctx_field_size && !target_size)) { 12292 verbose(env, "bpf verifier is misconfigured\n"); 12293 return -EINVAL; 12294 } 12295 12296 if (is_narrower_load && size < target_size) { 12297 u8 shift = bpf_ctx_narrow_access_offset( 12298 off, size, size_default) * 8; 12299 if (ctx_field_size <= 4) { 12300 if (shift) 12301 insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH, ^^^^^ increment beyond end of array 12302 insn->dst_reg, 12303 shift); --> 12304 insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg, ^^^^^ out of bounds write 12305 (1 << size * 8) - 1); 12306 } else { 12307 if (shift) 12308 insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH, 12309 insn->dst_reg, 12310 shift); 12311 insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg, ^^^^^^^^^^^^^^^ Same. 12312 (1ULL << size * 8) - 1); 12313 } 12314 } 12315 12316 new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); 12317 if (!new_prog) 12318 return -ENOMEM; 12319 12320 delta += cnt - 1; 12321 12322 /* keep walking new program and skip insns we just inserted */ 12323 env->prog = new_prog; 12324 insn = new_prog->insnsi + i + delta; 12325 } 12326 12327 return 0; 12328 } [0] https://lore.kernel.org/bpf/20210817050843.GA21456@kili/ v1->v2: - clarify that problem was only seen by static checker but not in prod; Fixes: 46f53a65d2de ("bpf: Allow narrow loads with offset > 0") Reported-by: Dan Carpenter Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210820163935.1902398-1-rdna@fb.com --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f5a0077c9981..206c221453cf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12295,6 +12295,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) if (is_narrower_load && size < target_size) { u8 shift = bpf_ctx_narrow_access_offset( off, size, size_default) * 8; + if (shift && cnt + 1 >= ARRAY_SIZE(insn_buf)) { + verbose(env, "bpf verifier narrow ctx load misconfigured\n"); + return -EINVAL; + } if (ctx_field_size <= 4) { if (shift) insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH, -- cgit v1.2.3 From 67d69e9d1a6c889d98951c1d74b19332ce0565af Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Mon, 23 Aug 2021 22:04:09 -0400 Subject: audit: move put_tree() to avoid trim_trees refcount underflow and UAF AUDIT_TRIM is expected to be idempotent, but multiple executions resulted in a refcount underflow and use-after-free. git bisect fingered commit fb041bb7c0a9 ("locking/refcount: Consolidate implementations of refcount_t") but this patch with its more thorough checking that wasn't in the x86 assembly code merely exposed a previously existing tree refcount imbalance in the case of tree trimming code that was refactored with prune_one() to remove a tree introduced in commit 8432c7006297 ("audit: Simplify locking around untag_chunk()") Move the put_tree() to cover only the prune_one() case. Passes audit-testsuite and 3 passes of "auditctl -t" with at least one directory watch. Cc: Jan Kara Cc: Will Deacon Cc: Alexander Viro Cc: Seiji Nishikawa Cc: stable@vger.kernel.org Fixes: 8432c7006297 ("audit: Simplify locking around untag_chunk()") Signed-off-by: Richard Guy Briggs Reviewed-by: Jan Kara [PM: reformatted/cleaned-up the commit description] Signed-off-by: Paul Moore --- kernel/audit_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index b2be4e978ba3..2cd7b5694422 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -593,7 +593,6 @@ static void prune_tree_chunks(struct audit_tree *victim, bool tagged) spin_lock(&hash_lock); } spin_unlock(&hash_lock); - put_tree(victim); } /* @@ -602,6 +601,7 @@ static void prune_tree_chunks(struct audit_tree *victim, bool tagged) static void prune_one(struct audit_tree *victim) { prune_tree_chunks(victim, false); + put_tree(victim); } /* trim the uncommitted chunks from tree */ -- cgit v1.2.3 From 9f72daf7edfa8f7e86ce8940d52266b5e931dcb0 Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Wed, 25 Aug 2021 12:54:15 +0200 Subject: cgroup/cpuset: Avoid memory migration when nodemasks match With the introduction of ee9707e8593d ("cgroup/cpuset: Enable memory migration for cpuset v2") attaching a process to a different cgroup will trigger a memory migration regardless of whether it's really needed. Memory migration is an expensive operation, so bypass it if the nodemasks passed to cpuset_migrate_mm() are equal. Note that we're not only avoiding the migration work itself, but also a call to lru_cache_disable(), which triggers and flushes an LRU drain work on every online CPU. Signed-off-by: Nicolas Saenz Julienne Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 44d234b0df5e..d497a65c4f04 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1634,6 +1634,11 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, { struct cpuset_migrate_mm_work *mwork; + if (nodes_equal(*from, *to)) { + mmput(mm); + return; + } + mwork = kzalloc(sizeof(*mwork), GFP_KERNEL); if (mwork) { mwork->mm = mm; -- cgit v1.2.3 From 33c5cb36015ac1034b50b823fae367e908d05147 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Mon, 23 Aug 2021 19:43:47 -0700 Subject: bpf: Consolidate task_struct BTF_ID declarations No need to have it defined 5 times. Once is enough. Signed-off-by: Daniel Xu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/6dcefa5bed26fe1226f26683f36819bb53ec19a2.1629772842.git.dxu@dxuuu.xyz --- kernel/bpf/bpf_task_storage.c | 6 ++---- kernel/bpf/stackmap.c | 4 +--- kernel/bpf/task_iter.c | 11 +++++------ kernel/trace/bpf_trace.c | 4 ++-- 4 files changed, 10 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 3ce75758d394..ebfa8bc90892 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -317,15 +317,13 @@ const struct bpf_map_ops task_storage_map_ops = { .map_owner_storage_ptr = task_storage_ptr, }; -BTF_ID_LIST_SINGLE(bpf_task_storage_btf_ids, struct, task_struct) - const struct bpf_func_proto bpf_task_storage_get_proto = { .func = bpf_task_storage_get, .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID, - .arg2_btf_id = &bpf_task_storage_btf_ids[0], + .arg2_btf_id = &btf_task_struct_ids[0], .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, }; @@ -336,5 +334,5 @@ const struct bpf_func_proto bpf_task_storage_delete_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID, - .arg2_btf_id = &bpf_task_storage_btf_ids[0], + .arg2_btf_id = &btf_task_struct_ids[0], }; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 6fbc2abe9c91..e8eefdf8cf3e 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -530,14 +530,12 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, return res; } -BTF_ID_LIST_SINGLE(bpf_get_task_stack_btf_ids, struct, task_struct) - const struct bpf_func_proto bpf_get_task_stack_proto = { .func = bpf_get_task_stack, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, - .arg1_btf_id = &bpf_get_task_stack_btf_ids[0], + .arg1_btf_id = &btf_task_struct_ids[0], .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index b68cb5d6d6eb..b48750bfba5a 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -525,7 +525,6 @@ static const struct seq_operations task_vma_seq_ops = { }; BTF_ID_LIST(btf_task_file_ids) -BTF_ID(struct, task_struct) BTF_ID(struct, file) BTF_ID(struct, vm_area_struct) @@ -591,19 +590,19 @@ static int __init task_iter_init(void) { int ret; - task_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0]; + task_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0]; ret = bpf_iter_reg_target(&task_reg_info); if (ret) return ret; - task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0]; - task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1]; + task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0]; + task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[0]; ret = bpf_iter_reg_target(&task_file_reg_info); if (ret) return ret; - task_vma_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0]; - task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[2]; + task_vma_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0]; + task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1]; return bpf_iter_reg_target(&task_vma_reg_info); } late_initcall(task_iter_init); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index cbc73c08c4a4..50d055fc2327 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -714,13 +714,13 @@ BPF_CALL_0(bpf_get_current_task_btf) return (unsigned long) current; } -BTF_ID_LIST_SINGLE(bpf_get_current_btf_ids, struct, task_struct) +BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct) static const struct bpf_func_proto bpf_get_current_task_btf_proto = { .func = bpf_get_current_task_btf, .gpl_only = true, .ret_type = RET_PTR_TO_BTF_ID, - .ret_btf_id = &bpf_get_current_btf_ids[0], + .ret_btf_id = &btf_task_struct_ids[0], }; BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) -- cgit v1.2.3 From a396eda5517ac958fb4eb7358f4708eb829058c4 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Mon, 23 Aug 2021 19:43:48 -0700 Subject: bpf: Extend bpf_base_func_proto helpers with bpf_get_current_task_btf() bpf_get_current_task() is already supported so it's natural to also include the _btf() variant for btf-powered helpers. This is required for non-tracing progs to use bpf_task_pt_regs() in the next commit. Signed-off-by: Daniel Xu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/f99870ed5f834c9803d73b3476f8272b1bb987c0.1629772842.git.dxu@dxuuu.xyz --- kernel/bpf/helpers.c | 3 +++ kernel/trace/bpf_trace.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 4e8540716187..609674f409ed 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1322,6 +1322,7 @@ out: } const struct bpf_func_proto bpf_get_current_task_proto __weak; +const struct bpf_func_proto bpf_get_current_task_btf_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_proto __weak; @@ -1407,6 +1408,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return bpf_get_trace_printk_proto(); case BPF_FUNC_get_current_task: return &bpf_get_current_task_proto; + case BPF_FUNC_get_current_task_btf: + return &bpf_get_current_task_btf_proto; case BPF_FUNC_probe_read_user: return &bpf_probe_read_user_proto; case BPF_FUNC_probe_read_kernel: diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 50d055fc2327..4e54f3dc209f 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -716,7 +716,7 @@ BPF_CALL_0(bpf_get_current_task_btf) BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct) -static const struct bpf_func_proto bpf_get_current_task_btf_proto = { +const struct bpf_func_proto bpf_get_current_task_btf_proto = { .func = bpf_get_current_task_btf, .gpl_only = true, .ret_type = RET_PTR_TO_BTF_ID, -- cgit v1.2.3 From dd6e10fbd9fb86a571d925602c8a24bb4d09a2a7 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Mon, 23 Aug 2021 19:43:49 -0700 Subject: bpf: Add bpf_task_pt_regs() helper The motivation behind this helper is to access userspace pt_regs in a kprobe handler. uprobe's ctx is the userspace pt_regs. kprobe's ctx is the kernelspace pt_regs. bpf_task_pt_regs() allows accessing userspace pt_regs in a kprobe handler. The final case (kernelspace pt_regs in uprobe) is pretty rare (usermode helper) so I think that can be solved later if necessary. More concretely, this helper is useful in doing BPF-based DWARF stack unwinding. Currently the kernel can only do framepointer based stack unwinds for userspace code. This is because the DWARF state machines are too fragile to be computed in kernelspace [0]. The idea behind DWARF-based stack unwinds w/ BPF is to copy a chunk of the userspace stack (while in prog context) and send it up to userspace for unwinding (probably with libunwind) [1]. This would effectively enable profiling applications with -fomit-frame-pointer using kprobes and uprobes. [0]: https://lkml.org/lkml/2012/2/10/356 [1]: https://github.com/danobi/bpf-dwarf-walk Signed-off-by: Daniel Xu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/e2718ced2d51ef4268590ab8562962438ab82815.1629772842.git.dxu@dxuuu.xyz --- kernel/bpf/helpers.c | 3 +++ kernel/trace/bpf_trace.c | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 609674f409ed..c227b7d4f56c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1327,6 +1327,7 @@ const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; +const struct bpf_func_proto bpf_task_pt_regs_proto __weak; const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) @@ -1424,6 +1425,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_snprintf_btf_proto; case BPF_FUNC_snprintf: return &bpf_snprintf_proto; + case BPF_FUNC_task_pt_regs: + return &bpf_task_pt_regs_proto; default: return NULL; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 4e54f3dc209f..580e14ee7ff9 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -723,6 +723,23 @@ const struct bpf_func_proto bpf_get_current_task_btf_proto = { .ret_btf_id = &btf_task_struct_ids[0], }; +BPF_CALL_1(bpf_task_pt_regs, struct task_struct *, task) +{ + return (unsigned long) task_pt_regs(task); +} + +BTF_ID_LIST(bpf_task_pt_regs_ids) +BTF_ID(struct, pt_regs) + +const struct bpf_func_proto bpf_task_pt_regs_proto = { + .func = bpf_task_pt_regs, + .gpl_only = true, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_task_struct_ids[0], + .ret_type = RET_PTR_TO_BTF_ID, + .ret_btf_id = &bpf_task_pt_regs_ids[0], +}; + BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) { struct bpf_array *array = container_of(map, struct bpf_array, map); @@ -1032,6 +1049,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_task_proto; case BPF_FUNC_get_current_task_btf: return &bpf_get_current_task_btf_proto; + case BPF_FUNC_task_pt_regs: + return &bpf_task_pt_regs_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_current_comm: -- cgit v1.2.3 From eb18b49ea758ec052ac2a12c6bb204e1e877ec31 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 24 Aug 2021 10:30:07 -0700 Subject: bpf: tcp: Allow bpf-tcp-cc to call bpf_(get|set)sockopt This patch allows the bpf-tcp-cc to call bpf_setsockopt. One use case is to allow a bpf-tcp-cc switching to another cc during init(). For example, when the tcp flow is not ecn ready, the bpf_dctcp can switch to another cc by calling setsockopt(TCP_CONGESTION). During setsockopt(TCP_CONGESTION), the new tcp-cc's init() will be called and this could cause a recursion but it is stopped by the current trampoline's logic (in the prog->active counter). While retiring a bpf-tcp-cc (e.g. in tcp_v[46]_destroy_sock()), the tcp stack calls bpf-tcp-cc's release(). To avoid the retiring bpf-tcp-cc making further changes to the sk, bpf_setsockopt is not available to the bpf-tcp-cc's release(). This will avoid release() making setsockopt() call that will potentially allocate new resources. Although the bpf-tcp-cc already has a more powerful way to read tcp_sock from the PTR_TO_BTF_ID, it is usually expected that bpf_getsockopt and bpf_setsockopt are available together. Thus, bpf_getsockopt() is also added to all tcp_congestion_ops except release(). When the old bpf-tcp-cc is calling setsockopt(TCP_CONGESTION) to switch to a new cc, the old bpf-tcp-cc will be released by bpf_struct_ops_put(). Thus, this patch also puts the bpf_struct_ops_map after a rcu grace period because the trampoline's image cannot be freed while the old bpf-tcp-cc is still running. bpf-tcp-cc can only access icsk_ca_priv as SCALAR. All kernel's tcp-cc is also accessing the icsk_ca_priv as SCALAR. The size of icsk_ca_priv has already been raised a few times to avoid extra kmalloc and memory referencing. The only exception is the kernel's tcp_cdg.c that stores a kmalloc()-ed pointer in icsk_ca_priv. To avoid the old bpf-tcp-cc accidentally overriding this tcp_cdg's pointer value stored in icsk_ca_priv after switching and without over-complicating the bpf's verifier for this one exception in tcp_cdg, this patch does not allow switching to tcp_cdg. If there is a need, bpf_tcp_cdg can be implemented and then use the bpf_sk_storage as the extended storage. bpf_sk_setsockopt proto has only been recently added and used in bpf-sockopt and bpf-iter-tcp, so impose the tcp_cdg limitation in the same proto instead of adding a new proto specifically for bpf-tcp-cc. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210824173007.3976921-1-kafai@fb.com --- kernel/bpf/bpf_struct_ops.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 70f6fd4fa305..d6731c32864e 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -28,6 +28,7 @@ struct bpf_struct_ops_value { struct bpf_struct_ops_map { struct bpf_map map; + struct rcu_head rcu; const struct bpf_struct_ops *st_ops; /* protect map_update */ struct mutex lock; @@ -622,6 +623,14 @@ bool bpf_struct_ops_get(const void *kdata) return refcount_inc_not_zero(&kvalue->refcnt); } +static void bpf_struct_ops_put_rcu(struct rcu_head *head) +{ + struct bpf_struct_ops_map *st_map; + + st_map = container_of(head, struct bpf_struct_ops_map, rcu); + bpf_map_put(&st_map->map); +} + void bpf_struct_ops_put(const void *kdata) { struct bpf_struct_ops_value *kvalue; @@ -632,6 +641,17 @@ void bpf_struct_ops_put(const void *kdata) st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); - bpf_map_put(&st_map->map); + /* The struct_ops's function may switch to another struct_ops. + * + * For example, bpf_tcp_cc_x->init() may switch to + * another tcp_cc_y by calling + * setsockopt(TCP_CONGESTION, "tcp_cc_y"). + * During the switch, bpf_struct_ops_put(tcp_cc_x) is called + * and its map->refcnt may reach 0 which then free its + * trampoline image while tcp_cc_x is still running. + * + * Thus, a rcu grace period is needed here. + */ + call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu); } } -- cgit v1.2.3 From eb529c5b10b9401a0f2d1f469e82c6a0ba98082c Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Wed, 25 Aug 2021 18:48:31 -0700 Subject: bpf: Fix bpf-next builds without CONFIG_BPF_EVENTS This commit fixes linker errors along the lines of: s390-linux-ld: task_iter.c:(.init.text+0xa4): undefined reference to `btf_task_struct_ids'` Fix by defining btf_task_struct_ids unconditionally in kernel/bpf/btf.c since there exists code that unconditionally uses btf_task_struct_ids. Reported-by: kernel test robot Signed-off-by: Daniel Xu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/05d94748d9f4b3eecedc4fddd6875418a396e23c.1629942444.git.dxu@dxuuu.xyz --- kernel/bpf/btf.c | 2 ++ kernel/trace/bpf_trace.c | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index c395024610ed..dfe61df4f974 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6213,3 +6213,5 @@ const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = { .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; + +BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 580e14ee7ff9..8e2eb950aa82 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -714,8 +714,6 @@ BPF_CALL_0(bpf_get_current_task_btf) return (unsigned long) current; } -BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct) - const struct bpf_func_proto bpf_get_current_task_btf_proto = { .func = bpf_get_current_task_btf, .gpl_only = true, -- cgit v1.2.3 From 307d522f5eb86cd6ac8c905f5b0577dedac54ec5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 23 Jun 2021 16:44:32 -0500 Subject: signal/seccomp: Refactor seccomp signal and coredump generation Factor out force_sig_seccomp from the seccomp signal generation and place it in kernel/signal.c. The function force_sig_seccomp takes a parameter force_coredump to indicate that the sigaction field should be reset to SIGDFL so that a coredump will be generated when the signal is delivered. force_sig_seccomp is then used to replace both seccomp_send_sigsys and seccomp_init_siginfo. force_sig_info_to_task gains an extra parameter to force using the default signal action. With this change seccomp is no longer a special case and there becomes exactly one place do_coredump is called from. Further it no longer becomes necessary for __seccomp_filter to call do_group_exit. Acked-by: Kees Cook Link: https://lkml.kernel.org/r/87r1gr6qc4.fsf_-_@disp2133 Signed-off-by: "Eric W. Biederman" --- kernel/seccomp.c | 40 ++++++---------------------------------- kernel/signal.c | 30 ++++++++++++++++++++++++++---- 2 files changed, 32 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 057e17f3215d..abcbd3d2ba54 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -922,30 +922,6 @@ void get_seccomp_filter(struct task_struct *tsk) refcount_inc(&orig->users); } -static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason) -{ - clear_siginfo(info); - info->si_signo = SIGSYS; - info->si_code = SYS_SECCOMP; - info->si_call_addr = (void __user *)KSTK_EIP(current); - info->si_errno = reason; - info->si_arch = syscall_get_arch(current); - info->si_syscall = syscall; -} - -/** - * seccomp_send_sigsys - signals the task to allow in-process syscall emulation - * @syscall: syscall number to send to userland - * @reason: filter-supplied reason code to send to userland (via si_errno) - * - * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info. - */ -static void seccomp_send_sigsys(int syscall, int reason) -{ - struct kernel_siginfo info; - seccomp_init_siginfo(&info, syscall, reason); - force_sig_info(&info); -} #endif /* CONFIG_SECCOMP_FILTER */ /* For use with seccomp_actions_logged */ @@ -1218,7 +1194,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, /* Show the handler the original registers. */ syscall_rollback(current, current_pt_regs()); /* Let the filter pass back 16 bits of data. */ - seccomp_send_sigsys(this_syscall, data); + force_sig_seccomp(this_syscall, data, false); goto skip; case SECCOMP_RET_TRACE: @@ -1289,18 +1265,14 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, /* Dump core only if this is the last remaining thread. */ if (action != SECCOMP_RET_KILL_THREAD || get_nr_threads(current) == 1) { - kernel_siginfo_t info; - /* Show the original registers in the dump. */ syscall_rollback(current, current_pt_regs()); - /* Trigger a manual coredump since do_exit skips it. */ - seccomp_init_siginfo(&info, this_syscall, data); - do_coredump(&info); - } - if (action == SECCOMP_RET_KILL_THREAD) + /* Trigger a coredump with SIGSYS */ + force_sig_seccomp(this_syscall, data, true); + } else { do_exit(SIGSYS); - else - do_group_exit(SIGSYS); + } + return -1; /* skip the syscall go directly to signal handling */ } unreachable(); diff --git a/kernel/signal.c b/kernel/signal.c index a3229add4455..fbf941b80dd4 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -54,6 +54,7 @@ #include #include #include +#include /* for syscall_get_* */ /* * SLAB caches for signal bits. @@ -1322,7 +1323,7 @@ int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p * that is why we also clear SIGNAL_UNKILLABLE. */ static int -force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t) +force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, bool sigdfl) { unsigned long int flags; int ret, blocked, ignored; @@ -1333,7 +1334,7 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t) action = &t->sighand->action[sig-1]; ignored = action->sa.sa_handler == SIG_IGN; blocked = sigismember(&t->blocked, sig); - if (blocked || ignored) { + if (blocked || ignored || sigdfl) { action->sa.sa_handler = SIG_DFL; if (blocked) { sigdelset(&t->blocked, sig); @@ -1354,7 +1355,7 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t) int force_sig_info(struct kernel_siginfo *info) { - return force_sig_info_to_task(info, current); + return force_sig_info_to_task(info, current, false); } /* @@ -1685,7 +1686,7 @@ int force_sig_fault_to_task(int sig, int code, void __user *addr info.si_flags = flags; info.si_isr = isr; #endif - return force_sig_info_to_task(&info, t); + return force_sig_info_to_task(&info, t, false); } int force_sig_fault(int sig, int code, void __user *addr @@ -1793,6 +1794,27 @@ int force_sig_perf(void __user *addr, u32 type, u64 sig_data) return force_sig_info(&info); } +/** + * force_sig_seccomp - signals the task to allow in-process syscall emulation + * @syscall: syscall number to send to userland + * @reason: filter-supplied reason code to send to userland (via si_errno) + * + * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info. + */ +int force_sig_seccomp(int syscall, int reason, bool force_coredump) +{ + struct kernel_siginfo info; + + clear_siginfo(&info); + info.si_signo = SIGSYS; + info.si_code = SYS_SECCOMP; + info.si_call_addr = (void __user *)KSTK_EIP(current); + info.si_errno = reason; + info.si_arch = syscall_get_arch(current); + info.si_syscall = syscall; + return force_sig_info_to_task(&info, current, force_coredump); +} + /* For the crazy architectures that include trap information in * the errno field, instead of an actual errno value. */ -- cgit v1.2.3 From d21918e5a94a862ccb297b9f2be38574c865fda0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 23 Jun 2021 16:51:49 -0500 Subject: signal/seccomp: Dump core when there is only one live thread Replace get_nr_threads with atomic_read(¤t->signal->live) as that is a more accurate number that is decremented sooner. Acked-by: Kees Cook Link: https://lkml.kernel.org/r/87lf6z6qbd.fsf_-_@disp2133 Signed-off-by: "Eric W. Biederman" --- kernel/seccomp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index abcbd3d2ba54..afa4db097068 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1264,7 +1264,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, seccomp_log(this_syscall, SIGSYS, action, true); /* Dump core only if this is the last remaining thread. */ if (action != SECCOMP_RET_KILL_THREAD || - get_nr_threads(current) == 1) { + (atomic_read(¤t->signal->live) == 1)) { /* Show the original registers in the dump. */ syscall_rollback(current, current_pt_regs()); /* Trigger a coredump with SIGSYS */ -- cgit v1.2.3 From bc17bed5fd73ef1a9aed39f3b0ea26936dad60b8 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 4 Aug 2021 21:01:05 +0800 Subject: printk/index: Fix -Wunused-function warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If CONFIG_MODULES is n, we got this: kernel/printk/index.c:146:13: warning: ‘pi_remove_file’ defined but not used [-Wunused-function] Move it inside #ifdef block to fix this warning. Reported-by: Hulk Robot Signed-off-by: YueHaibing Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20210804130105.18732-1-yuehaibing@huawei.com --- kernel/printk/index.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/index.c b/kernel/printk/index.c index 58d27272f992..d3709408debe 100644 --- a/kernel/printk/index.c +++ b/kernel/printk/index.c @@ -143,12 +143,12 @@ static void pi_create_file(struct module *mod) mod, &dfs_index_fops); } +#ifdef CONFIG_MODULES static void pi_remove_file(struct module *mod) { debugfs_remove(debugfs_lookup(pi_get_module_name(mod), dfs_index)); } -#ifdef CONFIG_MODULES static int pi_module_notify(struct notifier_block *nb, unsigned long op, void *data) { -- cgit v1.2.3 From d20d30ebb199354729c64c86945ed25c66ff4991 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 27 Aug 2021 17:02:55 -0700 Subject: cgroup: Avoid compiler warnings with no subsystems As done before in commit cb4a31675270 ("cgroup: use bitmask to filter for_each_subsys"), avoid compiler warnings for the pathological case of having no subsystems (i.e. CGROUP_SUBSYS_COUNT == 0). This condition is hit for the arm multi_v7_defconfig config under -Wzero-length-bounds: In file included from ./arch/arm/include/generated/asm/rwonce.h:1, from include/linux/compiler.h:264, from include/uapi/linux/swab.h:6, from include/linux/swab.h:5, from arch/arm/include/asm/opcodes.h:86, from arch/arm/include/asm/bug.h:7, from include/linux/bug.h:5, from include/linux/thread_info.h:13, from include/asm-generic/current.h:5, from ./arch/arm/include/generated/asm/current.h:1, from include/linux/sched.h:12, from include/linux/cgroup.h:12, from kernel/cgroup/cgroup-internal.h:5, from kernel/cgroup/cgroup.c:31: kernel/cgroup/cgroup.c: In function 'of_css': kernel/cgroup/cgroup.c:651:42: warning: array subscript '' is outside the bounds of an interior zero-length array 'struct cgroup_subsys_state *[0]' [-Wzero-length-bounds] 651 | return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); Reported-by: Stephen Rothwell Cc: Tejun Heo Cc: Zefan Li Cc: Johannes Weiner Cc: cgroups@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d0725c1a8db5..881ce1470beb 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -67,6 +67,14 @@ /* let's not notify more than 100 times per second */ #define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100) +/* + * To avoid confusing the compiler (and generating warnings) with code + * that attempts to access what would be a 0-element array (i.e. sized + * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this + * constant expression can be added. + */ +#define CGROUP_HAS_SUBSYS_CONFIG (CGROUP_SUBSYS_COUNT > 0) + /* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. @@ -248,7 +256,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, */ bool cgroup_ssid_enabled(int ssid) { - if (CGROUP_SUBSYS_COUNT == 0) + if (!CGROUP_HAS_SUBSYS_CONFIG) return false; return static_key_enabled(cgroup_subsys_enabled_key[ssid]); @@ -472,7 +480,7 @@ static u16 cgroup_ss_mask(struct cgroup *cgrp) static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { - if (ss) + if (CGROUP_HAS_SUBSYS_CONFIG && ss) return rcu_dereference_check(cgrp->subsys[ss->id], lockdep_is_held(&cgroup_mutex)); else @@ -550,6 +558,9 @@ struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, { struct cgroup_subsys_state *css; + if (!CGROUP_HAS_SUBSYS_CONFIG) + return NULL; + do { css = cgroup_css(cgrp, ss); @@ -577,6 +588,9 @@ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp, { struct cgroup_subsys_state *css; + if (!CGROUP_HAS_SUBSYS_CONFIG) + return NULL; + rcu_read_lock(); do { @@ -647,7 +661,7 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) * the matching css from the cgroup's subsys table is guaranteed to * be and stay valid until the enclosing operation is complete. */ - if (cft->ss) + if (CGROUP_HAS_SUBSYS_CONFIG && cft->ss) return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); else return &cgrp->self; @@ -695,7 +709,7 @@ EXPORT_SYMBOL_GPL(of_css); */ #define do_each_subsys_mask(ss, ssid, ss_mask) do { \ unsigned long __ss_mask = (ss_mask); \ - if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */ \ + if (!CGROUP_HAS_SUBSYS_CONFIG) { \ (ssid) = 0; \ break; \ } \ @@ -2372,7 +2386,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, struct css_set *cset = tset->cur_cset; struct task_struct *task = tset->cur_task; - while (&cset->mg_node != tset->csets) { + while (CGROUP_HAS_SUBSYS_CONFIG && &cset->mg_node != tset->csets) { if (!task) task = list_first_entry(&cset->mg_tasks, struct task_struct, cg_list); @@ -4643,7 +4657,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, it->ss = css->ss; it->flags = flags; - if (it->ss) + if (CGROUP_HAS_SUBSYS_CONFIG && it->ss) it->cset_pos = &css->cgroup->e_csets[css->ss->id]; else it->cset_pos = &css->cgroup->cset_links; -- cgit v1.2.3 From f3c4b1341e8320e63f197a554fc5a25686a11d22 Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Fri, 27 Aug 2021 11:48:02 +0800 Subject: swiotlb: use depends on for DMA_RESTRICTED_POOL Use depends on instead of select for DMA_RESTRICTED_POOL; otherwise it will make SWIOTLB user configurable and cause compile errors for some arch (e.g. mips). Fixes: 0b84e4f8b793 ("swiotlb: Add restricted DMA pool initialization") Acked-by: Will Deacon Reported-by: Guenter Roeck Signed-off-by: Claire Chang Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 3e961dc39634..d19f9fef4ab9 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -82,8 +82,7 @@ config SWIOTLB config DMA_RESTRICTED_POOL bool "DMA Restricted Pool" - depends on OF && OF_RESERVED_MEM - select SWIOTLB + depends on OF && OF_RESERVED_MEM && SWIOTLB help This enables support for restricted DMA pools which provide a level of DMA memory protection on systems with limited hardware protection -- cgit v1.2.3 From 35d7bdc86031a2c1ae05ac27dfa93b2acdcbaecc Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 23 Apr 2021 10:20:25 +0200 Subject: kernel/fork: factor out replacing the current MM exe_file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's factor the main logic out into replace_mm_exe_file(), such that all mm->exe_file logic is contained in kernel/fork.c. While at it, perform some simple cleanups that are possible now that we're simplifying the individual functions. Acked-by: Christian Brauner Acked-by: "Eric W. Biederman" Acked-by: Christian König Signed-off-by: David Hildenbrand --- kernel/fork.c | 44 +++++++++++++++++++++++++++++++++++++++++--- kernel/sys.c | 33 +-------------------------------- 2 files changed, 42 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 44f4c2d83763..f4ac883c4a1e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1148,9 +1148,7 @@ void mmput_async(struct mm_struct *mm) * * Main users are mmput() and sys_execve(). Callers prevent concurrent * invocations: in mmput() nobody alive left, in execve task is single - * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the - * mm->exe_file, but does so without using set_mm_exe_file() in order - * to avoid the need for any locks. + * threaded. */ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) { @@ -1170,6 +1168,46 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) fput(old_exe_file); } +/** + * replace_mm_exe_file - replace a reference to the mm's executable file + * + * This changes mm's executable file (shown as symlink /proc/[pid]/exe), + * dealing with concurrent invocation and without grabbing the mmap lock in + * write mode. + * + * Main user is sys_prctl(PR_SET_MM_MAP/EXE_FILE). + */ +int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) +{ + struct vm_area_struct *vma; + struct file *old_exe_file; + int ret = 0; + + /* Forbid mm->exe_file change if old file still mapped. */ + old_exe_file = get_mm_exe_file(mm); + if (old_exe_file) { + mmap_read_lock(mm); + for (vma = mm->mmap; vma && !ret; vma = vma->vm_next) { + if (!vma->vm_file) + continue; + if (path_equal(&vma->vm_file->f_path, + &old_exe_file->f_path)) + ret = -EBUSY; + } + mmap_read_unlock(mm); + fput(old_exe_file); + if (ret) + return ret; + } + + /* set the new file, lockless */ + get_file(new_exe_file); + old_exe_file = xchg(&mm->exe_file, new_exe_file); + if (old_exe_file) + fput(old_exe_file); + return 0; +} + /** * get_mm_exe_file - acquire a reference to the mm's executable file * diff --git a/kernel/sys.c b/kernel/sys.c index ef1a78f5d71c..30c12e54585a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1846,7 +1846,6 @@ SYSCALL_DEFINE1(umask, int, mask) static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { struct fd exe; - struct file *old_exe, *exe_file; struct inode *inode; int err; @@ -1869,40 +1868,10 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) if (err) goto exit; - /* - * Forbid mm->exe_file change if old file still mapped. - */ - exe_file = get_mm_exe_file(mm); - err = -EBUSY; - if (exe_file) { - struct vm_area_struct *vma; - - mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (!vma->vm_file) - continue; - if (path_equal(&vma->vm_file->f_path, - &exe_file->f_path)) - goto exit_err; - } - - mmap_read_unlock(mm); - fput(exe_file); - } - - err = 0; - /* set the new file, lockless */ - get_file(exe.file); - old_exe = xchg(&mm->exe_file, exe.file); - if (old_exe) - fput(old_exe); + err = replace_mm_exe_file(mm, exe.file); exit: fdput(exe); return err; -exit_err: - mmap_read_unlock(mm); - fput(exe_file); - goto exit; } /* -- cgit v1.2.3 From fe69d560b5bd9ec77b5d5749bd7027344daef47e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 23 Apr 2021 10:29:59 +0200 Subject: kernel/fork: always deny write access to current MM exe_file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We want to remove VM_DENYWRITE only currently only used when mapping the executable during exec. During exec, we already deny_write_access() the executable, however, after exec completes the VMAs mapped with VM_DENYWRITE effectively keeps write access denied via deny_write_access(). Let's deny write access when setting or replacing the MM exe_file. With this change, we can remove VM_DENYWRITE for mapping executables. Make set_mm_exe_file() return an error in case deny_write_access() fails; note that this should never happen, because exec code does a deny_write_access() early and keeps write access denied when calling set_mm_exe_file. However, it makes the code easier to read and makes set_mm_exe_file() and replace_mm_exe_file() look more similar. This represents a minor user space visible change: sys_prctl(PR_SET_MM_MAP/EXE_FILE) can now fail if the file is already opened writable. Also, after sys_prctl(PR_SET_MM_MAP/EXE_FILE) the file cannot be opened writable. Note that we can already fail with -EACCES if the file doesn't have execute permissions. Acked-by: "Eric W. Biederman" Acked-by: Christian König Signed-off-by: David Hildenbrand --- kernel/fork.c | 50 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 44 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index f4ac883c4a1e..7677f897ecb6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -470,6 +470,20 @@ void free_task(struct task_struct *tsk) } EXPORT_SYMBOL(free_task); +static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) +{ + struct file *exe_file; + + exe_file = get_mm_exe_file(oldmm); + RCU_INIT_POINTER(mm->exe_file, exe_file); + /* + * We depend on the oldmm having properly denied write access to the + * exe_file already. + */ + if (exe_file && deny_write_access(exe_file)) + pr_warn_once("deny_write_access() failed in %s\n", __func__); +} + #ifdef CONFIG_MMU static __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) @@ -493,7 +507,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); /* No ordering required: file already has been exposed. */ - RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); + dup_mm_exe_file(mm, oldmm); mm->total_vm = oldmm->total_vm; mm->data_vm = oldmm->data_vm; @@ -639,7 +653,7 @@ static inline void mm_free_pgd(struct mm_struct *mm) static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { mmap_write_lock(oldmm); - RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); + dup_mm_exe_file(mm, oldmm); mmap_write_unlock(oldmm); return 0; } @@ -1149,8 +1163,10 @@ void mmput_async(struct mm_struct *mm) * Main users are mmput() and sys_execve(). Callers prevent concurrent * invocations: in mmput() nobody alive left, in execve task is single * threaded. + * + * Can only fail if new_exe_file != NULL. */ -void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) +int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) { struct file *old_exe_file; @@ -1161,11 +1177,21 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) */ old_exe_file = rcu_dereference_raw(mm->exe_file); - if (new_exe_file) + if (new_exe_file) { + /* + * We expect the caller (i.e., sys_execve) to already denied + * write access, so this is unlikely to fail. + */ + if (unlikely(deny_write_access(new_exe_file))) + return -EACCES; get_file(new_exe_file); + } rcu_assign_pointer(mm->exe_file, new_exe_file); - if (old_exe_file) + if (old_exe_file) { + allow_write_access(old_exe_file); fput(old_exe_file); + } + return 0; } /** @@ -1201,10 +1227,22 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) } /* set the new file, lockless */ + ret = deny_write_access(new_exe_file); + if (ret) + return -EACCES; get_file(new_exe_file); + old_exe_file = xchg(&mm->exe_file, new_exe_file); - if (old_exe_file) + if (old_exe_file) { + /* + * Don't race with dup_mmap() getting the file and disallowing + * write access while someone might open the file writable. + */ + mmap_read_lock(mm); + allow_write_access(old_exe_file); fput(old_exe_file); + mmap_read_unlock(mm); + } return 0; } -- cgit v1.2.3 From 8d0920bde5eb8ec7e567939b85e65a0596c8580d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 22 Apr 2021 12:08:20 +0200 Subject: mm: remove VM_DENYWRITE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All in-tree users of MAP_DENYWRITE are gone. MAP_DENYWRITE cannot be set from user space, so all users are gone; let's remove it. Acked-by: "Eric W. Biederman" Acked-by: Christian König Signed-off-by: David Hildenbrand --- kernel/events/core.c | 2 -- kernel/fork.c | 3 --- 2 files changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 1cb1f9b8392e..19767bb9933c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8307,8 +8307,6 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) else flags = MAP_PRIVATE; - if (vma->vm_flags & VM_DENYWRITE) - flags |= MAP_DENYWRITE; if (vma->vm_flags & VM_LOCKED) flags |= MAP_LOCKED; if (is_vm_hugetlb_page(vma)) diff --git a/kernel/fork.c b/kernel/fork.c index 7677f897ecb6..feef1057081d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -570,12 +570,9 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); file = tmp->vm_file; if (file) { - struct inode *inode = file_inode(file); struct address_space *mapping = file->f_mapping; get_file(file); - if (tmp->vm_flags & VM_DENYWRITE) - put_write_access(inode); i_mmap_lock_write(mapping); if (tmp->vm_flags & VM_SHARED) mapping_allow_writable(mapping); -- cgit v1.2.3 From fab827dbee8c2e06ca4ba000fa6c48bcf9054aba Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 2 Sep 2021 14:54:57 -0700 Subject: memcg: enable accounting for pids in nested pid namespaces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 5d097056c9a0 ("kmemcg: account certain kmem allocations to memcg") enabled memcg accounting for pids allocated from init_pid_ns.pid_cachep, but forgot to adjust the setting for nested pid namespaces. As a result, pid memory is not accounted exactly where it is really needed, inside memcg-limited containers with their own pid namespaces. Pid was one the first kernel objects enabled for memcg accounting. init_pid_ns.pid_cachep marked by SLAB_ACCOUNT and we can expect that any new pids in the system are memcg-accounted. Though recently I've noticed that it is wrong. nested pid namespaces creates own slab caches for pid objects, nested pids have increased size because contain id both for all parent and for own pid namespaces. The problem is that these slab caches are _NOT_ marked by SLAB_ACCOUNT, as a result any pids allocated in nested pid namespaces are not memcg-accounted. Pid struct in nested pid namespace consumes up to 500 bytes memory, 100000 such objects gives us up to ~50Mb unaccounted memory, this allow container to exceed assigned memcg limits. Link: https://lkml.kernel.org/r/8b6de616-fd1a-02c6-cbdb-976ecdcfa604@virtuozzo.com Fixes: 5d097056c9a0 ("kmemcg: account certain kmem allocations to memcg") Cc: stable@vger.kernel.org Signed-off-by: Vasily Averin Reviewed-by: Michal Koutný Reviewed-by: Shakeel Butt Acked-by: Christian Brauner Acked-by: Roman Gushchin Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid_namespace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index ca43239a255a..cb5a25a8a0cc 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -51,7 +51,8 @@ static struct kmem_cache *create_pid_cachep(unsigned int level) mutex_lock(&pid_caches_mutex); /* Name collision forces to do allocation under mutex. */ if (!*pkc) - *pkc = kmem_cache_create(name, len, 0, SLAB_HWCACHE_ALIGN, 0); + *pkc = kmem_cache_create(name, len, 0, + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, 0); mutex_unlock(&pid_caches_mutex); /* current can fail, but someone else can succeed. */ return READ_ONCE(*pkc); -- cgit v1.2.3 From 30acd0bdfb86548172168a0cc71d455944de0683 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 2 Sep 2021 14:55:27 -0700 Subject: memcg: enable accounting for new namesapces and struct nsproxy Container admin can create new namespaces and force kernel to allocate up to several pages of memory for the namespaces and its associated structures. Net and uts namespaces have enabled accounting for such allocations. It makes sense to account for rest ones to restrict the host's memory consumption from inside the memcg-limited container. Link: https://lkml.kernel.org/r/5525bcbf-533e-da27-79b7-158686c64e13@virtuozzo.com Signed-off-by: Vasily Averin Acked-by: Serge Hallyn Acked-by: Christian Brauner Acked-by: Kirill Tkhai Reviewed-by: Shakeel Butt Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrei Vagin Cc: Borislav Petkov Cc: Borislav Petkov Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Jens Axboe Cc: Jiri Slaby Cc: Johannes Weiner Cc: Michal Hocko Cc: Oleg Nesterov Cc: Roman Gushchin Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vladimir Davydov Cc: Yutian Yang Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup/namespace.c | 2 +- kernel/nsproxy.c | 2 +- kernel/pid_namespace.c | 2 +- kernel/time/namespace.c | 4 ++-- kernel/user_namespace.c | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index f5e8828c109c..0d5c29879a50 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -24,7 +24,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) struct cgroup_namespace *new_ns; int ret; - new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL); + new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) return ERR_PTR(-ENOMEM); ret = ns_alloc_inum(&new_ns->ns); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index abc01fcad8c7..eec72ca962e2 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -568,6 +568,6 @@ out: int __init nsproxy_cache_init(void) { - nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); + nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC|SLAB_ACCOUNT); return 0; } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index cb5a25a8a0cc..a46a3723bc66 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -450,7 +450,7 @@ const struct proc_ns_operations pidns_for_children_operations = { static __init int pid_namespaces_init(void) { - pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); + pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT); #ifdef CONFIG_CHECKPOINT_RESTORE register_sysctl_paths(kern_path, pid_ns_ctl_table); diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 12eab0d2ae28..aec832801c26 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -88,13 +88,13 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, goto fail; err = -ENOMEM; - ns = kmalloc(sizeof(*ns), GFP_KERNEL); + ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT); if (!ns) goto fail_dec; refcount_set(&ns->ns.count, 1); - ns->vvar_page = alloc_page(GFP_KERNEL | __GFP_ZERO); + ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!ns->vvar_page) goto fail_free; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index ef82d401dde8..6b2e3ca7ee99 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -1385,7 +1385,7 @@ const struct proc_ns_operations userns_operations = { static __init int user_namespaces_init(void) { - user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); + user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT); return 0; } subsys_initcall(user_namespaces_init); -- cgit v1.2.3 From 5f58c39819ff78ca5ddbba2b3cd8ff4779b19bb5 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 2 Sep 2021 14:55:35 -0700 Subject: memcg: enable accounting for signals When a user send a signal to any another processes it forces the kernel to allocate memory for 'struct sigqueue' objects. The number of signals is limited by RLIMIT_SIGPENDING resource limit, but even the default settings allow each user to consume up to several megabytes of memory. It makes sense to account for these allocations to restrict the host's memory consumption from inside the memcg-limited container. Link: https://lkml.kernel.org/r/e34e958c-e785-712e-a62a-2c7b66c646c7@virtuozzo.com Signed-off-by: Vasily Averin Reviewed-by: Shakeel Butt Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrei Vagin Cc: Borislav Petkov Cc: Borislav Petkov Cc: Christian Brauner Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Jens Axboe Cc: Jiri Slaby Cc: Johannes Weiner Cc: Kirill Tkhai Cc: Michal Hocko Cc: Oleg Nesterov Cc: Roman Gushchin Cc: Serge Hallyn Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vladimir Davydov Cc: Yutian Yang Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index a3229add4455..8921c4a8419f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -4663,7 +4663,7 @@ void __init signals_init(void) { siginfo_buildtime_checks(); - sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC); + sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC | SLAB_ACCOUNT); } #ifdef CONFIG_KGDB_KDB -- cgit v1.2.3 From c509723ec27e925bb91a20682c448e95d4bc8c9f Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 2 Sep 2021 14:55:39 -0700 Subject: memcg: enable accounting for posix_timers_cache slab A program may create multiple interval timers using timer_create(). For each timer the kernel preallocates a "queued real-time signal", Consequently, the number of timers is limited by the RLIMIT_SIGPENDING resource limit. The allocated object is quite small, ~250 bytes, but even the default signal limits allow to consume up to 100 megabytes per user. It makes sense to account for them to limit the host's memory consumption from inside the memcg-limited container. Link: https://lkml.kernel.org/r/57795560-025c-267c-6b1a-dea852d95530@virtuozzo.com Signed-off-by: Vasily Averin Reviewed-by: Thomas Gleixner Reviewed-by: Shakeel Butt Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrei Vagin Cc: Borislav Petkov Cc: Borislav Petkov Cc: Christian Brauner Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Jens Axboe Cc: Jiri Slaby Cc: Johannes Weiner Cc: Kirill Tkhai Cc: Michal Hocko Cc: Oleg Nesterov Cc: Roman Gushchin Cc: Serge Hallyn Cc: Tejun Heo Cc: Vladimir Davydov Cc: Yutian Yang Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/posix-timers.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index dd5697d7347b..7363f81dc31a 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -273,8 +273,8 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) static __init int init_posix_timers(void) { posix_timers_cache = kmem_cache_create("posix_timers_cache", - sizeof (struct k_itimer), 0, SLAB_PANIC, - NULL); + sizeof(struct k_itimer), 0, + SLAB_PANIC | SLAB_ACCOUNT, NULL); return 0; } __initcall(init_posix_timers); -- cgit v1.2.3 From 65d759c8f9f57b96c199f3fe5cfb93ac7da095e9 Mon Sep 17 00:00:00 2001 From: Charan Teja Reddy Date: Thu, 2 Sep 2021 14:59:59 -0700 Subject: mm: compaction: support triggering of proactive compaction by user The proactive compaction[1] gets triggered for every 500msec and run compaction on the node for COMPACTION_HPAGE_ORDER (usually order-9) pages based on the value set to sysctl.compaction_proactiveness. Triggering the compaction for every 500msec in search of COMPACTION_HPAGE_ORDER pages is not needed for all applications, especially on the embedded system usecases which may have few MB's of RAM. Enabling the proactive compaction in its state will endup in running almost always on such systems. Other side, proactive compaction can still be very much useful for getting a set of higher order pages in some controllable manner(controlled by using the sysctl.compaction_proactiveness). So, on systems where enabling the proactive compaction always may proove not required, can trigger the same from user space on write to its sysctl interface. As an example, say app launcher decide to launch the memory heavy application which can be launched fast if it gets more higher order pages thus launcher can prepare the system in advance by triggering the proactive compaction from userspace. This triggering of proactive compaction is done on a write to sysctl.compaction_proactiveness by user. [1]https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit?id=facdaa917c4d5a376d09d25865f5a863f906234a [akpm@linux-foundation.org: tweak vm.rst, per Mike] Link: https://lkml.kernel.org/r/1627653207-12317-1-git-send-email-charante@codeaurora.org Signed-off-by: Charan Teja Reddy Acked-by: Vlastimil Babka Acked-by: Rafael Aquini Cc: Mike Rapoport Cc: Luis Chamberlain Cc: Kees Cook Cc: Iurii Zaikin Cc: Dave Hansen Cc: Mel Gorman Cc: Nitin Gupta Cc: Jonathan Corbet Cc: Khalid Aziz Cc: David Rientjes Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 272f4a272f8c..297f0b3966bd 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2871,7 +2871,7 @@ static struct ctl_table vm_table[] = { .data = &sysctl_compaction_proactiveness, .maxlen = sizeof(sysctl_compaction_proactiveness), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = compaction_proactiveness_sysctl_handler, .extra1 = SYSCTL_ZERO, .extra2 = &one_hundred, }, -- cgit v1.2.3 From dce49103962840dd61423d7627748d6c558d58c5 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 2 Sep 2021 15:00:33 -0700 Subject: mm: wire up syscall process_mrelease Split off from prev patch in the series that implements the syscall. Link: https://lkml.kernel.org/r/20210809185259.405936-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Geert Uytterhoeven Cc: Andy Lutomirski Cc: Christian Brauner Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Rientjes Cc: Florian Weimer Cc: Jan Engelhardt Cc: Jann Horn Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Tim Murray Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys_ni.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 30971b1dd4a9..18a9c2cde767 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -289,6 +289,7 @@ COND_SYSCALL(munlockall); COND_SYSCALL(mincore); COND_SYSCALL(madvise); COND_SYSCALL(process_madvise); +COND_SYSCALL(process_mrelease); COND_SYSCALL(remap_file_pages); COND_SYSCALL(mbind); COND_SYSCALL_COMPAT(mbind); -- cgit v1.2.3 From 54357f0c9149c871e5e4b83ad385a6f2ad3a749f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 10 Aug 2021 15:26:25 +0200 Subject: tracing: Add migrate-disabled counter to tracing output. migrate_disable() forbids task migration to another CPU. It is available since v5.11 and has already users such as highmem or BPF. It is useful to observe this task state in tracing which already has other states like the preemption counter. Instead of adding the migrate disable counter as a new entry to struct trace_entry, which would extend the whole struct by four bytes, it is squashed into the preempt-disable counter. The lower four bits represent the preemption counter, the upper four bits represent the migrate disable counter. Both counter shouldn't exceed 15 but if they do, there is a safety net which caps the value at 15. Add the migrate-disable counter to the trace entry so it shows up in the trace. Due to the users mentioned above, it is already possible to observe it: | bash-1108 [000] ...21 73.950578: rss_stat: mm_id=2213312838 curr=0 type=MM_ANONPAGES size=8192B | bash-1108 [000] d..31 73.951222: irq_disable: caller=flush_tlb_mm_range+0x115/0x130 parent=ptep_clear_flush+0x42/0x50 | bash-1108 [000] d..31 73.951222: tlb_flush: pages:1 reason:local mm shootdown (3) The last value is the migrate-disable counter. Things that popped up: - trace_print_lat_context() does not print the migrate counter. Not sure if it should. It is used in "verbose" mode and uses 8 digits and I'm not sure ther is something processing the value. - trace_define_common_fields() now defines a different variable. This probably breaks things. No ide what to do in order to preserve the old behaviour. Since this is used as a filter it should be split somehow to be able to match both nibbles here. Link: https://lkml.kernel.org/r/20210810132625.ylssabmsrkygokuv@linutronix.de Signed-off-by: Thomas Gleixner [bigeasy: patch description.] Signed-off-by: Sebastian Andrzej Siewior [ SDR: Removed change to common_preempt_count field name ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 26 +++++++++++++++++++------- kernel/trace/trace_events.c | 1 + kernel/trace/trace_output.c | 11 ++++++++--- 3 files changed, 28 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 489924cde4f8..b554e18924f8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2603,6 +2603,15 @@ enum print_line_t trace_handle_return(struct trace_seq *s) } EXPORT_SYMBOL_GPL(trace_handle_return); +static unsigned short migration_disable_value(void) +{ +#if defined(CONFIG_SMP) + return current->migration_disabled; +#else + return 0; +#endif +} + unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) { unsigned int trace_flags = irqs_status; @@ -2621,7 +2630,8 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) trace_flags |= TRACE_FLAG_NEED_RESCHED; if (test_preempt_need_resched()) trace_flags |= TRACE_FLAG_PREEMPT_RESCHED; - return (trace_flags << 16) | (pc & 0xff); + return (trace_flags << 16) | (min_t(unsigned int, pc & 0xff, 0xf)) | + (min_t(unsigned int, migration_disable_value(), 0xf)) << 4; } struct ring_buffer_event * @@ -4189,9 +4199,10 @@ static void print_lat_help_header(struct seq_file *m) "# | / _----=> need-resched \n" "# || / _---=> hardirq/softirq \n" "# ||| / _--=> preempt-depth \n" - "# |||| / delay \n" - "# cmd pid ||||| time | caller \n" - "# \\ / ||||| \\ | / \n"); + "# |||| / _-=> migrate-disable \n" + "# ||||| / delay \n" + "# cmd pid |||||| time | caller \n" + "# \\ / |||||| \\ | / \n"); } static void print_event_info(struct array_buffer *buf, struct seq_file *m) @@ -4229,9 +4240,10 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); - seq_printf(m, "# %.*s||| / delay\n", prec, space); - seq_printf(m, "# TASK-PID %.*s CPU# |||| TIMESTAMP FUNCTION\n", prec, " TGID "); - seq_printf(m, "# | | %.*s | |||| | |\n", prec, " | "); + seq_printf(m, "# %.*s||| / _-=> migrate-disable\n", prec, space); + seq_printf(m, "# %.*s|||| / delay\n", prec, space); + seq_printf(m, "# TASK-PID %.*s CPU# ||||| TIMESTAMP FUNCTION\n", prec, " TGID "); + seq_printf(m, "# | | %.*s | ||||| | |\n", prec, " | "); } void diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 1349b6de5eeb..830b3b9940f4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -181,6 +181,7 @@ static int trace_define_common_fields(void) __common_field(unsigned short, type); __common_field(unsigned char, flags); + /* Holds both preempt_count and migrate_disable */ __common_field(unsigned char, preempt_count); __common_field(int, pid); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index a0bf446bb034..c2ca40e8595b 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -492,8 +492,13 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) trace_seq_printf(s, "%c%c%c", irqs_off, need_resched, hardsoft_irq); - if (entry->preempt_count) - trace_seq_printf(s, "%x", entry->preempt_count); + if (entry->preempt_count & 0xf) + trace_seq_printf(s, "%x", entry->preempt_count & 0xf); + else + trace_seq_putc(s, '.'); + + if (entry->preempt_count & 0xf0) + trace_seq_printf(s, "%x", entry->preempt_count >> 4); else trace_seq_putc(s, '.'); @@ -656,7 +661,7 @@ int trace_print_lat_context(struct trace_iterator *iter) trace_seq_printf( s, "%16s %7d %3d %d %08x %08lx ", comm, entry->pid, iter->cpu, entry->flags, - entry->preempt_count, iter->idx); + entry->preempt_count & 0xf, iter->idx); } else { lat_print_generic(s, entry, iter->cpu); } -- cgit v1.2.3 From f8416aa29185468e0d914ba4b2a330fd53ee263f Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Mon, 6 Sep 2021 19:23:02 +0800 Subject: kernel: debug: Convert to SPDX identifier use SPDX-License-Identifier instead of a verbose license text Signed-off-by: Cai Huoqing Link: https://lore.kernel.org/r/20210906112302.937-1-caihuoqing@baidu.com Signed-off-by: Daniel Thompson --- kernel/debug/debug_core.c | 5 +---- kernel/debug/gdbstub.c | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index b4aa6bb6b2bd..da06a5553835 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Kernel Debug Core * @@ -22,10 +23,6 @@ * * Original KGDB stub: David Grothe , * Tigran Aivazian - * - * This file is licensed under the terms of the GNU General Public License - * version 2. This program is licensed "as is" without any warranty of any - * kind, whether express or implied. */ #define pr_fmt(fmt) "KGDB: " fmt diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index b6f28fad4307..9d34d2364b5a 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Kernel Debug Core * @@ -22,10 +23,6 @@ * * Original KGDB stub: David Grothe , * Tigran Aivazian - * - * This file is licensed under the terms of the GNU General Public License - * version 2. This program is licensed "as is" without any warranty of any - * kind, whether express or implied. */ #include -- cgit v1.2.3 From 5615e088b43d564aec08ccfaecb21ba484a1b4d6 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 24 Aug 2021 14:51:50 +0300 Subject: tracing: Fix some alloc_event_probe() error handling bugs There are two bugs in this code. First, if the kzalloc() fails it leads to a NULL dereference of "ep" on the next line. Second, if the alloc_event_probe() function returns an error then it leads to an error pointer dereference in the caller. Link: https://lkml.kernel.org/r/20210824115150.GI31143@kili Fixes: 7491e2c44278 ("tracing: Add a probe that attaches to trace events") Signed-off-by: Dan Carpenter Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_eprobe.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 56a96e9750cf..3044b762cbd7 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -151,7 +151,7 @@ static struct trace_eprobe *alloc_event_probe(const char *group, ep = kzalloc(struct_size(ep, tp.args, nargs), GFP_KERNEL); if (!ep) { - trace_event_put_ref(ep->event); + trace_event_put_ref(event); goto error; } ep->event = event; @@ -851,7 +851,8 @@ static int __trace_eprobe_create(int argc, const char *argv[]) ret = PTR_ERR(ep); /* This must return -ENOMEM, else there is a bug */ WARN_ON_ONCE(ret != -ENOMEM); - goto error; /* We know ep is not allocated */ + ep = NULL; + goto error; } argc -= 2; argv += 2; -- cgit v1.2.3 From 3c91dda97eea704ac257ddb138d1154adab8db62 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Tue, 7 Sep 2021 19:58:18 -0700 Subject: kernel/acct.c: use dedicated helper to access rlimit values Use rlimit() helper instead of manually writing whole chain from task to rlimit value. See patch "posix-cpu-timers: Use dedicated helper to access rlimit values". Link: https://lkml.kernel.org/r/20210728030822.524789-1-yang.yang29@zte.com.cn Signed-off-by: Yang Yang Reported-by: Zeal Robot Cc: Randy Dunlap Cc: sh_def@163.com Cc: Yang Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/acct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index a64102be2bb0..23a7ab8e6cbc 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -478,7 +478,7 @@ static void do_acct_process(struct bsd_acct_struct *acct) /* * Accounting records are not subject to resource limits. */ - flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + flim = rlimit(RLIMIT_FSIZE); current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; /* Perform file operations on behalf of whoever enabled accounting */ orig_cred = override_creds(file->f_cred); -- cgit v1.2.3 From 2d186afd04d669fe9c48b994c41a7405a3c9f16d Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Tue, 7 Sep 2021 19:58:21 -0700 Subject: profiling: fix shift-out-of-bounds bugs Syzbot reported shift-out-of-bounds bug in profile_init(). The problem was in incorrect prof_shift. Since prof_shift value comes from userspace we need to clamp this value into [0, BITS_PER_LONG -1] boundaries. Second possible shiht-out-of-bounds was found by Tetsuo: sample_step local variable in read_profile() had "unsigned int" type, but prof_shift allows to make a BITS_PER_LONG shift. So, to prevent possible shiht-out-of-bounds sample_step type was changed to "unsigned long". Also, "unsigned short int" will be sufficient for storing [0, BITS_PER_LONG] value, that's why there is no need for "unsigned long" prof_shift. Link: https://lkml.kernel.org/r/20210813140022.5011-1-paskripkin@gmail.com Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-and-tested-by: syzbot+e68c89a9510c159d9684@syzkaller.appspotmail.com Suggested-by: Tetsuo Handa Signed-off-by: Pavel Skripkin Cc: Thomas Gleixner Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/profile.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index c2ebddb5e974..eb9c7f0f5ac5 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -41,7 +41,8 @@ struct profile_hit { #define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ) static atomic_t *prof_buffer; -static unsigned long prof_len, prof_shift; +static unsigned long prof_len; +static unsigned short int prof_shift; int prof_on __read_mostly; EXPORT_SYMBOL_GPL(prof_on); @@ -67,8 +68,8 @@ int profile_setup(char *str) if (str[strlen(sleepstr)] == ',') str += strlen(sleepstr) + 1; if (get_option(&str, &par)) - prof_shift = par; - pr_info("kernel sleep profiling enabled (shift: %ld)\n", + prof_shift = clamp(par, 0, BITS_PER_LONG - 1); + pr_info("kernel sleep profiling enabled (shift: %u)\n", prof_shift); #else pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n"); @@ -78,21 +79,21 @@ int profile_setup(char *str) if (str[strlen(schedstr)] == ',') str += strlen(schedstr) + 1; if (get_option(&str, &par)) - prof_shift = par; - pr_info("kernel schedule profiling enabled (shift: %ld)\n", + prof_shift = clamp(par, 0, BITS_PER_LONG - 1); + pr_info("kernel schedule profiling enabled (shift: %u)\n", prof_shift); } else if (!strncmp(str, kvmstr, strlen(kvmstr))) { prof_on = KVM_PROFILING; if (str[strlen(kvmstr)] == ',') str += strlen(kvmstr) + 1; if (get_option(&str, &par)) - prof_shift = par; - pr_info("kernel KVM profiling enabled (shift: %ld)\n", + prof_shift = clamp(par, 0, BITS_PER_LONG - 1); + pr_info("kernel KVM profiling enabled (shift: %u)\n", prof_shift); } else if (get_option(&str, &par)) { - prof_shift = par; + prof_shift = clamp(par, 0, BITS_PER_LONG - 1); prof_on = CPU_PROFILING; - pr_info("kernel profiling enabled (shift: %ld)\n", + pr_info("kernel profiling enabled (shift: %u)\n", prof_shift); } return 1; @@ -468,7 +469,7 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) unsigned long p = *ppos; ssize_t read; char *pnt; - unsigned int sample_step = 1 << prof_shift; + unsigned long sample_step = 1UL << prof_shift; profile_flip_buffers(); if (p >= (prof_len+1)*sizeof(unsigned int)) -- cgit v1.2.3 From 1e1c15839df084f4011825fee922aa976c9159dc Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 7 Sep 2021 20:00:00 -0700 Subject: fs/epoll: use a per-cpu counter for user's watches count This counter tracks the number of watches a user has, to compare against the 'max_user_watches' limit. This causes a scalability bottleneck on SPECjbb2015 on large systems as there is only one user. Changing to a per-cpu counter increases throughput of the benchmark by about 30% on a 16-socket, > 1000 thread system. [rdunlap@infradead.org: fix build errors in kernel/user.c when CONFIG_EPOLL=n] [npiggin@gmail.com: move ifdefs into wrapper functions, slightly improve panic message] Link: https://lkml.kernel.org/r/1628051945.fens3r99ox.astroid@bobo.none [akpm@linux-foundation.org: tweak user_epoll_alloc(), per Guenter] Link: https://lkml.kernel.org/r/20210804191421.GA1900577@roeck-us.net Link: https://lkml.kernel.org/r/20210802032013.2751916-1-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reported-by: Anton Blanchard Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/user.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index c82399c1618a..e2cf8c22b539 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -129,6 +129,22 @@ static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent) return NULL; } +static int user_epoll_alloc(struct user_struct *up) +{ +#ifdef CONFIG_EPOLL + return percpu_counter_init(&up->epoll_watches, 0, GFP_KERNEL); +#else + return 0; +#endif +} + +static void user_epoll_free(struct user_struct *up) +{ +#ifdef CONFIG_EPOLL + percpu_counter_destroy(&up->epoll_watches); +#endif +} + /* IRQs are disabled and uidhash_lock is held upon function entry. * IRQ state (as stored in flags) is restored and uidhash_lock released * upon function exit. @@ -138,6 +154,7 @@ static void free_user(struct user_struct *up, unsigned long flags) { uid_hash_remove(up); spin_unlock_irqrestore(&uidhash_lock, flags); + user_epoll_free(up); kmem_cache_free(uid_cachep, up); } @@ -185,6 +202,10 @@ struct user_struct *alloc_uid(kuid_t uid) new->uid = uid; refcount_set(&new->__count, 1); + if (user_epoll_alloc(new)) { + kmem_cache_free(uid_cachep, new); + return NULL; + } ratelimit_state_init(&new->ratelimit, HZ, 100); ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE); @@ -195,6 +216,7 @@ struct user_struct *alloc_uid(kuid_t uid) spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { + user_epoll_free(new); kmem_cache_free(uid_cachep, new); } else { uid_hash_insert(new, hashent); @@ -216,6 +238,9 @@ static int __init uid_cache_init(void) for(n = 0; n < UIDHASH_SZ; ++n) INIT_HLIST_HEAD(uidhash_table + n); + if (user_epoll_alloc(&root_user)) + panic("root_user epoll percpu counter alloc failed"); + /* Insert the root user immediately (init already runs as root) */ spin_lock_irq(&uidhash_lock); uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID)); -- cgit v1.2.3 From 05da8113c9ba63a8913e6c73dc06ed01cae55f68 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Sep 2021 20:00:35 -0700 Subject: kernel/fork.c: unexport get_{mm,task}_exe_file Only used by core code and the tomoyo which can't be a module either. Link: https://lkml.kernel.org/r/20210820095430.445242-1-hch@lst.de Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 44f4c2d83763..df7296474ecd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1187,7 +1187,6 @@ struct file *get_mm_exe_file(struct mm_struct *mm) rcu_read_unlock(); return exe_file; } -EXPORT_SYMBOL(get_mm_exe_file); /** * get_task_exe_file - acquire a reference to the task's executable file @@ -1210,7 +1209,6 @@ struct file *get_task_exe_file(struct task_struct *task) task_unlock(task); return exe_file; } -EXPORT_SYMBOL(get_task_exe_file); /** * get_task_mm - acquire a reference to the task's mm -- cgit v1.2.3 From e1fbbd073137a9d63279f6bf363151a938347640 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 7 Sep 2021 20:00:41 -0700 Subject: prctl: allow to setup brk for et_dyn executables Keno Fischer reported that when a binray loaded via ld-linux-x the prctl(PR_SET_MM_MAP) doesn't allow to setup brk value because it lays before mm:end_data. For example a test program shows | # ~/t | | start_code 401000 | end_code 401a15 | start_stack 7ffce4577dd0 | start_data 403e10 | end_data 40408c | start_brk b5b000 | sbrk(0) b5b000 and when executed via ld-linux | # /lib64/ld-linux-x86-64.so.2 ~/t | | start_code 7fc25b0a4000 | end_code 7fc25b0c4524 | start_stack 7fffcc6b2400 | start_data 7fc25b0ce4c0 | end_data 7fc25b0cff98 | start_brk 55555710c000 | sbrk(0) 55555710c000 This of course prevent criu from restoring such programs. Looking into how kernel operates with brk/start_brk inside brk() syscall I don't see any problem if we allow to setup brk/start_brk without checking for end_data. Even if someone pass some weird address here on a purpose then the worst possible result will be an unexpected unmapping of existing vma (own vma, since prctl works with the callers memory) but test for RLIMIT_DATA is still valid and a user won't be able to gain more memory in case of expanding VMAs via new values shipped with prctl call. Link: https://lkml.kernel.org/r/20210121221207.GB2174@grain Fixes: bbdc6076d2e5 ("binfmt_elf: move brk out of mmap when doing direct loader exec") Signed-off-by: Cyrill Gorcunov Reported-by: Keno Fischer Acked-by: Andrey Vagin Tested-by: Andrey Vagin Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Kirill Tkhai Cc: Eric W. Biederman Cc: Pavel Tikhomirov Cc: Alexander Mikhalitsyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index ef1a78f5d71c..6ec50924b517 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1959,13 +1959,6 @@ static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map) error = -EINVAL; - /* - * @brk should be after @end_data in traditional maps. - */ - if (prctl_map->start_brk <= prctl_map->end_data || - prctl_map->brk <= prctl_map->end_data) - goto out; - /* * Neither we should allow to override limits if they set. */ -- cgit v1.2.3 From 4b6b08f2e45edda4c067ac40833e3c1f84383c0b Mon Sep 17 00:00:00 2001 From: "Qiang.Zhang" Date: Tue, 31 Aug 2021 10:29:19 +0800 Subject: tracing/osnoise: Fix missed cpus_read_unlock() in start_per_cpu_kthreads() When start_kthread() return error, the cpus_read_unlock() need to be called. Link: https://lkml.kernel.org/r/20210831022919.27630-1-qiang.zhang@windriver.com Cc: Fixes: c8895e271f79 ("trace/osnoise: Support hotplug operations") Acked-by: Daniel Bristot de Oliveira Signed-off-by: Qiang.Zhang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_osnoise.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 65b08b8e5bf8..ce053619f289 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1548,7 +1548,7 @@ static int start_kthread(unsigned int cpu) static int start_per_cpu_kthreads(struct trace_array *tr) { struct cpumask *current_mask = &save_cpumask; - int retval; + int retval = 0; int cpu; cpus_read_lock(); @@ -1568,13 +1568,13 @@ static int start_per_cpu_kthreads(struct trace_array *tr) retval = start_kthread(cpu); if (retval) { stop_per_cpu_kthreads(); - return retval; + break; } } cpus_read_unlock(); - return 0; + return retval; } #ifdef CONFIG_HOTPLUG_CPU -- cgit v1.2.3 From 0be083cee42ec78ba6f9148f5b12a00aa2fb8bd3 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 1 Sep 2021 16:55:13 +0300 Subject: tracing: synth events: increase max fields count Sometimes it is useful to construct larger synthetic trace events. Increase 'SYNTH_FIELDS_MAX' (maximum number of fields in a synthetic event) from 32 to 64. Link: https://lkml.kernel.org/r/20210901135513.3087062-1-dedekind1@gmail.com Signed-off-by: Artem Bityutskiy Acked-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_synth.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_synth.h b/kernel/trace/trace_synth.h index 4007fe95cf42..b29595fe3ac5 100644 --- a/kernel/trace/trace_synth.h +++ b/kernel/trace/trace_synth.h @@ -5,7 +5,7 @@ #include "trace_dynevent.h" #define SYNTH_SYSTEM "synthetic" -#define SYNTH_FIELDS_MAX 32 +#define SYNTH_FIELDS_MAX 64 #define STR_VAR_LEN_MAX MAX_FILTER_STR_VAL /* must be multiple of sizeof(u64) */ -- cgit v1.2.3 From c910db943d35d4ac4b77570ece76e0799af24233 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 2 Sep 2021 15:57:12 -0500 Subject: tracing: Dynamically allocate the per-elt hist_elt_data array Setting the hist_elt_data.field_var_str[] array unconditionally to a size of SYNTH_FIELD_MAX elements wastes space unnecessarily. The actual number of elements needed can be calculated at run-time instead. In most cases, this will save a lot of space since it's a per-elt array which isn't normally close to being full. It also allows us to increase SYNTH_FIELD_MAX without worrying about even more wastage when we do that. Link: https://lkml.kernel.org/r/d52ae0ad5e1b59af7c4f54faf3fc098461fd82b3.camel@kernel.org Signed-off-by: Tom Zanussi Tested-by: Artem Bityutskiy Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 9d91b1c06957..a6061a69aa84 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -508,7 +508,8 @@ struct track_data { struct hist_elt_data { char *comm; u64 *var_ref_vals; - char *field_var_str[SYNTH_FIELDS_MAX]; + char **field_var_str; + int n_field_var_str; }; struct snapshot_context { @@ -1401,9 +1402,11 @@ static void hist_elt_data_free(struct hist_elt_data *elt_data) { unsigned int i; - for (i = 0; i < SYNTH_FIELDS_MAX; i++) + for (i = 0; i < elt_data->n_field_var_str; i++) kfree(elt_data->field_var_str[i]); + kfree(elt_data->field_var_str); + kfree(elt_data->comm); kfree(elt_data); } @@ -1451,6 +1454,13 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt) size = STR_VAR_LEN_MAX; + elt_data->field_var_str = kcalloc(n_str, sizeof(char *), GFP_KERNEL); + if (!elt_data->field_var_str) { + hist_elt_data_free(elt_data); + return -EINVAL; + } + elt_data->n_field_var_str = n_str; + for (i = 0; i < n_str; i++) { elt_data->field_var_str[i] = kzalloc(size, GFP_KERNEL); if (!elt_data->field_var_str[i]) { -- cgit v1.2.3 From cfd799837dbc48499abb05d1891b3d9992354d3a Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 9 Sep 2021 04:38:03 +0900 Subject: tracing/boot: Fix to loop on only subkeys Since the commit e5efaeb8a8f5 ("bootconfig: Support mixing a value and subkeys under a key") allows to co-exist a value node and key nodes under a node, xbc_node_for_each_child() is not only returning key node but also a value node. In the boot-time tracing using xbc_node_for_each_child() to iterate the events, groups and instances, but those must be key nodes. Thus it must use xbc_node_for_each_subkey(). Link: https://lkml.kernel.org/r/163112988361.74896.2267026262061819145.stgit@devnote2 Fixes: e5efaeb8a8f5 ("bootconfig: Support mixing a value and subkeys under a key") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 1060b0446032..388e65d05978 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -522,14 +522,14 @@ trace_boot_init_events(struct trace_array *tr, struct xbc_node *node) if (!node) return; /* per-event key starts with "event.GROUP.EVENT" */ - xbc_node_for_each_child(node, gnode) { + xbc_node_for_each_subkey(node, gnode) { data = xbc_node_get_data(gnode); if (!strcmp(data, "enable")) { enable_all = true; continue; } enable = false; - xbc_node_for_each_child(gnode, enode) { + xbc_node_for_each_subkey(gnode, enode) { data = xbc_node_get_data(enode); if (!strcmp(data, "enable")) { enable = true; @@ -625,7 +625,7 @@ trace_boot_init_instances(struct xbc_node *node) if (!node) return; - xbc_node_for_each_child(node, inode) { + xbc_node_for_each_subkey(node, inode) { p = xbc_node_get_data(inode); if (!p || *p == '\0') continue; -- cgit v1.2.3 From 4b692e861619353ce069e547a67c8d0e32d9ef3d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 8 Sep 2021 15:18:10 -0700 Subject: kexec: move locking into do_kexec_load Patch series "compat: remove compat_alloc_user_space", v5. Going through compat_alloc_user_space() to convert indirect system call arguments tends to add complexity compared to handling the native and compat logic in the same code. This patch (of 6): The locking is the same between the native and compat version of sys_kexec_load(), so it can be done in the common implementation to reduce duplication. Link: https://lkml.kernel.org/r/20210727144859.4150043-1-arnd@kernel.org Link: https://lkml.kernel.org/r/20210727144859.4150043-2-arnd@kernel.org Signed-off-by: Arnd Bergmann Co-developed-by: Eric Biederman Co-developed-by: Christoph Hellwig Acked-by: "Eric W. Biederman" Cc: Catalin Marinas Cc: Will Deacon Cc: Thomas Bogendoerfer Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: "David S. Miller" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Al Viro Cc: Feng Tang Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 44 ++++++++++++++++---------------------------- 1 file changed, 16 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index c82c6c06f051..9c7aef8f4bb6 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -110,6 +110,17 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, unsigned long i; int ret; + /* + * Because we write directly to the reserved memory region when loading + * crash kernels we need a mutex here to prevent multiple crash kernels + * from attempting to load simultaneously, and to prevent a crash kernel + * from loading over the top of a in use crash kernel. + * + * KISS: always take the mutex. + */ + if (!mutex_trylock(&kexec_mutex)) + return -EBUSY; + if (flags & KEXEC_ON_CRASH) { dest_image = &kexec_crash_image; if (kexec_crash_image) @@ -121,7 +132,8 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, if (nr_segments == 0) { /* Uninstall image */ kimage_free(xchg(dest_image, NULL)); - return 0; + ret = 0; + goto out_unlock; } if (flags & KEXEC_ON_CRASH) { /* @@ -134,7 +146,7 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, ret = kimage_alloc_init(&image, entry, nr_segments, segments, flags); if (ret) - return ret; + goto out_unlock; if (flags & KEXEC_PRESERVE_CONTEXT) image->preserve_context = 1; @@ -171,6 +183,8 @@ out: arch_kexec_protect_crashkres(); kimage_free(image); +out_unlock: + mutex_unlock(&kexec_mutex); return ret; } @@ -247,21 +261,8 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) return -EINVAL; - /* Because we write directly to the reserved memory - * region when loading crash kernels we need a mutex here to - * prevent multiple crash kernels from attempting to load - * simultaneously, and to prevent a crash kernel from loading - * over the top of a in use crash kernel. - * - * KISS: always take the mutex. - */ - if (!mutex_trylock(&kexec_mutex)) - return -EBUSY; - result = do_kexec_load(entry, nr_segments, segments, flags); - mutex_unlock(&kexec_mutex); - return result; } @@ -301,21 +302,8 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, return -EFAULT; } - /* Because we write directly to the reserved memory - * region when loading crash kernels we need a mutex here to - * prevent multiple crash kernels from attempting to load - * simultaneously, and to prevent a crash kernel from loading - * over the top of a in use crash kernel. - * - * KISS: always take the mutex. - */ - if (!mutex_trylock(&kexec_mutex)) - return -EBUSY; - result = do_kexec_load(entry, nr_segments, ksegments, flags); - mutex_unlock(&kexec_mutex); - return result; } #endif -- cgit v1.2.3 From 5d700a0fd71ded7096b97f01d276efc1a6579613 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 8 Sep 2021 15:18:13 -0700 Subject: kexec: avoid compat_alloc_user_space kimage_alloc_init() expects a __user pointer, so compat_sys_kexec_load() uses compat_alloc_user_space() to convert the layout and put it back onto the user space caller stack. Moving the user space access into the syscall handler directly actually makes the code simpler, as the conversion for compat mode can now be done on kernel memory. Link: https://lkml.kernel.org/r/20210727144859.4150043-3-arnd@kernel.org Link: https://lore.kernel.org/lkml/YPbtsU4GX6PL7%2F42@infradead.org/ Link: https://lore.kernel.org/lkml/m1y2cbzmnw.fsf@fess.ebiederm.org/ Signed-off-by: Arnd Bergmann Co-developed-by: Eric Biederman Co-developed-by: Christoph Hellwig Acked-by: "Eric W. Biederman" Cc: Al Viro Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: "David S. Miller" Cc: Feng Tang Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Michael Ellerman Cc: Paul Mackerras Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 61 ++++++++++++++++++++++++---------------------------------- 1 file changed, 25 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 9c7aef8f4bb6..b5e40f069768 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -19,26 +19,9 @@ #include "kexec_internal.h" -static int copy_user_segment_list(struct kimage *image, - unsigned long nr_segments, - struct kexec_segment __user *segments) -{ - int ret; - size_t segment_bytes; - - /* Read in the segments */ - image->nr_segments = nr_segments; - segment_bytes = nr_segments * sizeof(*segments); - ret = copy_from_user(image->segment, segments, segment_bytes); - if (ret) - ret = -EFAULT; - - return ret; -} - static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, unsigned long nr_segments, - struct kexec_segment __user *segments, + struct kexec_segment *segments, unsigned long flags) { int ret; @@ -58,10 +41,8 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, return -ENOMEM; image->start = entry; - - ret = copy_user_segment_list(image, nr_segments, segments); - if (ret) - goto out_free_image; + image->nr_segments = nr_segments; + memcpy(image->segment, segments, nr_segments * sizeof(*segments)); if (kexec_on_panic) { /* Enable special crash kernel control page alloc policy. */ @@ -104,7 +85,7 @@ out_free_image: } static int do_kexec_load(unsigned long entry, unsigned long nr_segments, - struct kexec_segment __user *segments, unsigned long flags) + struct kexec_segment *segments, unsigned long flags) { struct kimage **dest_image, *image; unsigned long i; @@ -250,7 +231,8 @@ static inline int kexec_load_check(unsigned long nr_segments, SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, struct kexec_segment __user *, segments, unsigned long, flags) { - int result; + struct kexec_segment *ksegments; + unsigned long result; result = kexec_load_check(nr_segments, flags); if (result) @@ -261,7 +243,12 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) return -EINVAL; - result = do_kexec_load(entry, nr_segments, segments, flags); + ksegments = memdup_user(segments, nr_segments * sizeof(ksegments[0])); + if (IS_ERR(ksegments)) + return PTR_ERR(ksegments); + + result = do_kexec_load(entry, nr_segments, ksegments, flags); + kfree(ksegments); return result; } @@ -273,7 +260,7 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, compat_ulong_t, flags) { struct compat_kexec_segment in; - struct kexec_segment out, __user *ksegments; + struct kexec_segment *ksegments; unsigned long i, result; result = kexec_load_check(nr_segments, flags); @@ -286,24 +273,26 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) return -EINVAL; - ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); + ksegments = kmalloc_array(nr_segments, sizeof(ksegments[0]), + GFP_KERNEL); + if (!ksegments) + return -ENOMEM; + for (i = 0; i < nr_segments; i++) { result = copy_from_user(&in, &segments[i], sizeof(in)); if (result) - return -EFAULT; + goto fail; - out.buf = compat_ptr(in.buf); - out.bufsz = in.bufsz; - out.mem = in.mem; - out.memsz = in.memsz; - - result = copy_to_user(&ksegments[i], &out, sizeof(out)); - if (result) - return -EFAULT; + ksegments[i].buf = compat_ptr(in.buf); + ksegments[i].bufsz = in.bufsz; + ksegments[i].mem = in.mem; + ksegments[i].memsz = in.memsz; } result = do_kexec_load(entry, nr_segments, ksegments, flags); +fail: + kfree(ksegments); return result; } #endif -- cgit v1.2.3 From 59ab844eed9c6b01d32dcb27b57accc23771b324 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 8 Sep 2021 15:18:25 -0700 Subject: compat: remove some compat entry points These are all handled correctly when calling the native system call entry point, so remove the special cases. Link: https://lkml.kernel.org/r/20210727144859.4150043-6-arnd@kernel.org Signed-off-by: Arnd Bergmann Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: "David S. Miller" Cc: Eric Biederman Cc: Feng Tang Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Michael Ellerman Cc: Paul Mackerras Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys_ni.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 64578adfe115..f43d89d92860 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -292,15 +292,10 @@ COND_SYSCALL(process_madvise); COND_SYSCALL(process_mrelease); COND_SYSCALL(remap_file_pages); COND_SYSCALL(mbind); -COND_SYSCALL_COMPAT(mbind); COND_SYSCALL(get_mempolicy); -COND_SYSCALL_COMPAT(get_mempolicy); COND_SYSCALL(set_mempolicy); -COND_SYSCALL_COMPAT(set_mempolicy); COND_SYSCALL(migrate_pages); -COND_SYSCALL_COMPAT(migrate_pages); COND_SYSCALL(move_pages); -COND_SYSCALL_COMPAT(move_pages); COND_SYSCALL(perf_event_open); COND_SYSCALL(accept4); -- cgit v1.2.3 From a7a08b275a8bbade798c4bdaad07ade68fe7003c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 8 Sep 2021 15:18:29 -0700 Subject: arch: remove compat_alloc_user_space All users of compat_alloc_user_space() and copy_in_user() have been removed from the kernel, only a few functions in sparc remain that can be changed to calling arch_copy_in_user() instead. Link: https://lkml.kernel.org/r/20210727144859.4150043-7-arnd@kernel.org Signed-off-by: Arnd Bergmann Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: "David S. Miller" Cc: Eric Biederman Cc: Feng Tang Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Michael Ellerman Cc: Paul Mackerras Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/compat.c | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 05adfd6fa8bf..55551989d9da 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -269,24 +269,3 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat) return 0; } EXPORT_SYMBOL_GPL(get_compat_sigset); - -/* - * Allocate user-space memory for the duration of a single system call, - * in order to marshall parameters inside a compat thunk. - */ -void __user *compat_alloc_user_space(unsigned long len) -{ - void __user *ptr; - - /* If len would occupy more than half of the entire compat space... */ - if (unlikely(len > (((compat_uptr_t)~0) >> 1))) - return NULL; - - ptr = arch_compat_alloc_user_space(len); - - if (unlikely(!access_ok(ptr, len))) - return NULL; - - return ptr; -} -EXPORT_SYMBOL_GPL(compat_alloc_user_space); -- cgit v1.2.3 From 13db8c50477d83ad3e3b9b0ae247e5cd833a7ae4 Mon Sep 17 00:00:00 2001 From: Liu Zixian Date: Wed, 8 Sep 2021 18:10:05 -0700 Subject: mm/hugetlb: initialize hugetlb_usage in mm_init After fork, the child process will get incorrect (2x) hugetlb_usage. If a process uses 5 2MB hugetlb pages in an anonymous mapping, HugetlbPages: 10240 kB and then forks, the child will show, HugetlbPages: 20480 kB The reason for double the amount is because hugetlb_usage will be copied from the parent and then increased when we copy page tables from parent to child. Child will have 2x actual usage. Fix this by adding hugetlb_count_init in mm_init. Link: https://lkml.kernel.org/r/20210826071742.877-1-liuzixian4@huawei.com Fixes: 5d317b2b6536 ("mm: hugetlb: proc: add HugetlbPages field to /proc/PID/status") Signed-off-by: Liu Zixian Reviewed-by: Naoya Horiguchi Reviewed-by: Mike Kravetz Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index ff5be23800af..38681ad44c76 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1063,6 +1063,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->pmd_huge_pte = NULL; #endif mm_init_uprobes_state(mm); + hugetlb_count_init(mm); if (current->mm) { mm->flags = current->mm->flags & MMF_INIT_MASK; -- cgit v1.2.3