From b33164f2bd1cedb094c32cb466287116164457ae Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 12 Aug 2020 14:31:02 +0200 Subject: bpf: Iterate through all PT_NOTE sections when looking for build id Currently when we look for build id within bpf_get_stackid helper call, we check the first NOTE section and we fail if build id is not there. However on some system (Fedora) there can be multiple NOTE sections in binaries and build id data is not always the first one, like: $ readelf -a /usr/bin/ls ... [ 2] .note.gnu.propert NOTE 0000000000000338 00000338 0000000000000020 0000000000000000 A 0 0 8358 [ 3] .note.gnu.build-i NOTE 0000000000000358 00000358 0000000000000024 0000000000000000 A 0 0 437c [ 4] .note.ABI-tag NOTE 000000000000037c 0000037c ... So the stack_map_get_build_id function will fail on build id retrieval and fallback to BPF_STACK_BUILD_ID_IP. This patch is changing the stack_map_get_build_id code to iterate through all the NOTE sections and try to get build id data from each of them. When tracing on sched_switch tracepoint that does bpf_get_stackid helper call kernel build, I can see about 60% increase of successful build id retrieval. The rest seems fails on -EFAULT. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200812123102.20032-1-jolsa@kernel.org --- kernel/bpf/stackmap.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 4fd830a62be2..cfed0ac44d38 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -213,11 +213,13 @@ static int stack_map_get_build_id_32(void *page_addr, phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr)); - for (i = 0; i < ehdr->e_phnum; ++i) - if (phdr[i].p_type == PT_NOTE) - return stack_map_parse_build_id(page_addr, build_id, - page_addr + phdr[i].p_offset, - phdr[i].p_filesz); + for (i = 0; i < ehdr->e_phnum; ++i) { + if (phdr[i].p_type == PT_NOTE && + !stack_map_parse_build_id(page_addr, build_id, + page_addr + phdr[i].p_offset, + phdr[i].p_filesz)) + return 0; + } return -EINVAL; } @@ -236,11 +238,13 @@ static int stack_map_get_build_id_64(void *page_addr, phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr)); - for (i = 0; i < ehdr->e_phnum; ++i) - if (phdr[i].p_type == PT_NOTE) - return stack_map_parse_build_id(page_addr, build_id, - page_addr + phdr[i].p_offset, - phdr[i].p_filesz); + for (i = 0; i < ehdr->e_phnum; ++i) { + if (phdr[i].p_type == PT_NOTE && + !stack_map_parse_build_id(page_addr, build_id, + page_addr + phdr[i].p_offset, + phdr[i].p_filesz)) + return 0; + } return -EINVAL; } -- cgit v1.2.3 From 9420139f516d7fbc248ce17f35275cb005ed98ea Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 14 Aug 2020 12:26:24 +0200 Subject: dma-pool: fix coherent pool allocations for IOMMU mappings When allocating coherent pool memory for an IOMMU mapping we don't care about the DMA mask. Move the guess for the initial GFP mask into the dma_direct_alloc_pages and pass dma_coherent_ok as a function pointer argument so that it doesn't get applied to the IOMMU case. Signed-off-by: Christoph Hellwig Tested-by: Amit Pundir --- kernel/dma/direct.c | 13 ++++-- kernel/dma/pool.c | 114 ++++++++++++++++++++++------------------------------ 2 files changed, 57 insertions(+), 70 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index bb0041e99659..db6ef07aec3b 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -43,7 +43,7 @@ u64 dma_direct_get_required_mask(struct device *dev) return (1ULL << (fls64(max_dma) - 1)) * 2 - 1; } -gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, +static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, u64 *phys_limit) { u64 dma_limit = min_not_zero(dma_mask, dev->bus_dma_limit); @@ -68,7 +68,7 @@ gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, return 0; } -bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) +static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) { return phys_to_dma_direct(dev, phys) + size - 1 <= min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit); @@ -161,8 +161,13 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, size = PAGE_ALIGN(size); if (dma_should_alloc_from_pool(dev, gfp, attrs)) { - ret = dma_alloc_from_pool(dev, size, &page, gfp); - if (!ret) + u64 phys_mask; + + gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, + &phys_mask); + page = dma_alloc_from_pool(dev, size, &ret, gfp, + dma_coherent_ok); + if (!page) return NULL; goto done; } diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 6bc74a2d5127..5d071d4a3cba 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -196,93 +196,75 @@ static int __init dma_atomic_pool_init(void) } postcore_initcall(dma_atomic_pool_init); -static inline struct gen_pool *dma_guess_pool_from_device(struct device *dev) +static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp) { - u64 phys_mask; - gfp_t gfp; - - gfp = dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, - &phys_mask); - if (IS_ENABLED(CONFIG_ZONE_DMA) && gfp == GFP_DMA) + if (prev == NULL) { + if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32)) + return atomic_pool_dma32; + if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA)) + return atomic_pool_dma; + return atomic_pool_kernel; + } + if (prev == atomic_pool_kernel) + return atomic_pool_dma32 ? atomic_pool_dma32 : atomic_pool_dma; + if (prev == atomic_pool_dma32) return atomic_pool_dma; - if (IS_ENABLED(CONFIG_ZONE_DMA32) && gfp == GFP_DMA32) - return atomic_pool_dma32; - return atomic_pool_kernel; + return NULL; } -static inline struct gen_pool *dma_get_safer_pool(struct gen_pool *bad_pool) +static struct page *__dma_alloc_from_pool(struct device *dev, size_t size, + struct gen_pool *pool, void **cpu_addr, + bool (*phys_addr_ok)(struct device *, phys_addr_t, size_t)) { - if (bad_pool == atomic_pool_kernel) - return atomic_pool_dma32 ? : atomic_pool_dma; + unsigned long addr; + phys_addr_t phys; - if (bad_pool == atomic_pool_dma32) - return atomic_pool_dma; + addr = gen_pool_alloc(pool, size); + if (!addr) + return NULL; - return NULL; -} + phys = gen_pool_virt_to_phys(pool, addr); + if (phys_addr_ok && !phys_addr_ok(dev, phys, size)) { + gen_pool_free(pool, addr, size); + return NULL; + } -static inline struct gen_pool *dma_guess_pool(struct device *dev, - struct gen_pool *bad_pool) -{ - if (bad_pool) - return dma_get_safer_pool(bad_pool); + if (gen_pool_avail(pool) < atomic_pool_size) + schedule_work(&atomic_pool_work); - return dma_guess_pool_from_device(dev); + *cpu_addr = (void *)addr; + memset(*cpu_addr, 0, size); + return pfn_to_page(__phys_to_pfn(phys)); } -void *dma_alloc_from_pool(struct device *dev, size_t size, - struct page **ret_page, gfp_t flags) +struct page *dma_alloc_from_pool(struct device *dev, size_t size, + void **cpu_addr, gfp_t gfp, + bool (*phys_addr_ok)(struct device *, phys_addr_t, size_t)) { struct gen_pool *pool = NULL; - unsigned long val = 0; - void *ptr = NULL; - phys_addr_t phys; - - while (1) { - pool = dma_guess_pool(dev, pool); - if (!pool) { - WARN(1, "Failed to get suitable pool for %s\n", - dev_name(dev)); - break; - } - - val = gen_pool_alloc(pool, size); - if (!val) - continue; - - phys = gen_pool_virt_to_phys(pool, val); - if (dma_coherent_ok(dev, phys, size)) - break; - - gen_pool_free(pool, val, size); - val = 0; - } - - - if (val) { - *ret_page = pfn_to_page(__phys_to_pfn(phys)); - ptr = (void *)val; - memset(ptr, 0, size); + struct page *page; - if (gen_pool_avail(pool) < atomic_pool_size) - schedule_work(&atomic_pool_work); + while ((pool = dma_guess_pool(pool, gfp))) { + page = __dma_alloc_from_pool(dev, size, pool, cpu_addr, + phys_addr_ok); + if (page) + return page; } - return ptr; + WARN(1, "Failed to get suitable pool for %s\n", dev_name(dev)); + return NULL; } bool dma_free_from_pool(struct device *dev, void *start, size_t size) { struct gen_pool *pool = NULL; - while (1) { - pool = dma_guess_pool(dev, pool); - if (!pool) - return false; - - if (gen_pool_has_addr(pool, (unsigned long)start, size)) { - gen_pool_free(pool, (unsigned long)start, size); - return true; - } + while ((pool = dma_guess_pool(pool, 0))) { + if (!gen_pool_has_addr(pool, (unsigned long)start, size)) + continue; + gen_pool_free(pool, (unsigned long)start, size); + return true; } + + return false; } -- cgit v1.2.3 From d7e673ec2c8e0ea39c4c70fc490d67d7fbda869d Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Fri, 14 Aug 2020 12:26:23 +0200 Subject: dma-pool: Only allocate from CMA when in same memory zone There is no guarantee to CMA's placement, so allocating a zone specific atomic pool from CMA might return memory from a completely different memory zone. To get around this double check CMA's placement before allocating from it. Signed-off-by: Nicolas Saenz Julienne Signed-off-by: Christoph Hellwig --- kernel/dma/pool.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 5d071d4a3cba..06582b488e31 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -3,7 +3,9 @@ * Copyright (C) 2012 ARM Ltd. * Copyright (C) 2020 Google LLC */ +#include #include +#include #include #include #include @@ -55,6 +57,29 @@ static void dma_atomic_pool_size_add(gfp_t gfp, size_t size) pool_size_kernel += size; } +static bool cma_in_zone(gfp_t gfp) +{ + unsigned long size; + phys_addr_t end; + struct cma *cma; + + cma = dev_get_cma_area(NULL); + if (!cma) + return false; + + size = cma_get_size(cma); + if (!size) + return false; + + /* CMA can't cross zone boundaries, see cma_activate_area() */ + end = cma_get_base(cma) + size - 1; + if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA)) + return end <= DMA_BIT_MASK(zone_dma_bits); + if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32)) + return end <= DMA_BIT_MASK(32); + return true; +} + static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, gfp_t gfp) { @@ -68,7 +93,11 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, do { pool_size = 1 << (PAGE_SHIFT + order); - page = alloc_pages(gfp, order); + if (cma_in_zone(gfp)) + page = dma_alloc_from_contiguous(NULL, 1 << order, + order, false); + if (!page) + page = alloc_pages(gfp, order); } while (!page && order-- > 0); if (!page) goto out; -- cgit v1.2.3 From 29e44f4535faa71a70827af3639b5e6762d8f02a Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 17 Aug 2020 11:07:28 +0100 Subject: watch_queue: Limit the number of watches a user can hold Impose a limit on the number of watches that a user can hold so that they can't use this mechanism to fill up all the available memory. This is done by putting a counter in user_struct that's incremented when a watch is allocated and decreased when it is released. If the number exceeds the RLIMIT_NOFILE limit, the watch is rejected with EAGAIN. This can be tested by the following means: (1) Create a watch queue and attach it to fd 5 in the program given - in this case, bash: keyctl watch_session /tmp/nlog /tmp/gclog 5 bash (2) In the shell, set the maximum number of files to, say, 99: ulimit -n 99 (3) Add 200 keyrings: for ((i=0; i<200; i++)); do keyctl newring a$i @s || break; done (4) Try to watch all of the keyrings: for ((i=0; i<200; i++)); do echo $i; keyctl watch_add 5 %:a$i || break; done This should fail when the number of watches belonging to the user hits 99. (5) Remove all the keyrings and all of those watches should go away: for ((i=0; i<200; i++)); do keyctl unlink %:a$i; done (6) Kill off the watch queue by exiting the shell spawned by watch_session. Fixes: c73be61cede5 ("pipe: Add general notification queue support") Reported-by: Linus Torvalds Signed-off-by: David Howells Reviewed-by: Jarkko Sakkinen Signed-off-by: Linus Torvalds --- kernel/watch_queue.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index f74020f6bd9d..0ef8f65bd2d7 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -393,6 +393,7 @@ static void free_watch(struct rcu_head *rcu) struct watch *watch = container_of(rcu, struct watch, rcu); put_watch_queue(rcu_access_pointer(watch->queue)); + atomic_dec(&watch->cred->user->nr_watches); put_cred(watch->cred); } @@ -452,6 +453,13 @@ int add_watch_to_object(struct watch *watch, struct watch_list *wlist) watch->cred = get_current_cred(); rcu_assign_pointer(watch->watch_list, wlist); + if (atomic_inc_return(&watch->cred->user->nr_watches) > + task_rlimit(current, RLIMIT_NOFILE)) { + atomic_dec(&watch->cred->user->nr_watches); + put_cred(watch->cred); + return -EAGAIN; + } + spin_lock_bh(&wqueue->lock); kref_get(&wqueue->usage); kref_get(&watch->usage); -- cgit v1.2.3 From cf28f3bbfca097d956f9021cb710dfad56adcc62 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 17 Aug 2020 10:42:14 -0700 Subject: bpf: Use get_file_rcu() instead of get_file() for task_file iterator With latest `bpftool prog` command, we observed the following kernel panic. BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor instruction fetch in kernel mode #PF: error_code(0x0010) - not-present page PGD dfe894067 P4D dfe894067 PUD deb663067 PMD 0 Oops: 0010 [#1] SMP CPU: 9 PID: 6023 ... RIP: 0010:0x0 Code: Bad RIP value. RSP: 0000:ffffc900002b8f18 EFLAGS: 00010286 RAX: ffff8883a405f400 RBX: ffff888e46a6bf00 RCX: 000000008020000c RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff8883a405f400 RBP: ffff888e46a6bf50 R08: 0000000000000000 R09: ffffffff81129600 R10: ffff8883a405f300 R11: 0000160000000000 R12: 0000000000002710 R13: 000000e9494b690c R14: 0000000000000202 R15: 0000000000000009 FS: 00007fd9187fe700(0000) GS:ffff888e46a40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffffffffd6 CR3: 0000000de5d33002 CR4: 0000000000360ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: rcu_core+0x1a4/0x440 __do_softirq+0xd3/0x2c8 irq_exit+0x9d/0xa0 smp_apic_timer_interrupt+0x68/0x120 apic_timer_interrupt+0xf/0x20 RIP: 0033:0x47ce80 Code: Bad RIP value. RSP: 002b:00007fd9187fba40 EFLAGS: 00000206 ORIG_RAX: ffffffffffffff13 RAX: 0000000000000002 RBX: 00007fd931789160 RCX: 000000000000010c RDX: 00007fd9308cdfb4 RSI: 00007fd9308cdfb4 RDI: 00007ffedd1ea0a8 RBP: 00007fd9187fbab0 R08: 000000000000000e R09: 000000000000002a R10: 0000000000480210 R11: 00007fd9187fc570 R12: 00007fd9316cc400 R13: 0000000000000118 R14: 00007fd9308cdfb4 R15: 00007fd9317a9380 After further analysis, the bug is triggered by Commit eaaacd23910f ("bpf: Add task and task/file iterator targets") which introduced task_file bpf iterator, which traverses all open file descriptors for all tasks in the current namespace. The latest `bpftool prog` calls a task_file bpf program to traverse all files in the system in order to associate processes with progs/maps, etc. When traversing files for a given task, rcu read_lock is taken to access all files in a file_struct. But it used get_file() to grab a file, which is not right. It is possible file->f_count is 0 and get_file() will unconditionally increase it. Later put_file() may cause all kind of issues with the above as one of sympotoms. The failure can be reproduced with the following steps in a few seconds: $ cat t.c #include #include #include #include #include #define N 10000 int fd[N]; int main() { int i; for (i = 0; i < N; i++) { fd[i] = open("./note.txt", 'r'); if (fd[i] < 0) { fprintf(stderr, "failed\n"); return -1; } } for (i = 0; i < N; i++) close(fd[i]); return 0; } $ gcc -O2 t.c $ cat run.sh #/bin/bash for i in {1..100} do while true; do ./a.out; done & done $ ./run.sh $ while true; do bpftool prog >& /dev/null; done This patch used get_file_rcu() which only grabs a file if the file->f_count is not zero. This is to ensure the file pointer is always valid. The above reproducer did not fail for more than 30 minutes. Fixes: eaaacd23910f ("bpf: Add task and task/file iterator targets") Suggested-by: Josef Bacik Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Reviewed-by: Josef Bacik Link: https://lore.kernel.org/bpf/20200817174214.252601-1-yhs@fb.com --- kernel/bpf/task_iter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 232df29793e9..f21b5e1e4540 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -178,10 +178,11 @@ again: f = fcheck_files(curr_files, curr_fd); if (!f) continue; + if (!get_file_rcu(f)) + continue; /* set info->fd */ info->fd = curr_fd; - get_file(f); rcu_read_unlock(); return f; } -- cgit v1.2.3 From e679654a704e5bd676ea6446fa7b764cbabf168a Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 18 Aug 2020 15:23:09 -0700 Subject: bpf: Fix a rcu_sched stall issue with bpf task/task_file iterator In our production system, we observed rcu stalls when 'bpftool prog` is running. rcu: INFO: rcu_sched self-detected stall on CPU rcu: \x097-....: (20999 ticks this GP) idle=302/1/0x4000000000000000 softirq=1508852/1508852 fqs=4913 \x09(t=21031 jiffies g=2534773 q=179750) NMI backtrace for cpu 7 CPU: 7 PID: 184195 Comm: bpftool Kdump: loaded Tainted: G W 5.8.0-00004-g68bfc7f8c1b4 #6 Hardware name: Quanta Twin Lakes MP/Twin Lakes Passive MP, BIOS F09_3A17 05/03/2019 Call Trace: dump_stack+0x57/0x70 nmi_cpu_backtrace.cold+0x14/0x53 ? lapic_can_unplug_cpu.cold+0x39/0x39 nmi_trigger_cpumask_backtrace+0xb7/0xc7 rcu_dump_cpu_stacks+0xa2/0xd0 rcu_sched_clock_irq.cold+0x1ff/0x3d9 ? tick_nohz_handler+0x100/0x100 update_process_times+0x5b/0x90 tick_sched_timer+0x5e/0xf0 __hrtimer_run_queues+0x12a/0x2a0 hrtimer_interrupt+0x10e/0x280 __sysvec_apic_timer_interrupt+0x51/0xe0 asm_call_on_stack+0xf/0x20 sysvec_apic_timer_interrupt+0x6f/0x80 asm_sysvec_apic_timer_interrupt+0x12/0x20 RIP: 0010:task_file_seq_get_next+0x71/0x220 Code: 00 00 8b 53 1c 49 8b 7d 00 89 d6 48 8b 47 20 44 8b 18 41 39 d3 76 75 48 8b 4f 20 8b 01 39 d0 76 61 41 89 d1 49 39 c1 48 19 c0 <48> 8b 49 08 21 d0 48 8d 04 c1 4c 8b 08 4d 85 c9 74 46 49 8b 41 38 RSP: 0018:ffffc90006223e10 EFLAGS: 00000297 RAX: ffffffffffffffff RBX: ffff888f0d172388 RCX: ffff888c8c07c1c0 RDX: 00000000000f017b RSI: 00000000000f017b RDI: ffff888c254702c0 RBP: ffffc90006223e68 R08: ffff888be2a1c140 R09: 00000000000f017b R10: 0000000000000002 R11: 0000000000100000 R12: ffff888f23c24118 R13: ffffc90006223e60 R14: ffffffff828509a0 R15: 00000000ffffffff task_file_seq_next+0x52/0xa0 bpf_seq_read+0xb9/0x320 vfs_read+0x9d/0x180 ksys_read+0x5f/0xe0 do_syscall_64+0x38/0x60 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f8815f4f76e Code: c0 e9 f6 fe ff ff 55 48 8d 3d 76 70 0a 00 48 89 e5 e8 36 06 02 00 66 0f 1f 44 00 00 64 8b 04 25 18 00 00 00 85 c0 75 14 0f 05 <48> 3d 00 f0 ff ff 77 52 c3 66 0f 1f 84 00 00 00 00 00 55 48 89 e5 RSP: 002b:00007fff8f9df578 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 000000000170b9c0 RCX: 00007f8815f4f76e RDX: 0000000000001000 RSI: 00007fff8f9df5b0 RDI: 0000000000000007 RBP: 00007fff8f9e05f0 R08: 0000000000000049 R09: 0000000000000010 R10: 00007f881601fa40 R11: 0000000000000246 R12: 00007fff8f9e05a8 R13: 00007fff8f9e05a8 R14: 0000000001917f90 R15: 000000000000e22e Note that `bpftool prog` actually calls a task_file bpf iterator program to establish an association between prog/map/link/btf anon files and processes. In the case where the above rcu stall occured, we had a process having 1587 tasks and each task having roughly 81305 files. This implied 129 million bpf prog invocations. Unfortunwtely none of these files are prog/map/link/btf files so bpf iterator/prog needs to traverse all these files and not able to return to user space since there are no seq_file buffer overflow. This patch fixed the issue in bpf_seq_read() to limit the number of visited objects. If the maximum number of visited objects is reached, no more objects will be visited in the current syscall. If there is nothing written in the seq_file buffer, -EAGAIN will return to the user so user can try again. The maximum number of visited objects is set at 1 million. In our Intel Xeon D-2191 2.3GHZ 18-core server, bpf_seq_read() visiting 1 million files takes around 0.18 seconds. We did not use cond_resched() since for some iterators, e.g., netlink iterator, where rcu read_lock critical section spans between consecutive seq_ops->next(), which makes impossible to do cond_resched() in the key while loop of function bpf_seq_read(). Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Cc: Paul E. McKenney Link: https://lore.kernel.org/bpf/20200818222309.2181348-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index b6715964b685..8faa2ce89396 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -67,6 +67,9 @@ static void bpf_iter_done_stop(struct seq_file *seq) iter_priv->done_stop = true; } +/* maximum visited objects before bailing out */ +#define MAX_ITER_OBJECTS 1000000 + /* bpf_seq_read, a customized and simpler version for bpf iterator. * no_llseek is assumed for this file. * The following are differences from seq_read(): @@ -79,7 +82,7 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, { struct seq_file *seq = file->private_data; size_t n, offs, copied = 0; - int err = 0; + int err = 0, num_objs = 0; void *p; mutex_lock(&seq->lock); @@ -135,6 +138,7 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, while (1) { loff_t pos = seq->index; + num_objs++; offs = seq->count; p = seq->op->next(seq, p, &seq->index); if (pos == seq->index) { @@ -153,6 +157,15 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, if (seq->count >= size) break; + if (num_objs >= MAX_ITER_OBJECTS) { + if (offs == 0) { + err = -EAGAIN; + seq->op->stop(seq, p); + goto done; + } + break; + } + err = seq->op->show(seq, p); if (err > 0) { bpf_iter_dec_seq_num(seq); -- cgit v1.2.3 From e60572b8d4c39572be6857d1ec91fdf979f8775f Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 18 Aug 2020 15:23:10 -0700 Subject: bpf: Avoid visit same object multiple times Currently when traversing all tasks, the next tid is always increased by one. This may result in visiting the same task multiple times in a pid namespace. This patch fixed the issue by seting the next tid as pid_nr_ns(pid, ns) + 1, similar to funciton next_tgid(). Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Cc: Rik van Riel Link: https://lore.kernel.org/bpf/20200818222310.2181500-1-yhs@fb.com --- kernel/bpf/task_iter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index f21b5e1e4540..99af4cea1102 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -29,8 +29,9 @@ static struct task_struct *task_seq_get_next(struct pid_namespace *ns, rcu_read_lock(); retry: - pid = idr_get_next(&ns->idr, tid); + pid = find_ge_pid(*tid, ns); if (pid) { + *tid = pid_nr_ns(pid, ns); task = get_pid_task(pid, PIDTYPE_PID); if (!task) { ++*tid; -- cgit v1.2.3 From d88d59b64ca35abae208e2781fdb45e69cbed56c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Aug 2020 21:44:39 +0200 Subject: core/entry: Respect syscall number rewrites The transcript of the x86 entry code to the generic version failed to reload the syscall number from ptregs after ptrace and seccomp have run, which both can modify the syscall number in ptregs. It returns the original syscall number instead which is obviously not the right thing to do. Reload the syscall number to fix that. Fixes: 142781e108b1 ("entry: Provide generic syscall entry functionality") Reported-by: Kyle Huey Signed-off-by: Thomas Gleixner Tested-by: Kyle Huey Tested-by: Kees Cook Acked-by: Kees Cook Link: https://lore.kernel.org/r/87blj6ifo8.fsf@nanos.tec.linutronix.de --- kernel/entry/common.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 9852e0d62d95..fcae019158ca 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -65,7 +65,8 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, syscall_enter_audit(regs, syscall); - return ret ? : syscall; + /* The above might have changed the syscall number */ + return ret ? : syscall_get_nr(current, regs); } noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) -- cgit v1.2.3 From 71e843295c680898959b22dc877ae3839cc22470 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 20 Aug 2020 17:42:14 -0700 Subject: kernel/relay.c: fix memleak on destroy relay channel kmemleak report memory leak as follows: unreferenced object 0x607ee4e5f948 (size 8): comm "syz-executor.1", pid 2098, jiffies 4295031601 (age 288.468s) hex dump (first 8 bytes): 00 00 00 00 00 00 00 00 ........ backtrace: relay_open kernel/relay.c:583 [inline] relay_open+0xb6/0x970 kernel/relay.c:563 do_blk_trace_setup+0x4a8/0xb20 kernel/trace/blktrace.c:557 __blk_trace_setup+0xb6/0x150 kernel/trace/blktrace.c:597 blk_trace_ioctl+0x146/0x280 kernel/trace/blktrace.c:738 blkdev_ioctl+0xb2/0x6a0 block/ioctl.c:613 block_ioctl+0xe5/0x120 fs/block_dev.c:1871 vfs_ioctl fs/ioctl.c:48 [inline] __do_sys_ioctl fs/ioctl.c:753 [inline] __se_sys_ioctl fs/ioctl.c:739 [inline] __x64_sys_ioctl+0x170/0x1ce fs/ioctl.c:739 do_syscall_64+0x33/0x40 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 'chan->buf' is malloced in relay_open() by alloc_percpu() but not free while destroy the relay channel. Fix it by adding free_percpu() before return from relay_destroy_channel(). Fixes: 017c59c042d0 ("relay: Use per CPU constructs for the relay channel buffer pointers") Reported-by: Hulk Robot Signed-off-by: Wei Yongjun Signed-off-by: Andrew Morton Reviewed-by: Chris Wilson Cc: Al Viro Cc: Michael Ellerman Cc: David Rientjes Cc: Michel Lespinasse Cc: Daniel Axtens Cc: Thomas Gleixner Cc: Akash Goel Cc: Link: http://lkml.kernel.org/r/20200817122826.48518-1-weiyongjun1@huawei.com Signed-off-by: Linus Torvalds --- kernel/relay.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 72fe443ea78f..fb4e0c530c08 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -197,6 +197,7 @@ free_buf: static void relay_destroy_channel(struct kref *kref) { struct rchan *chan = container_of(kref, struct rchan, kref); + free_percpu(chan->buf); kfree(chan); } -- cgit v1.2.3 From c17c3dc9d08b9aad9a55a1e53f205187972f448e Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 20 Aug 2020 17:42:17 -0700 Subject: uprobes: __replace_page() avoid BUG in munlock_vma_page() syzbot crashed on the VM_BUG_ON_PAGE(PageTail) in munlock_vma_page(), when called from uprobes __replace_page(). Which of many ways to fix it? Settled on not calling when PageCompound (since Head and Tail are equals in this context, PageCompound the usual check in uprobes.c, and the prior use of FOLL_SPLIT_PMD will have cleared PageMlocked already). Fixes: 5a52c9df62b4 ("uprobe: use FOLL_SPLIT_PMD instead of FOLL_SPLIT") Reported-by: syzbot Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Reviewed-by: Srikar Dronamraju Acked-by: Song Liu Acked-by: Oleg Nesterov Cc: "Kirill A. Shutemov" Cc: [5.4+] Link: http://lkml.kernel.org/r/alpine.LSU.2.11.2008161338360.20413@eggly.anvils Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 649fd53dc9ad..0e18aaf23a7b 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -205,7 +205,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, try_to_free_swap(old_page); page_vma_mapped_walk_done(&pvmw); - if (vma->vm_flags & VM_LOCKED) + if ((vma->vm_flags & VM_LOCKED) && !PageCompound(old_page)) munlock_vma_page(old_page); put_page(old_page); -- cgit v1.2.3 From df561f6688fef775baa341a0f5d960becd248b11 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Sun, 23 Aug 2020 17:36:59 -0500 Subject: treewide: Use fallthrough pseudo-keyword Replace the existing /* fall through */ comments and its variants with the new pseudo-keyword macro fallthrough[1]. Also, remove unnecessary fall-through markings when it is the case. [1] https://www.kernel.org/doc/html/v5.7/process/deprecated.html?highlight=fallthrough#implicit-switch-case-fall-through Signed-off-by: Gustavo A. R. Silva --- kernel/auditfilter.c | 2 +- kernel/bpf/cgroup.c | 2 +- kernel/bpf/cpumap.c | 2 +- kernel/bpf/syscall.c | 2 +- kernel/bpf/verifier.c | 4 ++-- kernel/capability.c | 2 +- kernel/compat.c | 6 +++--- kernel/debug/gdbstub.c | 6 +++--- kernel/debug/kdb/kdb_keyboard.c | 4 ++-- kernel/debug/kdb/kdb_support.c | 6 +++--- kernel/events/core.c | 2 +- kernel/irq/handle.c | 2 +- kernel/irq/manage.c | 4 ++-- kernel/kallsyms.c | 4 ++-- kernel/power/hibernate.c | 2 +- kernel/power/qos.c | 4 ++-- kernel/sched/core.c | 2 +- kernel/sched/topology.c | 6 +++--- kernel/signal.c | 2 +- kernel/sys.c | 2 +- kernel/time/hrtimer.c | 2 +- kernel/time/posix-timers.c | 4 ++-- kernel/time/tick-broadcast.c | 2 +- kernel/time/timer.c | 2 +- kernel/trace/blktrace.c | 2 +- kernel/trace/trace_events_filter.c | 4 ++-- 26 files changed, 41 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index a10e2997aa6c..333b3bcfc545 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -681,7 +681,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) data->values[i] = AUDIT_UID_UNSET; break; } - /* fall through - if set */ + fallthrough; /* if set */ default: data->values[i] = f->val; } diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 83ff127ef7ae..e21de4f1754c 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1794,7 +1794,7 @@ static bool cg_sockopt_is_valid_access(int off, int size, return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT; case offsetof(struct bpf_sockopt, optname): - /* fallthrough */ + fallthrough; case offsetof(struct bpf_sockopt, level): if (size != size_default) return false; diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index f1c46529929b..6386b7bb98f2 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -279,7 +279,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, break; default: bpf_warn_invalid_xdp_action(act); - /* fallthrough */ + fallthrough; case XDP_DROP: xdp_return_frame(xdpf); stats->drop++; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 86299a292214..1bf960aa615c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2029,7 +2029,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, case BPF_PROG_TYPE_EXT: if (expected_attach_type) return -EINVAL; - /* fallthrough */ + fallthrough; default: return 0; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ef938f17b944..47e74f09fa37 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5236,7 +5236,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, off_reg == dst_reg ? dst : src); return -EACCES; } - /* fall-through */ + fallthrough; default: break; } @@ -10988,7 +10988,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) default: if (!prog_extension) return -EINVAL; - /* fallthrough */ + fallthrough; case BPF_MODIFY_RETURN: case BPF_LSM_MAC: case BPF_TRACE_FENTRY: diff --git a/kernel/capability.c b/kernel/capability.c index 1444f3954d75..7c59b096c98a 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -93,7 +93,7 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy) break; case _LINUX_CAPABILITY_VERSION_2: warn_deprecated_v2(); - /* fall through - v3 is otherwise equivalent to v2. */ + fallthrough; /* v3 is otherwise equivalent to v2 */ case _LINUX_CAPABILITY_VERSION_3: *tocopy = _LINUX_CAPABILITY_U32S_3; break; diff --git a/kernel/compat.c b/kernel/compat.c index b8d2800bb4b7..05adfd6fa8bf 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -255,11 +255,11 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat) return -EFAULT; switch (_NSIG_WORDS) { case 4: set->sig[3] = v.sig[6] | (((long)v.sig[7]) << 32 ); - /* fall through */ + fallthrough; case 3: set->sig[2] = v.sig[4] | (((long)v.sig[5]) << 32 ); - /* fall through */ + fallthrough; case 2: set->sig[1] = v.sig[2] | (((long)v.sig[3]) << 32 ); - /* fall through */ + fallthrough; case 1: set->sig[0] = v.sig[0] | (((long)v.sig[1]) << 32 ); } #else diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index a790026e42d0..cc3c43dfec44 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -1046,14 +1046,14 @@ int gdb_serial_stub(struct kgdb_state *ks) return DBG_PASS_EVENT; } #endif - /* Fall through */ + fallthrough; case 'C': /* Exception passing */ tmp = gdb_cmd_exception_pass(ks); if (tmp > 0) goto default_handle; if (tmp == 0) break; - /* Fall through - on tmp < 0 */ + fallthrough; /* on tmp < 0 */ case 'c': /* Continue packet */ case 's': /* Single step packet */ if (kgdb_contthread && kgdb_contthread != current) { @@ -1062,7 +1062,7 @@ int gdb_serial_stub(struct kgdb_state *ks) break; } dbg_activate_sw_breakpoints(); - /* Fall through - to default processing */ + fallthrough; /* to default processing */ default: default_handle: error = kgdb_arch_handle_exception(ks->ex_vector, diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c index 750497b0003a..f877a0a0d7cf 100644 --- a/kernel/debug/kdb/kdb_keyboard.c +++ b/kernel/debug/kdb/kdb_keyboard.c @@ -173,11 +173,11 @@ int kdb_get_kbd_char(void) case KT_LATIN: if (isprint(keychar)) break; /* printable characters */ - /* fall through */ + fallthrough; case KT_SPEC: if (keychar == K_ENTER) break; - /* fall through */ + fallthrough; default: return -1; /* ignore unprintables */ } diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 004c5b6c87f8..6226502ce049 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -432,7 +432,7 @@ int kdb_getphysword(unsigned long *word, unsigned long addr, size_t size) *word = w8; break; } - /* fall through */ + fallthrough; default: diag = KDB_BADWIDTH; kdb_printf("kdb_getphysword: bad width %ld\n", (long) size); @@ -481,7 +481,7 @@ int kdb_getword(unsigned long *word, unsigned long addr, size_t size) *word = w8; break; } - /* fall through */ + fallthrough; default: diag = KDB_BADWIDTH; kdb_printf("kdb_getword: bad width %ld\n", (long) size); @@ -525,7 +525,7 @@ int kdb_putword(unsigned long addr, unsigned long word, size_t size) diag = kdb_putarea(addr, w8); break; } - /* fall through */ + fallthrough; default: diag = KDB_BADWIDTH; kdb_printf("kdb_putword: bad width %ld\n", (long) size); diff --git a/kernel/events/core.c b/kernel/events/core.c index 5bfe8e3c6e44..7ed5248f0445 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10034,7 +10034,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, case IF_SRC_KERNELADDR: case IF_SRC_KERNEL: kernel = 1; - /* fall through */ + fallthrough; case IF_SRC_FILEADDR: case IF_SRC_FILE: diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a8e14c80b405..762a928e18f9 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -173,7 +173,7 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags __irq_wake_thread(desc, action); - /* Fall through - to add to randomness */ + fallthrough; /* to add to randomness */ case IRQ_HANDLED: *flags |= action->flags; break; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 52ac5391dcc6..5df903fccb60 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -271,7 +271,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, case IRQ_SET_MASK_OK: case IRQ_SET_MASK_OK_DONE: cpumask_copy(desc->irq_common_data.affinity, mask); - /* fall through */ + fallthrough; case IRQ_SET_MASK_OK_NOCOPY: irq_validate_effective_affinity(data); irq_set_thread_affinity(desc); @@ -868,7 +868,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) case IRQ_SET_MASK_OK_DONE: irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK); irqd_set(&desc->irq_data, flags); - /* fall through */ + fallthrough; case IRQ_SET_MASK_OK_NOCOPY: flags = irqd_get_trigger_type(&desc->irq_data); diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 95cb74f73292..4fb15fa96734 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -684,12 +684,12 @@ bool kallsyms_show_value(const struct cred *cred) case 0: if (kallsyms_for_perf()) return true; - /* fallthrough */ + fallthrough; case 1: if (security_capable(cred, &init_user_ns, CAP_SYSLOG, CAP_OPT_NOAUDIT) == 0) return true; - /* fallthrough */ + fallthrough; default: return false; } diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index f33769f97aca..e7aa57fb2fdc 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -659,7 +659,7 @@ static void power_down(void) break; case HIBERNATION_PLATFORM: hibernation_platform_enter(); - /* Fall through */ + fallthrough; case HIBERNATION_SHUTDOWN: if (pm_power_off) kernel_power_off(); diff --git a/kernel/power/qos.c b/kernel/power/qos.c index db0bed2cae26..ec7e1e85923e 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -119,7 +119,7 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, * and add, then see if the aggregate has changed. */ plist_del(node, &c->list); - /* fall through */ + fallthrough; case PM_QOS_ADD_REQ: plist_node_init(node, new_value); plist_add(node, &c->list); @@ -188,7 +188,7 @@ bool pm_qos_update_flags(struct pm_qos_flags *pqf, break; case PM_QOS_UPDATE_REQ: pm_qos_flags_remove_req(pqf, req); - /* fall through */ + fallthrough; case PM_QOS_ADD_REQ: req->flags = val; INIT_LIST_HEAD(&req->node); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8471a0f7eb32..2d95dc3f4644 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2320,7 +2320,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) state = possible; break; } - /* Fall-through */ + fallthrough; case possible: do_set_cpus_allowed(p, cpu_possible_mask); state = fail; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 007b0a6b0152..1bd7e3af904f 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1219,13 +1219,13 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, case sa_rootdomain: if (!atomic_read(&d->rd->refcount)) free_rootdomain(&d->rd->rcu); - /* Fall through */ + fallthrough; case sa_sd: free_percpu(d->sd); - /* Fall through */ + fallthrough; case sa_sd_storage: __sdt_free(cpu_map); - /* Fall through */ + fallthrough; case sa_none: break; } diff --git a/kernel/signal.c b/kernel/signal.c index 42b67d2cea37..a38b3edc6851 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -851,7 +851,7 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info, */ if (!sid || sid == task_session(current)) break; - /* fall through */ + fallthrough; default: return -EPERM; } diff --git a/kernel/sys.c b/kernel/sys.c index ca11af9d815d..ab6c409b1159 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1753,7 +1753,7 @@ void getrusage(struct task_struct *p, int who, struct rusage *r) if (who == RUSAGE_CHILDREN) break; - /* fall through */ + fallthrough; case RUSAGE_SELF: thread_group_cputime_adjusted(p, &tgutime, &tgstime); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index c4038511d5c9..95b6a708b040 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -377,7 +377,7 @@ static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_ACTIVE: WARN_ON(1); - /* fall through */ + fallthrough; default: return false; } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 07709ac30439..bf540f5a4115 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -439,12 +439,12 @@ static struct pid *good_sigevent(sigevent_t * event) rtn = pid_task(pid, PIDTYPE_PID); if (!rtn || !same_thread_group(rtn, current)) return NULL; - /* FALLTHRU */ + fallthrough; case SIGEV_SIGNAL: case SIGEV_THREAD: if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX) return NULL; - /* FALLTHRU */ + fallthrough; case SIGEV_NONE: return pid; default: diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index e51778c312f1..36d7464c8962 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -381,7 +381,7 @@ void tick_broadcast_control(enum tick_broadcast_mode mode) switch (mode) { case TICK_BROADCAST_FORCE: tick_broadcast_forced = 1; - /* fall through */ + fallthrough; case TICK_BROADCAST_ON: cpumask_set_cpu(cpu, tick_broadcast_on); if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a16764b0116e..a50364df1054 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -666,7 +666,7 @@ static bool timer_fixup_activate(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: WARN_ON(1); - /* fall through */ + fallthrough; default: return false; } diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7ba62d68885a..4b3a42fc3b24 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -745,7 +745,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) #endif case BLKTRACESTART: start = 1; - /* fall through */ + fallthrough; case BLKTRACESTOP: ret = __blk_trace_startstop(q, start); break; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index bf44f6bbd0c3..78a678eeb140 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -499,7 +499,7 @@ predicate_parse(const char *str, int nr_parens, int nr_preds, ptr++; break; } - /* fall through */ + fallthrough; default: parse_error(pe, FILT_ERR_TOO_MANY_PREDS, next - str); @@ -1273,7 +1273,7 @@ static int parse_pred(const char *str, void *data, switch (op) { case OP_NE: pred->not = 1; - /* Fall through */ + fallthrough; case OP_GLOB: case OP_EQ: break; -- cgit v1.2.3 From b474959d5afda6e341a02c85f9595d85d39189ae Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 21 Aug 2020 12:10:54 -0700 Subject: bpf: Fix a buffer out-of-bound access when filling raw_tp link_info Commit f2e10bff16a0 ("bpf: Add support for BPF_OBJ_GET_INFO_BY_FD for bpf_link") added link query for raw_tp. One of fields in link_info is to fill a user buffer with tp_name. The Scurrent checking only declares "ulen && !ubuf" as invalid. So "!ulen && ubuf" will be valid. Later on, we do "copy_to_user(ubuf, tp_name, ulen - 1)" which may overwrite user memory incorrectly. This patch fixed the problem by disallowing "!ulen && ubuf" case as well. Fixes: f2e10bff16a0 ("bpf: Add support for BPF_OBJ_GET_INFO_BY_FD for bpf_link") Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200821191054.714731-1-yhs@fb.com --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 86299a292214..ac6c784c0576 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2634,7 +2634,7 @@ static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link, u32 ulen = info->raw_tracepoint.tp_name_len; size_t tp_len = strlen(tp_name); - if (ulen && !ubuf) + if (!ulen ^ !ubuf) return -EINVAL; info->raw_tracepoint.tp_name_len = tp_len + 1; -- cgit v1.2.3 From 7787b6fc938e16aa418613c4a765c1dbb268ed9f Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Mon, 24 Aug 2020 16:20:47 +0200 Subject: bpf, sysctl: Let bpf_stats_handler take a kernel pointer buffer Commit 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") changed ctl_table.proc_handler to take a kernel pointer. Adjust the signature of bpf_stats_handler to match ctl_table.proc_handler which fixes the following sparse warning: kernel/sysctl.c:226:49: warning: incorrect type in argument 3 (different address spaces) kernel/sysctl.c:226:49: expected void * kernel/sysctl.c:226:49: got void [noderef] __user *buffer kernel/sysctl.c:2640:35: warning: incorrect type in initializer (incompatible argument 3 (different address spaces)) kernel/sysctl.c:2640:35: expected int ( [usertype] *proc_handler )( ... ) kernel/sysctl.c:2640:35: got int ( * )( ... ) Fixes: 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") Signed-off-by: Tobias Klauser Signed-off-by: Alexei Starovoitov Cc: Christoph Hellwig Link: https://lore.kernel.org/bpf/20200824142047.22043-1-tklauser@distanz.ch --- kernel/sysctl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 287862f91717..09e70ee2332e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -204,8 +204,7 @@ static int max_extfrag_threshold = 1000; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL) static int bpf_stats_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct static_key *key = (struct static_key *)table->data; static int saved_val; -- cgit v1.2.3 From fddf9055a60dfcc97bda5ef03c8fa4108ed555c5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 20 Aug 2020 09:13:30 +0200 Subject: lockdep: Use raw_cpu_*() for per-cpu variables Sven reported that commit a21ee6055c30 ("lockdep: Change hardirq{s_enabled,_context} to per-cpu variables") caused trouble on s390 because their this_cpu_*() primitives disable preemption which then lands back tracing. On the one hand, per-cpu ops should use preempt_*able_notrace() and raw_local_irq_*(), on the other hand, we can trivialy use raw_cpu_*() ops for this. Fixes: a21ee6055c30 ("lockdep: Change hardirq{s_enabled,_context} to per-cpu variables") Reported-by: Sven Schnelle Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Thomas Gleixner Acked-by: Rafael J. Wysocki Tested-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200821085348.192346882@infradead.org --- kernel/locking/lockdep.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 2fad21d345b0..c872e95e6e4d 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3756,7 +3756,7 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) skip_checks: /* we'll do an OFF -> ON transition: */ - this_cpu_write(hardirqs_enabled, 1); + __this_cpu_write(hardirqs_enabled, 1); trace->hardirq_enable_ip = ip; trace->hardirq_enable_event = ++trace->irq_events; debug_atomic_inc(hardirqs_on_events); @@ -3795,7 +3795,7 @@ void noinstr lockdep_hardirqs_off(unsigned long ip) /* * We have done an ON -> OFF transition: */ - this_cpu_write(hardirqs_enabled, 0); + __this_cpu_write(hardirqs_enabled, 0); trace->hardirq_disable_ip = ip; trace->hardirq_disable_event = ++trace->irq_events; debug_atomic_inc(hardirqs_off_events); -- cgit v1.2.3 From 1098582a0f6c4e8fd28da0a6305f9233d02c9c1d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 7 Aug 2020 20:50:19 +0200 Subject: sched,idle,rcu: Push rcu_idle deeper into the idle path Lots of things take locks, due to a wee bug, rcu_lockdep didn't notice that the locking tracepoints were using RCU. Push rcu_idle_{enter,exit}() as deep as possible into the idle paths, this also resolves a lot of _rcuidle()/RCU_NONIDLE() usage. Specifically, sched_clock_idle_wakeup_event() will use ktime which will use seqlocks which will tickle lockdep, and stop_critical_timings() uses lock. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Thomas Gleixner Acked-by: Rafael J. Wysocki Tested-by: Marco Elver Link: https://lkml.kernel.org/r/20200821085348.310943801@infradead.org --- kernel/sched/idle.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 6bf34986f45c..ea3a0989dc4c 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -54,17 +54,18 @@ __setup("hlt", cpu_idle_nopoll_setup); static noinline int __cpuidle cpu_idle_poll(void) { + trace_cpu_idle(0, smp_processor_id()); + stop_critical_timings(); rcu_idle_enter(); - trace_cpu_idle_rcuidle(0, smp_processor_id()); local_irq_enable(); - stop_critical_timings(); while (!tif_need_resched() && - (cpu_idle_force_poll || tick_check_broadcast_expired())) + (cpu_idle_force_poll || tick_check_broadcast_expired())) cpu_relax(); - start_critical_timings(); - trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); + rcu_idle_exit(); + start_critical_timings(); + trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); return 1; } @@ -91,7 +92,9 @@ void __cpuidle default_idle_call(void) local_irq_enable(); } else { stop_critical_timings(); + rcu_idle_enter(); arch_cpu_idle(); + rcu_idle_exit(); start_critical_timings(); } } @@ -158,7 +161,6 @@ static void cpuidle_idle_call(void) if (cpuidle_not_available(drv, dev)) { tick_nohz_idle_stop_tick(); - rcu_idle_enter(); default_idle_call(); goto exit_idle; @@ -178,21 +180,17 @@ static void cpuidle_idle_call(void) u64 max_latency_ns; if (idle_should_enter_s2idle()) { - rcu_idle_enter(); entered_state = call_cpuidle_s2idle(drv, dev); if (entered_state > 0) goto exit_idle; - rcu_idle_exit(); - max_latency_ns = U64_MAX; } else { max_latency_ns = dev->forced_idle_latency_limit_ns; } tick_nohz_idle_stop_tick(); - rcu_idle_enter(); next_state = cpuidle_find_deepest_state(drv, dev, max_latency_ns); call_cpuidle(drv, dev, next_state); @@ -209,8 +207,6 @@ static void cpuidle_idle_call(void) else tick_nohz_idle_retain_tick(); - rcu_idle_enter(); - entered_state = call_cpuidle(drv, dev, next_state); /* * Give the governor an opportunity to reflect on the outcome @@ -226,8 +222,6 @@ exit_idle: */ if (WARN_ON_ONCE(irqs_disabled())) local_irq_enable(); - - rcu_idle_exit(); } /* -- cgit v1.2.3 From 9864f5b5943ab0f1f835f21dc3f9f068d06f5b52 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 12 Aug 2020 12:27:10 +0200 Subject: cpuidle: Move trace_cpu_idle() into generic code Remove trace_cpu_idle() from the arch_cpu_idle() implementations and put it in the generic code, right before disabling RCU. Gets rid of more trace_*_rcuidle() users. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Thomas Gleixner Acked-by: Rafael J. Wysocki Tested-by: Marco Elver Link: https://lkml.kernel.org/r/20200821085348.428433395@infradead.org --- kernel/sched/idle.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index ea3a0989dc4c..f324dc36fc43 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -91,11 +91,14 @@ void __cpuidle default_idle_call(void) if (current_clr_polling_and_test()) { local_irq_enable(); } else { + + trace_cpu_idle(1, smp_processor_id()); stop_critical_timings(); rcu_idle_enter(); arch_cpu_idle(); rcu_idle_exit(); start_critical_timings(); + trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); } } -- cgit v1.2.3 From eb1f00237aca2e368b93db79303f8433d1976d10 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 7 Aug 2020 20:53:16 +0200 Subject: lockdep,trace: Expose tracepoints The lockdep tracepoints are under the lockdep recursion counter, this has a bunch of nasty side effects: - TRACE_IRQFLAGS doesn't work across the entire tracepoint - RCU-lockdep doesn't see the tracepoints either, hiding numerous "suspicious RCU usage" warnings. Pull the trace_lock_*() tracepoints completely out from under the lockdep recursion handling and completely rely on the trace level recusion handling -- also, tracing *SHOULD* not be taking locks in any case. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Thomas Gleixner Acked-by: Rafael J. Wysocki Tested-by: Marco Elver Link: https://lkml.kernel.org/r/20200821085348.782688941@infradead.org --- kernel/locking/lockdep.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index c872e95e6e4d..54b74fabf40c 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4977,6 +4977,8 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, { unsigned long flags; + trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); + if (unlikely(current->lockdep_recursion)) { /* XXX allow trylock from NMI ?!? */ if (lockdep_nmi() && !trylock) { @@ -5001,7 +5003,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, check_flags(flags); current->lockdep_recursion++; - trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); __lock_acquire(lock, subclass, trylock, read, check, irqs_disabled_flags(flags), nest_lock, ip, 0, 0); lockdep_recursion_finish(); @@ -5013,13 +5014,15 @@ void lock_release(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; + trace_lock_release(lock, ip); + if (unlikely(current->lockdep_recursion)) return; raw_local_irq_save(flags); check_flags(flags); + current->lockdep_recursion++; - trace_lock_release(lock, ip); if (__lock_release(lock, ip)) check_chain_key(current); lockdep_recursion_finish(); @@ -5205,8 +5208,6 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) hlock->holdtime_stamp = now; } - trace_lock_acquired(lock, ip); - stats = get_lock_stats(hlock_class(hlock)); if (waittime) { if (hlock->read) @@ -5225,6 +5226,8 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; + trace_lock_acquired(lock, ip); + if (unlikely(!lock_stat || !debug_locks)) return; @@ -5234,7 +5237,6 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip) raw_local_irq_save(flags); check_flags(flags); current->lockdep_recursion++; - trace_lock_contended(lock, ip); __lock_contended(lock, ip); lockdep_recursion_finish(); raw_local_irq_restore(flags); @@ -5245,6 +5247,8 @@ void lock_acquired(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; + trace_lock_contended(lock, ip); + if (unlikely(!lock_stat || !debug_locks)) return; -- cgit v1.2.3 From 892fc9f6835ecf075efac20789b012c5c9997fcc Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 26 Aug 2020 14:33:30 +0300 Subject: dma-pool: Fix an uninitialized variable bug in atomic_pool_expand() The "page" pointer can be used with out being initialized. Fixes: d7e673ec2c8e ("dma-pool: Only allocate from CMA when in same memory zone") Signed-off-by: Dan Carpenter Signed-off-by: Christoph Hellwig --- kernel/dma/pool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 06582b488e31..1281c0f0442b 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -84,7 +84,7 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, gfp_t gfp) { unsigned int order; - struct page *page; + struct page *page = NULL; void *addr; int ret = -ENOMEM; -- cgit v1.2.3 From 784a0830377d0761834e385975bc46861fea9fa0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 30 Aug 2020 19:07:53 +0200 Subject: genirq/matrix: Deal with the sillyness of for_each_cpu() on UP Most of the CPU mask operations behave the same way, but for_each_cpu() and it's variants ignore the cpumask argument and claim that CPU0 is always in the mask. This is historical, inconsistent and annoying behaviour. The matrix allocator uses for_each_cpu() and can be called on UP with an empty cpumask. The calling code does not expect that this succeeds but until commit e027fffff799 ("x86/irq: Unbreak interrupt affinity setting") this went unnoticed. That commit added a WARN_ON() to catch cases which move an interrupt from one vector to another on the same CPU. The warning triggers on UP. Add a check for the cpumask being empty to prevent this. Fixes: 2f75d9e1c905 ("genirq: Implement bitmap matrix allocator") Reported-by: kernel test robot Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org --- kernel/irq/matrix.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index 30cc217b8631..651a4ad6d711 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -380,6 +380,13 @@ int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk, unsigned int cpu, bit; struct cpumap *cm; + /* + * Not required in theory, but matrix_find_best_cpu() uses + * for_each_cpu() which ignores the cpumask on UP . + */ + if (cpumask_empty(msk)) + return -EINVAL; + cpu = matrix_find_best_cpu(m, msk); if (cpu == UINT_MAX) return -ENOSPC; -- cgit v1.2.3 From 4facb95b7adaf77e2da73aafb9ba60996fe42a12 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 2 Sep 2020 01:50:54 +0200 Subject: x86/entry: Unbreak 32bit fast syscall Andy reported that the syscall treacing for 32bit fast syscall fails: # ./tools/testing/selftests/x86/ptrace_syscall_32 ... [RUN] SYSEMU [FAIL] Initial args are wrong (nr=224, args=10 11 12 13 14 4289172732) ... [RUN] SYSCALL [FAIL] Initial args are wrong (nr=29, args=0 0 0 0 0 4289172732) The eason is that the conversion to generic entry code moved the retrieval of the sixth argument (EBP) after the point where the syscall entry work runs, i.e. ptrace, seccomp, audit... Unbreak it by providing a split up version of syscall_enter_from_user_mode(). - syscall_enter_from_user_mode_prepare() establishes state and enables interrupts - syscall_enter_from_user_mode_work() runs the entry work Replace the call to syscall_enter_from_user_mode() in the 32bit fast syscall C-entry with the split functions and stick the EBP retrieval between them. Fixes: 27d6b4d14f5c ("x86/entry: Use generic syscall entry function") Reported-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/87k0xdjbtt.fsf@nanos.tec.linutronix.de --- kernel/entry/common.c | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/entry/common.c b/kernel/entry/common.c index fcae019158ca..18683598edbc 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -69,22 +69,45 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, return ret ? : syscall_get_nr(current, regs); } -noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) +static __always_inline long +__syscall_enter_from_user_work(struct pt_regs *regs, long syscall) { unsigned long ti_work; - enter_from_user_mode(regs); - instrumentation_begin(); - - local_irq_enable(); ti_work = READ_ONCE(current_thread_info()->flags); if (ti_work & SYSCALL_ENTER_WORK) syscall = syscall_trace_enter(regs, syscall, ti_work); - instrumentation_end(); return syscall; } +long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall) +{ + return __syscall_enter_from_user_work(regs, syscall); +} + +noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) +{ + long ret; + + enter_from_user_mode(regs); + + instrumentation_begin(); + local_irq_enable(); + ret = __syscall_enter_from_user_work(regs, syscall); + instrumentation_end(); + + return ret; +} + +noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) +{ + enter_from_user_mode(regs); + instrumentation_begin(); + local_irq_enable(); + instrumentation_end(); +} + /** * exit_to_user_mode - Fixup state when exiting to user mode * -- cgit v1.2.3 From cfc905f158eaa099d6258031614d11869e7ef71c Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 4 Sep 2020 18:58:08 +0300 Subject: gcov: Disable gcov build with GCC 10 GCOV built with GCC 10 doesn't initialize n_function variable. This produces different kernel panics as was seen by Colin in Ubuntu and me in FC 32. As a workaround, let's disable GCOV build for broken GCC 10 version. Link: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1891288 Link: https://lore.kernel.org/lkml/20200827133932.3338519-1-leon@kernel.org Link: https://lore.kernel.org/lkml/CAHk-=whbijeSdSvx-Xcr0DPMj0BiwhJ+uiNnDSVZcr_h_kg7UA@mail.gmail.com/ Cc: Colin Ian King Signed-off-by: Leon Romanovsky Signed-off-by: Linus Torvalds --- kernel/gcov/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 3110c77230c7..bb4b680e8455 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -4,6 +4,7 @@ menu "GCOV-based kernel profiling" config GCOV_KERNEL bool "Enable gcov-based kernel profiling" depends on DEBUG_FS + depends on !CC_IS_GCC || GCC_VERSION < 100000 select CONSTRUCTORS if !UML default n help -- cgit v1.2.3 From b0daa2c73f3a8ab9183a9d298495b583b6c1bd19 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Fri, 4 Sep 2020 16:35:49 -0700 Subject: fork: adjust sysctl_max_threads definition to match prototype Commit 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") changed ctl_table.proc_handler to take a kernel pointer. Adjust the definition of sysctl_max_threads to match its prototype in linux/sysctl.h which fixes the following sparse error/warning: kernel/fork.c:3050:47: warning: incorrect type in argument 3 (different address spaces) kernel/fork.c:3050:47: expected void * kernel/fork.c:3050:47: got void [noderef] __user *buffer kernel/fork.c:3036:5: error: symbol 'sysctl_max_threads' redeclared with different type (incompatible argument 3 (different address spaces)): kernel/fork.c:3036:5: int extern [addressable] [signed] [toplevel] sysctl_max_threads( ... ) kernel/fork.c: note: in included file (through include/linux/key.h, include/linux/cred.h, include/linux/sched/signal.h, include/linux/sched/cputime.h): include/linux/sysctl.h:242:5: note: previously declared as: include/linux/sysctl.h:242:5: int extern [addressable] [signed] [toplevel] sysctl_max_threads( ... ) Fixes: 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") Signed-off-by: Tobias Klauser Signed-off-by: Andrew Morton Cc: Christoph Hellwig Cc: Al Viro Link: https://lkml.kernel.org/r/20200825093647.24263-1-tklauser@distanz.ch Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 4d32190861bd..49677d668de4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -3014,7 +3014,7 @@ int unshare_files(struct files_struct **displaced) } int sysctl_max_threads(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int ret; -- cgit v1.2.3 From cd1752d34ef33d68d82ef9dcc699b4eaa17c07fc Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 26 Aug 2020 18:37:50 +0100 Subject: genirq: Walk the irq_data hierarchy when resending an interrupt On resending an interrupt, we only check the outermost irqchip for a irq_retrigger callback. However, this callback could be implemented at an inner level. Use irq_chip_retrigger_hierarchy() in this case. Reviewed-by: Valentin Schneider Signed-off-by: Marc Zyngier --- kernel/irq/resend.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index c48ce19a257f..8ccd32a0cc80 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -86,6 +86,18 @@ static int irq_sw_resend(struct irq_desc *desc) } #endif +static int try_retrigger(struct irq_desc *desc) +{ + if (desc->irq_data.chip->irq_retrigger) + return desc->irq_data.chip->irq_retrigger(&desc->irq_data); + +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + return irq_chip_retrigger_hierarchy(&desc->irq_data); +#else + return 0; +#endif +} + /* * IRQ resend * @@ -113,8 +125,7 @@ int check_irq_resend(struct irq_desc *desc, bool inject) desc->istate &= ~IRQS_PENDING; - if (!desc->irq_data.chip->irq_retrigger || - !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) + if (!try_retrigger(desc)) err = irq_sw_resend(desc); /* If the retrigger was successfull, mark it with the REPLAY bit */ -- cgit v1.2.3 From c5e5ec033c4ab25c53f1fd217849e75deb0bf7bf Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 19 May 2020 10:41:00 +0100 Subject: genirq: Add fasteoi IPI flow For irqchips using the fasteoi flow, IPIs are a bit special. They need to be EOI'd early (before calling the handler), as funny things may happen in the handler (they do not necessarily behave like a normal interrupt). Reviewed-by: Valentin Schneider Signed-off-by: Marc Zyngier --- kernel/irq/chip.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 857f5f4c8098..bed517d5aa4d 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -944,6 +944,33 @@ void handle_percpu_devid_irq(struct irq_desc *desc) chip->irq_eoi(&desc->irq_data); } +/** + * handle_percpu_devid_fasteoi_ipi - Per CPU local IPI handler with per cpu + * dev ids + * @desc: the interrupt description structure for this irq + * + * The biggest difference with the IRQ version is that the interrupt is + * EOIed early, as the IPI could result in a context switch, and we need to + * make sure the IPI can fire again. We also assume that the arch code has + * registered an action. If not, we are positively doomed. + */ +void handle_percpu_devid_fasteoi_ipi(struct irq_desc *desc) +{ + struct irq_chip *chip = irq_desc_get_chip(desc); + struct irqaction *action = desc->action; + unsigned int irq = irq_desc_get_irq(desc); + irqreturn_t res; + + __kstat_incr_irqs_this_cpu(desc); + + if (chip->irq_eoi) + chip->irq_eoi(&desc->irq_data); + + trace_irq_handler_entry(irq, action); + res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id)); + trace_irq_handler_exit(irq, action, res); +} + /** * handle_percpu_devid_fasteoi_nmi - Per CPU local NMI handler with per cpu * dev ids -- cgit v1.2.3 From 83cfac95c01817819c2a51f0931d798d851f8a08 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 19 May 2020 14:58:13 +0100 Subject: genirq: Allow interrupts to be excluded from /proc/interrupts A number of architectures implement IPI statistics directly, duplicating the core kstat_irqs accounting. As we move IPIs to being actual IRQs, we would end-up with a confusing display in /proc/interrupts (where the IPIs would appear twice). In order to solve this, allow interrupts to be flagged as "hidden", which excludes them from /proc/interrupts. Reviewed-by: Valentin Schneider Signed-off-by: Marc Zyngier --- kernel/irq/debugfs.c | 1 + kernel/irq/proc.c | 2 +- kernel/irq/settings.h | 7 +++++++ 3 files changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index b95ff5d5f4bd..acabc0c0e46b 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -136,6 +136,7 @@ static const struct irq_bit_descr irqdesc_states[] = { BIT_MASK_DESCR(_IRQ_PER_CPU_DEVID), BIT_MASK_DESCR(_IRQ_IS_POLLED), BIT_MASK_DESCR(_IRQ_DISABLE_UNLAZY), + BIT_MASK_DESCR(_IRQ_HIDDEN), }; static const struct irq_bit_descr irqdesc_istates[] = { diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 32c071d7bc03..72513ed2a5fc 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -485,7 +485,7 @@ int show_interrupts(struct seq_file *p, void *v) rcu_read_lock(); desc = irq_to_desc(i); - if (!desc) + if (!desc || irq_settings_is_hidden(desc)) goto outsparse; if (desc->kstat_irqs) diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index e43795cd2ccf..403378b9947b 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -17,6 +17,7 @@ enum { _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID, _IRQ_IS_POLLED = IRQ_IS_POLLED, _IRQ_DISABLE_UNLAZY = IRQ_DISABLE_UNLAZY, + _IRQ_HIDDEN = IRQ_HIDDEN, _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, }; @@ -31,6 +32,7 @@ enum { #define IRQ_PER_CPU_DEVID GOT_YOU_MORON #define IRQ_IS_POLLED GOT_YOU_MORON #define IRQ_DISABLE_UNLAZY GOT_YOU_MORON +#define IRQ_HIDDEN GOT_YOU_MORON #undef IRQF_MODIFY_MASK #define IRQF_MODIFY_MASK GOT_YOU_MORON @@ -167,3 +169,8 @@ static inline void irq_settings_clr_disable_unlazy(struct irq_desc *desc) { desc->status_use_accessors &= ~_IRQ_DISABLE_UNLAZY; } + +static inline bool irq_settings_is_hidden(struct irq_desc *desc) +{ + return desc->status_use_accessors & _IRQ_HIDDEN; +} -- cgit v1.2.3 From 90428a8eb4947f9c7c905a178f3520dc7e2ee6d2 Mon Sep 17 00:00:00 2001 From: Maulik Shah Date: Mon, 28 Sep 2020 10:02:01 +0530 Subject: genirq/PM: Introduce IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND flag An interrupt that is disabled/masked but set for wakeup may still need to be able to wake up the system from sleep states like "suspend to RAM". To that effect, introduce the IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND flag. If the irqchip have this flag set, the irq PM code will enable/unmask the irqs that are marked for wakeup, but that are in a disabled state. On resume, such irqs will be restored back to their disabled state. Suggested-by: Thomas Gleixner Signed-off-by: Maulik Shah [maz: commit message fix-up] Signed-off-by: Marc Zyngier Tested-by: Stephen Boyd Reviewed-by: Thomas Gleixner Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/1601267524-20199-4-git-send-email-mkshah@codeaurora.org --- kernel/irq/debugfs.c | 3 +++ kernel/irq/pm.c | 34 ++++++++++++++++++++++++++++++---- 2 files changed, 33 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index b95ff5d5f4bd..b6e10945e8be 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -57,6 +57,7 @@ static const struct irq_bit_descr irqchip_flags[] = { BIT_MASK_DESCR(IRQCHIP_EOI_THREADED), BIT_MASK_DESCR(IRQCHIP_SUPPORTS_LEVEL_MSI), BIT_MASK_DESCR(IRQCHIP_SUPPORTS_NMI), + BIT_MASK_DESCR(IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND), }; static void @@ -125,6 +126,8 @@ static const struct irq_bit_descr irqdata_states[] = { BIT_MASK_DESCR(IRQD_DEFAULT_TRIGGER_SET), BIT_MASK_DESCR(IRQD_HANDLE_ENFORCE_IRQCTX), + + BIT_MASK_DESCR(IRQD_IRQ_ENABLED_ON_SUSPEND), }; static const struct irq_bit_descr irqdesc_states[] = { diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index c6c7e187ae74..ce0adb22ee96 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -69,12 +69,26 @@ void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) static bool suspend_device_irq(struct irq_desc *desc) { + unsigned long chipflags = irq_desc_get_chip(desc)->flags; + struct irq_data *irqd = &desc->irq_data; + if (!desc->action || irq_desc_is_chained(desc) || desc->no_suspend_depth) return false; - if (irqd_is_wakeup_set(&desc->irq_data)) { - irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED); + if (irqd_is_wakeup_set(irqd)) { + irqd_set(irqd, IRQD_WAKEUP_ARMED); + + if ((chipflags & IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND) && + irqd_irq_disabled(irqd)) { + /* + * Interrupt marked for wakeup is in disabled state. + * Enable interrupt here to unmask/enable in irqchip + * to be able to resume with such interrupts. + */ + __enable_irq(desc); + irqd_set(irqd, IRQD_IRQ_ENABLED_ON_SUSPEND); + } /* * We return true here to force the caller to issue * synchronize_irq(). We need to make sure that the @@ -93,7 +107,7 @@ static bool suspend_device_irq(struct irq_desc *desc) * chip level. The chip implementation indicates that with * IRQCHIP_MASK_ON_SUSPEND. */ - if (irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND) + if (chipflags & IRQCHIP_MASK_ON_SUSPEND) mask_irq(desc); return true; } @@ -137,7 +151,19 @@ EXPORT_SYMBOL_GPL(suspend_device_irqs); static void resume_irq(struct irq_desc *desc) { - irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); + struct irq_data *irqd = &desc->irq_data; + + irqd_clear(irqd, IRQD_WAKEUP_ARMED); + + if (irqd_is_enabled_on_suspend(irqd)) { + /* + * Interrupt marked for wakeup was enabled during suspend + * entry. Disable such interrupts to restore them back to + * original state. + */ + __disable_irq(desc); + irqd_clear(irqd, IRQD_IRQ_ENABLED_ON_SUSPEND); + } if (desc->istate & IRQS_SUSPENDED) goto resume; -- cgit v1.2.3 From 55567976629e58fde28fb70612ca73228271eef2 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 6 Oct 2020 10:10:20 +0100 Subject: genirq/irqdomain: Allow partial trimming of irq_data hierarchy It appears that some HW is ugly enough that not all the interrupts connected to a particular interrupt controller end up with the same hierarchy depth (some of them are terminated early). This leaves the irqchip hacker with only two choices, both equally bad: - create discrete domain chains, one for each "hierarchy depth", which is very hard to maintain - create fake hierarchy levels for the shallow paths, leading to all kind of problems (what are the safe hwirq values for these fake levels?) Implement the ability to cut short a single interrupt hierarchy from a level marked as being disconnected by using the new irq_domain_disconnect_hierarchy() helper. The irqdomain allocation code will then perform the trimming Signed-off-by: Marc Zyngier --- kernel/irq/irqdomain.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 95 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 76cd7ebd1178..cf8b374b892d 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1136,6 +1136,17 @@ static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain, return irq_data; } +static void __irq_domain_free_hierarchy(struct irq_data *irq_data) +{ + struct irq_data *tmp; + + while (irq_data) { + tmp = irq_data; + irq_data = irq_data->parent_data; + kfree(tmp); + } +} + static void irq_domain_free_irq_data(unsigned int virq, unsigned int nr_irqs) { struct irq_data *irq_data, *tmp; @@ -1147,12 +1158,83 @@ static void irq_domain_free_irq_data(unsigned int virq, unsigned int nr_irqs) irq_data->parent_data = NULL; irq_data->domain = NULL; - while (tmp) { - irq_data = tmp; - tmp = tmp->parent_data; - kfree(irq_data); + __irq_domain_free_hierarchy(tmp); + } +} + +/** + * irq_domain_disconnect_hierarchy - Mark the first unused level of a hierarchy + * @domain: IRQ domain from which the hierarchy is to be disconnected + * @virq: IRQ number where the hierarchy is to be trimmed + * + * Marks the @virq level belonging to @domain as disconnected. + * Returns -EINVAL if @virq doesn't have a valid irq_data pointing + * to @domain. + * + * Its only use is to be able to trim levels of hierarchy that do not + * have any real meaning for this interrupt, and that the driver marks + * as such from its .alloc() callback. + */ +int irq_domain_disconnect_hierarchy(struct irq_domain *domain, + unsigned int virq) +{ + struct irq_data *irqd; + + irqd = irq_domain_get_irq_data(domain, virq); + if (!irqd) + return -EINVAL; + + irqd->chip = ERR_PTR(-ENOTCONN); + return 0; +} + +static int irq_domain_trim_hierarchy(unsigned int virq) +{ + struct irq_data *tail, *irqd, *irq_data; + + irq_data = irq_get_irq_data(virq); + tail = NULL; + + /* The first entry must have a valid irqchip */ + if (!irq_data->chip || IS_ERR(irq_data->chip)) + return -EINVAL; + + /* + * Validate that the irq_data chain is sane in the presence of + * a hierarchy trimming marker. + */ + for (irqd = irq_data->parent_data; irqd; irq_data = irqd, irqd = irqd->parent_data) { + /* Can't have a valid irqchip after a trim marker */ + if (irqd->chip && tail) + return -EINVAL; + + /* Can't have an empty irqchip before a trim marker */ + if (!irqd->chip && !tail) + return -EINVAL; + + if (IS_ERR(irqd->chip)) { + /* Only -ENOTCONN is a valid trim marker */ + if (PTR_ERR(irqd->chip) != -ENOTCONN) + return -EINVAL; + + tail = irq_data; } } + + /* No trim marker, nothing to do */ + if (!tail) + return 0; + + pr_info("IRQ%d: trimming hierarchy from %s\n", + virq, tail->parent_data->domain->name); + + /* Sever the inner part of the hierarchy... */ + irqd = tail; + tail = tail->parent_data; + irqd->parent_data = NULL; + __irq_domain_free_hierarchy(tail); + + return 0; } static int irq_domain_alloc_irq_data(struct irq_domain *domain, @@ -1362,6 +1444,15 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, mutex_unlock(&irq_domain_mutex); goto out_free_irq_data; } + + for (i = 0; i < nr_irqs; i++) { + ret = irq_domain_trim_hierarchy(virq + i); + if (ret) { + mutex_unlock(&irq_domain_mutex); + goto out_free_irq_data; + } + } + for (i = 0; i < nr_irqs; i++) irq_domain_insert_irq(virq + i); mutex_unlock(&irq_domain_mutex); -- cgit v1.2.3 From 151a535171be6ff824a0a3875553ea38570f4c05 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 15 Oct 2020 21:41:44 +0100 Subject: genirq: Let GENERIC_IRQ_IPI select IRQ_DOMAIN_HIERARCHY kernel/irq/ipi.c otherwise fails to compile if nothing else selects it. Fixes: 379b656446a3 ("genirq: Add GENERIC_IRQ_IPI Kconfig symbol") Reported-by: Pavel Machek Tested-by: Pavel Machek Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201015101222.GA32747@amd --- kernel/irq/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 10a5aff4eecc..164a031cfdb6 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -82,6 +82,7 @@ config IRQ_FASTEOI_HIERARCHY_HANDLERS # Generic IRQ IPI support config GENERIC_IRQ_IPI bool + select IRQ_DOMAIN_HIERARCHY # Generic MSI interrupt support config GENERIC_MSI_IRQ -- cgit v1.2.3