From f14c234b4bc5184fd40d9a47830e5b32c3b36d49 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 1 Oct 2019 11:10:53 +1000 Subject: clone3: switch to copy_struct_from_user() Switch clone3() syscall from it's own copying struct clone_args from userspace to the new dedicated copy_struct_from_user() helper. The change is very straightforward, and helps unify the syscall interface for struct-from-userspace syscalls. Additionally, explicitly define CLONE_ARGS_SIZE_VER0 to match the other users of the struct-extension pattern. Signed-off-by: Aleksa Sarai Reviewed-by: Kees Cook Reviewed-by: Christian Brauner [christian.brauner@ubuntu.com: improve commit message] Link: https://lore.kernel.org/r/20191001011055.19283-3-cyphar@cyphar.com Signed-off-by: Christian Brauner --- kernel/fork.c | 34 +++++++--------------------------- 1 file changed, 7 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index f9572f416126..2ef529869c64 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2525,39 +2525,19 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, #ifdef __ARCH_WANT_SYS_CLONE3 noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args __user *uargs, - size_t size) + size_t usize) { + int err; struct clone_args args; - if (unlikely(size > PAGE_SIZE)) + if (unlikely(usize > PAGE_SIZE)) return -E2BIG; - - if (unlikely(size < sizeof(struct clone_args))) + if (unlikely(usize < CLONE_ARGS_SIZE_VER0)) return -EINVAL; - if (unlikely(!access_ok(uargs, size))) - return -EFAULT; - - if (size > sizeof(struct clone_args)) { - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; - - addr = (void __user *)uargs + sizeof(struct clone_args); - end = (void __user *)uargs + size; - - for (; addr < end; addr++) { - if (get_user(val, addr)) - return -EFAULT; - if (val) - return -E2BIG; - } - - size = sizeof(struct clone_args); - } - - if (copy_from_user(&args, uargs, size)) - return -EFAULT; + err = copy_struct_from_user(&args, sizeof(args), uargs, usize); + if (err) + return err; /* * Verify that higher 32bits of exit_signal are unset and that -- cgit v1.2.3 From dff3a85feceade213daf153556fd32a3b0a73c63 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 1 Oct 2019 11:10:54 +1000 Subject: sched_setattr: switch to copy_struct_from_user() Switch sched_setattr() syscall from it's own copying struct sched_attr from userspace to the new dedicated copy_struct_from_user() helper. The change is very straightforward, and helps unify the syscall interface for struct-from-userspace syscalls. Ideally we could also unify sched_getattr(2)-style syscalls as well, but unfortunately the correct semantics for such syscalls are much less clear (see [1] for more detail). In future we could come up with a more sane idea for how the syscall interface should look. [1]: commit 1251201c0d34 ("sched/core: Fix uclamp ABI bug, clean up and robustify sched_read_attr() ABI logic and code") Signed-off-by: Aleksa Sarai Reviewed-by: Kees Cook Reviewed-by: Christian Brauner [christian.brauner@ubuntu.com: improve commit message] Link: https://lore.kernel.org/r/20191001011055.19283-4-cyphar@cyphar.com Signed-off-by: Christian Brauner --- kernel/sched/core.c | 43 +++++++------------------------------------ 1 file changed, 7 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7880f4f64d0e..dd05a378631a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5106,9 +5106,6 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a u32 size; int ret; - if (!access_ok(uattr, SCHED_ATTR_SIZE_VER0)) - return -EFAULT; - /* Zero the full structure, so that a short copy will be nice: */ memset(attr, 0, sizeof(*attr)); @@ -5116,45 +5113,19 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a if (ret) return ret; - /* Bail out on silly large: */ - if (size > PAGE_SIZE) - goto err_size; - /* ABI compatibility quirk: */ if (!size) size = SCHED_ATTR_SIZE_VER0; - - if (size < SCHED_ATTR_SIZE_VER0) + if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) goto err_size; - /* - * If we're handed a bigger struct than we know of, - * ensure all the unknown bits are 0 - i.e. new - * user-space does not rely on any kernel feature - * extensions we dont know about yet. - */ - if (size > sizeof(*attr)) { - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; - - addr = (void __user *)uattr + sizeof(*attr); - end = (void __user *)uattr + size; - - for (; addr < end; addr++) { - ret = get_user(val, addr); - if (ret) - return ret; - if (val) - goto err_size; - } - size = sizeof(*attr); + ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); + if (ret) { + if (ret == -E2BIG) + goto err_size; + return ret; } - ret = copy_from_user(attr, uattr, size); - if (ret) - return -EFAULT; - if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) && size < SCHED_ATTR_SIZE_VER1) return -EINVAL; @@ -5354,7 +5325,7 @@ sched_attr_copy_to_user(struct sched_attr __user *uattr, * sys_sched_getattr - similar to sched_getparam, but with sched_attr * @pid: the pid in question. * @uattr: structure containing the extended parameters. - * @usize: sizeof(attr) that user-space knows about, for forwards and backwards compatibility. + * @usize: sizeof(attr) for fwd/bwd comp. * @flags: for future extension. */ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, -- cgit v1.2.3 From c2ba8f41ad366dc12840dd87d8f1565185845126 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 1 Oct 2019 11:10:55 +1000 Subject: perf_event_open: switch to copy_struct_from_user() Switch perf_event_open() syscall from it's own copying struct perf_event_attr from userspace to the new dedicated copy_struct_from_user() helper. The change is very straightforward, and helps unify the syscall interface for struct-from-userspace syscalls. Signed-off-by: Aleksa Sarai Reviewed-by: Kees Cook Reviewed-by: Christian Brauner [christian.brauner@ubuntu.com: improve commit message] Link: https://lore.kernel.org/r/20191001011055.19283-5-cyphar@cyphar.com Signed-off-by: Christian Brauner --- kernel/events/core.c | 47 +++++++++-------------------------------------- 1 file changed, 9 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 4655adbbae10..3f0cb82e4fbc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10586,55 +10586,26 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, u32 size; int ret; - if (!access_ok(uattr, PERF_ATTR_SIZE_VER0)) - return -EFAULT; - - /* - * zero the full structure, so that a short copy will be nice. - */ + /* Zero the full structure, so that a short copy will be nice. */ memset(attr, 0, sizeof(*attr)); ret = get_user(size, &uattr->size); if (ret) return ret; - if (size > PAGE_SIZE) /* silly large */ - goto err_size; - - if (!size) /* abi compat */ + /* ABI compatibility quirk: */ + if (!size) size = PERF_ATTR_SIZE_VER0; - - if (size < PERF_ATTR_SIZE_VER0) + if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE) goto err_size; - /* - * If we're handed a bigger struct than we know of, - * ensure all the unknown bits are 0 - i.e. new - * user-space does not rely on any kernel feature - * extensions we dont know about yet. - */ - if (size > sizeof(*attr)) { - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; - - addr = (void __user *)uattr + sizeof(*attr); - end = (void __user *)uattr + size; - - for (; addr < end; addr++) { - ret = get_user(val, addr); - if (ret) - return ret; - if (val) - goto err_size; - } - size = sizeof(*attr); + ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); + if (ret) { + if (ret == -E2BIG) + goto err_size; + return ret; } - ret = copy_from_user(attr, uattr, size); - if (ret) - return -EFAULT; - attr->size = size; if (attr->__reserved_1) -- cgit v1.2.3 From 501bd0166eb949820c8e4204e62aaee5c89c84d9 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 27 Sep 2019 17:28:42 +0200 Subject: fork: add kernel-doc for clone3 Add kernel-doc for the clone3() syscall. Link: https://lore.kernel.org/r/20191001114701.24661-2-christian.brauner@ubuntu.com Signed-off-by: Christian Brauner --- kernel/fork.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index f9572f416126..bf11cf39579a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2604,6 +2604,17 @@ static bool clone3_args_valid(const struct kernel_clone_args *kargs) return true; } +/** + * clone3 - create a new process with specific properties + * @uargs: argument structure + * @size: size of @uargs + * + * clone3() is the extensible successor to clone()/clone2(). + * It takes a struct as argument that is versioned by its size. + * + * Return: On success, a positive PID for the child process. + * On error, a negative errno number. + */ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size) { int err; -- cgit v1.2.3 From 86cdd2fdc4e39c388d39c7ba2396d1a9dfd66226 Mon Sep 17 00:00:00 2001 From: Dmitry Goldin Date: Fri, 4 Oct 2019 10:40:07 +0000 Subject: kheaders: make headers archive reproducible In commit 43d8ce9d65a5 ("Provide in-kernel headers to make extending kernel easier") a new mechanism was introduced, for kernels >=5.2, which embeds the kernel headers in the kernel image or a module and exposes them in procfs for use by userland tools. The archive containing the header files has nondeterminism caused by header files metadata. This patch normalizes the metadata and utilizes KBUILD_BUILD_TIMESTAMP if provided and otherwise falls back to the default behaviour. In commit f7b101d33046 ("kheaders: Move from proc to sysfs") it was modified to use sysfs and the script for generation of the archive was renamed to what is being patched. Signed-off-by: Dmitry Goldin Reviewed-by: Greg Kroah-Hartman Reviewed-by: Joel Fernandes (Google) Signed-off-by: Masahiro Yamada --- kernel/gen_kheaders.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 9ff449888d9c..aff79e461fc9 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -71,7 +71,10 @@ done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1 find $cpio_dir -type f -print0 | xargs -0 -P8 -n1 perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;' -tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null +# Create archive and try to normalize metadata for reproducibility +tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \ + --owner=0 --group=0 --sort=name --numeric-owner \ + -Jcf $tarfile -C $cpio_dir/ . > /dev/null echo "$src_files_md5" > kernel/kheaders.md5 echo "$obj_files_md5" >> kernel/kheaders.md5 -- cgit v1.2.3 From 2cf2aa6a69db0b17b3979144287af8775c1c1534 Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Sat, 5 Oct 2019 10:23:30 +0200 Subject: dma-mapping: fix false positivse warnings in dma_common_free_remap() Commit 5cf4537975bb ("dma-mapping: introduce a dma_common_find_pages helper") changed invalid input check in dma_common_free_remap() from: if (!area || !area->flags != VM_DMA_COHERENT) to if (!area || !area->flags != VM_DMA_COHERENT || !area->pages) which seem to produce false positives for memory obtained via dma_common_contiguous_remap() This triggers the following warning message when doing "reboot" on ZII VF610 Dev Board Rev B: WARNING: CPU: 0 PID: 1 at kernel/dma/remap.c:112 dma_common_free_remap+0x88/0x8c trying to free invalid coherent area: 9ef82980 Modules linked in: CPU: 0 PID: 1 Comm: systemd-shutdow Not tainted 5.3.0-rc6-next-20190820 #119 Hardware name: Freescale Vybrid VF5xx/VF6xx (Device Tree) Backtrace: [<8010d1ec>] (dump_backtrace) from [<8010d588>] (show_stack+0x20/0x24) r7:8015ed78 r6:00000009 r5:00000000 r4:9f4d9b14 [<8010d568>] (show_stack) from [<8077e3f0>] (dump_stack+0x24/0x28) [<8077e3cc>] (dump_stack) from [<801197a0>] (__warn.part.3+0xcc/0xe4) [<801196d4>] (__warn.part.3) from [<80119830>] (warn_slowpath_fmt+0x78/0x94) r6:00000070 r5:808e540c r4:81c03048 [<801197bc>] (warn_slowpath_fmt) from [<8015ed78>] (dma_common_free_remap+0x88/0x8c) r3:9ef82980 r2:808e53e0 r7:00001000 r6:a0b1e000 r5:a0b1e000 r4:00001000 [<8015ecf0>] (dma_common_free_remap) from [<8010fa9c>] (remap_allocator_free+0x60/0x68) r5:81c03048 r4:9f4d9b78 [<8010fa3c>] (remap_allocator_free) from [<801100d0>] (__arm_dma_free.constprop.3+0xf8/0x148) r5:81c03048 r4:9ef82900 [<8010ffd8>] (__arm_dma_free.constprop.3) from [<80110144>] (arm_dma_free+0x24/0x2c) r5:9f563410 r4:80110120 [<80110120>] (arm_dma_free) from [<8015d80c>] (dma_free_attrs+0xa0/0xdc) [<8015d76c>] (dma_free_attrs) from [<8020f3e4>] (dma_pool_destroy+0xc0/0x154) r8:9efa8860 r7:808f02f0 r6:808f02d0 r5:9ef82880 r4:9ef82780 [<8020f324>] (dma_pool_destroy) from [<805525d0>] (ehci_mem_cleanup+0x6c/0x150) r7:9f563410 r6:9efa8810 r5:00000000 r4:9efd0148 [<80552564>] (ehci_mem_cleanup) from [<80558e0c>] (ehci_stop+0xac/0xc0) r5:9efd0148 r4:9efd0000 [<80558d60>] (ehci_stop) from [<8053c4bc>] (usb_remove_hcd+0xf4/0x1b0) r7:9f563410 r6:9efd0074 r5:81c03048 r4:9efd0000 [<8053c3c8>] (usb_remove_hcd) from [<8056361c>] (host_stop+0x48/0xb8) r7:9f563410 r6:9efd0000 r5:9f5f4040 r4:9f5f5040 [<805635d4>] (host_stop) from [<80563d0c>] (ci_hdrc_host_destroy+0x34/0x38) r7:9f563410 r6:9f5f5040 r5:9efa8800 r4:9f5f4040 [<80563cd8>] (ci_hdrc_host_destroy) from [<8055ef18>] (ci_hdrc_remove+0x50/0x10c) [<8055eec8>] (ci_hdrc_remove) from [<804a2ed8>] (platform_drv_remove+0x34/0x4c) r7:9f563410 r6:81c4f99c r5:9efa8810 r4:9efa8810 [<804a2ea4>] (platform_drv_remove) from [<804a18a8>] (device_release_driver_internal+0xec/0x19c) r5:00000000 r4:9efa8810 [<804a17bc>] (device_release_driver_internal) from [<804a1978>] (device_release_driver+0x20/0x24) r7:9f563410 r6:81c41ed0 r5:9efa8810 r4:9f4a1dac [<804a1958>] (device_release_driver) from [<804a01b8>] (bus_remove_device+0xdc/0x108) [<804a00dc>] (bus_remove_device) from [<8049c204>] (device_del+0x150/0x36c) r7:9f563410 r6:81c03048 r5:9efa8854 r4:9efa8810 [<8049c0b4>] (device_del) from [<804a3368>] (platform_device_del.part.2+0x20/0x84) r10:9f563414 r9:809177e0 r8:81cb07dc r7:81c78320 r6:9f563454 r5:9efa8800 r4:9efa8800 [<804a3348>] (platform_device_del.part.2) from [<804a3420>] (platform_device_unregister+0x28/0x34) r5:9f563400 r4:9efa8800 [<804a33f8>] (platform_device_unregister) from [<8055dce0>] (ci_hdrc_remove_device+0x1c/0x30) r5:9f563400 r4:00000001 [<8055dcc4>] (ci_hdrc_remove_device) from [<805652ac>] (ci_hdrc_imx_remove+0x38/0x118) r7:81c78320 r6:9f563454 r5:9f563410 r4:9f541010 [<8056538c>] (ci_hdrc_imx_shutdown) from [<804a2970>] (platform_drv_shutdown+0x2c/0x30) [<804a2944>] (platform_drv_shutdown) from [<8049e4fc>] (device_shutdown+0x158/0x1f0) [<8049e3a4>] (device_shutdown) from [<8013ac80>] (kernel_restart_prepare+0x44/0x48) r10:00000058 r9:9f4d8000 r8:fee1dead r7:379ce700 r6:81c0b280 r5:81c03048 r4:00000000 [<8013ac3c>] (kernel_restart_prepare) from [<8013ad14>] (kernel_restart+0x1c/0x60) [<8013acf8>] (kernel_restart) from [<8013af84>] (__do_sys_reboot+0xe0/0x1d8) r5:81c03048 r4:00000000 [<8013aea4>] (__do_sys_reboot) from [<8013b0ec>] (sys_reboot+0x18/0x1c) r8:80101204 r7:00000058 r6:00000000 r5:00000000 r4:00000000 [<8013b0d4>] (sys_reboot) from [<80101000>] (ret_fast_syscall+0x0/0x54) Exception stack(0x9f4d9fa8 to 0x9f4d9ff0) 9fa0: 00000000 00000000 fee1dead 28121969 01234567 379ce700 9fc0: 00000000 00000000 00000000 00000058 00000000 00000000 00000000 00016d04 9fe0: 00028e0c 7ec87c64 000135ec 76c1f410 Restore original invalid input check in dma_common_free_remap() to avoid this problem. Fixes: 5cf4537975bb ("dma-mapping: introduce a dma_common_find_pages helper") Signed-off-by: Andrey Smirnov [hch: just revert the offending hunk instead of creating a new helper] Signed-off-by: Christoph Hellwig --- kernel/dma/remap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index ca4e5d44b571..c00b9258fa6a 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -87,9 +87,9 @@ void *dma_common_contiguous_remap(struct page *page, size_t size, */ void dma_common_free_remap(void *cpu_addr, size_t size) { - struct page **pages = dma_common_find_pages(cpu_addr); + struct vm_struct *area = find_vm_area(cpu_addr); - if (!pages) { + if (!area || area->flags != VM_DMA_COHERENT) { WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr); return; } -- cgit v1.2.3 From 0e48f51cbbfbdb79149806de14dcb8bf0f01ca94 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Fri, 4 Oct 2019 13:00:25 +0300 Subject: Revert "libata, freezer: avoid block device removal while system is frozen" This reverts commit 85fbd722ad0f5d64d1ad15888cd1eb2188bfb557. The commit was added as a quick band-aid for a hang that happened when a block device was removed during system suspend. Now that bdi_wq is not freezable anymore the hang should not be possible and we can get rid of this hack by reverting it. Acked-by: Rafael J. Wysocki Signed-off-by: Mika Westerberg Signed-off-by: Jens Axboe --- kernel/freezer.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/freezer.c b/kernel/freezer.c index c0738424bb43..dc520f01f99d 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -22,12 +22,6 @@ EXPORT_SYMBOL(system_freezing_cnt); bool pm_freezing; bool pm_nosig_freezing; -/* - * Temporary export for the deadlock workaround in ata_scsi_hotplug(). - * Remove once the hack becomes unnecessary. - */ -EXPORT_SYMBOL_GPL(pm_freezing); - /* protects freezing and frozen transitions */ static DEFINE_SPINLOCK(freezer_lock); -- cgit v1.2.3 From f733c6b508bcaa3441ba1eacf16efb9abd47489f Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Fri, 4 Oct 2019 15:57:29 +0300 Subject: perf/core: Fix inheritance of aux_output groups Commit: ab43762ef010 ("perf: Allow normal events to output AUX data") forgets to configure aux_output relation in the inherited groups, which results in child PEBS events forever failing to schedule. Fix this by setting up the AUX output link in the inheritance path. Signed-off-by: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191004125729.32397-1-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 3f0cb82e4fbc..f953dd16a5e2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11862,6 +11862,10 @@ static int inherit_group(struct perf_event *parent_event, child, leader, child_ctx); if (IS_ERR(child_ctr)) return PTR_ERR(child_ctr); + + if (sub->aux_event == parent_event && + !perf_get_aux_event(child_ctr, leader)) + return -EINVAL; } return 0; } -- cgit v1.2.3 From 20bb759a66be52cf4a9ddd17fddaf509e11490cd Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Sun, 6 Oct 2019 17:58:00 -0700 Subject: panic: ensure preemption is disabled during panic() Calling 'panic()' on a kernel with CONFIG_PREEMPT=y can leave the calling CPU in an infinite loop, but with interrupts and preemption enabled. From this state, userspace can continue to be scheduled, despite the system being "dead" as far as the kernel is concerned. This is easily reproducible on arm64 when booting with "nosmp" on the command line; a couple of shell scripts print out a periodic "Ping" message whilst another triggers a crash by writing to /proc/sysrq-trigger: | sysrq: Trigger a crash | Kernel panic - not syncing: sysrq triggered crash | CPU: 0 PID: 1 Comm: init Not tainted 5.2.15 #1 | Hardware name: linux,dummy-virt (DT) | Call trace: | dump_backtrace+0x0/0x148 | show_stack+0x14/0x20 | dump_stack+0xa0/0xc4 | panic+0x140/0x32c | sysrq_handle_reboot+0x0/0x20 | __handle_sysrq+0x124/0x190 | write_sysrq_trigger+0x64/0x88 | proc_reg_write+0x60/0xa8 | __vfs_write+0x18/0x40 | vfs_write+0xa4/0x1b8 | ksys_write+0x64/0xf0 | __arm64_sys_write+0x14/0x20 | el0_svc_common.constprop.0+0xb0/0x168 | el0_svc_handler+0x28/0x78 | el0_svc+0x8/0xc | Kernel Offset: disabled | CPU features: 0x0002,24002004 | Memory Limit: none | ---[ end Kernel panic - not syncing: sysrq triggered crash ]--- | Ping 2! | Ping 1! | Ping 1! | Ping 2! The issue can also be triggered on x86 kernels if CONFIG_SMP=n, otherwise local interrupts are disabled in 'smp_send_stop()'. Disable preemption in 'panic()' before re-enabling interrupts. Link: http://lkml.kernel.org/r/20191002123538.22609-1-will@kernel.org Link: https://lore.kernel.org/r/BX1W47JXPMR8.58IYW53H6M5N@dragonstone Signed-off-by: Will Deacon Reported-by: Xogium Reviewed-by: Kees Cook Cc: Russell King Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Petr Mladek Cc: Feng Tang Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 47e8ebccc22b..f470a038b05b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -180,6 +180,7 @@ void panic(const char *fmt, ...) * after setting panic_cpu) from invoking panic() again. */ local_irq_disable(); + preempt_disable_notrace(); /* * It's possible to come here directly from a panic-assertion and -- cgit v1.2.3 From b0f53dbc4bc4c371f38b14c391095a3bb8a0bb40 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Sun, 6 Oct 2019 17:58:19 -0700 Subject: kernel/sysctl.c: do not override max_threads provided by userspace Partially revert 16db3d3f1170 ("kernel/sysctl.c: threads-max observe limits") because the patch is causing a regression to any workload which needs to override the auto-tuning of the limit provided by kernel. set_max_threads is implementing a boot time guesstimate to provide a sensible limit of the concurrently running threads so that runaways will not deplete all the memory. This is a good thing in general but there are workloads which might need to increase this limit for an application to run (reportedly WebSpher MQ is affected) and that is simply not possible after the mentioned change. It is also very dubious to override an admin decision by an estimation that doesn't have any direct relation to correctness of the kernel operation. Fix this by dropping set_max_threads from sysctl_max_threads so any value is accepted as long as it fits into MAX_THREADS which is important to check because allowing more threads could break internal robust futex restriction. While at it, do not use MIN_THREADS as the lower boundary because it is also only a heuristic for automatic estimation and admin might have a good reason to stop new threads to be created even when below this limit. This became more severe when we switched x86 from 4k to 8k kernel stacks. Starting since 6538b8ea886e ("x86_64: expand kernel stack to 16K") (3.16) we use THREAD_SIZE_ORDER = 2 and that halved the auto-tuned value. In the particular case 3.12 kernel.threads-max = 515561 4.4 kernel.threads-max = 200000 Neither of the two values is really insane on 32GB machine. I am not sure we want/need to tune the max_thread value further. If anything the tuning should be removed altogether if proven not useful in general. But we definitely need a way to override this auto-tuning. Link: http://lkml.kernel.org/r/20190922065801.GB18814@dhcp22.suse.cz Fixes: 16db3d3f1170 ("kernel/sysctl.c: threads-max observe limits") Signed-off-by: Michal Hocko Reviewed-by: "Eric W. Biederman" Cc: Heinrich Schuchardt Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 1f6c45f6a734..bcdf53125210 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2925,7 +2925,7 @@ int sysctl_max_threads(struct ctl_table *table, int write, struct ctl_table t; int ret; int threads = max_threads; - int min = MIN_THREADS; + int min = 1; int max = MAX_THREADS; t = *table; @@ -2937,7 +2937,7 @@ int sysctl_max_threads(struct ctl_table *table, int write, if (ret || !write) return ret; - set_max_threads(threads); + max_threads = threads; return 0; } -- cgit v1.2.3 From 4929a4e6faa0f13289a67cae98139e727f0d4a97 Mon Sep 17 00:00:00 2001 From: Xuewei Zhang Date: Thu, 3 Oct 2019 17:12:43 -0700 Subject: sched/fair: Scale bandwidth quota and period without losing quota/period ratio precision The quota/period ratio is used to ensure a child task group won't get more bandwidth than the parent task group, and is calculated as: normalized_cfs_quota() = [(quota_us << 20) / period_us] If the quota/period ratio was changed during this scaling due to precision loss, it will cause inconsistency between parent and child task groups. See below example: A userspace container manager (kubelet) does three operations: 1) Create a parent cgroup, set quota to 1,000us and period to 10,000us. 2) Create a few children cgroups. 3) Set quota to 1,000us and period to 10,000us on a child cgroup. These operations are expected to succeed. However, if the scaling of 147/128 happens before step 3, quota and period of the parent cgroup will be changed: new_quota: 1148437ns, 1148us new_period: 11484375ns, 11484us And when step 3 comes in, the ratio of the child cgroup will be 104857, which will be larger than the parent cgroup ratio (104821), and will fail. Scaling them by a factor of 2 will fix the problem. Tested-by: Phil Auld Signed-off-by: Xuewei Zhang Signed-off-by: Peter Zijlstra (Intel) Acked-by: Phil Auld Cc: Anton Blanchard Cc: Ben Segall Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vincent Guittot Fixes: 2e8e19226398 ("sched/fair: Limit sched_cfs_period_timer() loop to avoid hard lockup") Link: https://lkml.kernel.org/r/20191004001243.140897-1-xueweiz@google.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 83ab35e2374f..682a754ea3e1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4926,20 +4926,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) if (++count > 3) { u64 new, old = ktime_to_ns(cfs_b->period); - new = (old * 147) / 128; /* ~115% */ - new = min(new, max_cfs_quota_period); - - cfs_b->period = ns_to_ktime(new); - - /* since max is 1s, this is limited to 1e9^2, which fits in u64 */ - cfs_b->quota *= new; - cfs_b->quota = div64_u64(cfs_b->quota, old); - - pr_warn_ratelimited( - "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n", - smp_processor_id(), - div_u64(new, NSEC_PER_USEC), - div_u64(cfs_b->quota, NSEC_PER_USEC)); + /* + * Grow period by a factor of 2 to avoid losing precision. + * Precision loss in the quota/period ratio can cause __cfs_schedulable + * to fail. + */ + new = old * 2; + if (new < max_cfs_quota_period) { + cfs_b->period = ns_to_ktime(new); + cfs_b->quota *= 2; + + pr_warn_ratelimited( + "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n", + smp_processor_id(), + div_u64(new, NSEC_PER_USEC), + div_u64(cfs_b->quota, NSEC_PER_USEC)); + } else { + pr_warn_ratelimited( + "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n", + smp_processor_id(), + div_u64(old, NSEC_PER_USEC), + div_u64(cfs_b->quota, NSEC_PER_USEC)); + } /* reset count so we don't come right back in here */ count = 0; -- cgit v1.2.3 From 68e7a4d66b0ce04bf18ff2ffded5596ab3618585 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 25 Sep 2019 23:42:42 +0200 Subject: sched/vtime: Fix guest/system mis-accounting on task switch vtime_account_system() assumes that the target task to account cputime to is always the current task. This is most often true indeed except on task switch where we call: vtime_common_task_switch(prev) vtime_account_system(prev) Here prev is the scheduling-out task where we account the cputime to. It doesn't match current that is already the scheduling-in task at this stage of the context switch. So we end up checking the wrong task flags to determine if we are accounting guest or system time to the previous task. As a result the wrong task is used to check if the target is running in guest mode. We may then spuriously account or leak either system or guest time on task switch. Fix this assumption and also turn vtime_guest_enter/exit() to use the task passed in parameter as well to avoid future similar issues. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Fixes: 2a42eb9594a1 ("sched/cputime: Accumulate vtime on top of nsec clocksource") Link: https://lkml.kernel.org/r/20190925214242.21873-1-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 2305ce89a26c..46ed4e1383e2 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -740,7 +740,7 @@ void vtime_account_system(struct task_struct *tsk) write_seqcount_begin(&vtime->seqcount); /* We might have scheduled out from guest path */ - if (current->flags & PF_VCPU) + if (tsk->flags & PF_VCPU) vtime_account_guest(tsk, vtime); else __vtime_account_system(tsk, vtime); @@ -783,7 +783,7 @@ void vtime_guest_enter(struct task_struct *tsk) */ write_seqcount_begin(&vtime->seqcount); __vtime_account_system(tsk, vtime); - current->flags |= PF_VCPU; + tsk->flags |= PF_VCPU; write_seqcount_end(&vtime->seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_enter); @@ -794,7 +794,7 @@ void vtime_guest_exit(struct task_struct *tsk) write_seqcount_begin(&vtime->seqcount); vtime_account_guest(tsk, vtime); - current->flags &= ~PF_VCPU; + tsk->flags &= ~PF_VCPU; write_seqcount_end(&vtime->seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_exit); -- cgit v1.2.3 From d44248a41337731a111374822d7d4451b64e73e4 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 4 Sep 2019 14:46:18 -0700 Subject: perf/core: Rework memory accounting in perf_mmap() perf_mmap() always increases user->locked_vm. As a result, "extra" could grow bigger than "user_extra", which doesn't make sense. Here is an example case: (Note: Assume "user_lock_limit" is very small.) | # of perf_mmap calls |vma->vm_mm->pinned_vm|user->locked_vm| | 0 | 0 | 0 | | 1 | user_extra | user_extra | | 2 | 3 * user_extra | 2 * user_extra| | 3 | 6 * user_extra | 3 * user_extra| | 4 | 10 * user_extra | 4 * user_extra| Fix this by maintaining proper user_extra and extra. Reviewed-By: Hechao Li Reported-by: Hechao Li Signed-off-by: Song Liu Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Jie Meng Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190904214618.3795672-1-songliubraving@fb.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f953dd16a5e2..2b8265ad7bf5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5668,7 +5668,8 @@ again: * undo the VM accounting. */ - atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); + atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked, + &mmap_user->locked_vm); atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm); free_uid(mmap_user); @@ -5812,8 +5813,20 @@ accounting: user_locked = atomic_long_read(&user->locked_vm) + user_extra; - if (user_locked > user_lock_limit) + if (user_locked <= user_lock_limit) { + /* charge all to locked_vm */ + } else if (atomic_long_read(&user->locked_vm) >= user_lock_limit) { + /* charge all to pinned_vm */ + extra = user_extra; + user_extra = 0; + } else { + /* + * charge locked_vm until it hits user_lock_limit; + * charge the rest from pinned_vm + */ extra = user_locked - user_lock_limit; + user_extra -= extra; + } lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; -- cgit v1.2.3 From 7fa343b7fdc4f351de4e3f28d5c285937dd1f42f Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 8 Oct 2019 09:59:49 -0700 Subject: perf/core: Fix corner case in perf_rotate_context() In perf_rotate_context(), when the first cpu flexible event fail to schedule, cpu_rotate is 1, while cpu_event is NULL. Since cpu_event is NULL, perf_rotate_context will _NOT_ call cpu_ctx_sched_out(), thus cpuctx->ctx.is_active will have EVENT_FLEXIBLE set. Then, the next perf_event_sched_in() will skip all cpu flexible events because of the EVENT_FLEXIBLE bit. In the next call of perf_rotate_context(), cpu_rotate stays 1, and cpu_event stays NULL, so this process repeats. The end result is, flexible events on this cpu will not be scheduled (until another event being added to the cpuctx). Here is an easy repro of this issue. On Intel CPUs, where ref-cycles could only use one counter, run one pinned event for ref-cycles, one flexible event for ref-cycles, and one flexible event for cycles. The flexible ref-cycles is never scheduled, which is expected. However, because of this issue, the cycles event is never scheduled either. $ perf stat -e ref-cycles:D,ref-cycles,cycles -C 5 -I 1000 time counts unit events 1.000152973 15,412,480 ref-cycles:D 1.000152973 ref-cycles (0.00%) 1.000152973 cycles (0.00%) 2.000486957 18,263,120 ref-cycles:D 2.000486957 ref-cycles (0.00%) 2.000486957 cycles (0.00%) To fix this, when the flexible_active list is empty, try rotate the first event in the flexible_groups. Also, rename ctx_first_active() to ctx_event_to_rotate(), which is more accurate. Signed-off-by: Song Liu Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sasha Levin Cc: Thomas Gleixner Fixes: 8d5bce0c37fa ("perf/core: Optimize perf_rotate_context() event scheduling") Link: https://lkml.kernel.org/r/20191008165949.920548-1-songliubraving@fb.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 2b8265ad7bf5..9ec0b0bfddbd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3779,11 +3779,23 @@ static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event) perf_event_groups_insert(&ctx->flexible_groups, event); } +/* pick an event from the flexible_groups to rotate */ static inline struct perf_event * -ctx_first_active(struct perf_event_context *ctx) +ctx_event_to_rotate(struct perf_event_context *ctx) { - return list_first_entry_or_null(&ctx->flexible_active, - struct perf_event, active_list); + struct perf_event *event; + + /* pick the first active flexible event */ + event = list_first_entry_or_null(&ctx->flexible_active, + struct perf_event, active_list); + + /* if no active flexible event, pick the first event */ + if (!event) { + event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree), + typeof(*event), group_node); + } + + return event; } static bool perf_rotate_context(struct perf_cpu_context *cpuctx) @@ -3808,9 +3820,9 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx) perf_pmu_disable(cpuctx->ctx.pmu); if (task_rotate) - task_event = ctx_first_active(task_ctx); + task_event = ctx_event_to_rotate(task_ctx); if (cpu_rotate) - cpu_event = ctx_first_active(&cpuctx->ctx); + cpu_event = ctx_event_to_rotate(&cpuctx->ctx); /* * As per the order given at ctx_resched() first 'pop' task flexible -- cgit v1.2.3 From 9ef16693aff8137faa21d16ffe65bb9832d24d71 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 11 Oct 2019 17:56:57 -0400 Subject: ftrace: Get a reference counter for the trace_array on filter files The ftrace set_ftrace_filter and set_ftrace_notrace files are specific for an instance now. They need to take a reference to the instance otherwise there could be a race between accessing the files and deleting the instance. It wasn't until the :mod: caching where these file operations started referencing the trace_array directly. Cc: stable@vger.kernel.org Fixes: 673feb9d76ab3 ("ftrace: Add :mod: caching infrastructure to trace_array") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 62a50bf399d6..32c2eb167de0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3540,21 +3540,22 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, struct ftrace_hash *hash; struct list_head *mod_head; struct trace_array *tr = ops->private; - int ret = 0; + int ret = -ENOMEM; ftrace_ops_init(ops); if (unlikely(ftrace_disabled)) return -ENODEV; + if (tr && trace_array_get(tr) < 0) + return -ENODEV; + iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) - return -ENOMEM; + goto out; - if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) { - kfree(iter); - return -ENOMEM; - } + if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) + goto out; iter->ops = ops; iter->flags = flag; @@ -3584,13 +3585,13 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, if (!iter->hash) { trace_parser_put(&iter->parser); - kfree(iter); - ret = -ENOMEM; goto out_unlock; } } else iter->hash = hash; + ret = 0; + if (file->f_mode & FMODE_READ) { iter->pg = ftrace_pages_start; @@ -3602,7 +3603,6 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, /* Failed */ free_ftrace_hash(iter->hash); trace_parser_put(&iter->parser); - kfree(iter); } } else file->private_data = iter; @@ -3610,6 +3610,13 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, out_unlock: mutex_unlock(&ops->func_hash->regex_lock); + out: + if (ret) { + kfree(iter); + if (tr) + trace_array_put(tr); + } + return ret; } @@ -5037,6 +5044,8 @@ int ftrace_regex_release(struct inode *inode, struct file *file) mutex_unlock(&iter->ops->func_hash->regex_lock); free_ftrace_hash(iter->hash); + if (iter->tr) + trace_array_put(iter->tr); kfree(iter); return 0; -- cgit v1.2.3 From 194c2c74f5532e62c218adeb8e2b683119503907 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 11 Oct 2019 18:19:17 -0400 Subject: tracing: Get trace_array reference for available_tracers files As instances may have different tracers available, we need to look at the trace_array descriptor that shows the list of the available tracers for the instance. But there's a race between opening the file and an admin deleting the instance. The trace_array_get() needs to be called before accessing the trace_array. Cc: stable@vger.kernel.org Fixes: 607e2ea167e56 ("tracing: Set up infrastructure to allow tracers for instances") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 252f79c435f8..fa7d813b04c6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4355,9 +4355,14 @@ static int show_traces_open(struct inode *inode, struct file *file) if (tracing_disabled) return -ENODEV; + if (trace_array_get(tr) < 0) + return -ENODEV; + ret = seq_open(file, &show_traces_seq_ops); - if (ret) + if (ret) { + trace_array_put(tr); return ret; + } m = file->private_data; m->private = tr; @@ -4365,6 +4370,14 @@ static int show_traces_open(struct inode *inode, struct file *file) return 0; } +static int show_traces_release(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + + trace_array_put(tr); + return seq_release(inode, file); +} + static ssize_t tracing_write_stub(struct file *filp, const char __user *ubuf, size_t count, loff_t *ppos) @@ -4395,8 +4408,8 @@ static const struct file_operations tracing_fops = { static const struct file_operations show_traces_fops = { .open = show_traces_open, .read = seq_read, - .release = seq_release, .llseek = seq_lseek, + .release = show_traces_release, }; static ssize_t -- cgit v1.2.3 From aa07d71f1bc7ea20e442e812b5de9d632b7f84c6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 11 Oct 2019 19:12:21 -0400 Subject: tracing: Have trace events system open call tracing_open_generic_tr() Instead of having the trace events system open call open code the taking of the trace_array descriptor (with trace_array_get()) and then calling trace_open_generic(), have it use the tracing_open_generic_tr() that does the combination of the two. This requires making tracing_open_generic_tr() global. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- kernel/trace/trace.h | 1 + kernel/trace/trace_events.c | 17 +++-------------- 3 files changed, 5 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fa7d813b04c6..94f1b9124939 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4156,7 +4156,7 @@ bool tracing_is_disabled(void) * Open and update trace_array ref count. * Must have the current trace_array passed to it. */ -static int tracing_open_generic_tr(struct inode *inode, struct file *filp) +int tracing_open_generic_tr(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f801d154ff6a..854dbf4050f8 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -681,6 +681,7 @@ void tracing_reset_online_cpus(struct trace_buffer *buf); void tracing_reset_current(int cpu); void tracing_reset_all_online_cpus(void); int tracing_open_generic(struct inode *inode, struct file *filp); +int tracing_open_generic_tr(struct inode *inode, struct file *filp); bool tracing_is_disabled(void); bool tracer_tracing_is_on(struct trace_array *tr); void tracer_tracing_on(struct trace_array *tr); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index b89cdfe20bc1..9613a757c902 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1440,28 +1440,17 @@ static int system_tr_open(struct inode *inode, struct file *filp) struct trace_array *tr = inode->i_private; int ret; - if (tracing_is_disabled()) - return -ENODEV; - - if (trace_array_get(tr) < 0) - return -ENODEV; - /* Make a temporary dir that has no system but points to tr */ dir = kzalloc(sizeof(*dir), GFP_KERNEL); - if (!dir) { - trace_array_put(tr); + if (!dir) return -ENOMEM; - } - - dir->tr = tr; - ret = tracing_open_generic(inode, filp); + ret = tracing_open_generic_tr(inode, filp); if (ret < 0) { - trace_array_put(tr); kfree(dir); return ret; } - + dir->tr = tr; filp->private_data = dir; return 0; -- cgit v1.2.3 From 8530dec63e7b486e3761cc3d74a22de301845ff5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 11 Oct 2019 17:39:57 -0400 Subject: tracing: Add tracing_check_open_get_tr() Currently, most files in the tracefs directory test if tracing_disabled is set. If so, it should return -ENODEV. The tracing_disabled is called when tracing is found to be broken. Originally it was done in case the ring buffer was found to be corrupted, and we wanted to prevent reading it from crashing the kernel. But it's also called if a tracing selftest fails on boot. It's a one way switch. That is, once it is triggered, tracing is disabled until reboot. As most tracefs files can also be used by instances in the tracefs directory, they need to be carefully done. Each instance has a trace_array associated to it, and when the instance is removed, the trace_array is freed. But if an instance is opened with a reference to the trace_array, then it requires looking up the trace_array to get its ref counter (as there could be a race with it being deleted and the open itself). Once it is found, a reference is added to prevent the instance from being removed (and the trace_array associated with it freed). Combine the two checks (tracing_disabled and trace_array_get()) into a single helper function. This will also make it easier to add lockdown to tracefs later. Link: http://lkml.kernel.org/r/20191011135458.7399da44@gandalf.local.home Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 7 ++- kernel/trace/trace.c | 117 ++++++++++++++++++++++----------------- kernel/trace/trace.h | 1 + kernel/trace/trace_dynevent.c | 4 ++ kernel/trace/trace_events.c | 10 ++-- kernel/trace/trace_events_hist.c | 2 +- 6 files changed, 81 insertions(+), 60 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 32c2eb167de0..8b765a55e01c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3547,7 +3547,7 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, if (unlikely(ftrace_disabled)) return -ENODEV; - if (tr && trace_array_get(tr) < 0) + if (tracing_check_open_get_tr(tr)) return -ENODEV; iter = kzalloc(sizeof(*iter), GFP_KERNEL); @@ -6546,8 +6546,9 @@ ftrace_pid_open(struct inode *inode, struct file *file) struct seq_file *m; int ret = 0; - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 94f1b9124939..26ee280f852b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -304,6 +304,17 @@ void trace_array_put(struct trace_array *this_tr) mutex_unlock(&trace_types_lock); } +int tracing_check_open_get_tr(struct trace_array *tr) +{ + if (tracing_disabled) + return -ENODEV; + + if (tr && trace_array_get(tr) < 0) + return -ENODEV; + + return 0; +} + int call_filter_check_discard(struct trace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event) @@ -4140,8 +4151,11 @@ release: int tracing_open_generic(struct inode *inode, struct file *filp) { - if (tracing_disabled) - return -ENODEV; + int ret; + + ret = tracing_check_open_get_tr(NULL); + if (ret) + return ret; filp->private_data = inode->i_private; return 0; @@ -4159,12 +4173,11 @@ bool tracing_is_disabled(void) int tracing_open_generic_tr(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; + int ret; - if (tracing_disabled) - return -ENODEV; - - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; filp->private_data = inode->i_private; @@ -4233,10 +4246,11 @@ static int tracing_open(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; struct trace_iterator *iter; - int ret = 0; + int ret; - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; /* If this file was open for write, then erase contents */ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { @@ -4352,11 +4366,9 @@ static int show_traces_open(struct inode *inode, struct file *file) struct seq_file *m; int ret; - if (tracing_disabled) - return -ENODEV; - - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; ret = seq_open(file, &show_traces_seq_ops); if (ret) { @@ -4710,11 +4722,9 @@ static int tracing_trace_options_open(struct inode *inode, struct file *file) struct trace_array *tr = inode->i_private; int ret; - if (tracing_disabled) - return -ENODEV; - - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; ret = single_open(file, tracing_trace_options_show, inode->i_private); if (ret < 0) @@ -5051,8 +5061,11 @@ static const struct seq_operations tracing_saved_tgids_seq_ops = { static int tracing_saved_tgids_open(struct inode *inode, struct file *filp) { - if (tracing_disabled) - return -ENODEV; + int ret; + + ret = tracing_check_open_get_tr(NULL); + if (ret) + return ret; return seq_open(filp, &tracing_saved_tgids_seq_ops); } @@ -5128,8 +5141,11 @@ static const struct seq_operations tracing_saved_cmdlines_seq_ops = { static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp) { - if (tracing_disabled) - return -ENODEV; + int ret; + + ret = tracing_check_open_get_tr(NULL); + if (ret) + return ret; return seq_open(filp, &tracing_saved_cmdlines_seq_ops); } @@ -5293,8 +5309,11 @@ static const struct seq_operations tracing_eval_map_seq_ops = { static int tracing_eval_map_open(struct inode *inode, struct file *filp) { - if (tracing_disabled) - return -ENODEV; + int ret; + + ret = tracing_check_open_get_tr(NULL); + if (ret) + return ret; return seq_open(filp, &tracing_eval_map_seq_ops); } @@ -5817,13 +5836,11 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; struct trace_iterator *iter; - int ret = 0; - - if (tracing_disabled) - return -ENODEV; + int ret; - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; mutex_lock(&trace_types_lock); @@ -6560,11 +6577,9 @@ static int tracing_clock_open(struct inode *inode, struct file *file) struct trace_array *tr = inode->i_private; int ret; - if (tracing_disabled) - return -ENODEV; - - if (trace_array_get(tr)) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; ret = single_open(file, tracing_clock_show, inode->i_private); if (ret < 0) @@ -6594,11 +6609,9 @@ static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file) struct trace_array *tr = inode->i_private; int ret; - if (tracing_disabled) - return -ENODEV; - - if (trace_array_get(tr)) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; ret = single_open(file, tracing_time_stamp_mode_show, inode->i_private); if (ret < 0) @@ -6651,10 +6664,11 @@ static int tracing_snapshot_open(struct inode *inode, struct file *file) struct trace_array *tr = inode->i_private; struct trace_iterator *iter; struct seq_file *m; - int ret = 0; + int ret; - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; if (file->f_mode & FMODE_READ) { iter = __tracing_open(inode, file, true); @@ -7118,8 +7132,9 @@ static int tracing_err_log_open(struct inode *inode, struct file *file) struct trace_array *tr = inode->i_private; int ret = 0; - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; /* If this file was opened for write, then erase contents */ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) @@ -7170,11 +7185,9 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) struct ftrace_buffer_info *info; int ret; - if (tracing_disabled) - return -ENODEV; - - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 854dbf4050f8..d685c61085c0 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -338,6 +338,7 @@ extern struct mutex trace_types_lock; extern int trace_array_get(struct trace_array *tr); extern void trace_array_put(struct trace_array *tr); +extern int tracing_check_open_get_tr(struct trace_array *tr); extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs); extern int tracing_set_clock(struct trace_array *tr, const char *clockstr); diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index a41fed46c285..89779eb84a07 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -174,6 +174,10 @@ static int dyn_event_open(struct inode *inode, struct file *file) { int ret; + ret = tracing_check_open_get_tr(NULL); + if (ret) + return ret; + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { ret = dyn_events_release_all(NULL); if (ret < 0) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 9613a757c902..71ab0a3660f4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1794,8 +1794,9 @@ ftrace_event_set_open(struct inode *inode, struct file *file) struct trace_array *tr = inode->i_private; int ret; - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) @@ -1814,8 +1815,9 @@ ftrace_event_set_pid_open(struct inode *inode, struct file *file) struct trace_array *tr = inode->i_private; int ret; - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 9468bd8d44a2..dd18d76bf1bd 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1680,7 +1680,7 @@ static int save_hist_vars(struct hist_trigger_data *hist_data) if (var_data) return 0; - if (trace_array_get(tr) < 0) + if (tracing_check_open_get_tr(tr)) return -ENODEV; var_data = kzalloc(sizeof(*var_data), GFP_KERNEL); -- cgit v1.2.3 From 17911ff38aa58d3c95c07589dbf5d3564c4cf3c5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 11 Oct 2019 17:22:50 -0400 Subject: tracing: Add locked_down checks to the open calls of files created for tracefs Added various checks on open tracefs calls to see if tracefs is in lockdown mode, and if so, to return -EPERM. Note, the event format files (which are basically standard on all machines) as well as the enabled_functions file (which shows what is currently being traced) are not lockde down. Perhaps they should be, but it seems counter intuitive to lockdown information to help you know if the system has been modified. Link: http://lkml.kernel.org/r/CAHk-=wj7fGPKUspr579Cii-w_y60PtRaiDgKuxVtBAMK0VNNkA@mail.gmail.com Suggested-by: Linus Torvalds Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 23 ++++++++++++++++++++++- kernel/trace/trace.c | 8 ++++++++ kernel/trace/trace_events.c | 8 ++++++++ kernel/trace/trace_events_hist.c | 11 +++++++++++ kernel/trace/trace_events_trigger.c | 8 +++++++- kernel/trace/trace_kprobe.c | 12 +++++++++++- kernel/trace/trace_printk.c | 7 +++++++ kernel/trace/trace_stack.c | 8 ++++++++ kernel/trace/trace_stat.c | 6 +++++- kernel/trace/trace_uprobe.c | 11 +++++++++++ 10 files changed, 98 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8b765a55e01c..f296d89be757 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -3486,6 +3487,11 @@ static int ftrace_avail_open(struct inode *inode, struct file *file) { struct ftrace_iterator *iter; + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; if (unlikely(ftrace_disabled)) return -ENODEV; @@ -3505,6 +3511,15 @@ ftrace_enabled_open(struct inode *inode, struct file *file) { struct ftrace_iterator *iter; + /* + * This shows us what functions are currently being + * traced and by what. Not sure if we want lockdown + * to hide such critical information for an admin. + * Although, perhaps it can show information we don't + * want people to see, but if something is tracing + * something, we probably want to know about it. + */ + iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); if (!iter) return -ENOMEM; @@ -3625,6 +3640,7 @@ ftrace_filter_open(struct inode *inode, struct file *file) { struct ftrace_ops *ops = inode->i_private; + /* Checks for tracefs lockdown */ return ftrace_regex_open(ops, FTRACE_ITER_FILTER | FTRACE_ITER_DO_PROBES, inode, file); @@ -3635,6 +3651,7 @@ ftrace_notrace_open(struct inode *inode, struct file *file) { struct ftrace_ops *ops = inode->i_private; + /* Checks for tracefs lockdown */ return ftrace_regex_open(ops, FTRACE_ITER_NOTRACE, inode, file); } @@ -5203,9 +5220,13 @@ static int __ftrace_graph_open(struct inode *inode, struct file *file, struct ftrace_graph_data *fgd) { - int ret = 0; + int ret; struct ftrace_hash *new_hash = NULL; + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + if (file->f_mode & FMODE_WRITE) { const int size_bits = FTRACE_HASH_DEFAULT_BITS; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 26ee280f852b..2b4eff383505 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -306,6 +307,12 @@ void trace_array_put(struct trace_array *this_tr) int tracing_check_open_get_tr(struct trace_array *tr) { + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + if (tracing_disabled) return -ENODEV; @@ -6813,6 +6820,7 @@ static int snapshot_raw_open(struct inode *inode, struct file *filp) struct ftrace_buffer_info *info; int ret; + /* The following checks for tracefs lockdown */ ret = tracing_buffers_open(inode, filp); if (ret < 0) return ret; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 71ab0a3660f4..fba87d10f0c1 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -12,6 +12,7 @@ #define pr_fmt(fmt) fmt #include +#include #include #include #include @@ -1294,6 +1295,8 @@ static int trace_format_open(struct inode *inode, struct file *file) struct seq_file *m; int ret; + /* Do we want to hide event format files on tracefs lockdown? */ + ret = seq_open(file, &trace_format_seq_ops); if (ret < 0) return ret; @@ -1760,6 +1763,10 @@ ftrace_event_open(struct inode *inode, struct file *file, struct seq_file *m; int ret; + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + ret = seq_open(file, seq_ops); if (ret < 0) return ret; @@ -1784,6 +1791,7 @@ ftrace_event_avail_open(struct inode *inode, struct file *file) { const struct seq_operations *seq_ops = &show_event_seq_ops; + /* Checks for tracefs lockdown */ return ftrace_event_open(inode, file, seq_ops); } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index dd18d76bf1bd..57648c5aa679 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -1448,6 +1449,10 @@ static int synth_events_open(struct inode *inode, struct file *file) { int ret; + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { ret = dyn_events_release_all(&synth_event_ops); if (ret < 0) @@ -5515,6 +5520,12 @@ static int hist_show(struct seq_file *m, void *v) static int event_hist_open(struct inode *inode, struct file *file) { + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + return single_open(file, hist_show, file); } diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 2a2912cb4533..2cd53ca21b51 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -5,6 +5,7 @@ * Copyright (C) 2013 Tom Zanussi */ +#include #include #include #include @@ -173,7 +174,11 @@ static const struct seq_operations event_triggers_seq_ops = { static int event_trigger_regex_open(struct inode *inode, struct file *file) { - int ret = 0; + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; mutex_lock(&event_mutex); @@ -292,6 +297,7 @@ event_trigger_write(struct file *filp, const char __user *ubuf, static int event_trigger_open(struct inode *inode, struct file *filp) { + /* Checks for tracefs lockdown */ return event_trigger_regex_open(inode, filp); } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 324ffbea3556..1552a95c743b 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -7,11 +7,11 @@ */ #define pr_fmt(fmt) "trace_kprobe: " fmt +#include #include #include #include #include -#include #include /* for COMMAND_LINE_SIZE */ @@ -936,6 +936,10 @@ static int probes_open(struct inode *inode, struct file *file) { int ret; + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { ret = dyn_events_release_all(&trace_kprobe_ops); if (ret < 0) @@ -988,6 +992,12 @@ static const struct seq_operations profile_seq_op = { static int profile_open(struct inode *inode, struct file *file) { + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + return seq_open(file, &profile_seq_op); } diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index c3fd849d4a8f..d4e31e969206 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -6,6 +6,7 @@ * */ #include +#include #include #include #include @@ -348,6 +349,12 @@ static const struct seq_operations show_format_seq_ops = { static int ftrace_formats_open(struct inode *inode, struct file *file) { + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + return seq_open(file, &show_format_seq_ops); } diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index ec9a34a97129..4df9a209f7ca 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -5,6 +5,7 @@ */ #include #include +#include #include #include #include @@ -470,6 +471,12 @@ static const struct seq_operations stack_trace_seq_ops = { static int stack_trace_open(struct inode *inode, struct file *file) { + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + return seq_open(file, &stack_trace_seq_ops); } @@ -487,6 +494,7 @@ stack_trace_filter_open(struct inode *inode, struct file *file) { struct ftrace_ops *ops = inode->i_private; + /* Checks for tracefs lockdown */ return ftrace_regex_open(ops, FTRACE_ITER_FILTER, inode, file); } diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 75bf1bcb4a8a..9ab0a1a7ad5e 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -9,7 +9,7 @@ * */ - +#include #include #include #include @@ -238,6 +238,10 @@ static int tracing_stat_open(struct inode *inode, struct file *file) struct seq_file *m; struct stat_session *session = inode->i_private; + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + ret = stat_seq_init(session); if (ret) return ret; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index dd884341f5c5..352073d36585 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -7,6 +7,7 @@ */ #define pr_fmt(fmt) "trace_uprobe: " fmt +#include #include #include #include @@ -769,6 +770,10 @@ static int probes_open(struct inode *inode, struct file *file) { int ret; + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { ret = dyn_events_release_all(&trace_uprobe_ops); if (ret) @@ -818,6 +823,12 @@ static const struct seq_operations profile_seq_op = { static int profile_open(struct inode *inode, struct file *file) { + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + return seq_open(file, &profile_seq_op); } -- cgit v1.2.3 From 98dc19c11470ee6048aba723d77079ad2cda8a52 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat (VMware)" Date: Thu, 10 Oct 2019 11:50:46 -0700 Subject: tracing/hwlat: Report total time spent in all NMIs during the sample nmi_total_ts is supposed to record the total time spent in *all* NMIs that occur on the given CPU during the (active portion of the) sampling window. However, the code seems to be overwriting this variable for each NMI, thereby only recording the time spent in the most recent NMI. Fix it by accumulating the duration instead. Link: http://lkml.kernel.org/r/157073343544.17189.13911783866738671133.stgit@srivatsa-ubuntu Fixes: 7b2c86250122 ("tracing: Add NMI tracing in hwlat detector") Cc: stable@vger.kernel.org Signed-off-by: Srivatsa S. Bhat (VMware) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_hwlat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index fa95139445b2..a0251a78807d 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -150,7 +150,7 @@ void trace_hwlat_callback(bool enter) if (enter) nmi_ts_start = time_get(); else - nmi_total_ts = time_get() - nmi_ts_start; + nmi_total_ts += time_get() - nmi_ts_start; } if (enter) -- cgit v1.2.3 From fc64e4ad80d4b72efce116f87b3174f0b7196f8e Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat (VMware)" Date: Thu, 10 Oct 2019 11:51:01 -0700 Subject: tracing/hwlat: Don't ignore outer-loop duration when calculating max_latency max_latency is intended to record the maximum ever observed hardware latency, which may occur in either part of the loop (inner/outer). So we need to also consider the outer-loop sample when updating max_latency. Link: http://lkml.kernel.org/r/157073345463.17189.18124025522664682811.stgit@srivatsa-ubuntu Fixes: e7c15cd8a113 ("tracing: Added hardware latency tracer") Cc: stable@vger.kernel.org Signed-off-by: Srivatsa S. Bhat (VMware) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_hwlat.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index a0251a78807d..862f4b0139fc 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -256,6 +256,8 @@ static int get_sample(void) /* Keep a running maximum ever recorded hardware latency */ if (sample > tr->max_latency) tr->max_latency = sample; + if (outer_sample > tr->max_latency) + tr->max_latency = outer_sample; } out: -- cgit v1.2.3 From d303de1fcf344ff7c15ed64c3f48a991c9958775 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 11 Oct 2019 16:21:34 +0200 Subject: tracing: Initialize iter->seq after zeroing in tracing_read_pipe() A customer reported the following softlockup: [899688.160002] NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [test.sh:16464] [899688.160002] CPU: 0 PID: 16464 Comm: test.sh Not tainted 4.12.14-6.23-azure #1 SLE12-SP4 [899688.160002] RIP: 0010:up_write+0x1a/0x30 [899688.160002] Kernel panic - not syncing: softlockup: hung tasks [899688.160002] RIP: 0010:up_write+0x1a/0x30 [899688.160002] RSP: 0018:ffffa86784d4fde8 EFLAGS: 00000257 ORIG_RAX: ffffffffffffff12 [899688.160002] RAX: ffffffff970fea00 RBX: 0000000000000001 RCX: 0000000000000000 [899688.160002] RDX: ffffffff00000001 RSI: 0000000000000080 RDI: ffffffff970fea00 [899688.160002] RBP: ffffffffffffffff R08: ffffffffffffffff R09: 0000000000000000 [899688.160002] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8b59014720d8 [899688.160002] R13: ffff8b59014720c0 R14: ffff8b5901471090 R15: ffff8b5901470000 [899688.160002] tracing_read_pipe+0x336/0x3c0 [899688.160002] __vfs_read+0x26/0x140 [899688.160002] vfs_read+0x87/0x130 [899688.160002] SyS_read+0x42/0x90 [899688.160002] do_syscall_64+0x74/0x160 It caught the process in the middle of trace_access_unlock(). There is no loop. So, it must be looping in the caller tracing_read_pipe() via the "waitagain" label. Crashdump analyze uncovered that iter->seq was completely zeroed at this point, including iter->seq.seq.size. It means that print_trace_line() was never able to print anything and there was no forward progress. The culprit seems to be in the code: /* reset all but tr, trace, and overruns */ memset(&iter->seq, 0, sizeof(struct trace_iterator) - offsetof(struct trace_iterator, seq)); It was added by the commit 53d0aa773053ab182877 ("ftrace: add logic to record overruns"). It was v2.6.27-rc1. It was the time when iter->seq looked like: struct trace_seq { unsigned char buffer[PAGE_SIZE]; unsigned int len; }; There was no "size" variable and zeroing was perfectly fine. The solution is to reinitialize the structure after or without zeroing. Link: http://lkml.kernel.org/r/20191011142134.11997-1-pmladek@suse.com Signed-off-by: Petr Mladek Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2b4eff383505..6a0ee9178365 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6036,6 +6036,7 @@ waitagain: sizeof(struct trace_iterator) - offsetof(struct trace_iterator, seq)); cpumask_clear(iter->started); + trace_seq_init(&iter->seq); iter->pos = -1; trace_event_read_lock(); -- cgit v1.2.3