From 7f07e5f1f778605e98cf2156d4db1ff3a3a1a74a Mon Sep 17 00:00:00 2001 From: Claudiu Manoil Date: Tue, 26 Mar 2019 11:48:57 +0200 Subject: net: mii: Fix PAUSE cap advertisement from linkmode_adv_to_lcl_adv_t() helper With a recent link mode advertisement code update this helper providing local pause capability translation used for flow control link mode negotiation got broken. For eth drivers using this helper, the issue is apparent only if either PAUSE or ASYM_PAUSE is being advertised. Fixes: 3c1bcc8614db ("net: ethernet: Convert phydev advertize and supported from u32 to link mode") Signed-off-by: Claudiu Manoil Signed-off-by: David S. Miller --- include/linux/mii.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mii.h b/include/linux/mii.h index 6fee8b1a4400..5cd824c1c0ca 100644 --- a/include/linux/mii.h +++ b/include/linux/mii.h @@ -469,7 +469,7 @@ static inline u32 linkmode_adv_to_lcl_adv_t(unsigned long *advertising) if (linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT, advertising)) lcl_adv |= ADVERTISE_PAUSE_CAP; - if (linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT, + if (linkmode_test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, advertising)) lcl_adv |= ADVERTISE_PAUSE_ASYM; -- cgit v1.2.3 From 80a2a9026b24c6bd34b8d58256973e22270bedec Mon Sep 17 00:00:00 2001 From: Yuval Avnery Date: Mon, 11 Mar 2019 06:18:24 +0200 Subject: net/mlx5e: Add a lock on tir list Refresh tirs is looping over a global list of tirs while netdevs are adding and removing tirs from that list. That is why a lock is required. Fixes: 724b2aa15126 ("net/mlx5e: TIRs management refactoring") Signed-off-by: Yuval Avnery Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 022541dc5dbf..0d0729648844 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -594,6 +594,8 @@ enum mlx5_pagefault_type_flags { }; struct mlx5_td { + /* protects tirs list changes while tirs refresh */ + struct mutex list_lock; struct list_head tirs_list; u32 tdn; }; -- cgit v1.2.3 From a0fe2c6479aab5723239b315ef1b552673f434a3 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 29 Mar 2019 22:46:49 +0100 Subject: linux/kernel.h: Use parentheses around argument in u64_to_user_ptr() Use parentheses around uses of the argument in u64_to_user_ptr() to ensure that the cast doesn't apply to part of the argument. There are existing uses of the macro of the form u64_to_user_ptr(A + B) which expands to (void __user *)(uintptr_t)A + B (the cast applies to the first operand of the addition, the addition is a pointer addition). This happens to still work as intended, the semantic difference doesn't cause a difference in behavior. But I want to use u64_to_user_ptr() with a ternary operator in the argument, like so: u64_to_user_ptr(A ? B : C) This currently doesn't work as intended. Signed-off-by: Jann Horn Signed-off-by: Borislav Petkov Reviewed-by: Mukesh Ojha Cc: Andrei Vagin Cc: Andrew Morton Cc: Dan Carpenter Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jani Nikula Cc: Kees Cook Cc: Masahiro Yamada Cc: NeilBrown Cc: Peter Zijlstra Cc: Qiaowei Ren Cc: Thomas Gleixner Cc: x86-ml Link: https://lkml.kernel.org/r/20190329214652.258477-1-jannh@google.com --- include/linux/kernel.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 34a5036debd3..2d14e21c16c0 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -47,8 +47,8 @@ #define u64_to_user_ptr(x) ( \ { \ - typecheck(u64, x); \ - (void __user *)(uintptr_t)x; \ + typecheck(u64, (x)); \ + (void __user *)(uintptr_t)(x); \ } \ ) -- cgit v1.2.3 From 631b7abacd02b88f4b0795c08b54ad4fc3e7c7c0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 7 Nov 2016 16:26:35 -0500 Subject: ptrace: Remove maxargs from task_current_syscall() task_current_syscall() has a single user that passes in 6 for maxargs, which is the maximum arguments that can be used to get system calls from syscall_get_arguments(). Instead of passing in a number of arguments to grab, just get 6 arguments. The args argument even specifies that it's an array of 6 items. This will also allow changing syscall_get_arguments() to not get a variable number of arguments, but always grab 6. Linus also suggested not passing in a bunch of arguments to task_current_syscall() but to instead pass in a pointer to a structure, and just fill the structure. struct seccomp_data has almost all the parameters that is needed except for the stack pointer (sp). As seccomp_data is part of uapi, and I'm afraid to change it, a new structure was created "syscall_info", which includes seccomp_data and adds the "sp" field. Link: http://lkml.kernel.org/r/20161107213233.466776454@goodmis.org Cc: Andy Lutomirski Cc: Alexey Dobriyan Cc: Oleg Nesterov Cc: Kees Cook Cc: Al Viro Cc: linux-fsdevel@vger.kernel.org Reviewed-by: Thomas Gleixner Signed-off-by: Steven Rostedt (VMware) --- include/linux/ptrace.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index edb9b040c94c..d5084ebd9f03 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -9,6 +9,13 @@ #include /* For BUG_ON. */ #include /* For task_active_pid_ns. */ #include +#include + +/* Add sp to seccomp_data, as seccomp is user API, we don't want to modify it */ +struct syscall_info { + __u64 sp; + struct seccomp_data data; +}; extern int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags); @@ -407,9 +414,7 @@ static inline void user_single_step_report(struct pt_regs *regs) #define current_user_stack_pointer() user_stack_pointer(current_pt_regs()) #endif -extern int task_current_syscall(struct task_struct *target, long *callno, - unsigned long args[6], unsigned int maxargs, - unsigned long *sp, unsigned long *pc); +extern int task_current_syscall(struct task_struct *target, struct syscall_info *info); extern void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact); #endif -- cgit v1.2.3 From 5f074f3e192f10c9fade898b9b3b8812e3d83342 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Fri, 5 Apr 2019 18:38:45 -0700 Subject: lib/string.c: implement a basic bcmp A recent optimization in Clang (r355672) lowers comparisons of the return value of memcmp against zero to comparisons of the return value of bcmp against zero. This helps some platforms that implement bcmp more efficiently than memcmp. glibc simply aliases bcmp to memcmp, but an optimized implementation is in the works. This results in linkage failures for all targets with Clang due to the undefined symbol. For now, just implement bcmp as a tailcail to memcmp to unbreak the build. This routine can be further optimized in the future. Other ideas discussed: * A weak alias was discussed, but breaks for architectures that define their own implementations of memcmp since aliases to declarations are not permitted (only definitions). Arch-specific memcmp implementations typically declare memcmp in C headers, but implement them in assembly. * -ffreestanding also is used sporadically throughout the kernel. * -fno-builtin-bcmp doesn't work when doing LTO. Link: https://bugs.llvm.org/show_bug.cgi?id=41035 Link: https://code.woboq.org/userspace/glibc/string/memcmp.c.html#bcmp Link: https://github.com/llvm/llvm-project/commit/8e16d73346f8091461319a7dfc4ddd18eedcff13 Link: https://github.com/ClangBuiltLinux/linux/issues/416 Link: http://lkml.kernel.org/r/20190313211335.165605-1-ndesaulniers@google.com Signed-off-by: Nick Desaulniers Reported-by: Nathan Chancellor Reported-by: Adhemerval Zanella Suggested-by: Arnd Bergmann Suggested-by: James Y Knight Suggested-by: Masahiro Yamada Suggested-by: Nathan Chancellor Suggested-by: Rasmus Villemoes Acked-by: Steven Rostedt (VMware) Reviewed-by: Nathan Chancellor Tested-by: Nathan Chancellor Reviewed-by: Masahiro Yamada Reviewed-by: Andy Shevchenko Cc: David Laight Cc: Rasmus Villemoes Cc: Namhyung Kim Cc: Greg Kroah-Hartman Cc: Alexander Shishkin Cc: Dan Williams Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/string.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/string.h b/include/linux/string.h index 7927b875f80c..6ab0a6fa512e 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -150,6 +150,9 @@ extern void * memscan(void *,int,__kernel_size_t); #ifndef __HAVE_ARCH_MEMCMP extern int memcmp(const void *,const void *,__kernel_size_t); #endif +#ifndef __HAVE_ARCH_BCMP +extern int bcmp(const void *,const void *,__kernel_size_t); +#endif #ifndef __HAVE_ARCH_MEMCHR extern void * memchr(const void *,int,__kernel_size_t); #endif -- cgit v1.2.3 From 6147e136ff5071609b54f18982dea87706288e21 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 5 Apr 2019 18:38:53 -0700 Subject: include/linux/bitrev.h: fix constant bitrev clang points out with hundreds of warnings that the bitrev macros have a problem with constant input: drivers/hwmon/sht15.c:187:11: error: variable '__x' is uninitialized when used within its own initialization [-Werror,-Wuninitialized] u8 crc = bitrev8(data->val_status & 0x0F); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ include/linux/bitrev.h:102:21: note: expanded from macro 'bitrev8' __constant_bitrev8(__x) : \ ~~~~~~~~~~~~~~~~~~~^~~~ include/linux/bitrev.h:67:11: note: expanded from macro '__constant_bitrev8' u8 __x = x; \ ~~~ ^ Both the bitrev and the __constant_bitrev macros use an internal variable named __x, which goes horribly wrong when passing one to the other. The obvious fix is to rename one of the variables, so this adds an extra '_'. It seems we got away with this because - there are only a few drivers using bitrev macros - usually there are no constant arguments to those - when they are constant, they tend to be either 0 or (unsigned)-1 (drivers/isdn/i4l/isdnhdlc.o, drivers/iio/amplifiers/ad8366.c) and give the correct result by pure chance. In fact, the only driver that I could find that gets different results with this is drivers/net/wan/slic_ds26522.c, which in turn is a driver for fairly rare hardware (adding the maintainer to Cc for testing). Link: http://lkml.kernel.org/r/20190322140503.123580-1-arnd@arndb.de Fixes: 556d2f055bf6 ("ARM: 8187/1: add CONFIG_HAVE_ARCH_BITREVERSE to support rbit instruction") Signed-off-by: Arnd Bergmann Reviewed-by: Nick Desaulniers Cc: Zhao Qiang Cc: Yalin Wang Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitrev.h | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitrev.h b/include/linux/bitrev.h index 50fb0dee23e8..d35b8ec1c485 100644 --- a/include/linux/bitrev.h +++ b/include/linux/bitrev.h @@ -34,41 +34,41 @@ static inline u32 __bitrev32(u32 x) #define __constant_bitrev32(x) \ ({ \ - u32 __x = x; \ - __x = (__x >> 16) | (__x << 16); \ - __x = ((__x & (u32)0xFF00FF00UL) >> 8) | ((__x & (u32)0x00FF00FFUL) << 8); \ - __x = ((__x & (u32)0xF0F0F0F0UL) >> 4) | ((__x & (u32)0x0F0F0F0FUL) << 4); \ - __x = ((__x & (u32)0xCCCCCCCCUL) >> 2) | ((__x & (u32)0x33333333UL) << 2); \ - __x = ((__x & (u32)0xAAAAAAAAUL) >> 1) | ((__x & (u32)0x55555555UL) << 1); \ - __x; \ + u32 ___x = x; \ + ___x = (___x >> 16) | (___x << 16); \ + ___x = ((___x & (u32)0xFF00FF00UL) >> 8) | ((___x & (u32)0x00FF00FFUL) << 8); \ + ___x = ((___x & (u32)0xF0F0F0F0UL) >> 4) | ((___x & (u32)0x0F0F0F0FUL) << 4); \ + ___x = ((___x & (u32)0xCCCCCCCCUL) >> 2) | ((___x & (u32)0x33333333UL) << 2); \ + ___x = ((___x & (u32)0xAAAAAAAAUL) >> 1) | ((___x & (u32)0x55555555UL) << 1); \ + ___x; \ }) #define __constant_bitrev16(x) \ ({ \ - u16 __x = x; \ - __x = (__x >> 8) | (__x << 8); \ - __x = ((__x & (u16)0xF0F0U) >> 4) | ((__x & (u16)0x0F0FU) << 4); \ - __x = ((__x & (u16)0xCCCCU) >> 2) | ((__x & (u16)0x3333U) << 2); \ - __x = ((__x & (u16)0xAAAAU) >> 1) | ((__x & (u16)0x5555U) << 1); \ - __x; \ + u16 ___x = x; \ + ___x = (___x >> 8) | (___x << 8); \ + ___x = ((___x & (u16)0xF0F0U) >> 4) | ((___x & (u16)0x0F0FU) << 4); \ + ___x = ((___x & (u16)0xCCCCU) >> 2) | ((___x & (u16)0x3333U) << 2); \ + ___x = ((___x & (u16)0xAAAAU) >> 1) | ((___x & (u16)0x5555U) << 1); \ + ___x; \ }) #define __constant_bitrev8x4(x) \ ({ \ - u32 __x = x; \ - __x = ((__x & (u32)0xF0F0F0F0UL) >> 4) | ((__x & (u32)0x0F0F0F0FUL) << 4); \ - __x = ((__x & (u32)0xCCCCCCCCUL) >> 2) | ((__x & (u32)0x33333333UL) << 2); \ - __x = ((__x & (u32)0xAAAAAAAAUL) >> 1) | ((__x & (u32)0x55555555UL) << 1); \ - __x; \ + u32 ___x = x; \ + ___x = ((___x & (u32)0xF0F0F0F0UL) >> 4) | ((___x & (u32)0x0F0F0F0FUL) << 4); \ + ___x = ((___x & (u32)0xCCCCCCCCUL) >> 2) | ((___x & (u32)0x33333333UL) << 2); \ + ___x = ((___x & (u32)0xAAAAAAAAUL) >> 1) | ((___x & (u32)0x55555555UL) << 1); \ + ___x; \ }) #define __constant_bitrev8(x) \ ({ \ - u8 __x = x; \ - __x = (__x >> 4) | (__x << 4); \ - __x = ((__x & (u8)0xCCU) >> 2) | ((__x & (u8)0x33U) << 2); \ - __x = ((__x & (u8)0xAAU) >> 1) | ((__x & (u8)0x55U) << 1); \ - __x; \ + u8 ___x = x; \ + ___x = (___x >> 4) | (___x << 4); \ + ___x = ((___x & (u8)0xCCU) >> 2) | ((___x & (u8)0x33U) << 2); \ + ___x = ((___x & (u8)0xAAU) >> 1) | ((___x & (u8)0x55U) << 1); \ + ___x; \ }) #define bitrev32(x) \ -- cgit v1.2.3 From fcae96ff96538f66e7acd5d4e0f2e7516ff8cbd0 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 5 Apr 2019 18:39:01 -0700 Subject: mm: fix vm_fault_t cast in VM_FAULT_GET_HINDEX() Symmetrically to VM_FAULT_SET_HINDEX(), we need a force-cast in VM_FAULT_GET_HINDEX() to tell sparse that this is intentional. Sparse complains about the current code when building a kernel with CONFIG_MEMORY_FAILURE: arch/x86/mm/fault.c:1058:53: warning: restricted vm_fault_t degrades to integer Link: http://lkml.kernel.org/r/20190327204117.35215-1-jannh@google.com Fixes: 3d3539018d2c ("mm: create the new vm_fault_t type") Signed-off-by: Jann Horn Reviewed-by: Andrew Morton Cc: Souptick Joarder Cc: Matthew Wilcox Cc: Vlastimil Babka Cc: "Kirill A. Shutemov" Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7eade9132f02..4ef4bbe78a1d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -671,7 +671,7 @@ enum vm_fault_reason { /* Encode hstate index for a hwpoisoned large page */ #define VM_FAULT_SET_HINDEX(x) ((__force vm_fault_t)((x) << 16)) -#define VM_FAULT_GET_HINDEX(x) (((x) >> 16) & 0xf) +#define VM_FAULT_GET_HINDEX(x) (((__force unsigned int)(x) >> 16) & 0xf) #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | \ VM_FAULT_SIGSEGV | VM_FAULT_HWPOISON | \ -- cgit v1.2.3 From 0b3d6e6f2dd0a7b697b1aa8c167265908940624b Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Fri, 5 Apr 2019 18:39:18 -0700 Subject: mm: writeback: use exact memcg dirty counts Since commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in memory.stat reporting") memcg dirty and writeback counters are managed as: 1) per-memcg per-cpu values in range of [-32..32] 2) per-memcg atomic counter When a per-cpu counter cannot fit in [-32..32] it's flushed to the atomic. Stat readers only check the atomic. Thus readers such as balance_dirty_pages() may see a nontrivial error margin: 32 pages per cpu. Assuming 100 cpus: 4k x86 page_size: 13 MiB error per memcg 64k ppc page_size: 200 MiB error per memcg Considering that dirty+writeback are used together for some decisions the errors double. This inaccuracy can lead to undeserved oom kills. One nasty case is when all per-cpu counters hold positive values offsetting an atomic negative value (i.e. per_cpu[*]=32, atomic=n_cpu*-32). balance_dirty_pages() only consults the atomic and does not consider throttling the next n_cpu*32 dirty pages. If the file_lru is in the 13..200 MiB range then there's absolutely no dirty throttling, which burdens vmscan with only dirty+writeback pages thus resorting to oom kill. It could be argued that tiny containers are not supported, but it's more subtle. It's the amount the space available for file lru that matters. If a container has memory.max-200MiB of non reclaimable memory, then it will also suffer such oom kills on a 100 cpu machine. The following test reliably ooms without this patch. This patch avoids oom kills. $ cat test mount -t cgroup2 none /dev/cgroup cd /dev/cgroup echo +io +memory > cgroup.subtree_control mkdir test cd test echo 10M > memory.max (echo $BASHPID > cgroup.procs && exec /memcg-writeback-stress /foo) (echo $BASHPID > cgroup.procs && exec dd if=/dev/zero of=/foo bs=2M count=100) $ cat memcg-writeback-stress.c /* * Dirty pages from all but one cpu. * Clean pages from the non dirtying cpu. * This is to stress per cpu counter imbalance. * On a 100 cpu machine: * - per memcg per cpu dirty count is 32 pages for each of 99 cpus * - per memcg atomic is -99*32 pages * - thus the complete dirty limit: sum of all counters 0 * - balance_dirty_pages() only sees atomic count -99*32 pages, which * it max()s to 0. * - So a workload can dirty -99*32 pages before balance_dirty_pages() * cares. */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include static char *buf; static int bufSize; static void set_affinity(int cpu) { cpu_set_t affinity; CPU_ZERO(&affinity); CPU_SET(cpu, &affinity); if (sched_setaffinity(0, sizeof(affinity), &affinity)) err(1, "sched_setaffinity"); } static void dirty_on(int output_fd, int cpu) { int i, wrote; set_affinity(cpu); for (i = 0; i < 32; i++) { for (wrote = 0; wrote < bufSize; ) { int ret = write(output_fd, buf+wrote, bufSize-wrote); if (ret == -1) err(1, "write"); wrote += ret; } } } int main(int argc, char **argv) { int cpu, flush_cpu = 1, output_fd; const char *output; if (argc != 2) errx(1, "usage: output_file"); output = argv[1]; bufSize = getpagesize(); buf = malloc(getpagesize()); if (buf == NULL) errx(1, "malloc failed"); output_fd = open(output, O_CREAT|O_RDWR); if (output_fd == -1) err(1, "open(%s)", output); for (cpu = 0; cpu < get_nprocs(); cpu++) { if (cpu != flush_cpu) dirty_on(output_fd, cpu); } set_affinity(flush_cpu); if (fsync(output_fd)) err(1, "fsync(%s)", output); if (close(output_fd)) err(1, "close(%s)", output); free(buf); } Make balance_dirty_pages() and wb_over_bg_thresh() work harder to collect exact per memcg counters. This avoids the aforementioned oom kills. This does not affect the overhead of memory.stat, which still reads the single atomic counter. Why not use percpu_counter? memcg already handles cpus going offline, so no need for that overhead from percpu_counter. And the percpu_counter spinlocks are more heavyweight than is required. It probably also makes sense to use exact dirty and writeback counters in memcg oom reports. But that is saved for later. Link: http://lkml.kernel.org/r/20190329174609.164344-1-gthelen@google.com Signed-off-by: Greg Thelen Reviewed-by: Roman Gushchin Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Tejun Heo Cc: [4.16+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1f3d880b7ca1..dbb6118370c1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -566,7 +566,10 @@ struct mem_cgroup *lock_page_memcg(struct page *page); void __unlock_page_memcg(struct mem_cgroup *memcg); void unlock_page_memcg(struct page *page); -/* idx can be of type enum memcg_stat_item or node_stat_item */ +/* + * idx can be of type enum memcg_stat_item or node_stat_item. + * Keep in sync with memcg_exact_page_state(). + */ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { -- cgit v1.2.3 From 10dce8af34226d90fa56746a934f8da5dcdba3df Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Tue, 26 Mar 2019 22:20:43 +0000 Subject: fs: stream_open - opener for stream-like files so that read and write can run simultaneously without deadlock Commit 9c225f2655e3 ("vfs: atomic f_pos accesses as per POSIX") added locking for file.f_pos access and in particular made concurrent read and write not possible - now both those functions take f_pos lock for the whole run, and so if e.g. a read is blocked waiting for data, write will deadlock waiting for that read to complete. This caused regression for stream-like files where previously read and write could run simultaneously, but after that patch could not do so anymore. See e.g. commit 581d21a2d02a ("xenbus: fix deadlock on writes to /proc/xen/xenbus") which fixes such regression for particular case of /proc/xen/xenbus. The patch that added f_pos lock in 2014 did so to guarantee POSIX thread safety for read/write/lseek and added the locking to file descriptors of all regular files. In 2014 that thread-safety problem was not new as it was already discussed earlier in 2006. However even though 2006'th version of Linus's patch was adding f_pos locking "only for files that are marked seekable with FMODE_LSEEK (thus avoiding the stream-like objects like pipes and sockets)", the 2014 version - the one that actually made it into the tree as 9c225f2655e3 - is doing so irregardless of whether a file is seekable or not. See https://lore.kernel.org/lkml/53022DB1.4070805@gmail.com/ https://lwn.net/Articles/180387 https://lwn.net/Articles/180396 for historic context. The reason that it did so is, probably, that there are many files that are marked non-seekable, but e.g. their read implementation actually depends on knowing current position to correctly handle the read. Some examples: kernel/power/user.c snapshot_read fs/debugfs/file.c u32_array_read fs/fuse/control.c fuse_conn_waiting_read + ... drivers/hwmon/asus_atk0110.c atk_debugfs_ggrp_read arch/s390/hypfs/inode.c hypfs_read_iter ... Despite that, many nonseekable_open users implement read and write with pure stream semantics - they don't depend on passed ppos at all. And for those cases where read could wait for something inside, it creates a situation similar to xenbus - the write could be never made to go until read is done, and read is waiting for some, potentially external, event, for potentially unbounded time -> deadlock. Besides xenbus, there are 14 such places in the kernel that I've found with semantic patch (see below): drivers/xen/evtchn.c:667:8-24: ERROR: evtchn_fops: .read() can deadlock .write() drivers/isdn/capi/capi.c:963:8-24: ERROR: capi_fops: .read() can deadlock .write() drivers/input/evdev.c:527:1-17: ERROR: evdev_fops: .read() can deadlock .write() drivers/char/pcmcia/cm4000_cs.c:1685:7-23: ERROR: cm4000_fops: .read() can deadlock .write() net/rfkill/core.c:1146:8-24: ERROR: rfkill_fops: .read() can deadlock .write() drivers/s390/char/fs3270.c:488:1-17: ERROR: fs3270_fops: .read() can deadlock .write() drivers/usb/misc/ldusb.c:310:1-17: ERROR: ld_usb_fops: .read() can deadlock .write() drivers/hid/uhid.c:635:1-17: ERROR: uhid_fops: .read() can deadlock .write() net/batman-adv/icmp_socket.c:80:1-17: ERROR: batadv_fops: .read() can deadlock .write() drivers/media/rc/lirc_dev.c:198:1-17: ERROR: lirc_fops: .read() can deadlock .write() drivers/leds/uleds.c:77:1-17: ERROR: uleds_fops: .read() can deadlock .write() drivers/input/misc/uinput.c:400:1-17: ERROR: uinput_fops: .read() can deadlock .write() drivers/infiniband/core/user_mad.c:985:7-23: ERROR: umad_fops: .read() can deadlock .write() drivers/gnss/core.c:45:1-17: ERROR: gnss_fops: .read() can deadlock .write() In addition to the cases above another regression caused by f_pos locking is that now FUSE filesystems that implement open with FOPEN_NONSEEKABLE flag, can no longer implement bidirectional stream-like files - for the same reason as above e.g. read can deadlock write locking on file.f_pos in the kernel. FUSE's FOPEN_NONSEEKABLE was added in 2008 in a7c1b990f715 ("fuse: implement nonseekable open") to support OSSPD. OSSPD implements /dev/dsp in userspace with FOPEN_NONSEEKABLE flag, with corresponding read and write routines not depending on current position at all, and with both read and write being potentially blocking operations: See https://github.com/libfuse/osspd https://lwn.net/Articles/308445 https://github.com/libfuse/osspd/blob/14a9cff0/osspd.c#L1406 https://github.com/libfuse/osspd/blob/14a9cff0/osspd.c#L1438-L1477 https://github.com/libfuse/osspd/blob/14a9cff0/osspd.c#L1479-L1510 Corresponding libfuse example/test also describes FOPEN_NONSEEKABLE as "somewhat pipe-like files ..." with read handler not using offset. However that test implements only read without write and cannot exercise the deadlock scenario: https://github.com/libfuse/libfuse/blob/fuse-3.4.2-3-ga1bff7d/example/poll.c#L124-L131 https://github.com/libfuse/libfuse/blob/fuse-3.4.2-3-ga1bff7d/example/poll.c#L146-L163 https://github.com/libfuse/libfuse/blob/fuse-3.4.2-3-ga1bff7d/example/poll.c#L209-L216 I've actually hit the read vs write deadlock for real while implementing my FUSE filesystem where there is /head/watch file, for which open creates separate bidirectional socket-like stream in between filesystem and its user with both read and write being later performed simultaneously. And there it is semantically not easy to split the stream into two separate read-only and write-only channels: https://lab.nexedi.com/kirr/wendelin.core/blob/f13aa600/wcfs/wcfs.go#L88-169 Let's fix this regression. The plan is: 1. We can't change nonseekable_open to include &~FMODE_ATOMIC_POS - doing so would break many in-kernel nonseekable_open users which actually use ppos in read/write handlers. 2. Add stream_open() to kernel to open stream-like non-seekable file descriptors. Read and write on such file descriptors would never use nor change ppos. And with that property on stream-like files read and write will be running without taking f_pos lock - i.e. read and write could be running simultaneously. 3. With semantic patch search and convert to stream_open all in-kernel nonseekable_open users for which read and write actually do not depend on ppos and where there is no other methods in file_operations which assume @offset access. 4. Add FOPEN_STREAM to fs/fuse/ and open in-kernel file-descriptors via steam_open if that bit is present in filesystem open reply. It was tempting to change fs/fuse/ open handler to use stream_open instead of nonseekable_open on just FOPEN_NONSEEKABLE flags, but grepping through Debian codesearch shows users of FOPEN_NONSEEKABLE, and in particular GVFS which actually uses offset in its read and write handlers https://codesearch.debian.net/search?q=-%3Enonseekable+%3D https://gitlab.gnome.org/GNOME/gvfs/blob/1.40.0-6-gcbc54396/client/gvfsfusedaemon.c#L1080 https://gitlab.gnome.org/GNOME/gvfs/blob/1.40.0-6-gcbc54396/client/gvfsfusedaemon.c#L1247-1346 https://gitlab.gnome.org/GNOME/gvfs/blob/1.40.0-6-gcbc54396/client/gvfsfusedaemon.c#L1399-1481 so if we would do such a change it will break a real user. 5. Add stream_open and FOPEN_STREAM handling to stable kernels starting from v3.14+ (the kernel where 9c225f2655 first appeared). This will allow to patch OSSPD and other FUSE filesystems that provide stream-like files to return FOPEN_STREAM | FOPEN_NONSEEKABLE in their open handler and this way avoid the deadlock on all kernel versions. This should work because fs/fuse/ ignores unknown open flags returned from a filesystem and so passing FOPEN_STREAM to a kernel that is not aware of this flag cannot hurt. In turn the kernel that is not aware of FOPEN_STREAM will be < v3.14 where just FOPEN_NONSEEKABLE is sufficient to implement streams without read vs write deadlock. This patch adds stream_open, converts /proc/xen/xenbus to it and adds semantic patch to automatically locate in-kernel places that are either required to be converted due to read vs write deadlock, or that are just safe to be converted because read and write do not use ppos and there are no other funky methods in file_operations. Regarding semantic patch I've verified each generated change manually - that it is correct to convert - and each other nonseekable_open instance left - that it is either not correct to convert there, or that it is not converted due to current stream_open.cocci limitations. The script also does not convert files that should be valid to convert, but that currently have .llseek = noop_llseek or generic_file_llseek for unknown reason despite file being opened with nonseekable_open (e.g. drivers/input/mousedev.c) Cc: Michael Kerrisk Cc: Yongzhi Pan Cc: Jonathan Corbet Cc: David Vrabel Cc: Juergen Gross Cc: Miklos Szeredi Cc: Tejun Heo Cc: Kirill Tkhai Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Greg Kroah-Hartman Cc: Julia Lawall Cc: Nikolaus Rath Cc: Han-Wen Nienhuys Signed-off-by: Kirill Smelkov Signed-off-by: Linus Torvalds --- include/linux/fs.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 8b42df09b04c..dd28e7679089 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -158,6 +158,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define FMODE_OPENED ((__force fmode_t)0x80000) #define FMODE_CREATED ((__force fmode_t)0x100000) +/* File is stream-like */ +#define FMODE_STREAM ((__force fmode_t)0x200000) + /* File was opened by fanotify and shouldn't generate fanotify events */ #define FMODE_NONOTIFY ((__force fmode_t)0x4000000) @@ -3074,6 +3077,7 @@ extern loff_t no_seek_end_llseek_size(struct file *, loff_t, int, loff_t); extern loff_t no_seek_end_llseek(struct file *, loff_t, int); extern int generic_file_open(struct inode * inode, struct file * filp); extern int nonseekable_open(struct inode * inode, struct file * filp); +extern int stream_open(struct inode * inode, struct file * filp); #ifdef CONFIG_BLOCK typedef void (dio_submit_t)(struct bio *bio, struct inode *inode, -- cgit v1.2.3 From 1200e07f3ad4b9d976cf2fff3a0c3d9a1faecb3e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 8 Apr 2019 19:02:38 +0800 Subject: block: don't use for-inside-for in bio_for_each_segment_all Commit 6dc4f100c175 ("block: allow bio_for_each_segment_all() to iterate over multi-page bvec") changes bio_for_each_segment_all() to use for-inside-for. This way breaks all bio_for_each_segment_all() call with error out branch via 'break', since now 'break' can only break from the inner loop. Fixes this issue by implementing bio_for_each_segment_all() via single 'for' loop, and now the logic is very similar with normal bvec iterator. Cc: Qu Wenruo Cc: linux-btrfs@vger.kernel.org Cc: linux-fsdevel@vger.kernel.org Cc: Omar Sandoval Reviewed-by: Johannes Thumshirn Reported-and-Tested-by: Qu Wenruo Fixes: 6dc4f100c175 ("block: allow bio_for_each_segment_all() to iterate over multi-page bvec") Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/bio.h | 20 ++++++++++++-------- include/linux/bvec.h | 14 ++++++++++---- 2 files changed, 22 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index bb6090aa165d..e584673c1881 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -120,19 +120,23 @@ static inline bool bio_full(struct bio *bio) return bio->bi_vcnt >= bio->bi_max_vecs; } -#define mp_bvec_for_each_segment(bv, bvl, i, iter_all) \ - for (bv = bvec_init_iter_all(&iter_all); \ - (iter_all.done < (bvl)->bv_len) && \ - (mp_bvec_next_segment((bvl), &iter_all), 1); \ - iter_all.done += bv->bv_len, i += 1) +static inline bool bio_next_segment(const struct bio *bio, + struct bvec_iter_all *iter) +{ + if (iter->idx >= bio->bi_vcnt) + return false; + + bvec_advance(&bio->bi_io_vec[iter->idx], iter); + return true; +} /* * drivers should _never_ use the all version - the bio may have been split * before it got to the driver and the driver won't own all of it */ -#define bio_for_each_segment_all(bvl, bio, i, iter_all) \ - for (i = 0, iter_all.idx = 0; iter_all.idx < (bio)->bi_vcnt; iter_all.idx++) \ - mp_bvec_for_each_segment(bvl, &((bio)->bi_io_vec[iter_all.idx]), i, iter_all) +#define bio_for_each_segment_all(bvl, bio, i, iter) \ + for (i = 0, bvl = bvec_init_iter_all(&iter); \ + bio_next_segment((bio), &iter); i++) static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter, unsigned bytes) diff --git a/include/linux/bvec.h b/include/linux/bvec.h index f6275c4da13a..3bc91879e1e2 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -145,18 +145,18 @@ static inline bool bvec_iter_advance(const struct bio_vec *bv, static inline struct bio_vec *bvec_init_iter_all(struct bvec_iter_all *iter_all) { - iter_all->bv.bv_page = NULL; iter_all->done = 0; + iter_all->idx = 0; return &iter_all->bv; } -static inline void mp_bvec_next_segment(const struct bio_vec *bvec, - struct bvec_iter_all *iter_all) +static inline void bvec_advance(const struct bio_vec *bvec, + struct bvec_iter_all *iter_all) { struct bio_vec *bv = &iter_all->bv; - if (bv->bv_page) { + if (iter_all->done) { bv->bv_page = nth_page(bv->bv_page, 1); bv->bv_offset = 0; } else { @@ -165,6 +165,12 @@ static inline void mp_bvec_next_segment(const struct bio_vec *bvec, } bv->bv_len = min_t(unsigned int, PAGE_SIZE - bv->bv_offset, bvec->bv_len - iter_all->done); + iter_all->done += bv->bv_len; + + if (iter_all->done == bvec->bv_len) { + iter_all->idx++; + iter_all->done = 0; + } } /* -- cgit v1.2.3 From cf94db21905333e610e479688add629397a4b384 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Mon, 8 Apr 2019 14:33:22 +0200 Subject: virtio: Honour 'may_reduce_num' in vring_create_virtqueue vring_create_virtqueue() allows the caller to specify via the may_reduce_num parameter whether the vring code is allowed to allocate a smaller ring than specified. However, the split ring allocation code tries to allocate a smaller ring on allocation failure regardless of what the caller specified. This may cause trouble for e.g. virtio-pci in legacy mode, which does not support ring resizing. (The packed ring code does not resize in any case.) Let's fix this by bailing out immediately in the split ring code if the requested size cannot be allocated and may_reduce_num has not been specified. While at it, fix a typo in the usage instructions. Fixes: 2a2d1382fe9d ("virtio: Add improved queue allocation API") Cc: stable@vger.kernel.org # v4.6+ Signed-off-by: Cornelia Huck Signed-off-by: Michael S. Tsirkin Reviewed-by: Halil Pasic Reviewed-by: Jens Freimann --- include/linux/virtio_ring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index fab02133a919..3dc70adfe5f5 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -63,7 +63,7 @@ struct virtqueue; /* * Creates a virtqueue and allocates the descriptor ring. If * may_reduce_num is set, then this may allocate a smaller ring than - * expected. The caller should query virtqueue_get_ring_size to learn + * expected. The caller should query virtqueue_get_vring_size to learn * the actual size of the ring. */ struct virtqueue *vring_create_virtqueue(unsigned int index, -- cgit v1.2.3 From 1b8f21b74c3c9c82fce5a751d7aefb7cc0b8d33d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 9 Apr 2019 06:31:21 +0800 Subject: blk-mq: introduce blk_mq_complete_request_sync() In NVMe's error handler, follows the typical steps of tearing down hardware for recovering controller: 1) stop blk_mq hw queues 2) stop the real hw queues 3) cancel in-flight requests via blk_mq_tagset_busy_iter(tags, cancel_request, ...) cancel_request(): mark the request as abort blk_mq_complete_request(req); 4) destroy real hw queues However, there may be race between #3 and #4, because blk_mq_complete_request() may run q->mq_ops->complete(rq) remotelly and asynchronously, and ->complete(rq) may be run after #4. This patch introduces blk_mq_complete_request_sync() for fixing the above race. Cc: Sagi Grimberg Cc: Bart Van Assche Cc: James Smart Cc: linux-nvme@lists.infradead.org Reviewed-by: Keith Busch Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index cb2aa7ecafff..db29928de467 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -302,6 +302,7 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); bool blk_mq_complete_request(struct request *rq); +void blk_mq_complete_request_sync(struct request *rq); bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, struct bio *bio); bool blk_mq_queue_stopped(struct request_queue *q); -- cgit v1.2.3 From 7c2e07130090ae001a97a6b65597830d6815e93e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20M=C3=BCller?= Date: Mon, 8 Apr 2019 15:33:54 +0200 Subject: clk: x86: Add system specific quirk to mark clocks as critical MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit 648e921888ad ("clk: x86: Stop marking clocks as CLK_IS_CRITICAL"), the pmc_plt_clocks of the Bay Trail SoC are unconditionally gated off. Unfortunately this will break systems where these clocks are used for external purposes beyond the kernel's knowledge. Fix it by implementing a system specific quirk to mark the necessary pmc_plt_clks as critical. Fixes: 648e921888ad ("clk: x86: Stop marking clocks as CLK_IS_CRITICAL") Signed-off-by: David Müller Signed-off-by: Hans de Goede Reviewed-by: Andy Shevchenko Signed-off-by: Stephen Boyd --- include/linux/platform_data/x86/clk-pmc-atom.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/clk-pmc-atom.h b/include/linux/platform_data/x86/clk-pmc-atom.h index 3ab892208343..7a37ac27d0fb 100644 --- a/include/linux/platform_data/x86/clk-pmc-atom.h +++ b/include/linux/platform_data/x86/clk-pmc-atom.h @@ -35,10 +35,13 @@ struct pmc_clk { * * @base: PMC clock register base offset * @clks: pointer to set of registered clocks, typically 0..5 + * @critical: flag to indicate if firmware enabled pmc_plt_clks + * should be marked as critial or not */ struct pmc_clk_data { void __iomem *base; const struct pmc_clk *clks; + bool critical; }; #endif /* __PLATFORM_DATA_X86_CLK_PMC_ATOM_H */ -- cgit v1.2.3 From 8065a779f17e94536a1c4dcee4f9d88011672f97 Mon Sep 17 00:00:00 2001 From: Si-Wei Liu Date: Mon, 8 Apr 2019 19:45:27 -0400 Subject: failover: allow name change on IFF_UP slave interfaces When a netdev appears through hot plug then gets enslaved by a failover master that is already up and running, the slave will be opened right away after getting enslaved. Today there's a race that userspace (udev) may fail to rename the slave if the kernel (net_failover) opens the slave earlier than when the userspace rename happens. Unlike bond or team, the primary slave of failover can't be renamed by userspace ahead of time, since the kernel initiated auto-enslavement is unable to, or rather, is never meant to be synchronized with the rename request from userspace. As the failover slave interfaces are not designed to be operated directly by userspace apps: IP configuration, filter rules with regard to network traffic passing and etc., should all be done on master interface. In general, userspace apps only care about the name of master interface, while slave names are less important as long as admin users can see reliable names that may carry other information describing the netdev. For e.g., they can infer that "ens3nsby" is a standby slave of "ens3", while for a name like "eth0" they can't tell which master it belongs to. Historically the name of IFF_UP interface can't be changed because there might be admin script or management software that is already relying on such behavior and assumes that the slave name can't be changed once UP. But failover is special: with the in-kernel auto-enslavement mechanism, the userspace expectation for device enumeration and bring-up order is already broken. Previously initramfs and various userspace config tools were modified to bypass failover slaves because of auto-enslavement and duplicate MAC address. Similarly, in case that users care about seeing reliable slave name, the new type of failover slaves needs to be taken care of specifically in userspace anyway. It's less risky to lift up the rename restriction on failover slave which is already UP. Although it's possible this change may potentially break userspace component (most likely configuration scripts or management software) that assumes slave name can't be changed while UP, it's relatively a limited and controllable set among all userspace components, which can be fixed specifically to listen for the rename events on failover slaves. Userspace component interacting with slaves is expected to be changed to operate on failover master interface instead, as the failover slave is dynamic in nature which may come and go at any point. The goal is to make the role of failover slaves less relevant, and userspace components should only deal with failover master in the long run. Fixes: 30c8bd5aa8b2 ("net: Introduce generic failover module") Signed-off-by: Si-Wei Liu Reviewed-by: Liran Alon Acked-by: Sridhar Samudrala Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 26f69cf763f4..324e872c91d1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1500,6 +1500,7 @@ struct net_device_ops { * @IFF_FAILOVER: device is a failover master device * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device * @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device + * @IFF_LIVE_RENAME_OK: rename is allowed while device is up and running */ enum netdev_priv_flags { IFF_802_1Q_VLAN = 1<<0, @@ -1532,6 +1533,7 @@ enum netdev_priv_flags { IFF_FAILOVER = 1<<27, IFF_FAILOVER_SLAVE = 1<<28, IFF_L3MDEV_RX_HANDLER = 1<<29, + IFF_LIVE_RENAME_OK = 1<<30, }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN @@ -1563,6 +1565,7 @@ enum netdev_priv_flags { #define IFF_FAILOVER IFF_FAILOVER #define IFF_FAILOVER_SLAVE IFF_FAILOVER_SLAVE #define IFF_L3MDEV_RX_HANDLER IFF_L3MDEV_RX_HANDLER +#define IFF_LIVE_RENAME_OK IFF_LIVE_RENAME_OK /** * struct net_device - The DEVICE structure. -- cgit v1.2.3 From d808b7f759b50acf0784ce6230ffa63e12ef465d Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 9 Apr 2019 10:03:59 -0600 Subject: nvmet: fix discover log page when offsets are used The nvme target hadn't been taking the Get Log Page offset parameter into consideration, and so has been returning corrupted log pages when offsets are used. Since many tools, including nvme-cli, split the log request to 4k, we've been breaking discovery log responses when more than 3 subsystems exist. Fix the returned data by internally generating the entire discovery log page and copying only the requested bytes into the user buffer. The command log page offset type has been modified to a native __le64 to make it easier to extract the value from a command. Signed-off-by: Keith Busch Tested-by: Minwoo Im Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: James Smart Signed-off-by: Christoph Hellwig --- include/linux/nvme.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index baa49e6a23cc..c40720cb59ac 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -967,8 +967,13 @@ struct nvme_get_log_page_command { __le16 numdl; __le16 numdu; __u16 rsvd11; - __le32 lpol; - __le32 lpou; + union { + struct { + __le32 lpol; + __le32 lpou; + }; + __le64 lpo; + }; __u32 rsvd14[2]; }; -- cgit v1.2.3 From af6b61d7ef58099c82d854395a0e002be6bd036c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 11 Apr 2019 15:16:52 -0400 Subject: Revert "SUNRPC: Micro-optimise when the task is known not to be sleeping" This reverts commit 009a82f6437490c262584d65a14094a818bcb747. The ability to optimise here relies on compiler being able to optimise away tail calls to avoid stack overflows. Unfortunately, we are seeing reports of problems, so let's just revert. Reported-by: Daniel Mack Signed-off-by: Trond Myklebust --- include/linux/sunrpc/sched.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index ec861cd0cfe8..52d41d0c1ae1 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -304,12 +304,4 @@ rpc_clnt_swap_deactivate(struct rpc_clnt *clnt) } #endif /* CONFIG_SUNRPC_SWAP */ -static inline bool -rpc_task_need_resched(const struct rpc_task *task) -{ - if (RPC_IS_QUEUED(task) || task->tk_callback) - return true; - return false; -} - #endif /* _LINUX_SUNRPC_SCHED_H_ */ -- cgit v1.2.3 From 77f1e0a52d26242b6c2dba019f6ebebfb9ff701e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 18 Jan 2019 10:34:16 -0700 Subject: bfq: update internal depth state when queue depth changes A previous commit moved the shallow depth and BFQ depth map calculations to be done at init time, moving it outside of the hotter IO path. This potentially causes hangs if the users changes the depth of the scheduler map, by writing to the 'nr_requests' sysfs file for that device. Add a blk-mq-sched hook that allows blk-mq to inform the scheduler if the depth changes, so that the scheduler can update its internal state. Tested-by: Kai Krakow Reported-by: Paolo Valente Fixes: f0635b8a416e ("bfq: calculate shallow depths at init time") Signed-off-by: Jens Axboe --- include/linux/elevator.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 2e9e2763bf47..6e8bc53740f0 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -31,6 +31,7 @@ struct elevator_mq_ops { void (*exit_sched)(struct elevator_queue *); int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int); void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); + void (*depth_updated)(struct blk_mq_hw_ctx *); bool (*allow_merge)(struct request_queue *, struct request *, struct bio *); bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *); -- cgit v1.2.3 From f958d7b528b1b40c44cfda5eabe2d82760d868c3 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 11 Apr 2019 10:06:20 -0700 Subject: mm: make page ref count overflow check tighter and more explicit We have a VM_BUG_ON() to check that the page reference count doesn't underflow (or get close to overflow) by checking the sign of the count. That's all fine, but we actually want to allow people to use a "get page ref unless it's already very high" helper function, and we want that one to use the sign of the page ref (without triggering this VM_BUG_ON). Change the VM_BUG_ON to only check for small underflows (or _very_ close to overflowing), and ignore overflows which have strayed into negative territory. Acked-by: Matthew Wilcox Cc: Jann Horn Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- include/linux/mm.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 80bb6408fe73..541d99b86aea 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -965,6 +965,10 @@ static inline bool is_pci_p2pdma_page(const struct page *page) } #endif /* CONFIG_DEV_PAGEMAP_OPS */ +/* 127: arbitrary random number, small enough to assemble well */ +#define page_ref_zero_or_close_to_overflow(page) \ + ((unsigned int) page_ref_count(page) + 127u <= 127u) + static inline void get_page(struct page *page) { page = compound_head(page); @@ -972,7 +976,7 @@ static inline void get_page(struct page *page) * Getting a normal page or the head of a compound page * requires to already have an elevated page->_refcount. */ - VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page); + VM_BUG_ON_PAGE(page_ref_zero_or_close_to_overflow(page), page); page_ref_inc(page); } -- cgit v1.2.3 From 88b1a17dfc3ed7728316478fae0f5ad508f50397 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 11 Apr 2019 10:14:59 -0700 Subject: mm: add 'try_get_page()' helper function This is the same as the traditional 'get_page()' function, but instead of unconditionally incrementing the reference count of the page, it only does so if the count was "safe". It returns whether the reference count was incremented (and is marked __must_check, since the caller obviously has to be aware of it). Also like 'get_page()', you can't use this function unless you already had a reference to the page. The intent is that you can use this exactly like get_page(), but in situations where you want to limit the maximum reference count. The code currently does an unconditional WARN_ON_ONCE() if we ever hit the reference count issues (either zero or negative), as a notification that the conditional non-increment actually happened. NOTE! The count access for the "safety" check is inherently racy, but that doesn't matter since the buffer we use is basically half the range of the reference count (ie we look at the sign of the count). Acked-by: Matthew Wilcox Cc: Jann Horn Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- include/linux/mm.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 541d99b86aea..7000ddd807e0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -980,6 +980,15 @@ static inline void get_page(struct page *page) page_ref_inc(page); } +static inline __must_check bool try_get_page(struct page *page) +{ + page = compound_head(page); + if (WARN_ON_ONCE(page_ref_count(page) <= 0)) + return false; + page_ref_inc(page); + return true; +} + static inline void put_page(struct page *page) { page = compound_head(page); -- cgit v1.2.3 From 15fab63e1e57be9fdb5eec1bbc5916e9825e9acb Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 5 Apr 2019 14:02:10 -0700 Subject: fs: prevent page refcount overflow in pipe_buf_get Change pipe_buf_get() to return a bool indicating whether it succeeded in raising the refcount of the page (if the thing in the pipe is a page). This removes another mechanism for overflowing the page refcount. All callers converted to handle a failure. Reported-by: Jann Horn Signed-off-by: Matthew Wilcox Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- include/linux/pipe_fs_i.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 5a3bb3b7c9ad..3f2a42c11e20 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -108,18 +108,20 @@ struct pipe_buf_operations { /* * Get a reference to the pipe buffer. */ - void (*get)(struct pipe_inode_info *, struct pipe_buffer *); + bool (*get)(struct pipe_inode_info *, struct pipe_buffer *); }; /** * pipe_buf_get - get a reference to a pipe_buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to get a reference to + * + * Return: %true if the reference was successfully obtained. */ -static inline void pipe_buf_get(struct pipe_inode_info *pipe, +static inline __must_check bool pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - buf->ops->get(pipe, buf); + return buf->ops->get(pipe, buf); } /** @@ -178,7 +180,7 @@ struct pipe_inode_info *alloc_pipe_info(void); void free_pipe_info(struct pipe_inode_info *); /* Generic pipe buffer ops functions */ -void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); +bool generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *); -- cgit v1.2.3 From 0082517fa4bce073e7cf542633439f26538a14cc Mon Sep 17 00:00:00 2001 From: Jian-Hong Pan Date: Fri, 12 Apr 2019 16:01:53 +0800 Subject: x86/reboot, efi: Use EFI reboot for Acer TravelMate X514-51T Upon reboot, the Acer TravelMate X514-51T laptop appears to complete the shutdown process, but then it hangs in BIOS POST with a black screen. The problem is intermittent - at some points it has appeared related to Secure Boot settings or different kernel builds, but ultimately we have not been able to identify the exact conditions that trigger the issue to come and go. Besides, the EFI mode cannot be disabled in the BIOS of this model. However, after extensive testing, we observe that using the EFI reboot method reliably avoids the issue in all cases. So add a boot time quirk to use EFI reboot on such systems. Buglink: https://bugzilla.kernel.org/show_bug.cgi?id=203119 Signed-off-by: Jian-Hong Pan Signed-off-by: Daniel Drake Cc: Ard Biesheuvel Cc: Borislav Petkov Cc: Linus Torvalds Cc: Matt Fleming Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Cc: linux@endlessm.com Link: http://lkml.kernel.org/r/20190412080152.3718-1-jian-hong@endlessm.com [ Fix !CONFIG_EFI build failure, clarify the code and the changelog a bit. ] Signed-off-by: Ingo Molnar --- include/linux/efi.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 54357a258b35..6ebc2098cfe1 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1611,7 +1611,12 @@ efi_status_t efi_setup_gop(efi_system_table_t *sys_table_arg, struct screen_info *si, efi_guid_t *proto, unsigned long size); -bool efi_runtime_disabled(void); +#ifdef CONFIG_EFI +extern bool efi_runtime_disabled(void); +#else +static inline bool efi_runtime_disabled(void) { return true; } +#endif + extern void efi_call_virt_check_flags(unsigned long flags, const char *call); extern unsigned long efi_call_virt_save_flags(void); -- cgit v1.2.3 From 1d487e9bf8ba66a7174c56a0029c54b1eca8f99c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Apr 2019 11:16:47 +0200 Subject: KVM: fix spectrev1 gadgets These were found with smatch, and then generalized when applicable. Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9d55c63db09b..640a03642766 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -513,10 +514,10 @@ static inline struct kvm_io_bus *kvm_get_bus(struct kvm *kvm, enum kvm_bus idx) static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) { - /* Pairs with smp_wmb() in kvm_vm_ioctl_create_vcpu, in case - * the caller has read kvm->online_vcpus before (as is the case - * for kvm_for_each_vcpu, for example). - */ + int num_vcpus = atomic_read(&kvm->online_vcpus); + i = array_index_nospec(i, num_vcpus); + + /* Pairs with smp_wmb() in kvm_vm_ioctl_create_vcpu. */ smp_rmb(); return kvm->vcpus[i]; } @@ -600,6 +601,7 @@ void kvm_put_kvm(struct kvm *kvm); static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id) { + as_id = array_index_nospec(as_id, KVM_ADDRESS_SPACE_NUM); return srcu_dereference_check(kvm->memslots[as_id], &kvm->srcu, lockdep_is_held(&kvm->slots_lock) || !refcount_read(&kvm->users_count)); -- cgit v1.2.3 From 3ff9c075cc767b3060bdac12da72fc94dd7da1b8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sun, 24 Feb 2019 01:49:52 +0900 Subject: x86/kprobes: Verify stack frame on kretprobe Verify the stack frame pointer on kretprobe trampoline handler, If the stack frame pointer does not match, it skips the wrong entry and tries to find correct one. This can happen if user puts the kretprobe on the function which can be used in the path of ftrace user-function call. Such functions should not be probed, so this adds a warning message that reports which function should be blacklisted. Tested-by: Andrea Righi Signed-off-by: Masami Hiramatsu Acked-by: Steven Rostedt Cc: Linus Torvalds Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/155094059185.6137.15527904013362842072.stgit@devbox Signed-off-by: Ingo Molnar --- include/linux/kprobes.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 201f0f2683f2..9a897256e481 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -173,6 +173,7 @@ struct kretprobe_instance { struct kretprobe *rp; kprobe_opcode_t *ret_addr; struct task_struct *task; + void *fp; char data[0]; }; -- cgit v1.2.3 From af53d3e9e04024885de5b4fda51e5fa362ae2bd8 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 18 Apr 2019 17:50:13 -0700 Subject: mm: swapoff: shmem_unuse() stop eviction without igrab() The igrab() in shmem_unuse() looks good, but we forgot that it gives no protection against concurrent unmounting: a point made by Konstantin Khlebnikov eight years ago, and then fixed in 2.6.39 by 778dd893ae78 ("tmpfs: fix race between umount and swapoff"). The current 5.1-rc swapoff is liable to hit "VFS: Busy inodes after unmount of tmpfs. Self-destruct in 5 seconds. Have a nice day..." followed by GPF. Once again, give up on using igrab(); but don't go back to making such heavy-handed use of shmem_swaplist_mutex as last time: that would spoil the new design, and I expect could deadlock inside shmem_swapin_page(). Instead, shmem_unuse() just raise a "stop_eviction" count in the shmem- specific inode, and shmem_evict_inode() wait for that to go down to 0. Call it "stop_eviction" rather than "swapoff_busy" because it can be put to use for others later (huge tmpfs patches expect to use it). That simplifies shmem_unuse(), protecting it from both unlink and unmount; and in practice lets it locate all the swap in its first try. But do not rely on that: there's still a theoretical case, when shmem_writepage() might have been preempted after its get_swap_page(), before making the swap entry visible to swapoff. [hughd@google.com: remove incorrect list_del()] Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1904091133570.1898@eggly.anvils Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1904081259400.1523@eggly.anvils Fixes: b56a2d8af914 ("mm: rid swapoff of quadratic complexity") Signed-off-by: Hugh Dickins Cc: "Alex Xu (Hello71)" Cc: Huang Ying Cc: Kelley Nielsen Cc: Konstantin Khlebnikov Cc: Rik van Riel Cc: Vineeth Pillai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shmem_fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index f3fb1edb3526..20d815a33145 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -21,6 +21,7 @@ struct shmem_inode_info { struct list_head swaplist; /* chain of maybes on swap */ struct shared_policy policy; /* NUMA memory alloc policy */ struct simple_xattrs xattrs; /* list of xattrs */ + atomic_t stop_eviction; /* hold when working on inode */ struct inode vfs_inode; }; -- cgit v1.2.3 From 04f5866e41fb70690e28397487d8bd8eea7d712a Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 18 Apr 2019 17:50:52 -0700 Subject: coredump: fix race condition between mmget_not_zero()/get_task_mm() and core dumping The core dumping code has always run without holding the mmap_sem for writing, despite that is the only way to ensure that the entire vma layout will not change from under it. Only using some signal serialization on the processes belonging to the mm is not nearly enough. This was pointed out earlier. For example in Hugh's post from Jul 2017: https://lkml.kernel.org/r/alpine.LSU.2.11.1707191716030.2055@eggly.anvils "Not strictly relevant here, but a related note: I was very surprised to discover, only quite recently, how handle_mm_fault() may be called without down_read(mmap_sem) - when core dumping. That seems a misguided optimization to me, which would also be nice to correct" In particular because the growsdown and growsup can move the vm_start/vm_end the various loops the core dump does around the vma will not be consistent if page faults can happen concurrently. Pretty much all users calling mmget_not_zero()/get_task_mm() and then taking the mmap_sem had the potential to introduce unexpected side effects in the core dumping code. Adding mmap_sem for writing around the ->core_dump invocation is a viable long term fix, but it requires removing all copy user and page faults and to replace them with get_dump_page() for all binary formats which is not suitable as a short term fix. For the time being this solution manually covers the places that can confuse the core dump either by altering the vma layout or the vma flags while it runs. Once ->core_dump runs under mmap_sem for writing the function mmget_still_valid() can be dropped. Allowing mmap_sem protected sections to run in parallel with the coredump provides some minor parallelism advantage to the swapoff code (which seems to be safe enough by never mangling any vma field and can keep doing swapins in parallel to the core dumping) and to some other corner case. In order to facilitate the backporting I added "Fixes: 86039bd3b4e6" however the side effect of this same race condition in /proc/pid/mem should be reproducible since before 2.6.12-rc2 so I couldn't add any other "Fixes:" because there's no hash beyond the git genesis commit. Because find_extend_vma() is the only location outside of the process context that could modify the "mm" structures under mmap_sem for reading, by adding the mmget_still_valid() check to it, all other cases that take the mmap_sem for reading don't need the new check after mmget_not_zero()/get_task_mm(). The expand_stack() in page fault context also doesn't need the new check, because all tasks under core dumping are frozen. Link: http://lkml.kernel.org/r/20190325224949.11068-1-aarcange@redhat.com Fixes: 86039bd3b4e6 ("userfaultfd: add new syscall to provide memory externalization") Signed-off-by: Andrea Arcangeli Reported-by: Jann Horn Suggested-by: Oleg Nesterov Acked-by: Peter Xu Reviewed-by: Mike Rapoport Reviewed-by: Oleg Nesterov Reviewed-by: Jann Horn Acked-by: Jason Gunthorpe Acked-by: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/mm.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 0cd9f10423fb..a3fda9f024c3 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -49,6 +49,27 @@ static inline void mmdrop(struct mm_struct *mm) __mmdrop(mm); } +/* + * This has to be called after a get_task_mm()/mmget_not_zero() + * followed by taking the mmap_sem for writing before modifying the + * vmas or anything the coredump pretends not to change from under it. + * + * NOTE: find_extend_vma() called from GUP context is the only place + * that can modify the "mm" (notably the vm_start/end) under mmap_sem + * for reading and outside the context of the process, so it is also + * the only case that holds the mmap_sem for reading that must call + * this function. Generally if the mmap_sem is hold for reading + * there's no need of this check after get_task_mm()/mmget_not_zero(). + * + * This function can be obsoleted and the check can be removed, after + * the coredump code will hold the mmap_sem for writing before + * invoking the ->core_dump methods. + */ +static inline bool mmget_still_valid(struct mm_struct *mm) +{ + return likely(!mm->core_state); +} + /** * mmget() - Pin the address space associated with a &struct mm_struct. * @mm: The address space to pin. -- cgit v1.2.3 From b40fabc05ea047f6af5933d26a5483873340b0d4 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 19 Apr 2019 10:31:27 +0800 Subject: block: kill all_q_node in request_queue all_q_node has not been used since commit 4b855ad37194 ("blk-mq: Create hctx for each present CPU"), so remove it. Reviewed-by: Chaitanya Kulkarni Reviewed-by: Ming Lei Signed-off-by: Hou Tao Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5c58a3b2bf00..317ab30d2904 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -548,7 +548,6 @@ struct request_queue { struct rcu_head rcu_head; wait_queue_head_t mq_freeze_wq; struct percpu_ref q_usage_counter; - struct list_head all_q_node; struct blk_mq_tag_set *tag_set; struct list_head tag_set_list; -- cgit v1.2.3 From 6bedf00e55e5dd0a4ed1ad3f06131edd6fb56ec8 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 17 Apr 2019 09:11:26 +0800 Subject: block: make sure that bvec length can't be overflow bvec->bv_offset may be bigger than PAGE_SIZE sometimes, such as, when one bio is splitted in the middle of one bvec via bio_split(), and bi_iter.bi_bvec_done is used to build offset of the 1st bvec of remained bio. And the remained bio's bvec may be re-submitted to fs layer via ITER_IBVEC, such as loop and nvme-loop. So we have to make sure that every bvec's offset is less than PAGE_SIZE from bio_for_each_segment_all() because some drivers(loop, nvme-loop) passes the splitted bvec to fs layer via ITER_BVEC. This patch fixes this issue reported by Zhang Yi When running nvme/011. Cc: Christoph Hellwig Cc: Yi Zhang Reported-by: Yi Zhang Reviewed-by: Christoph Hellwig Fixes: 6dc4f100c175 ("block: allow bio_for_each_segment_all() to iterate over multi-page bvec") Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/bvec.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 3bc91879e1e2..ff13cbc1887d 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -160,8 +160,9 @@ static inline void bvec_advance(const struct bio_vec *bvec, bv->bv_page = nth_page(bv->bv_page, 1); bv->bv_offset = 0; } else { - bv->bv_page = bvec->bv_page; - bv->bv_offset = bvec->bv_offset; + bv->bv_page = bvec_nth_page(bvec->bv_page, bvec->bv_offset / + PAGE_SIZE); + bv->bv_offset = bvec->bv_offset & ~PAGE_MASK; } bv->bv_len = min_t(unsigned int, PAGE_SIZE - bv->bv_offset, bvec->bv_len - iter_all->done); -- cgit v1.2.3 From c2b71462d294cf517a0bc6e4fd6424d7cee5596f Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Fri, 19 Apr 2019 13:52:38 -0400 Subject: USB: core: Fix bug caused by duplicate interface PM usage counter The syzkaller fuzzer reported a bug in the USB hub driver which turned out to be caused by a negative runtime-PM usage counter. This allowed a hub to be runtime suspended at a time when the driver did not expect it. The symptom is a WARNING issued because the hub's status URB is submitted while it is already active: URB 0000000031fb463e submitted while active WARNING: CPU: 0 PID: 2917 at drivers/usb/core/urb.c:363 The negative runtime-PM usage count was caused by an unfortunate design decision made when runtime PM was first implemented for USB. At that time, USB class drivers were allowed to unbind from their interfaces without balancing the usage counter (i.e., leaving it with a positive count). The core code would take care of setting the counter back to 0 before allowing another driver to bind to the interface. Later on when runtime PM was implemented for the entire kernel, the opposite decision was made: Drivers were required to balance their runtime-PM get and put calls. In order to maintain backward compatibility, however, the USB subsystem adapted to the new implementation by keeping an independent usage counter for each interface and using it to automatically adjust the normal usage counter back to 0 whenever a driver was unbound. This approach involves duplicating information, but what is worse, it doesn't work properly in cases where a USB class driver delays decrementing the usage counter until after the driver's disconnect() routine has returned and the counter has been adjusted back to 0. Doing so would cause the usage counter to become negative. There's even a warning about this in the USB power management documentation! As it happens, this is exactly what the hub driver does. The kick_hub_wq() routine increments the runtime-PM usage counter, and the corresponding decrement is carried out by hub_event() in the context of the hub_wq work-queue thread. This work routine may sometimes run after the driver has been unbound from its interface, and when it does it causes the usage counter to go negative. It is not possible for hub_disconnect() to wait for a pending hub_event() call to finish, because hub_disconnect() is called with the device lock held and hub_event() acquires that lock. The only feasible fix is to reverse the original design decision: remove the duplicate interface-specific usage counter and require USB drivers to balance their runtime PM gets and puts. As far as I know, all existing drivers currently do this. Signed-off-by: Alan Stern Reported-and-tested-by: syzbot+7634edaea4d0b341c625@syzkaller.appspotmail.com CC: Signed-off-by: Greg Kroah-Hartman --- include/linux/usb.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index 5e49e82c4368..ff010d1fd1c7 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -200,7 +200,6 @@ usb_find_last_int_out_endpoint(struct usb_host_interface *alt, * @dev: driver model's view of this device * @usb_dev: if an interface is bound to the USB major, this will point * to the sysfs representation for that device. - * @pm_usage_cnt: PM usage counter for this interface * @reset_ws: Used for scheduling resets from atomic context. * @resetting_device: USB core reset the device, so use alt setting 0 as * current; needs bandwidth alloc after reset. @@ -257,7 +256,6 @@ struct usb_interface { struct device dev; /* interface specific device info */ struct device *usb_dev; - atomic_t pm_usage_cnt; /* usage counter for autosuspend */ struct work_struct reset_ws; /* for resets in atomic context */ }; #define to_usb_interface(d) container_of(d, struct usb_interface, dev) -- cgit v1.2.3 From 1c5c12ee308aacf635c8819cd4baa3bd58f8a8b7 Mon Sep 17 00:00:00 2001 From: Tao Ren Date: Wed, 24 Apr 2019 01:43:32 +0000 Subject: net/ncsi: handle overflow when incrementing mac address Previously BMC's MAC address is calculated by simply adding 1 to the last byte of network controller's MAC address, and it produces incorrect result when network controller's MAC address ends with 0xFF. The problem can be fixed by calling eth_addr_inc() function to increment MAC address; besides, the MAC address is also validated before assigning to BMC. Fixes: cb10c7c0dfd9 ("net/ncsi: Add NCSI Broadcom OEM command") Signed-off-by: Tao Ren Acked-by: Jakub Kicinski Acked-by: Samuel Mendoza-Jonas Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index e2f3b21cd72a..aa8bfd6f738c 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -448,6 +448,18 @@ static inline void eth_addr_dec(u8 *addr) u64_to_ether_addr(u, addr); } +/** + * eth_addr_inc() - Increment the given MAC address. + * @addr: Pointer to a six-byte array containing Ethernet address to increment. + */ +static inline void eth_addr_inc(u8 *addr) +{ + u64 u = ether_addr_to_u64(addr); + + u++; + u64_to_ether_addr(u, addr); +} + /** * is_etherdev_addr - Tell if given Ethernet address belongs to the device. * @dev: Pointer to a device structure -- cgit v1.2.3 From 0edd6b64d1939e9e9168ff27947995bb7751db5d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Apr 2019 21:55:59 +0200 Subject: bpf: Fix preempt_enable_no_resched() abuse Unless the very next line is schedule(), or implies it, one must not use preempt_enable_no_resched(). It can cause a preemption to go missing and thereby cause arbitrary delays, breaking the PREEMPT=y invariant. Cc: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f02367faa58d..944ccc310201 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -510,7 +510,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, } \ _out: \ rcu_read_unlock(); \ - preempt_enable_no_resched(); \ + preempt_enable(); \ _ret; \ }) -- cgit v1.2.3 From b987222654f84f7b4ca95b3a55eca784cb30235b Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 4 Apr 2019 23:59:25 +0200 Subject: tracing: Fix buffer_ref pipe ops This fixes multiple issues in buffer_pipe_buf_ops: - The ->steal() handler must not return zero unless the pipe buffer has the only reference to the page. But generic_pipe_buf_steal() assumes that every reference to the pipe is tracked by the page's refcount, which isn't true for these buffers - buffer_pipe_buf_get(), which duplicates a buffer, doesn't touch the page's refcount. Fix it by using generic_pipe_buf_nosteal(), which refuses every attempted theft. It should be easy to actually support ->steal, but the only current users of pipe_buf_steal() are the virtio console and FUSE, and they also only use it as an optimization. So it's probably not worth the effort. - The ->get() and ->release() handlers can be invoked concurrently on pipe buffers backed by the same struct buffer_ref. Make them safe against concurrency by using refcount_t. - The pointers stored in ->private were only zeroed out when the last reference to the buffer_ref was dropped. As far as I know, this shouldn't be necessary anyway, but if we do it, let's always do it. Link: http://lkml.kernel.org/r/20190404215925.253531-1-jannh@google.com Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Al Viro Cc: stable@vger.kernel.org Fixes: 73a757e63114d ("ring-buffer: Return reader page back into existing ring buffer") Signed-off-by: Jann Horn Signed-off-by: Steven Rostedt (VMware) --- include/linux/pipe_fs_i.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 787d224ff43e..a830e9a00eb9 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -174,6 +174,7 @@ void free_pipe_info(struct pipe_inode_info *); void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); +int generic_pipe_buf_nosteal(struct pipe_inode_info *, struct pipe_buffer *); void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *); void pipe_buf_mark_unmergeable(struct pipe_buffer *buf); -- cgit v1.2.3 From f5eb4d3b92a6a1096ef3480b54782a9409281300 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 26 Apr 2019 18:45:21 +0800 Subject: iov_iter: fix iov_iter_type Commit 875f1d0769cd ("iov_iter: add ITER_BVEC_FLAG_NO_REF flag") introduces one extra flag of ITER_BVEC_FLAG_NO_REF, and this flag is stored into iter->type. However, iov_iter_type() doesn't consider the new added flag, fix it by masking this flag in iov_iter_type(). Fixes: 875f1d0769cd ("iov_iter: add ITER_BVEC_FLAG_NO_REF flag") Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/uio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/uio.h b/include/linux/uio.h index f184af1999a8..2d0131ad4604 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -60,7 +60,7 @@ struct iov_iter { static inline enum iter_type iov_iter_type(const struct iov_iter *i) { - return i->type & ~(READ | WRITE); + return i->type & ~(READ | WRITE | ITER_BVEC_FLAG_NO_REF); } static inline bool iter_is_iovec(const struct iov_iter *i) -- cgit v1.2.3