From f7757368e0f0b3e108088ca7b5b8abda6faa7ebc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 1 Apr 2020 17:40:33 -0400 Subject: sched/core: Fix illegal RCU from offline CPUs [ Upstream commit bf2c59fce4074e55d622089b34be3a6bc95484fb ] In the CPU-offline process, it calls mmdrop() after idle entry and the subsequent call to cpuhp_report_idle_dead(). Once execution passes the call to rcu_report_dead(), RCU is ignoring the CPU, which results in lockdep complaining when mmdrop() uses RCU from either memcg or debugobjects below. Fix it by cleaning up the active_mm state from BP instead. Every arch which has CONFIG_HOTPLUG_CPU should have already called idle_task_exit() from AP. The only exception is parisc because it switches them to &init_mm unconditionally (see smp_boot_one_cpu() and smp_cpu_init()), but the patch will still work there because it calls mmgrab(&init_mm) in smp_cpu_init() and then should call mmdrop(&init_mm) in finish_cpu(). WARNING: suspicious RCU usage ----------------------------- kernel/workqueue.c:710 RCU or wq_pool_mutex should be held! other info that might help us debug this: RCU used illegally from offline CPU! Call Trace: dump_stack+0xf4/0x164 (unreliable) lockdep_rcu_suspicious+0x140/0x164 get_work_pool+0x110/0x150 __queue_work+0x1bc/0xca0 queue_work_on+0x114/0x120 css_release+0x9c/0xc0 percpu_ref_put_many+0x204/0x230 free_pcp_prepare+0x264/0x570 free_unref_page+0x38/0xf0 __mmdrop+0x21c/0x2c0 idle_task_exit+0x170/0x1b0 pnv_smp_cpu_kill_self+0x38/0x2e0 cpu_die+0x48/0x64 arch_cpu_idle_dead+0x30/0x50 do_idle+0x2f4/0x470 cpu_startup_entry+0x38/0x40 start_secondary+0x7a8/0xa80 start_secondary_resume+0x10/0x14 Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Qian Cai Signed-off-by: Peter Zijlstra (Intel) Acked-by: Michael Ellerman (powerpc) Link: https://lkml.kernel.org/r/20200401214033.8448-1-cai@lca.pw Signed-off-by: Sasha Levin --- include/linux/sched/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index c49257a3b510..a132d875d351 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -49,6 +49,8 @@ static inline void mmdrop(struct mm_struct *mm) __mmdrop(mm); } +void mmdrop(struct mm_struct *mm); + /* * This has to be called after a get_task_mm()/mmget_not_zero() * followed by taking the mmap_sem for writing before modifying the -- cgit v1.2.3 From 4f1a132115ef54eb4136ae24035ee62c9f41c201 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Wed, 6 May 2020 17:42:23 +0100 Subject: kgdb: Fix spurious true from in_dbg_master() [ Upstream commit 3fec4aecb311995189217e64d725cfe84a568de3 ] Currently there is a small window where a badly timed migration could cause in_dbg_master() to spuriously return true. Specifically if we migrate to a new core after reading the processor id and the previous core takes a breakpoint then we will evaluate true if we read kgdb_active before we get the IPI to bring us to halt. Fix this by checking irqs_disabled() first. Interrupts are always disabled when we are executing the kgdb trap so this is an acceptable prerequisite. This also allows us to replace raw_smp_processor_id() with smp_processor_id() since the short circuit logic will prevent warnings from PREEMPT_DEBUG. Fixes: dcc7871128e9 ("kgdb: core changes to support kdb") Suggested-by: Will Deacon Link: https://lore.kernel.org/r/20200506164223.2875760-1-daniel.thompson@linaro.org Reviewed-by: Douglas Anderson Signed-off-by: Daniel Thompson Signed-off-by: Sasha Levin --- include/linux/kgdb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h index b072aeb1fd78..4d6fe87fd38f 100644 --- a/include/linux/kgdb.h +++ b/include/linux/kgdb.h @@ -323,7 +323,7 @@ extern void gdbstub_exit(int status); extern int kgdb_single_step; extern atomic_t kgdb_active; #define in_dbg_master() \ - (raw_smp_processor_id() == atomic_read(&kgdb_active)) + (irqs_disabled() && (smp_processor_id() == atomic_read(&kgdb_active))) extern bool dbg_is_early; extern void __init dbg_late_init(void); extern void kgdb_panic(const char *msg); -- cgit v1.2.3 From e7b1564a24e62b302ce9cb39765f0ac9814ffabd Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 29 May 2020 16:06:59 -0700 Subject: bpf: Fix running sk_skb program types with ktls [ Upstream commit e91de6afa81c10e9f855c5695eb9a53168d96b73 ] KTLS uses a stream parser to collect TLS messages and send them to the upper layer tls receive handler. This ensures the tls receiver has a full TLS header to parse when it is run. However, when a socket has BPF_SK_SKB_STREAM_VERDICT program attached before KTLS is enabled we end up with two stream parsers running on the same socket. The result is both try to run on the same socket. First the KTLS stream parser runs and calls read_sock() which will tcp_read_sock which in turn calls tcp_rcv_skb(). This dequeues the skb from the sk_receive_queue. When this is done KTLS code then data_ready() callback which because we stacked KTLS on top of the bpf stream verdict program has been replaced with sk_psock_start_strp(). This will in turn kick the stream parser again and eventually do the same thing KTLS did above calling into tcp_rcv_skb() and dequeuing a skb from the sk_receive_queue. At this point the data stream is broke. Part of the stream was handled by the KTLS side some other bytes may have been handled by the BPF side. Generally this results in either missing data or more likely a "Bad Message" complaint from the kTLS receive handler as the BPF program steals some bytes meant to be in a TLS header and/or the TLS header length is no longer correct. We've already broke the idealized model where we can stack ULPs in any order with generic callbacks on the TX side to handle this. So in this patch we do the same thing but for RX side. We add a sk_psock_strp_enabled() helper so TLS can learn a BPF verdict program is running and add a tls_sw_has_ctx_rx() helper so BPF side can learn there is a TLS ULP on the socket. Then on BPF side we omit calling our stream parser to avoid breaking the data stream for the KTLS receiver. Then on the KTLS side we call BPF_SK_SKB_STREAM_VERDICT once the KTLS receiver is done with the packet but before it posts the msg to userspace. This gives us symmetry between the TX and RX halfs and IMO makes it usable again. On the TX side we process packets in this order BPF -> TLS -> TCP and on the receive side in the reverse order TCP -> TLS -> BPF. Discovered while testing OpenSSL 3.0 Alpha2.0 release. Fixes: d829e9c4112b5 ("tls: convert to generic sk_msg interface") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/159079361946.5745.605854335665044485.stgit@john-Precision-5820-Tower Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- include/linux/skmsg.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index a3adbe593505..4bdb5e4bbd6a 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -457,4 +457,12 @@ static inline void psock_progs_drop(struct sk_psock_progs *progs) psock_set_prog(&progs->skb_verdict, NULL); } +int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb); + +static inline bool sk_psock_strp_enabled(struct sk_psock *psock) +{ + if (!psock) + return false; + return psock->parser.enabled; +} #endif /* _LINUX_SKMSG_H */ -- cgit v1.2.3 From b008ae4cc74d3ab43074099746f6c32d353e01aa Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Wed, 3 Jun 2020 15:56:46 -0700 Subject: string.h: fix incompatibility between FORTIFY_SOURCE and KASAN [ Upstream commit 47227d27e2fcb01a9e8f5958d8997cf47a820afc ] The memcmp KASAN self-test fails on a kernel with both KASAN and FORTIFY_SOURCE. When FORTIFY_SOURCE is on, a number of functions are replaced with fortified versions, which attempt to check the sizes of the operands. However, these functions often directly invoke __builtin_foo() once they have performed the fortify check. Using __builtins may bypass KASAN checks if the compiler decides to inline it's own implementation as sequence of instructions, rather than emit a function call that goes out to a KASAN-instrumented implementation. Why is only memcmp affected? ============================ Of the string and string-like functions that kasan_test tests, only memcmp is replaced by an inline sequence of instructions in my testing on x86 with gcc version 9.2.1 20191008 (Ubuntu 9.2.1-9ubuntu2). I believe this is due to compiler heuristics. For example, if I annotate kmalloc calls with the alloc_size annotation (and disable some fortify compile-time checking!), the compiler will replace every memset except the one in kmalloc_uaf_memset with inline instructions. (I have some WIP patches to add this annotation.) Does this affect other functions in string.h? ============================================= Yes. Anything that uses __builtin_* rather than __real_* could be affected. This looks like: - strncpy - strcat - strlen - strlcpy maybe, under some circumstances? - strncat under some circumstances - memset - memcpy - memmove - memcmp (as noted) - memchr - strcpy Whether a function call is emitted always depends on the compiler. Most bugs should get caught by FORTIFY_SOURCE, but the missed memcmp test shows that this is not always the case. Isn't FORTIFY_SOURCE disabled with KASAN? ========================================- The string headers on all arches supporting KASAN disable fortify with kasan, but only when address sanitisation is _also_ disabled. For example from x86: #if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__) /* * For files that are not instrumented (e.g. mm/slub.c) we * should use not instrumented version of mem* functions. */ #define memcpy(dst, src, len) __memcpy(dst, src, len) #define memmove(dst, src, len) __memmove(dst, src, len) #define memset(s, c, n) __memset(s, c, n) #ifndef __NO_FORTIFY #define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */ #endif #endif This comes from commit 6974f0c4555e ("include/linux/string.h: add the option of fortified string.h functions"), and doesn't work when KASAN is enabled and the file is supposed to be sanitised - as with test_kasan.c I'm pretty sure this is not wrong, but not as expansive it should be: * we shouldn't use __builtin_memcpy etc in files where we don't have instrumentation - it could devolve into a function call to memcpy, which will be instrumented. Rather, we should use __memcpy which by convention is not instrumented. * we also shouldn't be using __builtin_memcpy when we have a KASAN instrumented file, because it could be replaced with inline asm that will not be instrumented. What is correct behaviour? ========================== Firstly, there is some overlap between fortification and KASAN: both provide some level of _runtime_ checking. Only fortify provides compile-time checking. KASAN and fortify can pick up different things at runtime: - Some fortify functions, notably the string functions, could easily be modified to consider sub-object sizes (e.g. members within a struct), and I have some WIP patches to do this. KASAN cannot detect these because it cannot insert poision between members of a struct. - KASAN can detect many over-reads/over-writes when the sizes of both operands are unknown, which fortify cannot. So there are a couple of options: 1) Flip the test: disable fortify in santised files and enable it in unsanitised files. This at least stops us missing KASAN checking, but we lose the fortify checking. 2) Make the fortify code always call out to real versions. Do this only for KASAN, for fear of losing the inlining opportunities we get from __builtin_*. (We can't use kasan_check_{read,write}: because the fortify functions are _extern inline_, you can't include _static_ inline functions without a compiler warning. kasan_check_{read,write} are static inline so we can't use them even when they would otherwise be suitable.) Take approach 2 and call out to real versions when KASAN is enabled. Use __underlying_foo to distinguish from __real_foo: __real_foo always refers to the kernel's implementation of foo, __underlying_foo could be either the kernel implementation or the __builtin_foo implementation. This is sometimes enough to make the memcmp test succeed with FORTIFY_SOURCE enabled. It is at least enough to get the function call into the module. One more fix is needed to make it reliable: see the next patch. Fixes: 6974f0c4555e ("include/linux/string.h: add the option of fortified string.h functions") Signed-off-by: Daniel Axtens Signed-off-by: Andrew Morton Tested-by: David Gow Reviewed-by: Dmitry Vyukov Cc: Daniel Micay Cc: Andrey Ryabinin Cc: Alexander Potapenko Link: http://lkml.kernel.org/r/20200423154503.5103-3-dja@axtens.net Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- include/linux/string.h | 60 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 48 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/string.h b/include/linux/string.h index b6ccdc2c7f02..b2264355272d 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -269,6 +269,31 @@ void __read_overflow3(void) __compiletime_error("detected read beyond size of ob void __write_overflow(void) __compiletime_error("detected write beyond size of object passed as 1st parameter"); #if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE) + +#ifdef CONFIG_KASAN +extern void *__underlying_memchr(const void *p, int c, __kernel_size_t size) __RENAME(memchr); +extern int __underlying_memcmp(const void *p, const void *q, __kernel_size_t size) __RENAME(memcmp); +extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(memcpy); +extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(memmove); +extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(memset); +extern char *__underlying_strcat(char *p, const char *q) __RENAME(strcat); +extern char *__underlying_strcpy(char *p, const char *q) __RENAME(strcpy); +extern __kernel_size_t __underlying_strlen(const char *p) __RENAME(strlen); +extern char *__underlying_strncat(char *p, const char *q, __kernel_size_t count) __RENAME(strncat); +extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) __RENAME(strncpy); +#else +#define __underlying_memchr __builtin_memchr +#define __underlying_memcmp __builtin_memcmp +#define __underlying_memcpy __builtin_memcpy +#define __underlying_memmove __builtin_memmove +#define __underlying_memset __builtin_memset +#define __underlying_strcat __builtin_strcat +#define __underlying_strcpy __builtin_strcpy +#define __underlying_strlen __builtin_strlen +#define __underlying_strncat __builtin_strncat +#define __underlying_strncpy __builtin_strncpy +#endif + __FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size) { size_t p_size = __builtin_object_size(p, 0); @@ -276,14 +301,14 @@ __FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size) __write_overflow(); if (p_size < size) fortify_panic(__func__); - return __builtin_strncpy(p, q, size); + return __underlying_strncpy(p, q, size); } __FORTIFY_INLINE char *strcat(char *p, const char *q) { size_t p_size = __builtin_object_size(p, 0); if (p_size == (size_t)-1) - return __builtin_strcat(p, q); + return __underlying_strcat(p, q); if (strlcat(p, q, p_size) >= p_size) fortify_panic(__func__); return p; @@ -297,7 +322,7 @@ __FORTIFY_INLINE __kernel_size_t strlen(const char *p) /* Work around gcc excess stack consumption issue */ if (p_size == (size_t)-1 || (__builtin_constant_p(p[p_size - 1]) && p[p_size - 1] == '\0')) - return __builtin_strlen(p); + return __underlying_strlen(p); ret = strnlen(p, p_size); if (p_size <= ret) fortify_panic(__func__); @@ -330,7 +355,7 @@ __FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size) __write_overflow(); if (len >= p_size) fortify_panic(__func__); - __builtin_memcpy(p, q, len); + __underlying_memcpy(p, q, len); p[len] = '\0'; } return ret; @@ -343,12 +368,12 @@ __FORTIFY_INLINE char *strncat(char *p, const char *q, __kernel_size_t count) size_t p_size = __builtin_object_size(p, 0); size_t q_size = __builtin_object_size(q, 0); if (p_size == (size_t)-1 && q_size == (size_t)-1) - return __builtin_strncat(p, q, count); + return __underlying_strncat(p, q, count); p_len = strlen(p); copy_len = strnlen(q, count); if (p_size < p_len + copy_len + 1) fortify_panic(__func__); - __builtin_memcpy(p + p_len, q, copy_len); + __underlying_memcpy(p + p_len, q, copy_len); p[p_len + copy_len] = '\0'; return p; } @@ -360,7 +385,7 @@ __FORTIFY_INLINE void *memset(void *p, int c, __kernel_size_t size) __write_overflow(); if (p_size < size) fortify_panic(__func__); - return __builtin_memset(p, c, size); + return __underlying_memset(p, c, size); } __FORTIFY_INLINE void *memcpy(void *p, const void *q, __kernel_size_t size) @@ -375,7 +400,7 @@ __FORTIFY_INLINE void *memcpy(void *p, const void *q, __kernel_size_t size) } if (p_size < size || q_size < size) fortify_panic(__func__); - return __builtin_memcpy(p, q, size); + return __underlying_memcpy(p, q, size); } __FORTIFY_INLINE void *memmove(void *p, const void *q, __kernel_size_t size) @@ -390,7 +415,7 @@ __FORTIFY_INLINE void *memmove(void *p, const void *q, __kernel_size_t size) } if (p_size < size || q_size < size) fortify_panic(__func__); - return __builtin_memmove(p, q, size); + return __underlying_memmove(p, q, size); } extern void *__real_memscan(void *, int, __kernel_size_t) __RENAME(memscan); @@ -416,7 +441,7 @@ __FORTIFY_INLINE int memcmp(const void *p, const void *q, __kernel_size_t size) } if (p_size < size || q_size < size) fortify_panic(__func__); - return __builtin_memcmp(p, q, size); + return __underlying_memcmp(p, q, size); } __FORTIFY_INLINE void *memchr(const void *p, int c, __kernel_size_t size) @@ -426,7 +451,7 @@ __FORTIFY_INLINE void *memchr(const void *p, int c, __kernel_size_t size) __read_overflow(); if (p_size < size) fortify_panic(__func__); - return __builtin_memchr(p, c, size); + return __underlying_memchr(p, c, size); } void *__real_memchr_inv(const void *s, int c, size_t n) __RENAME(memchr_inv); @@ -457,11 +482,22 @@ __FORTIFY_INLINE char *strcpy(char *p, const char *q) size_t p_size = __builtin_object_size(p, 0); size_t q_size = __builtin_object_size(q, 0); if (p_size == (size_t)-1 && q_size == (size_t)-1) - return __builtin_strcpy(p, q); + return __underlying_strcpy(p, q); memcpy(p, q, strlen(q) + 1); return p; } +/* Don't use these outside the FORITFY_SOURCE implementation */ +#undef __underlying_memchr +#undef __underlying_memcmp +#undef __underlying_memcpy +#undef __underlying_memmove +#undef __underlying_memset +#undef __underlying_strcat +#undef __underlying_strcpy +#undef __underlying_strlen +#undef __underlying_strncat +#undef __underlying_strncpy #endif /** -- cgit v1.2.3 From c388f173ed8a6b8c7b8f90389d17f58d77d28e4f Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Wed, 3 Jun 2020 15:59:24 -0700 Subject: mm: initialize deferred pages with interrupts enabled commit 3d060856adfc59afb9d029c233141334cfaba418 upstream. Initializing struct pages is a long task and keeping interrupts disabled for the duration of this operation introduces a number of problems. 1. jiffies are not updated for long period of time, and thus incorrect time is reported. See proposed solution and discussion here: lkml/20200311123848.118638-1-shile.zhang@linux.alibaba.com 2. It prevents farther improving deferred page initialization by allowing intra-node multi-threading. We are keeping interrupts disabled to solve a rather theoretical problem that was never observed in real world (See 3a2d7fa8a3d5). Let's keep interrupts enabled. In case we ever encounter a scenario where an interrupt thread wants to allocate large amount of memory this early in boot we can deal with that by growing zone (see deferred_grow_zone()) by the needed amount before starting deferred_init_memmap() threads. Before: [ 1.232459] node 0 initialised, 12058412 pages in 1ms After: [ 1.632580] node 0 initialised, 12051227 pages in 436ms Fixes: 3a2d7fa8a3d5 ("mm: disable interrupts while initializing deferred pages") Reported-by: Shile Zhang Signed-off-by: Pavel Tatashin Signed-off-by: Andrew Morton Reviewed-by: Daniel Jordan Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Dan Williams Cc: James Morris Cc: Kirill Tkhai Cc: Sasha Levin Cc: Yiqian Wei Cc: [4.17+] Link: http://lkml.kernel.org/r/20200403140952.17177-3-pasha.tatashin@soleen.com Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/mmzone.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8b5f758942a2..85804ba62215 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -709,6 +709,8 @@ typedef struct pglist_data { /* * Must be held any time you expect node_start_pfn, * node_present_pages, node_spanned_pages or nr_zones to stay constant. + * Also synchronizes pgdat->first_deferred_pfn during deferred page + * init. * * pgdat_resize_lock() and pgdat_resize_unlock() are provided to * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG -- cgit v1.2.3 From b35415c0949a111f956dcc645b23d5ab03b9a5dc Mon Sep 17 00:00:00 2001 From: Jon Derrick Date: Tue, 12 Nov 2019 05:47:53 -0700 Subject: PCI: vmd: Add device id for VMD device 8086:9A0B [ Upstream commit ec11e5c213cc20cac5e8310728b06793448b9f6d ] This patch adds support for this VMD device which supports the bus restriction mode. Signed-off-by: Jon Derrick Signed-off-by: Lorenzo Pieralisi Signed-off-by: Sasha Levin --- include/linux/pci_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 228f66347620..dae9ffe4478f 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -3008,6 +3008,7 @@ #define PCI_DEVICE_ID_INTEL_84460GX 0x84ea #define PCI_DEVICE_ID_INTEL_IXP4XX 0x8500 #define PCI_DEVICE_ID_INTEL_IXP2800 0x9004 +#define PCI_DEVICE_ID_INTEL_VMD_9A0B 0x9a0b #define PCI_DEVICE_ID_INTEL_S21152BB 0xb152 #define PCI_VENDOR_ID_SCALEMP 0x8686 -- cgit v1.2.3 From 9cfece5c0e2b774be5d613262b2df3789d2c5a3a Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 10 Jan 2020 01:56:49 +0000 Subject: x86/amd_nb: Add Family 19h PCI IDs [ Upstream commit b3f79ae45904ae987a7c06a9e8d6084d7b73e67f ] Add the new PCI Device 18h IDs for AMD Family 19h systems. Note that Family 19h systems will not have a new PCI root device ID. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200110015651.14887-4-Yazen.Ghannam@amd.com Signed-off-by: Sasha Levin --- include/linux/pci_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index dae9ffe4478f..6693cf561cd1 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -549,6 +549,7 @@ #define PCI_DEVICE_ID_AMD_17H_M10H_DF_F3 0x15eb #define PCI_DEVICE_ID_AMD_17H_M30H_DF_F3 0x1493 #define PCI_DEVICE_ID_AMD_17H_M70H_DF_F3 0x1443 +#define PCI_DEVICE_ID_AMD_19H_DF_F3 0x1653 #define PCI_DEVICE_ID_AMD_CNB17H_F3 0x1703 #define PCI_DEVICE_ID_AMD_LANCE 0x2000 #define PCI_DEVICE_ID_AMD_LANCE_HOME 0x2001 -- cgit v1.2.3 From 3c8938fb19edaa3542d9f3a22cd74e2d051a1ceb Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 10 Mar 2020 20:50:07 +0800 Subject: PCI: Add Loongson vendor ID [ Upstream commit 9acb9fe18d863aacc99948963f8d5d447dc311be ] Add the Loongson vendor ID to pci_ids.h to be used by the controller driver in the future. The Loongson vendor ID can be found at the following link: https://git.kernel.org/pub/scm/utils/pciutils/pciutils.git/tree/pci.ids Signed-off-by: Tiezhu Yang Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- include/linux/pci_ids.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 6693cf561cd1..1dfc4e1dcb94 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -148,6 +148,8 @@ /* Vendors and devices. Sort key: vendor first, device next. */ +#define PCI_VENDOR_ID_LOONGSON 0x0014 + #define PCI_VENDOR_ID_TTTECH 0x0357 #define PCI_DEVICE_ID_TTTECH_MC322 0x000a -- cgit v1.2.3 From 44eec92cc4bff3cc9aa025706a625b940dd012c9 Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Fri, 8 May 2020 14:53:40 +0800 Subject: serial: 8250_pci: Move Pericom IDs to pci_ids.h [ Upstream commit 62a7f3009a460001eb46984395280dd900bc4ef4 ] Move the IDs to pci_ids.h so it can be used by next patch. Link: https://lore.kernel.org/r/20200508065343.32751-1-kai.heng.feng@canonical.com Signed-off-by: Kai-Heng Feng Signed-off-by: Bjorn Helgaas Acked-by: Greg Kroah-Hartman Cc: stable@vger.kernel.org Signed-off-by: Sasha Levin --- include/linux/pci_ids.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 1dfc4e1dcb94..9a57e6717e5c 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -1832,6 +1832,12 @@ #define PCI_VENDOR_ID_NVIDIA_SGS 0x12d2 #define PCI_DEVICE_ID_NVIDIA_SGS_RIVA128 0x0018 +#define PCI_VENDOR_ID_PERICOM 0x12D8 +#define PCI_DEVICE_ID_PERICOM_PI7C9X7951 0x7951 +#define PCI_DEVICE_ID_PERICOM_PI7C9X7952 0x7952 +#define PCI_DEVICE_ID_PERICOM_PI7C9X7954 0x7954 +#define PCI_DEVICE_ID_PERICOM_PI7C9X7958 0x7958 + #define PCI_SUBVENDOR_ID_CHASE_PCIFAST 0x12E0 #define PCI_SUBDEVICE_ID_CHASE_PCIFAST4 0x0031 #define PCI_SUBDEVICE_ID_CHASE_PCIFAST8 0x0021 -- cgit v1.2.3 From 70ce85319d450a8e566467eab3395d318a69f832 Mon Sep 17 00:00:00 2001 From: Alexander Monakov Date: Sun, 10 May 2020 20:48:40 +0000 Subject: x86/amd_nb: Add AMD family 17h model 60h PCI IDs [ Upstream commit a4e91825d7e1252f7cba005f1451e5464b23c15d ] Add PCI IDs for AMD Renoir (4000-series Ryzen CPUs). This is necessary to enable support for temperature sensors via the k10temp module. Signed-off-by: Alexander Monakov Signed-off-by: Borislav Petkov Acked-by: Yazen Ghannam Acked-by: Guenter Roeck Link: https://lkml.kernel.org/r/20200510204842.2603-2-amonakov@ispras.ru Signed-off-by: Sasha Levin --- include/linux/pci_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 9a57e6717e5c..0ad57693f392 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -550,6 +550,7 @@ #define PCI_DEVICE_ID_AMD_17H_DF_F3 0x1463 #define PCI_DEVICE_ID_AMD_17H_M10H_DF_F3 0x15eb #define PCI_DEVICE_ID_AMD_17H_M30H_DF_F3 0x1493 +#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F3 0x144b #define PCI_DEVICE_ID_AMD_17H_M70H_DF_F3 0x1443 #define PCI_DEVICE_ID_AMD_19H_DF_F3 0x1653 #define PCI_DEVICE_ID_AMD_CNB17H_F3 0x1703 -- cgit v1.2.3 From e5084eadf193b73e9e38b93f52fabad43d1fd268 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 22 May 2020 12:01:33 +1000 Subject: sunrpc: clean up properly in gss_mech_unregister() commit 24c5efe41c29ee3e55bcf5a1c9f61ca8709622e8 upstream. gss_mech_register() calls svcauth_gss_register_pseudoflavor() for each flavour, but gss_mech_unregister() does not call auth_domain_put(). This is unbalanced and makes it impossible to reload the module. Change svcauth_gss_register_pseudoflavor() to return the registered auth_domain, and save it for later release. Cc: stable@vger.kernel.org (v2.6.12+) Link: https://bugzilla.kernel.org/show_bug.cgi?id=206651 Signed-off-by: NeilBrown Signed-off-by: J. Bruce Fields Signed-off-by: Greg Kroah-Hartman --- include/linux/sunrpc/gss_api.h | 1 + include/linux/sunrpc/svcauth_gss.h | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/gss_api.h b/include/linux/sunrpc/gss_api.h index d4326d6662a4..b5a4eb14f809 100644 --- a/include/linux/sunrpc/gss_api.h +++ b/include/linux/sunrpc/gss_api.h @@ -85,6 +85,7 @@ struct pf_desc { u32 service; char *name; char *auth_domain_name; + struct auth_domain *domain; bool datatouch; }; diff --git a/include/linux/sunrpc/svcauth_gss.h b/include/linux/sunrpc/svcauth_gss.h index a4528b26c8aa..d229d27ab19e 100644 --- a/include/linux/sunrpc/svcauth_gss.h +++ b/include/linux/sunrpc/svcauth_gss.h @@ -21,7 +21,8 @@ int gss_svc_init(void); void gss_svc_shutdown(void); int gss_svc_init_net(struct net *net); void gss_svc_shutdown_net(struct net *net); -int svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name); +struct auth_domain *svcauth_gss_register_pseudoflavor(u32 pseudoflavor, + char *name); u32 svcauth_gss_flavor(struct auth_domain *dom); #endif /* __KERNEL__ */ -- cgit v1.2.3 From 805bae7592fa2cfede898b24e2a8d85fdd6317ee Mon Sep 17 00:00:00 2001 From: Peter Chen Date: Sun, 10 May 2020 13:30:41 +0800 Subject: usb: gadget: core: sync interrupt before unbind the udc [ Upstream commit 3c73bc52195def14165c3a7d91bdbb33b51725f5 ] The threaded interrupt handler may still be called after the usb_gadget_disconnect is called, it causes the structures used at interrupt handler was freed before it uses, eg the usb_request. This issue usually occurs we remove the udc function during the transfer. Below is the example when doing stress test for android switch function, the EP0's request is freed by .unbind (configfs_composite_unbind -> composite_dev_cleanup), but the threaded handler accesses this request during handling setup packet request. In fact, there is no protection between unbind the udc and udc interrupt handling, so we have to avoid the interrupt handler is occurred or scheduled during the .unbind flow. init: Sending signal 9 to service 'adbd' (pid 18077) process group... android_work: did not send uevent (0 0 000000007bec2039) libprocessgroup: Successfully killed process cgroup uid 0 pid 18077 in 6ms init: Service 'adbd' (pid 18077) received signal 9 init: Sending signal 9 to service 'adbd' (pid 18077) process group... libprocessgroup: Successfully killed process cgroup uid 0 pid 18077 in 0ms init: processing action (init.svc.adbd=stopped) from (/init.usb.configfs.rc:14) init: Received control message 'start' for 'adbd' from pid: 399 (/vendor/bin/hw/android.hardware.usb@1. init: starting service 'adbd'... read descriptors read strings Unable to handle kernel read from unreadable memory at virtual address 000000000000002a android_work: sent uevent USB_STATE=CONNECTED Mem abort info: ESR = 0x96000004 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 Data abort info: ISV = 0, ISS = 0x00000004 CM = 0, WnR = 0 user pgtable: 4k pages, 48-bit VAs, pgdp=00000000e97f1000 using random self ethernet address [000000000000002a] pgd=0000000000000000 Internal error: Oops: 96000004 [#1] PREEMPT SMP Modules linked in: CPU: 0 PID: 232 Comm: irq/68-5b110000 Not tainted 5.4.24-06075-g94a6b52b5815 #92 Hardware name: Freescale i.MX8QXP MEK (DT) pstate: 00400085 (nzcv daIf +PAN -UAO) using random host ethernet address pc : composite_setup+0x5c/0x1730 lr : android_setup+0xc0/0x148 sp : ffff80001349bba0 x29: ffff80001349bba0 x28: ffff00083a50da00 x27: ffff8000124e6000 x26: ffff800010177950 x25: 0000000000000040 x24: ffff000834e18010 x23: 0000000000000000 x22: 0000000000000000 x21: ffff00083a50da00 x20: ffff00082e75ec40 x19: 0000000000000000 x18: 0000000000000000 x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000 x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000001 x11: ffff80001180fb58 x10: 0000000000000040 x9 : ffff8000120fc980 x8 : 0000000000000000 x7 : ffff00083f98df50 x6 : 0000000000000100 x5 : 00000307e8978431 x4 : ffff800011386788 x3 : 0000000000000000 x2 : ffff800012342000 x1 : 0000000000000000 x0 : ffff800010c6d3a0 Call trace: composite_setup+0x5c/0x1730 android_setup+0xc0/0x148 cdns3_ep0_delegate_req+0x64/0x90 cdns3_check_ep0_interrupt_proceed+0x384/0x738 cdns3_device_thread_irq_handler+0x124/0x6e0 cdns3_thread_irq+0x94/0xa0 irq_thread_fn+0x30/0xa0 irq_thread+0x150/0x248 kthread+0xfc/0x128 ret_from_fork+0x10/0x18 Code: 910e8000 f9400693 12001ed7 79400f79 (3940aa61) ---[ end trace c685db37f8773fba ]--- Kernel panic - not syncing: Fatal exception SMP: stopping secondary CPUs Kernel Offset: disabled CPU features: 0x0002,20002008 Memory Limit: none Rebooting in 5 seconds.. Reviewed-by: Jun Li Signed-off-by: Peter Chen Signed-off-by: Felipe Balbi Signed-off-by: Sasha Levin --- include/linux/usb/gadget.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h index 124462d65eac..67f5adc9b875 100644 --- a/include/linux/usb/gadget.h +++ b/include/linux/usb/gadget.h @@ -373,6 +373,7 @@ struct usb_gadget_ops { * @connected: True if gadget is connected. * @lpm_capable: If the gadget max_speed is FULL or HIGH, this flag * indicates that it supports LPM as per the LPM ECN & errata. + * @irq: the interrupt number for device controller. * * Gadgets have a mostly-portable "gadget driver" implementing device * functions, handling all usb configurations and interfaces. Gadget @@ -427,6 +428,7 @@ struct usb_gadget { unsigned deactivated:1; unsigned connected:1; unsigned lpm_capable:1; + int irq; }; #define work_to_gadget(w) (container_of((w), struct usb_gadget, work)) -- cgit v1.2.3 From c707e18796c6a001c44254f7087e7ad9f39086d5 Mon Sep 17 00:00:00 2001 From: Pawel Laszczak Date: Mon, 18 May 2020 12:08:45 +0200 Subject: usb: gadget: Fix issue with config_ep_by_speed function [ Upstream commit 5d363120aa548ba52d58907a295eee25f8207ed2 ] This patch adds new config_ep_by_speed_and_alt function which extends the config_ep_by_speed about alt parameter. This additional parameter allows to find proper usb_ss_ep_comp_descriptor. Problem has appeared during testing f_tcm (BOT/UAS) driver function. f_tcm function for SS use array of headers for both BOT/UAS alternate setting: static struct usb_descriptor_header *uasp_ss_function_desc[] = { (struct usb_descriptor_header *) &bot_intf_desc, (struct usb_descriptor_header *) &uasp_ss_bi_desc, (struct usb_descriptor_header *) &bot_bi_ep_comp_desc, (struct usb_descriptor_header *) &uasp_ss_bo_desc, (struct usb_descriptor_header *) &bot_bo_ep_comp_desc, (struct usb_descriptor_header *) &uasp_intf_desc, (struct usb_descriptor_header *) &uasp_ss_bi_desc, (struct usb_descriptor_header *) &uasp_bi_ep_comp_desc, (struct usb_descriptor_header *) &uasp_bi_pipe_desc, (struct usb_descriptor_header *) &uasp_ss_bo_desc, (struct usb_descriptor_header *) &uasp_bo_ep_comp_desc, (struct usb_descriptor_header *) &uasp_bo_pipe_desc, (struct usb_descriptor_header *) &uasp_ss_status_desc, (struct usb_descriptor_header *) &uasp_status_in_ep_comp_desc, (struct usb_descriptor_header *) &uasp_status_pipe_desc, (struct usb_descriptor_header *) &uasp_ss_cmd_desc, (struct usb_descriptor_header *) &uasp_cmd_comp_desc, (struct usb_descriptor_header *) &uasp_cmd_pipe_desc, NULL, }; The first 5 descriptors are associated with BOT alternate setting, and others are associated with UAS. During handling UAS alternate setting f_tcm driver invokes config_ep_by_speed and this function sets incorrect companion endpoint descriptor in usb_ep object. Instead setting ep->comp_desc to uasp_bi_ep_comp_desc function in this case set ep->comp_desc to uasp_ss_bi_desc. This is due to the fact that it searches endpoint based on endpoint address: for_each_ep_desc(speed_desc, d_spd) { chosen_desc = (struct usb_endpoint_descriptor *)*d_spd; if (chosen_desc->bEndpoitAddress == _ep->address) goto ep_found; } And in result it uses the descriptor from BOT alternate setting instead UAS. Finally, it causes that controller driver during enabling endpoints detect that just enabled endpoint for bot. Signed-off-by: Jayshri Pawar Signed-off-by: Pawel Laszczak Signed-off-by: Felipe Balbi Signed-off-by: Sasha Levin --- include/linux/usb/composite.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h index 8675e145ea8b..2040696d75b6 100644 --- a/include/linux/usb/composite.h +++ b/include/linux/usb/composite.h @@ -249,6 +249,9 @@ int usb_function_activate(struct usb_function *); int usb_interface_id(struct usb_configuration *, struct usb_function *); +int config_ep_by_speed_and_alt(struct usb_gadget *g, struct usb_function *f, + struct usb_ep *_ep, u8 alt); + int config_ep_by_speed(struct usb_gadget *g, struct usb_function *f, struct usb_ep *_ep); -- cgit v1.2.3 From 9669bf039ff67c183ac51721ecff366623337bb3 Mon Sep 17 00:00:00 2001 From: Amelie Delaunay Date: Wed, 22 Apr 2020 11:08:33 +0200 Subject: mfd: stmfx: Disable IRQ in suspend to avoid spurious interrupt [ Upstream commit 97eda5dcc2cde5dcc778bef7a9344db3b6bf8ef5 ] When STMFX supply is stopped, spurious interrupt can occur. To avoid that, disable the interrupt in suspend before disabling the regulator and re-enable it at the end of resume. Fixes: 06252ade9156 ("mfd: Add ST Multi-Function eXpander (STMFX) core driver") Signed-off-by: Amelie Delaunay Signed-off-by: Lee Jones Signed-off-by: Sasha Levin --- include/linux/mfd/stmfx.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mfd/stmfx.h b/include/linux/mfd/stmfx.h index 3c67983678ec..744dce63946e 100644 --- a/include/linux/mfd/stmfx.h +++ b/include/linux/mfd/stmfx.h @@ -109,6 +109,7 @@ struct stmfx { struct device *dev; struct regmap *map; struct regulator *vdd; + int irq; struct irq_domain *irq_domain; struct mutex lock; /* IRQ bus lock */ u8 irq_src; -- cgit v1.2.3 From ece3a3337c50c8e4fdd76a9158ac2c5c1067d061 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 21 May 2020 14:06:17 -0700 Subject: /dev/mem: Revoke mappings when a driver claims the region [ Upstream commit 3234ac664a870e6ea69ae3a57d824cd7edbeacc5 ] Close the hole of holding a mapping over kernel driver takeover event of a given address range. Commit 90a545e98126 ("restrict /dev/mem to idle io memory ranges") introduced CONFIG_IO_STRICT_DEVMEM with the goal of protecting the kernel against scenarios where a /dev/mem user tramples memory that a kernel driver owns. However, this protection only prevents *new* read(), write() and mmap() requests. Established mappings prior to the driver calling request_mem_region() are left alone. Especially with persistent memory, and the core kernel metadata that is stored there, there are plentiful scenarios for a /dev/mem user to violate the expectations of the driver and cause amplified damage. Teach request_mem_region() to find and shoot down active /dev/mem mappings that it believes it has successfully claimed for the exclusive use of the driver. Effectively a driver call to request_mem_region() becomes a hole-punch on the /dev/mem device. The typical usage of unmap_mapping_range() is part of truncate_pagecache() to punch a hole in a file, but in this case the implementation is only doing the "first half" of a hole punch. Namely it is just evacuating current established mappings of the "hole", and it relies on the fact that /dev/mem establishes mappings in terms of absolute physical address offsets. Once existing mmap users are invalidated they can attempt to re-establish the mapping, or attempt to continue issuing read(2) / write(2) to the invalidated extent, but they will then be subject to the CONFIG_IO_STRICT_DEVMEM checking that can block those subsequent accesses. Cc: Arnd Bergmann Cc: Ingo Molnar Cc: Kees Cook Cc: Matthew Wilcox Cc: Russell King Cc: Andrew Morton Cc: Greg Kroah-Hartman Fixes: 90a545e98126 ("restrict /dev/mem to idle io memory ranges") Signed-off-by: Dan Williams Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/159009507306.847224.8502634072429766747.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- include/linux/ioport.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 7bddddfc76d6..fdc201d61460 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -300,5 +300,11 @@ struct resource *devm_request_free_mem_region(struct device *dev, struct resource *request_free_mem_region(struct resource *base, unsigned long size, const char *name); +#ifdef CONFIG_IO_STRICT_DEVMEM +void revoke_devmem(struct resource *res); +#else +static inline void revoke_devmem(struct resource *res) { }; +#endif + #endif /* __ASSEMBLY__ */ #endif /* _LINUX_IOPORT_H */ -- cgit v1.2.3 From 7fd59952282392e66cd92f33a7831a903a750254 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 4 Jun 2020 16:50:30 -0700 Subject: include/linux/bitops.h: avoid clang shift-count-overflow warnings [ Upstream commit bd93f003b7462ae39a43c531abca37fe7073b866 ] Clang normally does not warn about certain issues in inline functions when it only happens in an eliminated code path. However if something else goes wrong, it does tend to complain about the definition of hweight_long() on 32-bit targets: include/linux/bitops.h:75:41: error: shift count >= width of type [-Werror,-Wshift-count-overflow] return sizeof(w) == 4 ? hweight32(w) : hweight64(w); ^~~~~~~~~~~~ include/asm-generic/bitops/const_hweight.h:29:49: note: expanded from macro 'hweight64' define hweight64(w) (__builtin_constant_p(w) ? __const_hweight64(w) : __arch_hweight64(w)) ^~~~~~~~~~~~~~~~~~~~ include/asm-generic/bitops/const_hweight.h:21:76: note: expanded from macro '__const_hweight64' define __const_hweight64(w) (__const_hweight32(w) + __const_hweight32((w) >> 32)) ^ ~~ include/asm-generic/bitops/const_hweight.h:20:49: note: expanded from macro '__const_hweight32' define __const_hweight32(w) (__const_hweight16(w) + __const_hweight16((w) >> 16)) ^ include/asm-generic/bitops/const_hweight.h:19:72: note: expanded from macro '__const_hweight16' define __const_hweight16(w) (__const_hweight8(w) + __const_hweight8((w) >> 8 )) ^ include/asm-generic/bitops/const_hweight.h:12:9: note: expanded from macro '__const_hweight8' (!!((w) & (1ULL << 2))) + \ Adding an explicit cast to __u64 avoids that warning and makes it easier to read other output. Signed-off-by: Arnd Bergmann Signed-off-by: Andrew Morton Acked-by: Christian Brauner Cc: Andy Shevchenko Cc: Rasmus Villemoes Cc: Josh Poimboeuf Cc: Nick Desaulniers Link: http://lkml.kernel.org/r/20200505135513.65265-1-arnd@arndb.de Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- include/linux/bitops.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index c94a9ff9f082..4f0e62cbf2ff 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -57,7 +57,7 @@ static inline int get_bitmask_order(unsigned int count) static __always_inline unsigned long hweight_long(unsigned long w) { - return sizeof(w) == 4 ? hweight32(w) : hweight64(w); + return sizeof(w) == 4 ? hweight32(w) : hweight64((__u64)w); } /** -- cgit v1.2.3 From 57f71bb57f7c74ab06855dba30dd68935168fb7c Mon Sep 17 00:00:00 2001 From: Zheng Bin Date: Thu, 21 May 2020 17:17:21 +0800 Subject: nfs: set invalid blocks after NFSv4 writes [ Upstream commit 3a39e778690500066b31fe982d18e2e394d3bce2 ] Use the following command to test nfsv4(size of file1M is 1MB): mount -t nfs -o vers=4.0,actimeo=60 127.0.0.1/dir1 /mnt cp file1M /mnt du -h /mnt/file1M -->0 within 60s, then 1M When write is done(cp file1M /mnt), will call this: nfs_writeback_done nfs4_write_done nfs4_write_done_cb nfs_writeback_update_inode nfs_post_op_update_inode_force_wcc_locked(change, ctime, mtime nfs_post_op_update_inode_force_wcc_locked nfs_set_cache_invalid nfs_refresh_inode_locked nfs_update_inode nfsd write response contains change, ctime, mtime, the flag will be clear after nfs_update_inode. Howerver, write response does not contain space_used, previous open response contains space_used whose value is 0, so inode->i_blocks is still 0. nfs_getattr -->called by "du -h" do_update |= force_sync || nfs_attribute_cache_expired -->false in 60s cache_validity = READ_ONCE(NFS_I(inode)->cache_validity) do_update |= cache_validity & (NFS_INO_INVALID_ATTR -->false if (do_update) { __nfs_revalidate_inode } Within 60s, does not send getattr request to nfsd, thus "du -h /mnt/file1M" is 0. Add a NFS_INO_INVALID_BLOCKS flag, set it when nfsv4 write is done. Fixes: 16e143751727 ("NFS: More fine grained attribute tracking") Signed-off-by: Zheng Bin Signed-off-by: Anna Schumaker Signed-off-by: Sasha Levin --- include/linux/nfs_fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 570a60c2f4f4..ad09c0cc5464 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -225,6 +225,7 @@ struct nfs4_copy_state { #define NFS_INO_INVALID_OTHER BIT(12) /* other attrs are invalid */ #define NFS_INO_DATA_INVAL_DEFER \ BIT(13) /* Deferred cache invalidation */ +#define NFS_INO_INVALID_BLOCKS BIT(14) /* cached blocks are invalid */ #define NFS_INO_INVALID_ATTR (NFS_INO_INVALID_CHANGE \ | NFS_INO_INVALID_CTIME \ -- cgit v1.2.3 From 21a45a1427dd16c7a07da2d53df190688f9f716c Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Mon, 27 Jan 2020 17:37:42 +0900 Subject: usb: host: ehci-platform: add a quirk to avoid stuck [ Upstream commit cc7eac1e4afdd151085be4d0341a155760388653 ] Since EHCI/OHCI controllers on R-Car Gen3 SoCs are possible to be getting stuck very rarely after a full/low usb device was disconnected. To detect/recover from such a situation, the controllers require a special way which poll the EHCI PORTSC register and changes the OHCI functional state. So, this patch adds a polling timer into the ehci-platform driver, and if the ehci driver detects the issue by the EHCI PORTSC register, the ehci driver removes a companion device (= the OHCI controller) to change the OHCI functional state to USB Reset once. And then, the ehci driver adds the companion device again. Signed-off-by: Yoshihiro Shimoda Acked-by: Alan Stern Link: https://lore.kernel.org/r/1580114262-25029-1-git-send-email-yoshihiro.shimoda.uh@renesas.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- include/linux/usb/ehci_def.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/usb/ehci_def.h b/include/linux/usb/ehci_def.h index a15ce99dfc2d..78e006355557 100644 --- a/include/linux/usb/ehci_def.h +++ b/include/linux/usb/ehci_def.h @@ -151,7 +151,7 @@ struct ehci_regs { #define PORT_OWNER (1<<13) /* true: companion hc owns this port */ #define PORT_POWER (1<<12) /* true: has power (see PPC) */ #define PORT_USB11(x) (((x)&(3<<10)) == (1<<10)) /* USB 1.1 device */ -/* 11:10 for detecting lowspeed devices (reset vs release ownership) */ +#define PORT_LS_MASK (3<<10) /* Link status (SE0, K or J */ /* 9 reserved */ #define PORT_LPM (1<<9) /* LPM transaction */ #define PORT_RESET (1<<8) /* reset port */ -- cgit v1.2.3 From ef4f3b65d5a97fc8385dd8bdd54cc13c7df75b61 Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Wed, 3 Jun 2020 15:48:19 +0800 Subject: libata: Use per port sync for detach [ Upstream commit b5292111de9bb70cba3489075970889765302136 ] Commit 130f4caf145c ("libata: Ensure ata_port probe has completed before detach") may cause system freeze during suspend. Using async_synchronize_full() in PM callbacks is wrong, since async callbacks that are already scheduled may wait for not-yet-scheduled callbacks, causes a circular dependency. Instead of using big hammer like async_synchronize_full(), use async cookie to make sure port probe are synced, without affecting other scheduled PM callbacks. Fixes: 130f4caf145c ("libata: Ensure ata_port probe has completed before detach") Suggested-by: John Garry Signed-off-by: Kai-Heng Feng Tested-by: John Garry BugLink: https://bugs.launchpad.net/bugs/1867983 Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- include/linux/libata.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index c44e4cfbcb16..b9970f5bab67 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -22,6 +22,7 @@ #include #include #include +#include /* * Define if arch has non-standard setup. This is a _PCI_ standard @@ -870,6 +871,8 @@ struct ata_port { struct timer_list fastdrain_timer; unsigned long fastdrain_cnt; + async_cookie_t cookie; + int em_message_type; void *private_data; -- cgit v1.2.3 From 0804b23d2ff15f6656d70291d5e07677ca897881 Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Wed, 4 Dec 2019 20:46:14 +0800 Subject: jbd2: clean __jbd2_journal_abort_hard() and __journal_abort_soft() [ Upstream commit 7f6225e446cc8dfa4c3c7959a4de3dd03ec277bf ] __jbd2_journal_abort_hard() is no longer used, so now we can merge __jbd2_journal_abort_hard() and __journal_abort_soft() these two functions into jbd2_journal_abort() and remove them. Signed-off-by: zhangyi (F) Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20191204124614.45424-5-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o Signed-off-by: Sasha Levin --- include/linux/jbd2.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 10e6049c0ba9..b0e97e5de8ca 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1402,7 +1402,6 @@ extern int jbd2_journal_skip_recovery (journal_t *); extern void jbd2_journal_update_sb_errno(journal_t *); extern int jbd2_journal_update_sb_log_tail (journal_t *, tid_t, unsigned long, int); -extern void __jbd2_journal_abort_hard (journal_t *); extern void jbd2_journal_abort (journal_t *, int); extern int jbd2_journal_errno (journal_t *); extern void jbd2_journal_ack_err (journal_t *); -- cgit v1.2.3 From 859a0a9afee3159aeebdefa5febc0c96cf919f6b Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Wed, 3 Jun 2020 16:49:48 +0200 Subject: block: nr_sects_write(): Disable preemption on seqcount write [ Upstream commit 15b81ce5abdc4b502aa31dff2d415b79d2349d2f ] For optimized block readers not holding a mutex, the "number of sectors" 64-bit value is protected from tearing on 32-bit architectures by a sequence counter. Disable preemption before entering that sequence counter's write side critical section. Otherwise, the read side can preempt the write side section and spin for the entire scheduler tick. If the reader belongs to a real-time scheduling class, it can spin forever and the kernel will livelock. Fixes: c83f6bf98dc1 ("block: add partition resize function to blkpg ioctl") Cc: Signed-off-by: Ahmed S. Darwish Reviewed-by: Sebastian Andrzej Siewior Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- include/linux/genhd.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 8b5330dd5ac0..62a2ec9f17df 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -750,9 +750,11 @@ static inline sector_t part_nr_sects_read(struct hd_struct *part) static inline void part_nr_sects_write(struct hd_struct *part, sector_t size) { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) + preempt_disable(); write_seqcount_begin(&part->nr_sects_seq); part->nr_sects = size; write_seqcount_end(&part->nr_sects_seq); + preempt_enable(); #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT) preempt_disable(); part->nr_sects = size; -- cgit v1.2.3 From 3d390370d78c4ea967a7f8ce5d0ab0136199eb21 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 12 May 2020 17:03:18 +0900 Subject: kretprobe: Prevent triggering kretprobe from within kprobe_flush_task commit 9b38cc704e844e41d9cf74e647bff1d249512cb3 upstream. Ziqian reported lockup when adding retprobe on _raw_spin_lock_irqsave. My test was also able to trigger lockdep output: ============================================ WARNING: possible recursive locking detected 5.6.0-rc6+ #6 Not tainted -------------------------------------------- sched-messaging/2767 is trying to acquire lock: ffffffff9a492798 (&(kretprobe_table_locks[i].lock)){-.-.}, at: kretprobe_hash_lock+0x52/0xa0 but task is already holding lock: ffffffff9a491a18 (&(kretprobe_table_locks[i].lock)){-.-.}, at: kretprobe_trampoline+0x0/0x50 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&(kretprobe_table_locks[i].lock)); lock(&(kretprobe_table_locks[i].lock)); *** DEADLOCK *** May be due to missing lock nesting notation 1 lock held by sched-messaging/2767: #0: ffffffff9a491a18 (&(kretprobe_table_locks[i].lock)){-.-.}, at: kretprobe_trampoline+0x0/0x50 stack backtrace: CPU: 3 PID: 2767 Comm: sched-messaging Not tainted 5.6.0-rc6+ #6 Call Trace: dump_stack+0x96/0xe0 __lock_acquire.cold.57+0x173/0x2b7 ? native_queued_spin_lock_slowpath+0x42b/0x9e0 ? lockdep_hardirqs_on+0x590/0x590 ? __lock_acquire+0xf63/0x4030 lock_acquire+0x15a/0x3d0 ? kretprobe_hash_lock+0x52/0xa0 _raw_spin_lock_irqsave+0x36/0x70 ? kretprobe_hash_lock+0x52/0xa0 kretprobe_hash_lock+0x52/0xa0 trampoline_handler+0xf8/0x940 ? kprobe_fault_handler+0x380/0x380 ? find_held_lock+0x3a/0x1c0 kretprobe_trampoline+0x25/0x50 ? lock_acquired+0x392/0xbc0 ? _raw_spin_lock_irqsave+0x50/0x70 ? __get_valid_kprobe+0x1f0/0x1f0 ? _raw_spin_unlock_irqrestore+0x3b/0x40 ? finish_task_switch+0x4b9/0x6d0 ? __switch_to_asm+0x34/0x70 ? __switch_to_asm+0x40/0x70 The code within the kretprobe handler checks for probe reentrancy, so we won't trigger any _raw_spin_lock_irqsave probe in there. The problem is in outside kprobe_flush_task, where we call: kprobe_flush_task kretprobe_table_lock raw_spin_lock_irqsave _raw_spin_lock_irqsave where _raw_spin_lock_irqsave triggers the kretprobe and installs kretprobe_trampoline handler on _raw_spin_lock_irqsave return. The kretprobe_trampoline handler is then executed with already locked kretprobe_table_locks, and first thing it does is to lock kretprobe_table_locks ;-) the whole lockup path like: kprobe_flush_task kretprobe_table_lock raw_spin_lock_irqsave _raw_spin_lock_irqsave ---> probe triggered, kretprobe_trampoline installed ---> kretprobe_table_locks locked kretprobe_trampoline trampoline_handler kretprobe_hash_lock(current, &head, &flags); <--- deadlock Adding kprobe_busy_begin/end helpers that mark code with fake probe installed to prevent triggering of another kprobe within this code. Using these helpers in kprobe_flush_task, so the probe recursion protection check is hit and the probe is never set to prevent above lockup. Link: http://lkml.kernel.org/r/158927059835.27680.7011202830041561604.stgit@devnote2 Fixes: ef53d9c5e4da ("kprobes: improve kretprobe scalability with hashed locking") Cc: Ingo Molnar Cc: "Gustavo A . R . Silva" Cc: Anders Roxell Cc: "Naveen N . Rao" Cc: Anil S Keshavamurthy Cc: David Miller Cc: Ingo Molnar Cc: Peter Zijlstra Cc: stable@vger.kernel.org Reported-by: "Ziqian SUN (Zamir)" Acked-by: Masami Hiramatsu Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- include/linux/kprobes.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 04bdaf01112c..645fd401c856 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -350,6 +350,10 @@ static inline struct kprobe_ctlblk *get_kprobe_ctlblk(void) return this_cpu_ptr(&kprobe_ctlblk); } +extern struct kprobe kprobe_busy; +void kprobe_busy_begin(void); +void kprobe_busy_end(void); + kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset); int register_kprobe(struct kprobe *p); void unregister_kprobe(struct kprobe *p); -- cgit v1.2.3 From 9f217d6dd79663efeb89d307ec3edf77116c6577 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Tue, 16 Jun 2020 15:52:05 +0000 Subject: net: core: reduce recursion limit value [ Upstream commit fb7861d14c8d7edac65b2fcb6e8031cb138457b2 ] In the current code, ->ndo_start_xmit() can be executed recursively only 10 times because of stack memory. But, in the case of the vxlan, 10 recursion limit value results in a stack overflow. In the current code, the nested interface is limited by 8 depth. There is no critical reason that the recursion limitation value should be 10. So, it would be good to be the same value with the limitation value of nesting interface depth. Test commands: ip link add vxlan10 type vxlan vni 10 dstport 4789 srcport 4789 4789 ip link set vxlan10 up ip a a 192.168.10.1/24 dev vxlan10 ip n a 192.168.10.2 dev vxlan10 lladdr fc:22:33:44:55:66 nud permanent for i in {9..0} do let A=$i+1 ip link add vxlan$i type vxlan vni $i dstport 4789 srcport 4789 4789 ip link set vxlan$i up ip a a 192.168.$i.1/24 dev vxlan$i ip n a 192.168.$i.2 dev vxlan$i lladdr fc:22:33:44:55:66 nud permanent bridge fdb add fc:22:33:44:55:66 dev vxlan$A dst 192.168.$i.2 self done hping3 192.168.10.2 -2 -d 60000 Splat looks like: [ 103.814237][ T1127] ============================================================================= [ 103.871955][ T1127] BUG kmalloc-2k (Tainted: G B ): Padding overwritten. 0x00000000897a2e4f-0x000 [ 103.873187][ T1127] ----------------------------------------------------------------------------- [ 103.873187][ T1127] [ 103.874252][ T1127] INFO: Slab 0x000000005cccc724 objects=5 used=5 fp=0x0000000000000000 flags=0x10000000001020 [ 103.881323][ T1127] CPU: 3 PID: 1127 Comm: hping3 Tainted: G B 5.7.0+ #575 [ 103.882131][ T1127] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 103.883006][ T1127] Call Trace: [ 103.883324][ T1127] dump_stack+0x96/0xdb [ 103.883716][ T1127] slab_err+0xad/0xd0 [ 103.884106][ T1127] ? _raw_spin_unlock+0x1f/0x30 [ 103.884620][ T1127] ? get_partial_node.isra.78+0x140/0x360 [ 103.885214][ T1127] slab_pad_check.part.53+0xf7/0x160 [ 103.885769][ T1127] ? pskb_expand_head+0x110/0xe10 [ 103.886316][ T1127] check_slab+0x97/0xb0 [ 103.886763][ T1127] alloc_debug_processing+0x84/0x1a0 [ 103.887308][ T1127] ___slab_alloc+0x5a5/0x630 [ 103.887765][ T1127] ? pskb_expand_head+0x110/0xe10 [ 103.888265][ T1127] ? lock_downgrade+0x730/0x730 [ 103.888762][ T1127] ? pskb_expand_head+0x110/0xe10 [ 103.889244][ T1127] ? __slab_alloc+0x3e/0x80 [ 103.889675][ T1127] __slab_alloc+0x3e/0x80 [ 103.890108][ T1127] __kmalloc_node_track_caller+0xc7/0x420 [ ... ] Fixes: 11a766ce915f ("net: Increase xmit RECURSION_LIMIT to 10.") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b580a35f50ea..ec3081ab04c0 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3043,7 +3043,7 @@ static inline int dev_recursion_level(void) return this_cpu_read(softnet_data.xmit.recursion); } -#define XMIT_RECURSION_LIMIT 10 +#define XMIT_RECURSION_LIMIT 8 static inline bool dev_xmit_recursion(void) { return unlikely(__this_cpu_read(softnet_data.xmit.recursion) > -- cgit v1.2.3 From 8c236ac4376aa838f62fb86d21f73ffe1852b551 Mon Sep 17 00:00:00 2001 From: Fabian Vogt Date: Mon, 15 Jun 2020 09:16:36 +0200 Subject: efi/tpm: Verify event log header before parsing [ Upstream commit 7dfc06a0f25b593a9f51992f540c0f80a57f3629 ] It is possible that the first event in the event log is not actually a log header at all, but rather a normal event. This leads to the cast in __calc_tpm2_event_size being an invalid conversion, which means that the values read are effectively garbage. Depending on the first event's contents, this leads either to apparently normal behaviour, a crash or a freeze. While this behaviour of the firmware is not in accordance with the TCG Client EFI Specification, this happens on a Dell Precision 5510 with the TPM enabled but hidden from the OS ("TPM On" disabled, state otherwise untouched). The EFI firmware claims that the TPM is present and active and that it supports the TCG 2.0 event log format. Fortunately, this can be worked around by simply checking the header of the first event and the event log header signature itself. Commit b4f1874c6216 ("tpm: check event log version before reading final events") addressed a similar issue also found on Dell models. Fixes: 6b0326190205 ("efi: Attempt to get the TCG2 event log in the boot stub") Signed-off-by: Fabian Vogt Link: https://lore.kernel.org/r/1927248.evlx2EsYKh@linux-e202.suse.de Bugzilla: https://bugzilla.suse.com/show_bug.cgi?id=1165773 Signed-off-by: Ard Biesheuvel Signed-off-by: Sasha Levin --- include/linux/tpm_eventlog.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tpm_eventlog.h b/include/linux/tpm_eventlog.h index 131ea1bad458..eccfd3a4e4c8 100644 --- a/include/linux/tpm_eventlog.h +++ b/include/linux/tpm_eventlog.h @@ -81,6 +81,8 @@ struct tcg_efi_specid_event_algs { u16 digest_size; } __packed; +#define TCG_SPECID_SIG "Spec ID Event03" + struct tcg_efi_specid_event_head { u8 signature[16]; u32 platform_class; @@ -171,6 +173,7 @@ static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event, int i; int j; u32 count, event_type; + const u8 zero_digest[sizeof(event_header->digest)] = {0}; marker = event; marker_start = marker; @@ -198,10 +201,19 @@ static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event, count = READ_ONCE(event->count); event_type = READ_ONCE(event->event_type); + /* Verify that it's the log header */ + if (event_header->pcr_idx != 0 || + event_header->event_type != NO_ACTION || + memcmp(event_header->digest, zero_digest, sizeof(zero_digest))) { + size = 0; + goto out; + } + efispecid = (struct tcg_efi_specid_event_head *)event_header->event; /* Check if event is malformed. */ - if (count > efispecid->num_algs) { + if (memcmp(efispecid->signature, TCG_SPECID_SIG, + sizeof(TCG_SPECID_SIG)) || count > efispecid->num_algs) { size = 0; goto out; } -- cgit v1.2.3 From 5cf7f0c68405ec6d08b9e45eb9cb072876537793 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 23 Jun 2020 16:51:29 +0300 Subject: net: qed: fix left elements count calculation [ Upstream commit 97dd1abd026ae4e6a82fa68645928404ad483409 ] qed_chain_get_element_left{,_u32} returned 0 when the difference between producer and consumer page count was equal to the total page count. Fix this by conditional expanding of producer value (vs unconditional). This allowed to eliminate normalizaton against total page count, which was the cause of this bug. Misc: replace open-coded constants with common defines. Fixes: a91eb52abb50 ("qed: Revisit chain implementation") Signed-off-by: Alexander Lobakin Signed-off-by: Igor Russkikh Signed-off-by: Michal Kalderon Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- include/linux/qed/qed_chain.h | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/qed/qed_chain.h b/include/linux/qed/qed_chain.h index 733fad7dfbed..6d15040c642c 100644 --- a/include/linux/qed/qed_chain.h +++ b/include/linux/qed/qed_chain.h @@ -207,28 +207,34 @@ static inline u32 qed_chain_get_cons_idx_u32(struct qed_chain *p_chain) static inline u16 qed_chain_get_elem_left(struct qed_chain *p_chain) { + u16 elem_per_page = p_chain->elem_per_page; + u32 prod = p_chain->u.chain16.prod_idx; + u32 cons = p_chain->u.chain16.cons_idx; u16 used; - used = (u16) (((u32)0x10000 + - (u32)p_chain->u.chain16.prod_idx) - - (u32)p_chain->u.chain16.cons_idx); + if (prod < cons) + prod += (u32)U16_MAX + 1; + + used = (u16)(prod - cons); if (p_chain->mode == QED_CHAIN_MODE_NEXT_PTR) - used -= p_chain->u.chain16.prod_idx / p_chain->elem_per_page - - p_chain->u.chain16.cons_idx / p_chain->elem_per_page; + used -= prod / elem_per_page - cons / elem_per_page; return (u16)(p_chain->capacity - used); } static inline u32 qed_chain_get_elem_left_u32(struct qed_chain *p_chain) { + u16 elem_per_page = p_chain->elem_per_page; + u64 prod = p_chain->u.chain32.prod_idx; + u64 cons = p_chain->u.chain32.cons_idx; u32 used; - used = (u32) (((u64)0x100000000ULL + - (u64)p_chain->u.chain32.prod_idx) - - (u64)p_chain->u.chain32.cons_idx); + if (prod < cons) + prod += (u64)U32_MAX + 1; + + used = (u32)(prod - cons); if (p_chain->mode == QED_CHAIN_MODE_NEXT_PTR) - used -= p_chain->u.chain32.prod_idx / p_chain->elem_per_page - - p_chain->u.chain32.cons_idx / p_chain->elem_per_page; + used -= (u32)(prod / elem_per_page - cons / elem_per_page); return p_chain->capacity - used; } -- cgit v1.2.3 From 2a6c8d3d0dd0204fd38f302c3822e7bbb0613f24 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 2 Jul 2020 11:49:23 -0700 Subject: kallsyms: Refactor kallsyms_show_value() to take cred commit 160251842cd35a75edfb0a1d76afa3eb674ff40a upstream. In order to perform future tests against the cred saved during open(), switch kallsyms_show_value() to operate on a cred, and have all current callers pass current_cred(). This makes it very obvious where callers are checking the wrong credential in their "read" contexts. These will be fixed in the coming patches. Additionally switch return value to bool, since it is always used as a direct permission check, not a 0-on-success, negative-on-error style function return. Cc: stable@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: Greg Kroah-Hartman --- include/linux/filter.h | 2 +- include/linux/kallsyms.h | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 3bbc72dbc69e..0a91c01a6613 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -858,7 +858,7 @@ static inline bool bpf_dump_raw_ok(void) /* Reconstruction of call-sites is dependent on kallsyms, * thus make dump the same restriction. */ - return kallsyms_show_value() == 1; + return kallsyms_show_value(current_cred()); } struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index 657a83b943f0..1f96ce2b47df 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -18,6 +18,7 @@ #define KSYM_SYMBOL_LEN (sizeof("%s+%#lx/%#lx [%s]") + (KSYM_NAME_LEN - 1) + \ 2*(BITS_PER_LONG*3/10) + (MODULE_NAME_LEN - 1) + 1) +struct cred; struct module; static inline int is_kernel_inittext(unsigned long addr) @@ -98,7 +99,7 @@ int lookup_symbol_name(unsigned long addr, char *symname); int lookup_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name); /* How and when do we show kallsyms values? */ -extern int kallsyms_show_value(void); +extern bool kallsyms_show_value(const struct cred *cred); #else /* !CONFIG_KALLSYMS */ @@ -158,7 +159,7 @@ static inline int lookup_symbol_attrs(unsigned long addr, unsigned long *size, u return -ERANGE; } -static inline int kallsyms_show_value(void) +static inline bool kallsyms_show_value(const struct cred *cred) { return false; } -- cgit v1.2.3 From baef8d1027b0c037128c8be56338e6e066317e68 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 2 Jul 2020 15:45:23 -0700 Subject: bpf: Check correct cred for CAP_SYSLOG in bpf_dump_raw_ok() commit 63960260457a02af2a6cb35d75e6bdb17299c882 upstream. When evaluating access control over kallsyms visibility, credentials at open() time need to be used, not the "current" creds (though in BPF's case, this has likely always been the same). Plumb access to associated file->f_cred down through bpf_dump_raw_ok() and its callers now that kallsysm_show_value() has been refactored to take struct cred. Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: bpf@vger.kernel.org Cc: stable@vger.kernel.org Fixes: 7105e828c087 ("bpf: allow for correlation of maps and helpers in dump") Signed-off-by: Kees Cook Signed-off-by: Greg Kroah-Hartman --- include/linux/filter.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 0a91c01a6613..79830bc9e45c 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -853,12 +853,12 @@ void bpf_jit_compile(struct bpf_prog *prog); bool bpf_jit_needs_zext(void); bool bpf_helper_changes_pkt_data(void *func); -static inline bool bpf_dump_raw_ok(void) +static inline bool bpf_dump_raw_ok(const struct cred *cred) { /* Reconstruction of call-sites is dependent on kallsyms, * thus make dump the same restriction. */ - return kallsyms_show_value(current_cred()); + return kallsyms_show_value(cred); } struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, -- cgit v1.2.3 From 9b7fd81cf9b6ca322cffc6aff4be0f2d54e9689a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 3 Jul 2020 22:26:43 +0200 Subject: sched: consistently handle layer3 header accesses in the presence of VLANs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit d7bf2ebebc2bd61ab95e2a8e33541ef282f303d4 ] There are a couple of places in net/sched/ that check skb->protocol and act on the value there. However, in the presence of VLAN tags, the value stored in skb->protocol can be inconsistent based on whether VLAN acceleration is enabled. The commit quoted in the Fixes tag below fixed the users of skb->protocol to use a helper that will always see the VLAN ethertype. However, most of the callers don't actually handle the VLAN ethertype, but expect to find the IP header type in the protocol field. This means that things like changing the ECN field, or parsing diffserv values, stops working if there's a VLAN tag, or if there are multiple nested VLAN tags (QinQ). To fix this, change the helper to take an argument that indicates whether the caller wants to skip the VLAN tags or not. When skipping VLAN tags, we make sure to skip all of them, so behaviour is consistent even in QinQ mode. To make the helper usable from the ECN code, move it to if_vlan.h instead of pkt_sched.h. v3: - Remove empty lines - Move vlan variable definitions inside loop in skb_protocol() - Also use skb_protocol() helper in IP{,6}_ECN_decapsulate() and bpf_skb_ecn_set_ce() v2: - Use eth_type_vlan() helper in skb_protocol() - Also fix code that reads skb->protocol directly - Change a couple of 'if/else if' statements to switch constructs to avoid calling the helper twice Reported-by: Ilya Ponetayev Fixes: d8b9605d2697 ("net: sched: fix skb->protocol use in case of accelerated vlan path") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/linux/if_vlan.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index b05e855f1ddd..427a5b8597c2 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -308,6 +308,34 @@ static inline bool eth_type_vlan(__be16 ethertype) } } +/* A getter for the SKB protocol field which will handle VLAN tags consistently + * whether VLAN acceleration is enabled or not. + */ +static inline __be16 skb_protocol(const struct sk_buff *skb, bool skip_vlan) +{ + unsigned int offset = skb_mac_offset(skb) + sizeof(struct ethhdr); + __be16 proto = skb->protocol; + + if (!skip_vlan) + /* VLAN acceleration strips the VLAN header from the skb and + * moves it to skb->vlan_proto + */ + return skb_vlan_tag_present(skb) ? skb->vlan_proto : proto; + + while (eth_type_vlan(proto)) { + struct vlan_hdr vhdr, *vh; + + vh = skb_header_pointer(skb, offset, sizeof(vhdr), &vhdr); + if (!vh) + break; + + proto = vh->h_vlan_encapsulated_proto; + offset += sizeof(vhdr); + } + + return proto; +} + static inline bool vlan_hw_offload_capable(netdev_features_t features, __be16 proto) { -- cgit v1.2.3 From 30d015f5ecd9ce5706ad18a4a0649f364e3e6f7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 7 Jul 2020 13:03:25 +0200 Subject: vlan: consolidate VLAN parsing code and limit max parsing depth MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 469aceddfa3ed16e17ee30533fae45e90f62efd8 ] Toshiaki pointed out that we now have two very similar functions to extract the L3 protocol number in the presence of VLAN tags. And Daniel pointed out that the unbounded parsing loop makes it possible for maliciously crafted packets to loop through potentially hundreds of tags. Fix both of these issues by consolidating the two parsing functions and limiting the VLAN tag parsing to a max depth of 8 tags. As part of this, switch over __vlan_get_protocol() to use skb_header_pointer() instead of pskb_may_pull(), to avoid the possible side effects of the latter and keep the skb pointer 'const' through all the parsing functions. v2: - Use limit of 8 tags instead of 32 (matching XMIT_RECURSION_LIMIT) Reported-by: Toshiaki Makita Reported-by: Daniel Borkmann Fixes: d7bf2ebebc2b ("sched: consistently handle layer3 header accesses in the presence of VLANs") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/linux/if_vlan.h | 57 +++++++++++++++++++------------------------------ 1 file changed, 22 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 427a5b8597c2..41a518336673 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -25,6 +25,8 @@ #define VLAN_ETH_DATA_LEN 1500 /* Max. octets in payload */ #define VLAN_ETH_FRAME_LEN 1518 /* Max. octets in frame sans FCS */ +#define VLAN_MAX_DEPTH 8 /* Max. number of nested VLAN tags parsed */ + /* * struct vlan_hdr - vlan header * @h_vlan_TCI: priority and VLAN ID @@ -308,34 +310,6 @@ static inline bool eth_type_vlan(__be16 ethertype) } } -/* A getter for the SKB protocol field which will handle VLAN tags consistently - * whether VLAN acceleration is enabled or not. - */ -static inline __be16 skb_protocol(const struct sk_buff *skb, bool skip_vlan) -{ - unsigned int offset = skb_mac_offset(skb) + sizeof(struct ethhdr); - __be16 proto = skb->protocol; - - if (!skip_vlan) - /* VLAN acceleration strips the VLAN header from the skb and - * moves it to skb->vlan_proto - */ - return skb_vlan_tag_present(skb) ? skb->vlan_proto : proto; - - while (eth_type_vlan(proto)) { - struct vlan_hdr vhdr, *vh; - - vh = skb_header_pointer(skb, offset, sizeof(vhdr), &vhdr); - if (!vh) - break; - - proto = vh->h_vlan_encapsulated_proto; - offset += sizeof(vhdr); - } - - return proto; -} - static inline bool vlan_hw_offload_capable(netdev_features_t features, __be16 proto) { @@ -605,10 +579,10 @@ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) * Returns the EtherType of the packet, regardless of whether it is * vlan encapsulated (normal or hardware accelerated) or not. */ -static inline __be16 __vlan_get_protocol(struct sk_buff *skb, __be16 type, +static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, int *depth) { - unsigned int vlan_depth = skb->mac_len; + unsigned int vlan_depth = skb->mac_len, parse_depth = VLAN_MAX_DEPTH; /* if type is 802.1Q/AD then the header should already be * present at mac_len - VLAN_HLEN (if mac_len > 0), or at @@ -623,13 +597,12 @@ static inline __be16 __vlan_get_protocol(struct sk_buff *skb, __be16 type, vlan_depth = ETH_HLEN; } do { - struct vlan_hdr *vh; + struct vlan_hdr vhdr, *vh; - if (unlikely(!pskb_may_pull(skb, - vlan_depth + VLAN_HLEN))) + vh = skb_header_pointer(skb, vlan_depth, sizeof(vhdr), &vhdr); + if (unlikely(!vh || !--parse_depth)) return 0; - vh = (struct vlan_hdr *)(skb->data + vlan_depth); type = vh->h_vlan_encapsulated_proto; vlan_depth += VLAN_HLEN; } while (eth_type_vlan(type)); @@ -648,11 +621,25 @@ static inline __be16 __vlan_get_protocol(struct sk_buff *skb, __be16 type, * Returns the EtherType of the packet, regardless of whether it is * vlan encapsulated (normal or hardware accelerated) or not. */ -static inline __be16 vlan_get_protocol(struct sk_buff *skb) +static inline __be16 vlan_get_protocol(const struct sk_buff *skb) { return __vlan_get_protocol(skb, skb->protocol, NULL); } +/* A getter for the SKB protocol field which will handle VLAN tags consistently + * whether VLAN acceleration is enabled or not. + */ +static inline __be16 skb_protocol(const struct sk_buff *skb, bool skip_vlan) +{ + if (!skip_vlan) + /* VLAN acceleration strips the VLAN header from the skb and + * moves it to skb->vlan_proto + */ + return skb_vlan_tag_present(skb) ? skb->vlan_proto : skb->protocol; + + return vlan_get_protocol(skb); +} + static inline void vlan_set_encap_proto(struct sk_buff *skb, struct vlan_hdr *vhdr) { -- cgit v1.2.3 From 94886c86e833dbc8995202b6c6aaff592b7abd24 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 2 Jul 2020 11:52:56 -0700 Subject: cgroup: fix cgroup_sk_alloc() for sk_clone_lock() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit ad0f75e5f57ccbceec13274e1e242f2b5a6397ed ] When we clone a socket in sk_clone_lock(), its sk_cgrp_data is copied, so the cgroup refcnt must be taken too. And, unlike the sk_alloc() path, sock_update_netprioidx() is not called here. Therefore, it is safe and necessary to grab the cgroup refcnt even when cgroup_sk_alloc is disabled. sk_clone_lock() is in BH context anyway, the in_interrupt() would terminate this function if called there. And for sk_alloc() skcd->val is always zero. So it's safe to factor out the code to make it more readable. The global variable 'cgroup_sk_alloc_disabled' is used to determine whether to take these reference counts. It is impossible to make the reference counting correct unless we save this bit of information in skcd->val. So, add a new bit there to record whether the socket has already taken the reference counts. This obviously relies on kmalloc() to align cgroup pointers to at least 4 bytes, ARCH_KMALLOC_MINALIGN is certainly larger than that. This bug seems to be introduced since the beginning, commit d979a39d7242 ("cgroup: duplicate cgroup reference when cloning sockets") tried to fix it but not compeletely. It seems not easy to trigger until the recent commit 090e28b229af ("netprio_cgroup: Fix unlimited memory leak of v2 cgroups") was merged. Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup") Reported-by: Cameron Berkenpas Reported-by: Peter Geis Reported-by: Lu Fengqi Reported-by: Daniël Sonck Reported-by: Zhang Qiang Tested-by: Cameron Berkenpas Tested-by: Peter Geis Tested-by: Thomas Lamprecht Cc: Daniel Borkmann Cc: Zefan Li Cc: Tejun Heo Cc: Roman Gushchin Signed-off-by: Cong Wang Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/linux/cgroup-defs.h | 6 ++++-- include/linux/cgroup.h | 4 +++- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 430e219e3aba..2ebacb7ef501 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -797,7 +797,8 @@ struct sock_cgroup_data { union { #ifdef __LITTLE_ENDIAN struct { - u8 is_data; + u8 is_data : 1; + u8 no_refcnt : 1; u8 padding; u16 prioidx; u32 classid; @@ -807,7 +808,8 @@ struct sock_cgroup_data { u32 classid; u16 prioidx; u8 padding; - u8 is_data; + u8 no_refcnt : 1; + u8 is_data : 1; } __packed; #endif u64 val; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 57577075d204..202852383ae9 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -822,6 +822,7 @@ extern spinlock_t cgroup_sk_update_lock; void cgroup_sk_alloc_disable(void); void cgroup_sk_alloc(struct sock_cgroup_data *skcd); +void cgroup_sk_clone(struct sock_cgroup_data *skcd); void cgroup_sk_free(struct sock_cgroup_data *skcd); static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd) @@ -835,7 +836,7 @@ static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd) */ v = READ_ONCE(skcd->val); - if (v & 1) + if (v & 3) return &cgrp_dfl_root.cgrp; return (struct cgroup *)(unsigned long)v ?: &cgrp_dfl_root.cgrp; @@ -847,6 +848,7 @@ static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd) #else /* CONFIG_CGROUP_DATA */ static inline void cgroup_sk_alloc(struct sock_cgroup_data *skcd) {} +static inline void cgroup_sk_clone(struct sock_cgroup_data *skcd) {} static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {} #endif /* CONFIG_CGROUP_DATA */ -- cgit v1.2.3 From 38b122c0af04e8ab77dbbc387e126de7a8738aa4 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 9 Jul 2020 16:28:44 -0700 Subject: cgroup: Fix sock_cgroup_data on big-endian. [ Upstream commit 14b032b8f8fce03a546dcf365454bec8c4a58d7d ] In order for no_refcnt and is_data to be the lowest order two bits in the 'val' we have to pad out the bitfield of the u8. Fixes: ad0f75e5f57c ("cgroup: fix cgroup_sk_alloc() for sk_clone_lock()") Reported-by: Guenter Roeck Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/linux/cgroup-defs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 2ebacb7ef501..1ccfa3779e18 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -799,6 +799,7 @@ struct sock_cgroup_data { struct { u8 is_data : 1; u8 no_refcnt : 1; + u8 unused : 6; u8 padding; u16 prioidx; u32 classid; @@ -808,6 +809,7 @@ struct sock_cgroup_data { u32 classid; u16 prioidx; u8 padding; + u8 unused : 6; u8 no_refcnt : 1; u8 is_data : 1; } __packed; -- cgit v1.2.3 From c3adbd37c05451b68137876cee8831c715902f8b Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Tue, 28 Apr 2020 09:54:56 +0800 Subject: blk-mq-debugfs: update blk_queue_flag_name[] accordingly for new flags [ Upstream commit bfe373f608cf81b7626dfeb904001b0e867c5110 ] Else there may be magic numbers in /sys/kernel/debug/block/*/state. Signed-off-by: Hou Tao Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bff1def62eed..d5338b9ee550 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -592,6 +592,7 @@ struct request_queue { u64 write_hints[BLK_MAX_WRITE_HINTS]; }; +/* Keep blk_queue_flag_name[] in sync with the definitions below */ #define QUEUE_FLAG_STOPPED 0 /* queue is stopped */ #define QUEUE_FLAG_DYING 1 /* queue being torn down */ #define QUEUE_FLAG_NOMERGES 3 /* disable merge attempts */ -- cgit v1.2.3 From e7e98dd42aaec713e1208b853bfaed35a73bb0a1 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 24 Feb 2020 12:58:03 -0800 Subject: bus: ti-sysc: Handle module unlock quirk needed for some RTC [ Upstream commit e8639e1c986a8a9d0f94549170f6db579376c3ae ] The RTC modules on am3 and am4 need quirk handling to unlock and lock them for reset so let's add the quirk handling based on what we already have for legacy platform data. In later patches we will simply drop the RTC related platform data and the old quirk handling. Signed-off-by: Tony Lindgren Signed-off-by: Sasha Levin --- include/linux/platform_data/ti-sysc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/ti-sysc.h b/include/linux/platform_data/ti-sysc.h index 2cbde6542849..5fcc9bc9e751 100644 --- a/include/linux/platform_data/ti-sysc.h +++ b/include/linux/platform_data/ti-sysc.h @@ -49,6 +49,7 @@ struct sysc_regbits { s8 emufree_shift; }; +#define SYSC_MODULE_QUIRK_RTC_UNLOCK BIT(22) #define SYSC_QUIRK_CLKDM_NOAUTO BIT(21) #define SYSC_QUIRK_FORCE_MSTANDBY BIT(20) #define SYSC_MODULE_QUIRK_AESS BIT(19) -- cgit v1.2.3 From 97f1aecb80e9716c7b0f9163e5953d7976331fc3 Mon Sep 17 00:00:00 2001 From: Dave Wang Date: Wed, 8 Jul 2020 22:25:03 -0700 Subject: Input: elan_i2c - add more hardware ID for Lenovo laptops commit a50ca29523b18baea548bdf5df9b4b923c2bb4f6 upstream. This adds more hardware IDs for Elan touchpads found in various Lenovo laptops. Signed-off-by: Dave Wang Link: https://lore.kernel.org/r/000201d5a8bd$9fead3f0$dfc07bd0$@emc.com.tw Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- include/linux/input/elan-i2c-ids.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/input/elan-i2c-ids.h b/include/linux/input/elan-i2c-ids.h index 1ecb6b45812c..520858d12680 100644 --- a/include/linux/input/elan-i2c-ids.h +++ b/include/linux/input/elan-i2c-ids.h @@ -67,8 +67,15 @@ static const struct acpi_device_id elan_acpi_id[] = { { "ELAN062B", 0 }, { "ELAN062C", 0 }, { "ELAN062D", 0 }, + { "ELAN062E", 0 }, /* Lenovo V340 Whiskey Lake U */ + { "ELAN062F", 0 }, /* Lenovo V340 Comet Lake U */ { "ELAN0631", 0 }, { "ELAN0632", 0 }, + { "ELAN0633", 0 }, /* Lenovo S145 */ + { "ELAN0634", 0 }, /* Lenovo V340 Ice lake */ + { "ELAN0635", 0 }, /* Lenovo V1415-IIL */ + { "ELAN0636", 0 }, /* Lenovo V1415-Dali */ + { "ELAN0637", 0 }, /* Lenovo V1415-IGLR */ { "ELAN1000", 0 }, { } }; -- cgit v1.2.3 From 722c6e954c90eca93ac89980e7b79577cd141777 Mon Sep 17 00:00:00 2001 From: Charan Teja Kalla Date: Fri, 19 Jun 2020 17:27:19 +0530 Subject: dmabuf: use spinlock to access dmabuf->name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 6348dd291e3653534a9e28e6917569bc9967b35b ] There exists a sleep-while-atomic bug while accessing the dmabuf->name under mutex in the dmabuffs_dname(). This is caused from the SELinux permissions checks on a process where it tries to validate the inherited files from fork() by traversing them through iterate_fd() (which traverse files under spin_lock) and call match_file(security/selinux/hooks.c) where the permission checks happen. This audit information is logged using dump_common_audit_data() where it calls d_path() to get the file path name. If the file check happen on the dmabuf's fd, then it ends up in ->dmabuffs_dname() and use mutex to access dmabuf->name. The flow will be like below: flush_unauthorized_files() iterate_fd() spin_lock() --> Start of the atomic section. match_file() file_has_perm() avc_has_perm() avc_audit() slow_avc_audit() common_lsm_audit() dump_common_audit_data() audit_log_d_path() d_path() dmabuffs_dname() mutex_lock()--> Sleep while atomic. Call trace captured (on 4.19 kernels) is below: ___might_sleep+0x204/0x208 __might_sleep+0x50/0x88 __mutex_lock_common+0x5c/0x1068 __mutex_lock_common+0x5c/0x1068 mutex_lock_nested+0x40/0x50 dmabuffs_dname+0xa0/0x170 d_path+0x84/0x290 audit_log_d_path+0x74/0x130 common_lsm_audit+0x334/0x6e8 slow_avc_audit+0xb8/0xf8 avc_has_perm+0x154/0x218 file_has_perm+0x70/0x180 match_file+0x60/0x78 iterate_fd+0x128/0x168 selinux_bprm_committing_creds+0x178/0x248 security_bprm_committing_creds+0x30/0x48 install_exec_creds+0x1c/0x68 load_elf_binary+0x3a4/0x14e0 search_binary_handler+0xb0/0x1e0 So, use spinlock to access dmabuf->name to avoid sleep-while-atomic. Cc: [5.3+] Signed-off-by: Charan Teja Kalla Reviewed-by: Michael J. Ruhl Acked-by: Christian König [sumits: added comment to spinlock_t definition to avoid warning] Signed-off-by: Sumit Semwal Link: https://patchwork.freedesktop.org/patch/msgid/a83e7f0d-4e54-9848-4b58-e1acdbe06735@codeaurora.org Signed-off-by: Sasha Levin --- include/linux/dma-buf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index ec212cb27fdc..12eac4293af6 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -303,6 +303,7 @@ struct dma_buf { void *vmap_ptr; const char *exp_name; const char *name; + spinlock_t name_lock; /* spinlock to protect name access */ struct module *owner; struct list_head list_node; void *priv; -- cgit v1.2.3 From 80fed4024c39d8d7cea91b9ebba2398c9976459f Mon Sep 17 00:00:00 2001 From: Merlijn Wajer Date: Tue, 30 Jun 2020 11:47:04 -0700 Subject: Input: add `SW_MACHINE_COVER` [ Upstream commit c463bb2a8f8d7d97aa414bf7714fc77e9d3b10df ] This event code represents the state of a removable cover of a device. Value 0 means that the cover is open or removed, value 1 means that the cover is closed. Reviewed-by: Sebastian Reichel Acked-by: Tony Lindgren Signed-off-by: Merlijn Wajer Link: https://lore.kernel.org/r/20200612125402.18393-2-merlijn@wizzup.org Signed-off-by: Dmitry Torokhov Signed-off-by: Sasha Levin --- include/linux/mod_devicetable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 953d7ca01eb6..4c56404e53a7 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -318,7 +318,7 @@ struct pcmcia_device_id { #define INPUT_DEVICE_ID_LED_MAX 0x0f #define INPUT_DEVICE_ID_SND_MAX 0x07 #define INPUT_DEVICE_ID_FF_MAX 0x7f -#define INPUT_DEVICE_ID_SW_MAX 0x0f +#define INPUT_DEVICE_ID_SW_MAX 0x10 #define INPUT_DEVICE_ID_PROP_MAX 0x1f #define INPUT_DEVICE_ID_MATCH_BUS 1 -- cgit v1.2.3 From 615f44e047929035f168b165efa6a855ac8deaf9 Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Thu, 23 Jul 2020 21:15:46 -0700 Subject: io-mapping: indicate mapping failure commit e0b3e0b1a04367fc15c07f44e78361545b55357c upstream. The !ATOMIC_IOMAP version of io_maping_init_wc will always return success, even when the ioremap fails. Since the ATOMIC_IOMAP version returns NULL when the init fails, and callers check for a NULL return on error this is unexpected. During a device probe, where the ioremap failed, a crash can look like this: BUG: unable to handle page fault for address: 0000000000210000 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page Oops: 0002 [#1] PREEMPT SMP CPU: 0 PID: 177 Comm: RIP: 0010:fill_page_dma [i915] gen8_ppgtt_create [i915] i915_ppgtt_create [i915] intel_gt_init [i915] i915_gem_init [i915] i915_driver_probe [i915] pci_device_probe really_probe driver_probe_device The remap failure occurred much earlier in the probe. If it had been propagated, the driver would have exited with an error. Return NULL on ioremap failure. [akpm@linux-foundation.org: detect ioremap_wc() errors earlier] Fixes: cafaf14a5d8f ("io-mapping: Always create a struct to hold metadata about the io-mapping") Signed-off-by: Michael J. Ruhl Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Cc: Mike Rapoport Cc: Andy Shevchenko Cc: Chris Wilson Cc: Daniel Vetter Cc: Link: http://lkml.kernel.org/r/20200721171936.81563-1-michael.j.ruhl@intel.com Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/io-mapping.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h index 6e125e9b4187..b9c91d321240 100644 --- a/include/linux/io-mapping.h +++ b/include/linux/io-mapping.h @@ -108,9 +108,12 @@ io_mapping_init_wc(struct io_mapping *iomap, resource_size_t base, unsigned long size) { + iomap->iomem = ioremap_wc(base, size); + if (!iomap->iomem) + return NULL; + iomap->base = base; iomap->size = size; - iomap->iomem = ioremap_wc(base, size); #if defined(pgprot_noncached_wc) /* archs can't agree on a name ... */ iomap->prot = pgprot_noncached_wc(PAGE_KERNEL); #elif defined(pgprot_writecombine) -- cgit v1.2.3 From 6d4448ca54bc8831f983e51aed8a11f6c71aea35 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 23 Jul 2020 10:42:09 -0400 Subject: dm integrity: fix integrity recalculation that is improperly skipped commit 5df96f2b9f58a5d2dc1f30fe7de75e197f2c25f2 upstream. Commit adc0daad366b62ca1bce3e2958a40b0b71a8b8b3 ("dm: report suspended device during destroy") broke integrity recalculation. The problem is dm_suspended() returns true not only during suspend, but also during resume. So this race condition could occur: 1. dm_integrity_resume calls queue_work(ic->recalc_wq, &ic->recalc_work) 2. integrity_recalc (&ic->recalc_work) preempts the current thread 3. integrity_recalc calls if (unlikely(dm_suspended(ic->ti))) goto unlock_ret; 4. integrity_recalc exits and no recalculating is done. To fix this race condition, add a function dm_post_suspending that is only true during the postsuspend phase and use it instead of dm_suspended(). Signed-off-by: Mikulas Patocka Fixes: adc0daad366b ("dm: report suspended device during destroy") Cc: stable vger kernel org # v4.18+ Signed-off-by: Mike Snitzer Signed-off-by: Greg Kroah-Hartman --- include/linux/device-mapper.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 399ad8632356..e4e1f5c1f492 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -420,6 +420,7 @@ const char *dm_device_name(struct mapped_device *md); int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid); struct gendisk *dm_disk(struct mapped_device *md); int dm_suspended(struct dm_target *ti); +int dm_post_suspending(struct dm_target *ti); int dm_noflush_suspending(struct dm_target *ti); void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors); void dm_remap_zone_report(struct dm_target *ti, sector_t start, -- cgit v1.2.3 From 182ffc66456b20530def3b2d4f6b9a07545ac475 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 23 Jul 2020 12:00:06 -0700 Subject: tcp: allow at most one TLP probe per flight [ Upstream commit 76be93fc0702322179bb0ea87295d820ee46ad14 ] Previously TLP may send multiple probes of new data in one flight. This happens when the sender is cwnd limited. After the initial TLP containing new data is sent, the sender receives another ACK that acks partial inflight. It may re-arm another TLP timer to send more, if no further ACK returns before the next TLP timeout (PTO) expires. The sender may send in theory a large amount of TLP until send queue is depleted. This only happens if the sender sees such irregular uncommon ACK pattern. But it is generally undesirable behavior during congestion especially. The original TLP design restrict only one TLP probe per inflight as published in "Reducing Web Latency: the Virtue of Gentle Aggression", SIGCOMM 2013. This patch changes TLP to send at most one probe per inflight. Note that if the sender is app-limited, TLP retransmits old data and did not have this issue. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/linux/tcp.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 668e25a76d69..358deb4ff830 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -216,6 +216,8 @@ struct tcp_sock { } rack; u16 advmss; /* Advertised MSS */ u8 compressed_ack; + u8 tlp_retrans:1, /* TLP is a retransmission */ + unused_1:7; u32 chrono_start; /* Start time in jiffies of a TCP chrono */ u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ u8 chrono_type:2, /* current chronograph type */ @@ -238,7 +240,7 @@ struct tcp_sock { save_syn:1, /* Save headers of SYN packet */ is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */ syn_smc:1; /* SYN includes SMC */ - u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */ + u32 tlp_high_seq; /* snd_nxt at the time of TLP */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ u64 tcp_wstamp_ns; /* departure time for next sent data packet */ -- cgit v1.2.3 From 475cbcef491ace969e1ba7d30f4743f9840a5937 Mon Sep 17 00:00:00 2001 From: Ron Diskin Date: Sun, 5 Apr 2020 13:58:40 +0300 Subject: net/mlx5e: Modify uplink state on interface up/down [ Upstream commit 7d0314b11cdd92bca8b89684c06953bf114605fc ] When setting the PF interface up/down, notify the firmware to update uplink state via MODIFY_VPORT_STATE, when E-Switch is enabled. This behavior will prevent sending traffic out on uplink port when PF is down, such as sending traffic from a VF interface which is still up. Currently when calling mlx5e_open/close(), the driver only sends PAOS command to notify the firmware to set the physical port state to up/down, however, it is not sufficient. When VF is in "auto" state, it follows the uplink state, which was not updated on mlx5e_open/close() before this patch. When switchdev mode is enabled and uplink representor is first enabled, set the uplink port state value back to its FW default "AUTO". Fixes: 63bfd399de55 ("net/mlx5e: Send PAOS command on interface up/down") Signed-off-by: Ron Diskin Reviewed-by: Roi Dayan Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed Signed-off-by: Sasha Levin --- include/linux/mlx5/mlx5_ifc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index acd859ea09d4..aba56077cfda 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -4177,6 +4177,7 @@ struct mlx5_ifc_query_vport_state_out_bits { enum { MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT = 0x0, MLX5_VPORT_STATE_OP_MOD_ESW_VPORT = 0x1, + MLX5_VPORT_STATE_OP_MOD_UPLINK = 0x2, }; struct mlx5_ifc_arm_monitor_counter_in_bits { -- cgit v1.2.3 From c3883876d3f154d4eb3c9a4406c0489996806e0d Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 24 Jul 2020 20:12:53 +1000 Subject: rhashtable: Fix unprotected RCU dereference in __rht_ptr [ Upstream commit 1748f6a2cbc4694523f16da1c892b59861045b9d ] The rcu_dereference call in rht_ptr_rcu is completely bogus because we've already dereferenced the value in __rht_ptr and operated on it. This causes potential double readings which could be fatal. The RCU dereference must occur prior to the comparison in __rht_ptr. This patch changes the order of RCU dereference so that it is done first and the result is then fed to __rht_ptr. The RCU marking changes have been minimised using casts which will be removed in a follow-up patch. Fixes: ba6306e3f648 ("rhashtable: Remove RCU marking from...") Reported-by: "Gong, Sishuai" Signed-off-by: Herbert Xu Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- include/linux/rhashtable.h | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index beb9a9da1699..c5bf21261bb1 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -349,11 +349,11 @@ static inline void rht_unlock(struct bucket_table *tbl, local_bh_enable(); } -static inline struct rhash_head __rcu *__rht_ptr( - struct rhash_lock_head *const *bkt) +static inline struct rhash_head *__rht_ptr( + struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt) { - return (struct rhash_head __rcu *) - ((unsigned long)*bkt & ~BIT(0) ?: + return (struct rhash_head *) + ((unsigned long)p & ~BIT(0) ?: (unsigned long)RHT_NULLS_MARKER(bkt)); } @@ -365,25 +365,26 @@ static inline struct rhash_head __rcu *__rht_ptr( * access is guaranteed, such as when destroying the table. */ static inline struct rhash_head *rht_ptr_rcu( - struct rhash_lock_head *const *bkt) + struct rhash_lock_head *const *p) { - struct rhash_head __rcu *p = __rht_ptr(bkt); - - return rcu_dereference(p); + struct rhash_lock_head __rcu *const *bkt = (void *)p; + return __rht_ptr(rcu_dereference(*bkt), bkt); } static inline struct rhash_head *rht_ptr( - struct rhash_lock_head *const *bkt, + struct rhash_lock_head *const *p, struct bucket_table *tbl, unsigned int hash) { - return rht_dereference_bucket(__rht_ptr(bkt), tbl, hash); + struct rhash_lock_head __rcu *const *bkt = (void *)p; + return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt); } static inline struct rhash_head *rht_ptr_exclusive( - struct rhash_lock_head *const *bkt) + struct rhash_lock_head *const *p) { - return rcu_dereference_protected(__rht_ptr(bkt), 1); + struct rhash_lock_head __rcu *const *bkt = (void *)p; + return __rht_ptr(rcu_dereference_protected(*bkt, 1), bkt); } static inline void rht_assign_locked(struct rhash_lock_head **bkt, -- cgit v1.2.3 From c15a77bdda2c4f8acaa3e436128630a81f904ae7 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Fri, 10 Jul 2020 15:23:19 +0200 Subject: random32: update the net random state on interrupt and activity commit f227e3ec3b5cad859ad15666874405e8c1bbc1d4 upstream. This modifies the first 32 bits out of the 128 bits of a random CPU's net_rand_state on interrupt or CPU activity to complicate remote observations that could lead to guessing the network RNG's internal state. Note that depending on some network devices' interrupt rate moderation or binding, this re-seeding might happen on every packet or even almost never. In addition, with NOHZ some CPUs might not even get timer interrupts, leaving their local state rarely updated, while they are running networked processes making use of the random state. For this reason, we also perform this update in update_process_times() in order to at least update the state when there is user or system activity, since it's the only case we care about. Reported-by: Amit Klein Suggested-by: Linus Torvalds Cc: Eric Dumazet Cc: "Jason A. Donenfeld" Cc: Andy Lutomirski Cc: Kees Cook Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Signed-off-by: Willy Tarreau Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/random.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/random.h b/include/linux/random.h index f189c927fdea..4d080e5ef6eb 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -9,6 +9,7 @@ #include #include +#include #include @@ -117,6 +118,8 @@ struct rnd_state { __u32 s1, s2, s3, s4; }; +DECLARE_PER_CPU(struct rnd_state, net_rand_state) __latent_entropy; + u32 prandom_u32_state(struct rnd_state *state); void prandom_bytes_state(struct rnd_state *state, void *buf, size_t nbytes); void prandom_seed_full_state(struct rnd_state __percpu *pcpu_state); -- cgit v1.2.3 From 7471f3228e7ad9641a8b2330960d27eb00af0d0f Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Thu, 30 Jul 2020 07:59:24 +0200 Subject: random: fix circular include dependency on arm64 after addition of percpu.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 1c9df907da83812e4f33b59d3d142c864d9da57f upstream. Daniel Díaz and Kees Cook independently reported that commit f227e3ec3b5c ("random32: update the net random state on interrupt and activity") broke arm64 due to a circular dependency on include files since the addition of percpu.h in random.h. The correct fix would definitely be to move all the prandom32 stuff out of random.h but for backporting, a smaller solution is preferred. This one replaces linux/percpu.h with asm/percpu.h, and this fixes the problem on x86_64, arm64, arm, and mips. Note that moving percpu.h around didn't change anything and that removing it entirely broke differently. When backporting, such options might still be considered if this patch fails to help. [ It turns out that an alternate fix seems to be to just remove the troublesome remove from the arm64 that causes the circular dependency. But we might as well do the whole belt-and-suspenders thing, and minimize inclusion in too. Either will fix the problem, and both are good changes. - Linus ] Reported-by: Daniel Díaz Reported-by: Kees Cook Tested-by: Marc Zyngier Fixes: f227e3ec3b5c Cc: Stephen Rothwell Signed-off-by: Willy Tarreau Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/random.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/random.h b/include/linux/random.h index 4d080e5ef6eb..e5d63f875c02 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -9,7 +9,7 @@ #include #include -#include +#include #include -- cgit v1.2.3 From c131009987f2501642e3b3fa328ef6c65cbbc72a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 29 Jul 2020 19:11:00 -0700 Subject: random32: remove net_rand_state from the latent entropy gcc plugin commit 83bdc7275e6206f560d247be856bceba3e1ed8f2 upstream. It turns out that the plugin right now ends up being really unhappy about the change from 'static' to 'extern' storage that happened in commit f227e3ec3b5c ("random32: update the net random state on interrupt and activity"). This is probably a trivial fix for the latent_entropy plugin, but for now, just remove net_rand_state from the list of things the plugin worries about. Reported-by: Stephen Rothwell Cc: Emese Revfy Cc: Kees Cook Cc: Willy Tarreau Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/random.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/random.h b/include/linux/random.h index e5d63f875c02..2625c0deda47 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -118,7 +118,7 @@ struct rnd_state { __u32 s1, s2, s3, s4; }; -DECLARE_PER_CPU(struct rnd_state, net_rand_state) __latent_entropy; +DECLARE_PER_CPU(struct rnd_state, net_rand_state); u32 prandom_u32_state(struct rnd_state *state); void prandom_bytes_state(struct rnd_state *state, void *buf, size_t nbytes); -- cgit v1.2.3 From f06d60ff794adeca67795dd12a8085cc921c4279 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 31 Jul 2020 07:51:14 +0200 Subject: random32: move the pseudo-random 32-bit definitions to prandom.h commit c0842fbc1b18c7a044e6ff3e8fa78bfa822c7d1a upstream. The addition of percpu.h to the list of includes in random.h revealed some circular dependencies on arm64 and possibly other platforms. This include was added solely for the pseudo-random definitions, which have nothing to do with the rest of the definitions in this file but are still there for legacy reasons. This patch moves the pseudo-random parts to linux/prandom.h and the percpu.h include with it, which is now guarded by _LINUX_PRANDOM_H and protected against recursive inclusion. A further cleanup step would be to remove this from entirely, and make people who use the prandom infrastructure include just the new header file. That's a bit of a churn patch, but grepping for "prandom_" and "next_pseudo_random32" "struct rnd_state" should catch most users. But it turns out that that nice cleanup step is fairly painful, because a _lot_ of code currently seems to depend on the implicit include of , which can currently come in a lot of ways, including such fairly core headfers as . So the "nice cleanup" part may or may never happen. Fixes: 1c9df907da83 ("random: fix circular include dependency on arm64 after addition of percpu.h") Tested-by: Guenter Roeck Acked-by: Willy Tarreau Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/prandom.h | 78 +++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/random.h | 66 +++-------------------------------------- 2 files changed, 82 insertions(+), 62 deletions(-) create mode 100644 include/linux/prandom.h (limited to 'include/linux') diff --git a/include/linux/prandom.h b/include/linux/prandom.h new file mode 100644 index 000000000000..aa16e6468f91 --- /dev/null +++ b/include/linux/prandom.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * include/linux/prandom.h + * + * Include file for the fast pseudo-random 32-bit + * generation. + */ +#ifndef _LINUX_PRANDOM_H +#define _LINUX_PRANDOM_H + +#include +#include + +u32 prandom_u32(void); +void prandom_bytes(void *buf, size_t nbytes); +void prandom_seed(u32 seed); +void prandom_reseed_late(void); + +struct rnd_state { + __u32 s1, s2, s3, s4; +}; + +DECLARE_PER_CPU(struct rnd_state, net_rand_state); + +u32 prandom_u32_state(struct rnd_state *state); +void prandom_bytes_state(struct rnd_state *state, void *buf, size_t nbytes); +void prandom_seed_full_state(struct rnd_state __percpu *pcpu_state); + +#define prandom_init_once(pcpu_state) \ + DO_ONCE(prandom_seed_full_state, (pcpu_state)) + +/** + * prandom_u32_max - returns a pseudo-random number in interval [0, ep_ro) + * @ep_ro: right open interval endpoint + * + * Returns a pseudo-random number that is in interval [0, ep_ro). Note + * that the result depends on PRNG being well distributed in [0, ~0U] + * u32 space. Here we use maximally equidistributed combined Tausworthe + * generator, that is, prandom_u32(). This is useful when requesting a + * random index of an array containing ep_ro elements, for example. + * + * Returns: pseudo-random number in interval [0, ep_ro) + */ +static inline u32 prandom_u32_max(u32 ep_ro) +{ + return (u32)(((u64) prandom_u32() * ep_ro) >> 32); +} + +/* + * Handle minimum values for seeds + */ +static inline u32 __seed(u32 x, u32 m) +{ + return (x < m) ? x + m : x; +} + +/** + * prandom_seed_state - set seed for prandom_u32_state(). + * @state: pointer to state structure to receive the seed. + * @seed: arbitrary 64-bit value to use as a seed. + */ +static inline void prandom_seed_state(struct rnd_state *state, u64 seed) +{ + u32 i = (seed >> 32) ^ (seed << 10) ^ seed; + + state->s1 = __seed(i, 2U); + state->s2 = __seed(i, 8U); + state->s3 = __seed(i, 16U); + state->s4 = __seed(i, 128U); +} + +/* Pseudo random number generator from numerical recipes. */ +static inline u32 next_pseudo_random32(u32 seed) +{ + return seed * 1664525 + 1013904223; +} + +#endif diff --git a/include/linux/random.h b/include/linux/random.h index 2625c0deda47..5b3ec7d2791f 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -9,7 +9,6 @@ #include #include -#include #include @@ -109,63 +108,12 @@ declare_get_random_var_wait(long) unsigned long randomize_page(unsigned long start, unsigned long range); -u32 prandom_u32(void); -void prandom_bytes(void *buf, size_t nbytes); -void prandom_seed(u32 seed); -void prandom_reseed_late(void); - -struct rnd_state { - __u32 s1, s2, s3, s4; -}; - -DECLARE_PER_CPU(struct rnd_state, net_rand_state); - -u32 prandom_u32_state(struct rnd_state *state); -void prandom_bytes_state(struct rnd_state *state, void *buf, size_t nbytes); -void prandom_seed_full_state(struct rnd_state __percpu *pcpu_state); - -#define prandom_init_once(pcpu_state) \ - DO_ONCE(prandom_seed_full_state, (pcpu_state)) - -/** - * prandom_u32_max - returns a pseudo-random number in interval [0, ep_ro) - * @ep_ro: right open interval endpoint - * - * Returns a pseudo-random number that is in interval [0, ep_ro). Note - * that the result depends on PRNG being well distributed in [0, ~0U] - * u32 space. Here we use maximally equidistributed combined Tausworthe - * generator, that is, prandom_u32(). This is useful when requesting a - * random index of an array containing ep_ro elements, for example. - * - * Returns: pseudo-random number in interval [0, ep_ro) - */ -static inline u32 prandom_u32_max(u32 ep_ro) -{ - return (u32)(((u64) prandom_u32() * ep_ro) >> 32); -} - /* - * Handle minimum values for seeds - */ -static inline u32 __seed(u32 x, u32 m) -{ - return (x < m) ? x + m : x; -} - -/** - * prandom_seed_state - set seed for prandom_u32_state(). - * @state: pointer to state structure to receive the seed. - * @seed: arbitrary 64-bit value to use as a seed. + * This is designed to be standalone for just prandom + * users, but for now we include it from + * for legacy reasons. */ -static inline void prandom_seed_state(struct rnd_state *state, u64 seed) -{ - u32 i = (seed >> 32) ^ (seed << 10) ^ seed; - - state->s1 = __seed(i, 2U); - state->s2 = __seed(i, 8U); - state->s3 = __seed(i, 16U); - state->s4 = __seed(i, 128U); -} +#include #ifdef CONFIG_ARCH_RANDOM # include @@ -196,10 +144,4 @@ static inline bool arch_has_random_seed(void) } #endif -/* Pseudo random number generator from numerical recipes. */ -static inline u32 next_pseudo_random32(u32 seed) -{ - return seed * 1664525 + 1013904223; -} - #endif /* _LINUX_RANDOM_H */ -- cgit v1.2.3 From ca7ace8fd26d9ae4be3cf69f474ddcfb0e8506ce Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 29 Jun 2020 10:56:28 +0100 Subject: bpf: sockmap: Require attach_bpf_fd when detaching a program commit bb0de3131f4c60a9bf976681e0fe4d1e55c7a821 upstream. The sockmap code currently ignores the value of attach_bpf_fd when detaching a program. This is contrary to the usual behaviour of checking that attach_bpf_fd represents the currently attached program. Ensure that attach_bpf_fd is indeed the currently attached program. It turns out that all sockmap selftests already do this, which indicates that this is unlikely to cause breakage. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200629095630.7933-5-lmb@cloudflare.com Signed-off-by: Greg Kroah-Hartman --- include/linux/bpf.h | 13 +++++++++++-- include/linux/skmsg.h | 13 +++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3bf3835d0e86..7aa0d8b5aaf0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -956,11 +956,14 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map) #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ #if defined(CONFIG_BPF_STREAM_PARSER) -int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, u32 which); +int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, + struct bpf_prog *old, u32 which); int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog); +int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); #else static inline int sock_map_prog_update(struct bpf_map *map, - struct bpf_prog *prog, u32 which) + struct bpf_prog *prog, + struct bpf_prog *old, u32 which) { return -EOPNOTSUPP; } @@ -970,6 +973,12 @@ static inline int sock_map_get_from_fd(const union bpf_attr *attr, { return -EINVAL; } + +static inline int sock_map_prog_detach(const union bpf_attr *attr, + enum bpf_prog_type ptype) +{ + return -EOPNOTSUPP; +} #endif #if defined(CONFIG_XDP_SOCKETS) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 4bdb5e4bbd6a..20f3550b0b11 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -450,6 +450,19 @@ static inline void psock_set_prog(struct bpf_prog **pprog, bpf_prog_put(prog); } +static inline int psock_replace_prog(struct bpf_prog **pprog, + struct bpf_prog *prog, + struct bpf_prog *old) +{ + if (cmpxchg(pprog, old, prog) != old) + return -ENOENT; + + if (old) + bpf_prog_put(old); + + return 0; +} + static inline void psock_progs_drop(struct sk_psock_progs *progs) { psock_set_prog(&progs->msg_parser, NULL); -- cgit v1.2.3 From 6059000e145feb0c0399717f34a70b8d74fe030d Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Sun, 19 Jan 2020 15:29:22 -0800 Subject: Drivers: hv: vmbus: Ignore CHANNELMSG_TL_CONNECT_RESULT(23) [ Upstream commit ddc9d357b991838c2d975e8d7e4e9db26f37a7ff ] When a Linux hv_sock app tries to connect to a Service GUID on which no host app is listening, a recent host (RS3+) sends a CHANNELMSG_TL_CONNECT_RESULT (23) message to Linux and this triggers such a warning: unknown msgtype=23 WARNING: CPU: 2 PID: 0 at drivers/hv/vmbus_drv.c:1031 vmbus_on_msg_dpc Actually Linux can safely ignore the message because the Linux app's connect() will time out in 2 seconds: see VSOCK_DEFAULT_CONNECT_TIMEOUT and vsock_stream_connect(). We don't bother to make use of the message because: 1) it's only supported on recent hosts; 2) a non-trivial effort is required to use the message in Linux, but the benefit is small. So, let's not see the warning by silently ignoring the message. Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Signed-off-by: Sasha Levin --- include/linux/hyperv.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index b4a017093b69..67d9b5a37460 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -423,6 +423,8 @@ enum vmbus_channel_message_type { CHANNELMSG_19 = 19, CHANNELMSG_20 = 20, CHANNELMSG_TL_CONNECT_REQUEST = 21, + CHANNELMSG_22 = 22, + CHANNELMSG_TL_CONNECT_RESULT = 23, CHANNELMSG_COUNT }; -- cgit v1.2.3 From 11e64146dc698f6543706d2865fb01cfea7e7b42 Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Tue, 23 Jun 2020 22:39:18 +0000 Subject: xattr: break delegations in {set,remove}xattr commit 08b5d5014a27e717826999ad20e394a8811aae92 upstream. set/removexattr on an exported filesystem should break NFS delegations. This is true in general, but also for the upcoming support for RFC 8726 (NFSv4 extended attribute support). Make sure that they do. Additionally, they need to grow a _locked variant, since callers might call this with i_rwsem held (like the NFS server code). Cc: stable@vger.kernel.org # v4.9+ Cc: linux-fsdevel@vger.kernel.org Cc: Al Viro Signed-off-by: Frank van der Linden Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- include/linux/xattr.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 6dad031be3c2..3a71ad716da5 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -51,8 +51,10 @@ ssize_t vfs_getxattr(struct dentry *, const char *, void *, size_t); ssize_t vfs_listxattr(struct dentry *d, char *list, size_t size); int __vfs_setxattr(struct dentry *, struct inode *, const char *, const void *, size_t, int); int __vfs_setxattr_noperm(struct dentry *, const char *, const void *, size_t, int); +int __vfs_setxattr_locked(struct dentry *, const char *, const void *, size_t, int, struct inode **); int vfs_setxattr(struct dentry *, const char *, const void *, size_t, int); int __vfs_removexattr(struct dentry *, const char *); +int __vfs_removexattr_locked(struct dentry *, const char *, struct inode **); int vfs_removexattr(struct dentry *, const char *); ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size); -- cgit v1.2.3 From 512570b17807ec4f7d0e2f0bd349445e33c051db Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 2 Mar 2020 14:45:53 -0500 Subject: nfsd: Fix NFSv4 READ on RDMA when using readv commit 412055398b9e67e07347a936fc4a6adddabe9cf4 upstream. svcrdma expects that the payload falls precisely into the xdr_buf page vector. This does not seem to be the case for nfsd4_encode_readv(). This code is called only when fops->splice_read is missing or when RQ_SPLICE_OK is clear, so it's not a noticeable problem in many common cases. Add new transport method: ->xpo_read_payload so that when a READ payload does not fit exactly in rq_res's page vector, the XDR encoder can inform the RPC transport exactly where that payload is, without the payload's XDR pad. That way, when a Write chunk is present, the transport knows what byte range in the Reply message is supposed to be matched with the chunk. Note that the Linux NFS server implementation of NFS/RDMA can currently handle only one Write chunk per RPC-over-RDMA message. This simplifies the implementation of this fix. Fixes: b04209806384 ("nfsd4: allow exotic read compounds") Buglink: https://bugzilla.kernel.org/show_bug.cgi?id=198053 Signed-off-by: Chuck Lever Cc: Timo Rothenpieler Signed-off-by: Greg Kroah-Hartman --- include/linux/sunrpc/svc.h | 3 +++ include/linux/sunrpc/svc_rdma.h | 8 +++++++- include/linux/sunrpc/svc_xprt.h | 2 ++ 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 1afe38eb33f7..82665ff360fd 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -517,6 +517,9 @@ void svc_wake_up(struct svc_serv *); void svc_reserve(struct svc_rqst *rqstp, int space); struct svc_pool * svc_pool_for_cpu(struct svc_serv *serv, int cpu); char * svc_print_addr(struct svc_rqst *, char *, size_t); +int svc_encode_read_payload(struct svc_rqst *rqstp, + unsigned int offset, + unsigned int length); unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, struct page **pages, struct kvec *first, size_t total); diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index fddad9f5b390..26f282e5e082 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -137,6 +137,8 @@ struct svc_rdma_recv_ctxt { unsigned int rc_page_count; unsigned int rc_hdr_count; u32 rc_inv_rkey; + unsigned int rc_read_payload_offset; + unsigned int rc_read_payload_length; struct page *rc_pages[RPCSVC_MAXPAGES]; }; @@ -171,7 +173,9 @@ extern int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp, struct svc_rdma_recv_ctxt *head, __be32 *p); extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, - __be32 *wr_ch, struct xdr_buf *xdr); + __be32 *wr_ch, struct xdr_buf *xdr, + unsigned int offset, + unsigned long length); extern int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, __be32 *rp_ch, bool writelist, struct xdr_buf *xdr); @@ -190,6 +194,8 @@ extern int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt, struct xdr_buf *xdr, __be32 *wr_lst); extern int svc_rdma_sendto(struct svc_rqst *); +extern int svc_rdma_read_payload(struct svc_rqst *rqstp, unsigned int offset, + unsigned int length); /* svc_rdma_transport.c */ extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *); diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index ea6f46be9cb7..9e1e046de176 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -21,6 +21,8 @@ struct svc_xprt_ops { int (*xpo_has_wspace)(struct svc_xprt *); int (*xpo_recvfrom)(struct svc_rqst *); int (*xpo_sendto)(struct svc_rqst *); + int (*xpo_read_payload)(struct svc_rqst *, unsigned int, + unsigned int); void (*xpo_release_rqst)(struct svc_rqst *); void (*xpo_detach)(struct svc_xprt *); void (*xpo_free)(struct svc_xprt *); -- cgit v1.2.3 From 16d2fb138f98eb365f48f19fd3dce914be012b21 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Thu, 30 Jul 2020 15:45:54 -0700 Subject: tracepoint: Mark __tracepoint_string's __used commit f3751ad0116fb6881f2c3c957d66a9327f69cefb upstream. __tracepoint_string's have their string data stored in .rodata, and an address to that data stored in the "__tracepoint_str" section. Functions that refer to those strings refer to the symbol of the address. Compiler optimization can replace those address references with references directly to the string data. If the address doesn't appear to have other uses, then it appears dead to the compiler and is removed. This can break the /tracing/printk_formats sysfs node which iterates the addresses stored in the "__tracepoint_str" section. Like other strings stored in custom sections in this header, mark these __used to inform the compiler that there are other non-obvious users of the address, so they should still be emitted. Link: https://lkml.kernel.org/r/20200730224555.2142154-2-ndesaulniers@google.com Cc: Ingo Molnar Cc: Miguel Ojeda Cc: stable@vger.kernel.org Fixes: 102c9323c35a8 ("tracing: Add __tracepoint_string() to export string pointers") Reported-by: Tim Murray Reported-by: Simon MacMullen Suggested-by: Greg Hackmann Signed-off-by: Nick Desaulniers Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- include/linux/tracepoint.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 1fb11daa5c53..57ce5af258a3 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -362,7 +362,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) static const char *___tp_str __tracepoint_string = str; \ ___tp_str; \ }) -#define __tracepoint_string __attribute__((section("__tracepoint_str"))) +#define __tracepoint_string __attribute__((section("__tracepoint_str"), used)) #else /* * tracepoint_string() is used to save the string address for userspace -- cgit v1.2.3 From 3a17c7bfe705eb64d6acfc388882e2db2d373bed Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 10 Jul 2020 14:29:55 -0500 Subject: tpm: Require that all digests are present in TCG_PCR_EVENT2 structures [ Upstream commit 7f3d176f5f7e3f0477bf82df0f600fcddcdcc4e4 ] Require that the TCG_PCR_EVENT2.digests.count value strictly matches the value of TCG_EfiSpecIdEvent.numberOfAlgorithms in the event field of the TCG_PCClientPCREvent event log header. Also require that TCG_EfiSpecIdEvent.numberOfAlgorithms is non-zero. The TCG PC Client Platform Firmware Profile Specification section 9.1 (Family "2.0", Level 00 Revision 1.04) states: For each Hash algorithm enumerated in the TCG_PCClientPCREvent entry, there SHALL be a corresponding digest in all TCG_PCR_EVENT2 structures. Note: This includes EV_NO_ACTION events which do not extend the PCR. Section 9.4.5.1 provides this description of TCG_EfiSpecIdEvent.numberOfAlgorithms: The number of Hash algorithms in the digestSizes field. This field MUST be set to a value of 0x01 or greater. Enforce these restrictions, as required by the above specification, in order to better identify and ignore invalid sequences of bytes at the end of an otherwise valid TPM2 event log. Firmware doesn't always have the means necessary to inform the kernel of the actual event log size so the kernel's event log parsing code should be stringent when parsing the event log for resiliency against firmware bugs. This is true, for example, when firmware passes the event log to the kernel via a reserved memory region described in device tree. POWER and some ARM systems use the "linux,sml-base" and "linux,sml-size" device tree properties to describe the memory region used to pass the event log from firmware to the kernel. Unfortunately, the "linux,sml-size" property describes the size of the entire reserved memory region rather than the size of the event long within the memory region and the event log format does not include information describing the size of the event log. tpm_read_log_of(), in drivers/char/tpm/eventlog/of.c, is where the "linux,sml-size" property is used. At the end of that function, log->bios_event_log_end is pointing at the end of the reserved memory region. That's typically 0x10000 bytes offset from "linux,sml-base", depending on what's defined in the device tree source. The firmware event log only fills a portion of those 0x10000 bytes and the rest of the memory region should be zeroed out by firmware. Even in the case of a properly zeroed bytes in the remainder of the memory region, the only thing allowing the kernel's event log parser to detect the end of the event log is the following conditional in __calc_tpm2_event_size(): if (event_type == 0 && event_field->event_size == 0) size = 0; If that wasn't there, __calc_tpm2_event_size() would think that a 16 byte sequence of zeroes, following an otherwise valid event log, was a valid event. However, problems can occur if a single bit is set in the offset corresponding to either the TCG_PCR_EVENT2.eventType or TCG_PCR_EVENT2.eventSize fields, after the last valid event log entry. This could confuse the parser into thinking that an additional entry is present in the event log and exposing this invalid entry to userspace in the /sys/kernel/security/tpm0/binary_bios_measurements file. Such problems have been seen if firmware does not fully zero the memory region upon a warm reboot. This patch significantly raises the bar on how difficult it is for stale/invalid memory to confuse the kernel's event log parser but there's still, ultimately, a reliance on firmware to properly initialize the remainder of the memory region reserved for the event log as the parser cannot be expected to detect a stale but otherwise properly formatted firmware event log entry. Fixes: fd5c78694f3f ("tpm: fix handling of the TPM 2.0 event logs") Signed-off-by: Tyler Hicks Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen Signed-off-by: Sasha Levin --- include/linux/tpm_eventlog.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tpm_eventlog.h b/include/linux/tpm_eventlog.h index eccfd3a4e4c8..f3caeeb7a0d0 100644 --- a/include/linux/tpm_eventlog.h +++ b/include/linux/tpm_eventlog.h @@ -211,9 +211,16 @@ static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event, efispecid = (struct tcg_efi_specid_event_head *)event_header->event; - /* Check if event is malformed. */ + /* + * Perform validation of the event in order to identify malformed + * events. This function may be asked to parse arbitrary byte sequences + * immediately following a valid event log. The caller expects this + * function to recognize that the byte sequence is not a valid event + * and to return an event size of 0. + */ if (memcmp(efispecid->signature, TCG_SPECID_SIG, - sizeof(TCG_SPECID_SIG)) || count > efispecid->num_algs) { + sizeof(TCG_SPECID_SIG)) || + !efispecid->num_algs || count != efispecid->num_algs) { size = 0; goto out; } -- cgit v1.2.3 From 1a2e558c8b3084a292f461fb9adca5bb78792ee5 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Fri, 3 Jul 2020 01:55:59 +0300 Subject: tpm: Unify the mismatching TPM space buffer sizes commit 6c4e79d99e6f42b79040f1a33cd4018f5425030b upstream. The size of the buffers for storing context's and sessions can vary from arch to arch as PAGE_SIZE can be anything between 4 kB and 256 kB (the maximum for PPC64). Define a fixed buffer size set to 16 kB. This should be enough for most use with three handles (that is how many we allow at the moment). Parametrize the buffer size while doing this, so that it is easier to revisit this later on if required. Cc: stable@vger.kernel.org Reported-by: Stefan Berger Fixes: 745b361e989a ("tpm: infrastructure for TPM spaces") Reviewed-by: Jerry Snitselaar Tested-by: Stefan Berger Signed-off-by: Jarkko Sakkinen Signed-off-by: Greg Kroah-Hartman --- include/linux/tpm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/tpm.h b/include/linux/tpm.h index 53c0ea9ec9df..77fdc988c610 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -93,6 +93,7 @@ struct tpm_space { u8 *context_buf; u32 session_tbl[3]; u8 *session_buf; + u32 buf_size; }; struct tpm_bios_log { -- cgit v1.2.3 From 5be9072b8121b578b1429a35d5b158d55583e043 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 10 Aug 2020 11:21:11 -0700 Subject: bitfield.h: don't compile-time validate _val in FIELD_FIT commit 444da3f52407d74c9aa12187ac6b01f76ee47d62 upstream. When ur_load_imm_any() is inlined into jeq_imm(), it's possible for the compiler to deduce a case where _val can only have the value of -1 at compile time. Specifically, /* struct bpf_insn: _s32 imm */ u64 imm = insn->imm; /* sign extend */ if (imm >> 32) { /* non-zero only if insn->imm is negative */ /* inlined from ur_load_imm_any */ u32 __imm = imm >> 32; /* therefore, always 0xffffffff */ if (__builtin_constant_p(__imm) && __imm > 255) compiletime_assert_XXX() This can result in tripping a BUILD_BUG_ON() in __BF_FIELD_CHECK() that checks that a given value is representable in one byte (interpreted as unsigned). FIELD_FIT() should return true or false at runtime for whether a value can fit for not. Don't break the build over a value that's too large for the mask. We'd prefer to keep the inlining and compiler optimizations though we know this case will always return false. Cc: stable@vger.kernel.org Fixes: 1697599ee301a ("bitfield.h: add FIELD_FIT() helper") Link: https://lore.kernel.org/kernel-hardening/CAK7LNASvb0UDJ0U5wkYYRzTAdnEs64HjXpEUL7d=V0CXiAXcNw@mail.gmail.com/ Reported-by: Masahiro Yamada Debugged-by: Sami Tolvanen Signed-off-by: Jakub Kicinski Signed-off-by: Nick Desaulniers Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/linux/bitfield.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h index 4bbb5f1c8b5b..4c0224ff0a14 100644 --- a/include/linux/bitfield.h +++ b/include/linux/bitfield.h @@ -64,7 +64,7 @@ */ #define FIELD_FIT(_mask, _val) \ ({ \ - __BF_FIELD_CHECK(_mask, 0ULL, _val, "FIELD_FIT: "); \ + __BF_FIELD_CHECK(_mask, 0ULL, 0ULL, "FIELD_FIT: "); \ !((((typeof(_mask))_val) << __bf_shf(_mask)) & ~(_mask)); \ }) -- cgit v1.2.3 From a11f42496ac80813d94b30d51868d4610bb5e5d9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 24 Jul 2020 22:44:41 +0200 Subject: genirq/affinity: Make affinity setting if activated opt-in commit f0c7baca180046824e07fc5f1326e83a8fd150c7 upstream. John reported that on a RK3288 system the perf per CPU interrupts are all affine to CPU0 and provided the analysis: "It looks like what happens is that because the interrupts are not per-CPU in the hardware, armpmu_request_irq() calls irq_force_affinity() while the interrupt is deactivated and then request_irq() with IRQF_PERCPU | IRQF_NOBALANCING. Now when irq_startup() runs with IRQ_STARTUP_NORMAL, it calls irq_setup_affinity() which returns early because IRQF_PERCPU and IRQF_NOBALANCING are set, leaving the interrupt on its original CPU." This was broken by the recent commit which blocked interrupt affinity setting in hardware before activation of the interrupt. While this works in general, it does not work for this particular case. As contrary to the initial analysis not all interrupt chip drivers implement an activate callback, the safe cure is to make the deferred interrupt affinity setting at activation time opt-in. Implement the necessary core logic and make the two irqchip implementations for which this is required opt-in. In hindsight this would have been the right thing to do, but ... Fixes: baedb87d1b53 ("genirq/affinity: Handle affinity setting on inactive interrupts correctly") Reported-by: John Keeping Signed-off-by: Thomas Gleixner Tested-by: Marc Zyngier Acked-by: Marc Zyngier Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/87blk4tzgm.fsf@nanos.tec.linutronix.de Signed-off-by: Greg Kroah-Hartman --- include/linux/irq.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index f8755e5fcd74..e9e69c511ea9 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -211,6 +211,8 @@ struct irq_data { * IRQD_CAN_RESERVE - Can use reservation mode * IRQD_MSI_NOMASK_QUIRK - Non-maskable MSI quirk for affinity change * required + * IRQD_AFFINITY_ON_ACTIVATE - Affinity is set on activation. Don't call + * irq_chip::irq_set_affinity() when deactivated. */ enum { IRQD_TRIGGER_MASK = 0xf, @@ -234,6 +236,7 @@ enum { IRQD_DEFAULT_TRIGGER_SET = (1 << 25), IRQD_CAN_RESERVE = (1 << 26), IRQD_MSI_NOMASK_QUIRK = (1 << 27), + IRQD_AFFINITY_ON_ACTIVATE = (1 << 29), }; #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) @@ -408,6 +411,16 @@ static inline bool irqd_msi_nomask_quirk(struct irq_data *d) return __irqd_to_state(d) & IRQD_MSI_NOMASK_QUIRK; } +static inline void irqd_set_affinity_on_activate(struct irq_data *d) +{ + __irqd_to_state(d) |= IRQD_AFFINITY_ON_ACTIVATE; +} + +static inline bool irqd_affinity_on_activate(struct irq_data *d) +{ + return __irqd_to_state(d) & IRQD_AFFINITY_ON_ACTIVATE; +} + #undef __irqd_to_state static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) -- cgit v1.2.3 From 043bc80399a80625a66e0eae52f4e6c819200637 Mon Sep 17 00:00:00 2001 From: Liu Yi L Date: Fri, 24 Jul 2020 09:49:14 +0800 Subject: iommu/vt-d: Enforce PASID devTLB field mask [ Upstream commit 5f77d6ca5ca74e4b4a5e2e010f7ff50c45dea326 ] Set proper masks to avoid invalid input spillover to reserved bits. Signed-off-by: Liu Yi L Signed-off-by: Jacob Pan Signed-off-by: Lu Baolu Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20200724014925.15523-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Sasha Levin --- include/linux/intel-iommu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 1e5dad8b8e59..ed870da78326 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -359,8 +359,8 @@ enum { #define QI_DEV_EIOTLB_ADDR(a) ((u64)(a) & VTD_PAGE_MASK) #define QI_DEV_EIOTLB_SIZE (((u64)1) << 11) -#define QI_DEV_EIOTLB_GLOB(g) ((u64)g) -#define QI_DEV_EIOTLB_PASID(p) (((u64)p) << 32) +#define QI_DEV_EIOTLB_GLOB(g) ((u64)(g) & 0x1) +#define QI_DEV_EIOTLB_PASID(p) ((u64)((p) & 0xfffff) << 32) #define QI_DEV_EIOTLB_SID(sid) ((u64)((sid) & 0xffff) << 16) #define QI_DEV_EIOTLB_QDEP(qd) ((u64)((qd) & 0x1f) << 4) #define QI_DEV_EIOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | \ -- cgit v1.2.3 From 12a9bec2bd4ebb59fc7422cff369e665d2499f11 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 6 Aug 2020 23:25:01 -0700 Subject: efi: provide empty efi_enter_virtual_mode implementation [ Upstream commit 2c547f9da0539ad1f7ef7f08c8c82036d61b011a ] When CONFIG_EFI is not enabled, we might get an undefined reference to efi_enter_virtual_mode() error, if this efi_enabled() call isn't inlined into start_kernel(). This happens in particular, if start_kernel() is annodated with __no_sanitize_address. Reported-by: kernel test robot Signed-off-by: Andrey Konovalov Signed-off-by: Andrew Morton Acked-by: Ard Biesheuvel Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Elena Petrova Cc: Marco Elver Cc: Vincenzo Frascino Cc: Walter Wu Link: http://lkml.kernel.org/r/6514652d3a32d3ed33d6eb5c91d0af63bf0d1a0c.1596544734.git.andreyknvl@google.com Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- include/linux/efi.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index d87acf62958e..13ed2c6b13f8 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1039,7 +1039,11 @@ extern void *efi_get_pal_addr (void); extern void efi_map_pal_code (void); extern void efi_memmap_walk (efi_freemem_callback_t callback, void *arg); extern void efi_gettimeofday (struct timespec64 *ts); +#ifdef CONFIG_EFI extern void efi_enter_virtual_mode (void); /* switch EFI to virtual mode, if possible */ +#else +static inline void efi_enter_virtual_mode (void) {} +#endif #ifdef CONFIG_X86 extern efi_status_t efi_query_variable_store(u32 attributes, unsigned long size, -- cgit v1.2.3 From 3803312a3c55244620a2073f288eebe54454f473 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 10 Aug 2020 13:52:15 +0200 Subject: netfilter: avoid ipv6 -> nf_defrag_ipv6 module dependency [ Upstream commit 2404b73c3f1a5f15726c6ecd226b56f6f992767f ] nf_ct_frag6_gather is part of nf_defrag_ipv6.ko, not ipv6 core. The current use of the netfilter ipv6 stub indirections causes a module dependency between ipv6 and nf_defrag_ipv6. This prevents nf_defrag_ipv6 module from being removed because ipv6 can't be unloaded. Remove the indirection and always use a direct call. This creates a depency from nf_conntrack_bridge to nf_defrag_ipv6 instead: modinfo nf_conntrack depends: nf_conntrack,nf_defrag_ipv6,bridge .. and nf_conntrack already depends on nf_defrag_ipv6 anyway. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- include/linux/netfilter_ipv6.h | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h index aac42c28fe62..9b67394471e1 100644 --- a/include/linux/netfilter_ipv6.h +++ b/include/linux/netfilter_ipv6.h @@ -58,7 +58,6 @@ struct nf_ipv6_ops { int (*output)(struct net *, struct sock *, struct sk_buff *)); int (*reroute)(struct sk_buff *skb, const struct nf_queue_entry *entry); #if IS_MODULE(CONFIG_IPV6) - int (*br_defrag)(struct net *net, struct sk_buff *skb, u32 user); int (*br_fragment)(struct net *net, struct sock *sk, struct sk_buff *skb, struct nf_bridge_frag_data *data, @@ -117,23 +116,6 @@ static inline int nf_ip6_route(struct net *net, struct dst_entry **dst, #include -static inline int nf_ipv6_br_defrag(struct net *net, struct sk_buff *skb, - u32 user) -{ -#if IS_MODULE(CONFIG_IPV6) - const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops(); - - if (!v6_ops) - return 1; - - return v6_ops->br_defrag(net, skb, user); -#elif IS_BUILTIN(CONFIG_IPV6) - return nf_ct_frag6_gather(net, skb, user); -#else - return 1; -#endif -} - int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, struct nf_bridge_frag_data *data, int (*output)(struct net *, struct sock *sk, -- cgit v1.2.3 From cb0c74450072f9e5fb6e8b0bd2833225c7cfb0ce Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 29 May 2020 15:05:22 +0200 Subject: writeback: Avoid skipping inode writeback commit 5afced3bf28100d81fb2fe7e98918632a08feaf5 upstream. Inode's i_io_list list head is used to attach inode to several different lists - wb->{b_dirty, b_dirty_time, b_io, b_more_io}. When flush worker prepares a list of inodes to writeback e.g. for sync(2), it moves inodes to b_io list. Thus it is critical for sync(2) data integrity guarantees that inode is not requeued to any other writeback list when inode is queued for processing by flush worker. That's the reason why writeback_single_inode() does not touch i_io_list (unless the inode is completely clean) and why __mark_inode_dirty() does not touch i_io_list if I_SYNC flag is set. However there are two flaws in the current logic: 1) When inode has only I_DIRTY_TIME set but it is already queued in b_io list due to sync(2), concurrent __mark_inode_dirty(inode, I_DIRTY_SYNC) can still move inode back to b_dirty list resulting in skipping writeback of inode time stamps during sync(2). 2) When inode is on b_dirty_time list and writeback_single_inode() races with __mark_inode_dirty() like: writeback_single_inode() __mark_inode_dirty(inode, I_DIRTY_PAGES) inode->i_state |= I_SYNC __writeback_single_inode() inode->i_state |= I_DIRTY_PAGES; if (inode->i_state & I_SYNC) bail if (!(inode->i_state & I_DIRTY_ALL)) - not true so nothing done We end up with I_DIRTY_PAGES inode on b_dirty_time list and thus standard background writeback will not writeback this inode leading to possible dirty throttling stalls etc. (thanks to Martijn Coenen for this analysis). Fix these problems by tracking whether inode is queued in b_io or b_more_io lists in a new I_SYNC_QUEUED flag. When this flag is set, we know flush worker has queued inode and we should not touch i_io_list. On the other hand we also know that once flush worker is done with the inode it will requeue the inode to appropriate dirty list. When I_SYNC_QUEUED is not set, __mark_inode_dirty() can (and must) move inode to appropriate dirty list. Reported-by: Martijn Coenen Reviewed-by: Martijn Coenen Tested-by: Martijn Coenen Reviewed-by: Christoph Hellwig Fixes: 0ae45f63d4ef ("vfs: add support for a lazytime mount option") CC: stable@vger.kernel.org Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- include/linux/fs.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 5bd384dbdca5..4c82683e034a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2140,6 +2140,10 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) * * I_CREATING New object's inode in the middle of setting up. * + * I_SYNC_QUEUED Inode is queued in b_io or b_more_io writeback lists. + * Used to detect that mark_inode_dirty() should not move + * inode between dirty lists. + * * Q: What is the difference between I_WILL_FREE and I_FREEING? */ #define I_DIRTY_SYNC (1 << 0) @@ -2157,11 +2161,11 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) #define I_DIO_WAKEUP (1 << __I_DIO_WAKEUP) #define I_LINKABLE (1 << 10) #define I_DIRTY_TIME (1 << 11) -#define __I_DIRTY_TIME_EXPIRED 12 -#define I_DIRTY_TIME_EXPIRED (1 << __I_DIRTY_TIME_EXPIRED) +#define I_DIRTY_TIME_EXPIRED (1 << 12) #define I_WB_SWITCH (1 << 13) #define I_OVL_INUSE (1 << 14) #define I_CREATING (1 << 15) +#define I_SYNC_QUEUED (1 << 17) #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) -- cgit v1.2.3 From 376810e5e9e148648c8b6e4aa8dc6306e9364f49 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 30 Jul 2020 19:47:14 +0900 Subject: fbmem: pull fbcon_update_vcs() out of fb_set_var() [ Upstream commit d88ca7e1a27eb2df056bbf37ddef62e1c73d37ea ] syzbot is reporting OOB read bug in vc_do_resize() [1] caused by memcpy() based on outdated old_{rows,row_size} values, for resize_screen() can recurse into vc_do_resize() which changes vc->vc_{cols,rows} that outdates old_{rows,row_size} values which were saved before calling resize_screen(). Daniel Vetter explained that resize_screen() should not recurse into fbcon_update_vcs() path due to FBINFO_MISC_USEREVENT being still set when calling resize_screen(). Instead of masking FBINFO_MISC_USEREVENT before calling fbcon_update_vcs(), we can remove FBINFO_MISC_USEREVENT by calling fbcon_update_vcs() only if fb_set_var() returned 0. This change assumes that it is harmless to call fbcon_update_vcs() when fb_set_var() returned 0 without reaching fb_notifier_call_chain(). [1] https://syzkaller.appspot.com/bug?id=c70c88cfd16dcf6e1d3c7f0ab8648b3144b5b25e Reported-and-tested-by: syzbot Suggested-by: Daniel Vetter Signed-off-by: Tetsuo Handa Reported-by: kernel test robot for missing #include Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/075b7e37-3278-cd7d-31ab-c5073cfa8e92@i-love.sakura.ne.jp Signed-off-by: Sasha Levin --- include/linux/fb.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fb.h b/include/linux/fb.h index 756706b666a1..8221838fefd9 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -400,8 +400,6 @@ struct fb_tile_ops { #define FBINFO_HWACCEL_YPAN 0x2000 /* optional */ #define FBINFO_HWACCEL_YWRAP 0x4000 /* optional */ -#define FBINFO_MISC_USEREVENT 0x10000 /* event request - from userspace */ #define FBINFO_MISC_TILEBLITTING 0x20000 /* use tile blitting */ /* A driver may set this flag to indicate that it does want a set_par to be -- cgit v1.2.3 From 4bae1afed43212ee3ec64f2bdc9e39e800974e7e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 1 Sep 2020 10:52:33 +0100 Subject: HID: core: Sanitize event code and type when mapping input commit 35556bed836f8dc07ac55f69c8d17dce3e7f0e25 upstream. When calling into hid_map_usage(), the passed event code is blindly stored as is, even if it doesn't fit in the associated bitmap. This event code can come from a variety of sources, including devices masquerading as input devices, only a bit more "programmable". Instead of taking the event code at face value, check that it actually fits the corresponding bitmap, and if it doesn't: - spit out a warning so that we know which device is acting up - NULLify the bitmap pointer so that we catch unexpected uses Code paths that can make use of untrusted inputs can now check that the mapping was indeed correct and bail out if not. Cc: stable@vger.kernel.org Signed-off-by: Marc Zyngier Signed-off-by: Benjamin Tissoires Signed-off-by: Greg Kroah-Hartman --- include/linux/hid.h | 42 +++++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index 875f71132b14..c7044a14200e 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -959,34 +959,49 @@ static inline void hid_device_io_stop(struct hid_device *hid) { * @max: maximal valid usage->code to consider later (out parameter) * @type: input event type (EV_KEY, EV_REL, ...) * @c: code which corresponds to this usage and type + * + * The value pointed to by @bit will be set to NULL if either @type is + * an unhandled event type, or if @c is out of range for @type. This + * can be used as an error condition. */ static inline void hid_map_usage(struct hid_input *hidinput, struct hid_usage *usage, unsigned long **bit, int *max, - __u8 type, __u16 c) + __u8 type, unsigned int c) { struct input_dev *input = hidinput->input; - - usage->type = type; - usage->code = c; + unsigned long *bmap = NULL; + unsigned int limit = 0; switch (type) { case EV_ABS: - *bit = input->absbit; - *max = ABS_MAX; + bmap = input->absbit; + limit = ABS_MAX; break; case EV_REL: - *bit = input->relbit; - *max = REL_MAX; + bmap = input->relbit; + limit = REL_MAX; break; case EV_KEY: - *bit = input->keybit; - *max = KEY_MAX; + bmap = input->keybit; + limit = KEY_MAX; break; case EV_LED: - *bit = input->ledbit; - *max = LED_MAX; + bmap = input->ledbit; + limit = LED_MAX; break; } + + if (unlikely(c > limit || !bmap)) { + pr_warn_ratelimited("%s: Invalid code %d type %d\n", + input->name, c, type); + *bit = NULL; + return; + } + + usage->type = type; + usage->code = c; + *max = limit; + *bit = bmap; } /** @@ -1000,7 +1015,8 @@ static inline void hid_map_usage_clear(struct hid_input *hidinput, __u8 type, __u16 c) { hid_map_usage(hidinput, usage, bit, max, type, c); - clear_bit(c, *bit); + if (*bit) + clear_bit(usage->code, *bit); } /** -- cgit v1.2.3 From 9ff9f74ed4621e1aa28606fc625646cb0ddde545 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 23 Aug 2020 13:55:36 +0200 Subject: netfilter: nfnetlink: nfnetlink_unicast() reports EAGAIN instead of ENOBUFS [ Upstream commit ee921183557af39c1a0475f982d43b0fcac25e2e ] Frontend callback reports EAGAIN to nfnetlink to retry a command, this is used to signal that module autoloading is required. Unfortunately, nlmsg_unicast() reports EAGAIN in case the receiver socket buffer gets full, so it enters a busy-loop. This patch updates nfnetlink_unicast() to turn EAGAIN into ENOBUFS and to use nlmsg_unicast(). Remove the flags field in nfnetlink_unicast() since this is always MSG_DONTWAIT in the existing code which is exactly what nlmsg_unicast() passes to netlink_unicast() as parameter. Fixes: 96518518cc41 ("netfilter: add nftables") Reported-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- include/linux/netfilter/nfnetlink.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index 851425c3178f..89016d08f6a2 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -43,8 +43,7 @@ int nfnetlink_has_listeners(struct net *net, unsigned int group); int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid, unsigned int group, int echo, gfp_t flags); int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error); -int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid, - int flags); +int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid); static inline u16 nfnl_msg_type(u8 subsys, u8 msg_type) { -- cgit v1.2.3 From 5629bb21ce202e8aa0151a28b1dc2e794753b708 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 4 Sep 2020 16:36:19 -0700 Subject: include/linux/log2.h: add missing () around n in roundup_pow_of_two() [ Upstream commit 428fc0aff4e59399ec719ffcc1f7a5d29a4ee476 ] Otherwise gcc generates warnings if the expression is complicated. Fixes: 312a0c170945 ("[PATCH] LOG2: Alter roundup_pow_of_two() so that it can use a ilog2() on a constant") Signed-off-by: Jason Gunthorpe Signed-off-by: Andrew Morton Link: https://lkml.kernel.org/r/0-v1-8a2697e3c003+41165-log_brackets_jgg@nvidia.com Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- include/linux/log2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/log2.h b/include/linux/log2.h index 83a4a3ca3e8a..c619ec6eff4a 100644 --- a/include/linux/log2.h +++ b/include/linux/log2.h @@ -173,7 +173,7 @@ unsigned long __rounddown_pow_of_two(unsigned long n) #define roundup_pow_of_two(n) \ ( \ __builtin_constant_p(n) ? ( \ - (n == 1) ? 1 : \ + ((n) == 1) ? 1 : \ (1UL << (ilog2((n) - 1) + 1)) \ ) : \ __roundup_pow_of_two(n) \ -- cgit v1.2.3 From 84c041c12442d233c9b3c593cbe9eb8a77875578 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 17 Aug 2020 18:00:55 +0800 Subject: block: allow for_each_bvec to support zero len bvec commit 7e24969022cbd61ddc586f14824fc205661bb124 upstream. Block layer usually doesn't support or allow zero-length bvec. Since commit 1bdc76aea115 ("iov_iter: use bvec iterator to implement iterate_bvec()"), iterate_bvec() switches to bvec iterator. However, Al mentioned that 'Zero-length segments are not disallowed' in iov_iter. Fixes for_each_bvec() so that it can move on after seeing one zero length bvec. Fixes: 1bdc76aea115 ("iov_iter: use bvec iterator to implement iterate_bvec()") Reported-by: syzbot Signed-off-by: Ming Lei Tested-by: Tetsuo Handa Cc: Al Viro Cc: Matthew Wilcox Cc: Link: https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg2262077.html Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- include/linux/bvec.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bvec.h b/include/linux/bvec.h index a032f01e928c..d7a628e066ee 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -110,11 +110,18 @@ static inline bool bvec_iter_advance(const struct bio_vec *bv, return true; } +static inline void bvec_iter_skip_zero_bvec(struct bvec_iter *iter) +{ + iter->bi_bvec_done = 0; + iter->bi_idx++; +} + #define for_each_bvec(bvl, bio_vec, iter, start) \ for (iter = (start); \ (iter).bi_size && \ ((bvl = bvec_iter_bvec((bio_vec), (iter))), 1); \ - bvec_iter_advance((bio_vec), &(iter), (bvl).bv_len)) + (bvl).bv_len ? (void)bvec_iter_advance((bio_vec), &(iter), \ + (bvl).bv_len) : bvec_iter_skip_zero_bvec(&(iter))) /* for iterating one bio from start to end */ #define BVEC_ITER_ALL_INIT (struct bvec_iter) \ -- cgit v1.2.3 From f7880745e91b77633da7f4428edf0ea3e50c4f4b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 2 Sep 2020 12:32:45 -0400 Subject: libata: implement ATA_HORKAGE_MAX_TRIM_128M and apply to Sandisks commit 3b5455636fe26ea21b4189d135a424a6da016418 upstream. All three generations of Sandisk SSDs lock up hard intermittently. Experiments showed that disabling NCQ lowered the failure rate significantly and the kernel has been disabling NCQ for some models of SD7's and 8's, which is obviously undesirable. Karthik worked with Sandisk to root cause the hard lockups to trim commands larger than 128M. This patch implements ATA_HORKAGE_MAX_TRIM_128M which limits max trim size to 128M and applies it to all three generations of Sandisk SSDs. Signed-off-by: Tejun Heo Cc: Karthik Shivaram Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- include/linux/libata.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index b9970f5bab67..e752368ea351 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -422,6 +422,7 @@ enum { ATA_HORKAGE_NO_DMA_LOG = (1 << 23), /* don't use DMA for log read */ ATA_HORKAGE_NOTRIM = (1 << 24), /* don't use TRIM */ ATA_HORKAGE_MAX_SEC_1024 = (1 << 25), /* Limit max sects to 1024 */ + ATA_HORKAGE_MAX_TRIM_128M = (1 << 26), /* Limit max trim size to 128M */ /* DMA mask for user DMA control: User visible values; DO NOT renumber */ -- cgit v1.2.3 From 59d2b1e5cb058122d53ab140831d69740034f951 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Aug 2020 16:15:58 +0200 Subject: netfilter: conntrack: allow sctp hearbeat after connection re-use [ Upstream commit cc5453a5b7e90c39f713091a7ebc53c1f87d1700 ] If an sctp connection gets re-used, heartbeats are flagged as invalid because their vtag doesn't match. Handle this in a similar way as TCP conntrack when it suspects that the endpoints and conntrack are out-of-sync. When a HEARTBEAT request fails its vtag validation, flag this in the conntrack state and accept the packet. When a HEARTBEAT_ACK is received with an invalid vtag in the reverse direction after we allowed such a HEARTBEAT through, assume we are out-of-sync and re-set the vtag info. v2: remove left-over snippet from an older incarnation that moved new_state/old_state assignments, thats not needed so keep that as-is. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- include/linux/netfilter/nf_conntrack_sctp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfilter/nf_conntrack_sctp.h b/include/linux/netfilter/nf_conntrack_sctp.h index 9a33f171aa82..625f491b95de 100644 --- a/include/linux/netfilter/nf_conntrack_sctp.h +++ b/include/linux/netfilter/nf_conntrack_sctp.h @@ -9,6 +9,8 @@ struct ip_ct_sctp { enum sctp_conntrack state; __be32 vtag[IP_CT_DIR_MAX]; + u8 last_dir; + u8 flags; }; #endif /* _NF_CONNTRACK_SCTP_H */ -- cgit v1.2.3 From 9f09e86200fdeef8b68696b5e1cb25ebd8ebfa5c Mon Sep 17 00:00:00 2001 From: Evan Nimmo Date: Wed, 9 Sep 2020 08:32:47 +1200 Subject: i2c: algo: pca: Reapply i2c bus settings after reset [ Upstream commit 0a355aeb24081e4538d4d424cd189f16c0bbd983 ] If something goes wrong (such as the SCL being stuck low) then we need to reset the PCA chip. The issue with this is that on reset we lose all config settings and the chip ends up in a disabled state which results in a lock up/high CPU usage. We need to re-apply any configuration that had previously been set and re-enable the chip. Signed-off-by: Evan Nimmo Reviewed-by: Chris Packham Reviewed-by: Andy Shevchenko Signed-off-by: Wolfram Sang Signed-off-by: Sasha Levin --- include/linux/i2c-algo-pca.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/i2c-algo-pca.h b/include/linux/i2c-algo-pca.h index d03071732db4..7c522fdd9ea7 100644 --- a/include/linux/i2c-algo-pca.h +++ b/include/linux/i2c-algo-pca.h @@ -53,6 +53,20 @@ #define I2C_PCA_CON_SI 0x08 /* Serial Interrupt */ #define I2C_PCA_CON_CR 0x07 /* Clock Rate (MASK) */ +/** + * struct pca_i2c_bus_settings - The configured PCA i2c bus settings + * @mode: Configured i2c bus mode + * @tlow: Configured SCL LOW period + * @thi: Configured SCL HIGH period + * @clock_freq: The configured clock frequency + */ +struct pca_i2c_bus_settings { + int mode; + int tlow; + int thi; + int clock_freq; +}; + struct i2c_algo_pca_data { void *data; /* private low level data */ void (*write_byte) (void *data, int reg, int val); @@ -64,6 +78,7 @@ struct i2c_algo_pca_data { * For PCA9665, use the frequency you want here. */ unsigned int i2c_clock; unsigned int chip; + struct pca_i2c_bus_settings bus_settings; }; int i2c_pca_add_bus(struct i2c_adapter *); -- cgit v1.2.3 From d6712eefc77e58a8625dc3f4598369085c824d5c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 20 Sep 2020 08:54:42 -0700 Subject: dm: Call proper helper to determine dax support commit e2ec5128254518cae320d5dc631b71b94160f663 upstream. DM was calling generic_fsdax_supported() to determine whether a device referenced in the DM table supports DAX. However this is a helper for "leaf" device drivers so that they don't have to duplicate common generic checks. High level code should call dax_supported() helper which that calls into appropriate helper for the particular device. This problem manifested itself as kernel messages: dm-3: error: dax access failed (-95) when lvm2-testsuite run in cases where a DM device was stacked on top of another DM device. Fixes: 7bf7eac8d648 ("dax: Arrange for dax_supported check to span multiple devices") Cc: Tested-by: Adrian Huang Signed-off-by: Jan Kara Acked-by: Mike Snitzer Reported-by: kernel test robot Link: https://lore.kernel.org/r/160061715195.13131.5503173247632041975.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Signed-off-by: Greg Kroah-Hartman --- include/linux/dax.h | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index 9bd8528bd305..1c8e589cd74f 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -128,6 +128,8 @@ static inline bool generic_fsdax_supported(struct dax_device *dax_dev, return __generic_fsdax_supported(dax_dev, bdev, blocksize, start, sectors); } +bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev, + int blocksize, sector_t start, sector_t len); static inline struct dax_device *fs_dax_get_by_host(const char *host) { @@ -165,6 +167,13 @@ static inline struct dax_device *fs_dax_get_by_host(const char *host) return NULL; } +static inline bool dax_supported(struct dax_device *dax_dev, + struct block_device *bdev, int blocksize, sector_t start, + sector_t len) +{ + return false; +} + static inline void fs_put_dax(struct dax_device *dax_dev) { } @@ -197,14 +206,23 @@ static inline void dax_unlock_page(struct page *page, dax_entry_t cookie) } #endif +#if IS_ENABLED(CONFIG_DAX) int dax_read_lock(void); void dax_read_unlock(int id); +#else +static inline int dax_read_lock(void) +{ + return 0; +} + +static inline void dax_read_unlock(int id) +{ +} +#endif /* CONFIG_DAX */ bool dax_alive(struct dax_device *dax_dev); void *dax_get_private(struct dax_device *dax_dev); long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn); -bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev, - int blocksize, sector_t start, sector_t len); size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, -- cgit v1.2.3 From ef6458fdbb5c1fd6cb7fa94f6d4754495ef5af2e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 21 Sep 2020 11:33:23 +0200 Subject: dax: Fix compilation for CONFIG_DAX && !CONFIG_FS_DAX commit 88b67edd7247466bc47f01e1dc539b0d0d4b931e upstream. dax_supported() is defined whenever CONFIG_DAX is enabled. So dummy implementation should be defined only in !CONFIG_DAX case, not in !CONFIG_FS_DAX case. Fixes: e2ec51282545 ("dm: Call proper helper to determine dax support") Cc: Reported-by: Geert Uytterhoeven Reported-by: Naresh Kamboju Reported-by: kernel test robot Signed-off-by: Jan Kara Signed-off-by: Dan Williams Signed-off-by: Greg Kroah-Hartman --- include/linux/dax.h | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index 1c8e589cd74f..72a7f03a59f4 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -56,6 +56,8 @@ static inline void set_dax_synchronous(struct dax_device *dax_dev) { __set_dax_synchronous(dax_dev); } +bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev, + int blocksize, sector_t start, sector_t len); /* * Check if given mapping is supported by the file / underlying device. */ @@ -102,6 +104,12 @@ static inline bool dax_synchronous(struct dax_device *dax_dev) static inline void set_dax_synchronous(struct dax_device *dax_dev) { } +static inline bool dax_supported(struct dax_device *dax_dev, + struct block_device *bdev, int blocksize, sector_t start, + sector_t len) +{ + return false; +} static inline bool daxdev_mapping_supported(struct vm_area_struct *vma, struct dax_device *dax_dev) { @@ -128,8 +136,6 @@ static inline bool generic_fsdax_supported(struct dax_device *dax_dev, return __generic_fsdax_supported(dax_dev, bdev, blocksize, start, sectors); } -bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev, - int blocksize, sector_t start, sector_t len); static inline struct dax_device *fs_dax_get_by_host(const char *host) { @@ -167,13 +173,6 @@ static inline struct dax_device *fs_dax_get_by_host(const char *host) return NULL; } -static inline bool dax_supported(struct dax_device *dax_dev, - struct block_device *bdev, int blocksize, sector_t start, - sector_t len) -{ - return false; -} - static inline void fs_put_dax(struct dax_device *dax_dev) { } -- cgit v1.2.3 From 7f23aa7cabdd017e169ee18681a42182d0ae7259 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Sep 2020 01:27:40 -0700 Subject: net: add __must_check to skb_put_padto() [ Upstream commit 4a009cb04aeca0de60b73f37b102573354214b52 ] skb_put_padto() and __skb_put_padto() callers must check return values or risk use-after-free. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/linux/skbuff.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 955e1370f033..a62889c8bed7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3185,8 +3185,9 @@ static inline int skb_padto(struct sk_buff *skb, unsigned int len) * is untouched. Otherwise it is extended. Returns zero on * success. The skb is freed on error if @free_on_error is true. */ -static inline int __skb_put_padto(struct sk_buff *skb, unsigned int len, - bool free_on_error) +static inline int __must_check __skb_put_padto(struct sk_buff *skb, + unsigned int len, + bool free_on_error) { unsigned int size = skb->len; @@ -3209,7 +3210,7 @@ static inline int __skb_put_padto(struct sk_buff *skb, unsigned int len, * is untouched. Otherwise it is extended. Returns zero on * success. The skb is freed on error. */ -static inline int skb_put_padto(struct sk_buff *skb, unsigned int len) +static inline int __must_check skb_put_padto(struct sk_buff *skb, unsigned int len) { return __skb_put_padto(skb, len, true); } -- cgit v1.2.3 From a21260928bd30bab09c7406b72e79415b2f34fcc Mon Sep 17 00:00:00 2001 From: Bradley Bolen Date: Sat, 16 Nov 2019 20:00:45 -0500 Subject: mmc: core: Fix size overflow for mmc partitions [ Upstream commit f3d7c2292d104519195fdb11192daec13229c219 ] With large eMMC cards, it is possible to create general purpose partitions that are bigger than 4GB. The size member of the mmc_part struct is only an unsigned int which overflows for gp partitions larger than 4GB. Change this to a u64 to handle the overflow. Signed-off-by: Bradley Bolen Signed-off-by: Ulf Hansson Signed-off-by: Sasha Levin --- include/linux/mmc/card.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index e459b38ef33c..cf3780a6ccc4 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -226,7 +226,7 @@ struct mmc_queue_req; * MMC Physical partitions */ struct mmc_part { - unsigned int size; /* partition size (in bytes) */ + u64 size; /* partition size (in bytes) */ unsigned int part_cfg; /* partition type */ char name[MAX_MMC_PART_NAME_LEN]; bool force_ro; /* to make boot parts RO by default */ -- cgit v1.2.3 From 80f7667422447a146f55b26e7b453354cccdc642 Mon Sep 17 00:00:00 2001 From: Kusanagi Kouichi Date: Thu, 21 Nov 2019 19:20:21 +0900 Subject: debugfs: Fix !DEBUG_FS debugfs_create_automount [ Upstream commit 4250b047039d324e0ff65267c8beb5bad5052a86 ] If DEBUG_FS=n, compile fails with the following error: kernel/trace/trace.c: In function 'tracing_init_dentry': kernel/trace/trace.c:8658:9: error: passing argument 3 of 'debugfs_create_automount' from incompatible pointer type [-Werror=incompatible-pointer-types] 8658 | trace_automount, NULL); | ^~~~~~~~~~~~~~~ | | | struct vfsmount * (*)(struct dentry *, void *) In file included from kernel/trace/trace.c:24: ./include/linux/debugfs.h:206:25: note: expected 'struct vfsmount * (*)(void *)' but argument is of type 'struct vfsmount * (*)(struct dentry *, void *)' 206 | struct vfsmount *(*f)(void *), | ~~~~~~~~~~~~~~~~~~~^~~~~~~~~~ Signed-off-by: Kusanagi Kouichi Link: https://lore.kernel.org/r/20191121102021787.MLMY.25002.ppp.dion.ne.jp@dmta0003.auone-net.jp Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- include/linux/debugfs.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index 58424eb3b329..798f0b9b43ae 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -54,6 +54,8 @@ static const struct file_operations __fops = { \ .llseek = no_llseek, \ } +typedef struct vfsmount *(*debugfs_automount_t)(struct dentry *, void *); + #if defined(CONFIG_DEBUG_FS) struct dentry *debugfs_lookup(const char *name, struct dentry *parent); @@ -75,7 +77,6 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent); struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, const char *dest); -typedef struct vfsmount *(*debugfs_automount_t)(struct dentry *, void *); struct dentry *debugfs_create_automount(const char *name, struct dentry *parent, debugfs_automount_t f, @@ -203,7 +204,7 @@ static inline struct dentry *debugfs_create_symlink(const char *name, static inline struct dentry *debugfs_create_automount(const char *name, struct dentry *parent, - struct vfsmount *(*f)(void *), + debugfs_automount_t f, void *data) { return ERR_PTR(-ENODEV); -- cgit v1.2.3 From 06a90303633ffcd18bece6e41bb4a7787b9f34c2 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 14 Nov 2019 19:03:00 +0100 Subject: seqlock: Require WRITE_ONCE surrounding raw_seqcount_barrier [ Upstream commit bf07132f96d426bcbf2098227fb680915cf44498 ] This patch proposes to require marked atomic accesses surrounding raw_write_seqcount_barrier. We reason that otherwise there is no way to guarantee propagation nor atomicity of writes before/after the barrier [1]. For example, consider the compiler tears stores either before or after the barrier; in this case, readers may observe a partial value, and because readers are unaware that writes are going on (writes are not in a seq-writer critical section), will complete the seq-reader critical section while having observed some partial state. [1] https://lwn.net/Articles/793253/ This came up when designing and implementing KCSAN, because KCSAN would flag these accesses as data-races. After careful analysis, our reasoning as above led us to conclude that the best thing to do is to propose an amendment to the raw_seqcount_barrier usage. Signed-off-by: Marco Elver Acked-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Signed-off-by: Sasha Levin --- include/linux/seqlock.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index bcf4cf26b8c8..a42a29952889 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -243,6 +243,13 @@ static inline void raw_write_seqcount_end(seqcount_t *s) * usual consistency guarantee. It is one wmb cheaper, because we can * collapse the two back-to-back wmb()s. * + * Note that, writes surrounding the barrier should be declared atomic (e.g. + * via WRITE_ONCE): a) to ensure the writes become visible to other threads + * atomically, avoiding compiler optimizations; b) to document which writes are + * meant to propagate to the reader critical section. This is necessary because + * neither writes before and after the barrier are enclosed in a seq-writer + * critical section that would ensure readers are aware of ongoing writes. + * * seqcount_t seq; * bool X = true, Y = false; * @@ -262,11 +269,11 @@ static inline void raw_write_seqcount_end(seqcount_t *s) * * void write(void) * { - * Y = true; + * WRITE_ONCE(Y, true); * * raw_write_seqcount_barrier(seq); * - * X = false; + * WRITE_ONCE(X, false); * } */ static inline void raw_write_seqcount_barrier(seqcount_t *s) -- cgit v1.2.3 From 980040c7ae697bbb15735fe6dad6d63dd459a135 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Tue, 4 Feb 2020 13:40:29 -0500 Subject: skbuff: fix a data race in skb_queue_len() [ Upstream commit 86b18aaa2b5b5bb48e609cd591b3d2d0fdbe0442 ] sk_buff.qlen can be accessed concurrently as noticed by KCSAN, BUG: KCSAN: data-race in __skb_try_recv_from_queue / unix_dgram_sendmsg read to 0xffff8a1b1d8a81c0 of 4 bytes by task 5371 on cpu 96: unix_dgram_sendmsg+0x9a9/0xb70 include/linux/skbuff.h:1821 net/unix/af_unix.c:1761 ____sys_sendmsg+0x33e/0x370 ___sys_sendmsg+0xa6/0xf0 __sys_sendmsg+0x69/0xf0 __x64_sys_sendmsg+0x51/0x70 do_syscall_64+0x91/0xb47 entry_SYSCALL_64_after_hwframe+0x49/0xbe write to 0xffff8a1b1d8a81c0 of 4 bytes by task 1 on cpu 99: __skb_try_recv_from_queue+0x327/0x410 include/linux/skbuff.h:2029 __skb_try_recv_datagram+0xbe/0x220 unix_dgram_recvmsg+0xee/0x850 ____sys_recvmsg+0x1fb/0x210 ___sys_recvmsg+0xa2/0xf0 __sys_recvmsg+0x66/0xf0 __x64_sys_recvmsg+0x51/0x70 do_syscall_64+0x91/0xb47 entry_SYSCALL_64_after_hwframe+0x49/0xbe Since only the read is operating as lockless, it could introduce a logic bug in unix_recvq_full() due to the load tearing. Fix it by adding a lockless variant of skb_queue_len() and unix_recvq_full() where READ_ONCE() is on the read while WRITE_ONCE() is on the write similar to the commit d7d16a89350a ("net: add skb_queue_empty_lockless()"). Signed-off-by: Qian Cai Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- include/linux/skbuff.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a62889c8bed7..68139cc2f3ca 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1816,6 +1816,18 @@ static inline __u32 skb_queue_len(const struct sk_buff_head *list_) return list_->qlen; } +/** + * skb_queue_len_lockless - get queue length + * @list_: list to measure + * + * Return the length of an &sk_buff queue. + * This variant can be used in lockless contexts. + */ +static inline __u32 skb_queue_len_lockless(const struct sk_buff_head *list_) +{ + return READ_ONCE(list_->qlen); +} + /** * __skb_queue_head_init - initialize non-spinlock portions of sk_buff_head * @list: queue to initialize @@ -2021,7 +2033,7 @@ static inline void __skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) { struct sk_buff *next, *prev; - list->qlen--; + WRITE_ONCE(list->qlen, list->qlen - 1); next = skb->next; prev = skb->prev; skb->next = skb->prev = NULL; -- cgit v1.2.3 From b796d94921ce2adffcb30d25381bcb88a9d77f60 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 25 Mar 2020 10:03:36 -0500 Subject: exec: Add exec_update_mutex to replace cred_guard_mutex [ Upstream commit eea9673250db4e854e9998ef9da6d4584857f0ea ] The cred_guard_mutex is problematic as it is held over possibly indefinite waits for userspace. The possible indefinite waits for userspace that I have identified are: The cred_guard_mutex is held in PTRACE_EVENT_EXIT waiting for the tracer. The cred_guard_mutex is held over "put_user(0, tsk->clear_child_tid)" in exit_mm(). The cred_guard_mutex is held over "get_user(futex_offset, ...") in exit_robust_list. The cred_guard_mutex held over copy_strings. The functions get_user and put_user can trigger a page fault which can potentially wait indefinitely in the case of userfaultfd or if userspace implements part of the page fault path. In any of those cases the userspace process that the kernel is waiting for might make a different system call that winds up taking the cred_guard_mutex and result in deadlock. Holding a mutex over any of those possibly indefinite waits for userspace does not appear necessary. Add exec_update_mutex that will just cover updating the process during exec where the permissions and the objects pointed to by the task struct may be out of sync. The plan is to switch the users of cred_guard_mutex to exec_update_mutex one by one. This lets us move forward while still being careful and not introducing any regressions. Link: https://lore.kernel.org/lkml/20160921152946.GA24210@dhcp22.suse.cz/ Link: https://lore.kernel.org/lkml/AM6PR03MB5170B06F3A2B75EFB98D071AE4E60@AM6PR03MB5170.eurprd03.prod.outlook.com/ Link: https://lore.kernel.org/linux-fsdevel/20161102181806.GB1112@redhat.com/ Link: https://lore.kernel.org/lkml/20160923095031.GA14923@redhat.com/ Link: https://lore.kernel.org/lkml/20170213141452.GA30203@redhat.com/ Ref: 45c1a159b85b ("Add PTRACE_O_TRACEVFORKDONE and PTRACE_O_TRACEEXIT facilities.") Ref: 456f17cd1a28 ("[PATCH] user-vm-unlock-2.5.31-A2") Reviewed-by: Kirill Tkhai Signed-off-by: "Eric W. Biederman" Signed-off-by: Bernd Edlinger Signed-off-by: Eric W. Biederman Signed-off-by: Sasha Levin --- include/linux/binfmts.h | 8 +++++++- include/linux/sched/signal.h | 9 ++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index b40fc633f3be..a345d9fed3d8 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -44,7 +44,13 @@ struct linux_binprm { * exec has happened. Used to sanitize execution environment * and to set AT_SECURE auxv for glibc. */ - secureexec:1; + secureexec:1, + /* + * Set by flush_old_exec, when exec_mmap has been called. + * This is past the point of no return, when the + * exec_update_mutex has been taken. + */ + called_exec_mmap:1; #ifdef __alpha__ unsigned int taso:1; #endif diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 88050259c466..a29df79540ce 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -224,7 +224,14 @@ struct signal_struct { struct mutex cred_guard_mutex; /* guard against foreign influences on * credential calculations - * (notably. ptrace) */ + * (notably. ptrace) + * Deprecated do not use in new code. + * Use exec_update_mutex instead. + */ + struct mutex exec_update_mutex; /* Held while task_struct is being + * updated during exec, and may have + * inconsistent permissions. + */ } __randomize_layout; /* -- cgit v1.2.3 From 535943c46dfcf32018d94da0b8c9ba470a9a8e71 Mon Sep 17 00:00:00 2001 From: Mikel Rychliski Date: Wed, 18 Mar 2020 22:16:23 -0400 Subject: PCI: Use ioremap(), not phys_to_virt() for platform ROM [ Upstream commit 72e0ef0e5f067fd991f702f0b2635d911d0cf208 ] On some EFI systems, the video BIOS is provided by the EFI firmware. The boot stub code stores the physical address of the ROM image in pdev->rom. Currently we attempt to access this pointer using phys_to_virt(), which doesn't work with CONFIG_HIGHMEM. On these systems, attempting to load the radeon module on a x86_32 kernel can result in the following: BUG: unable to handle page fault for address: 3e8ed03c #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page *pde = 00000000 Oops: 0000 [#1] PREEMPT SMP CPU: 0 PID: 317 Comm: systemd-udevd Not tainted 5.6.0-rc3-next-20200228 #2 Hardware name: Apple Computer, Inc. MacPro1,1/Mac-F4208DC8, BIOS MP11.88Z.005C.B08.0707021221 07/02/07 EIP: radeon_get_bios+0x5ed/0xe50 [radeon] Code: 00 00 84 c0 0f 85 12 fd ff ff c7 87 64 01 00 00 00 00 00 00 8b 47 08 8b 55 b0 e8 1e 83 e1 d6 85 c0 74 1a 8b 55 c0 85 d2 74 13 <80> 38 55 75 0e 80 78 01 aa 0f 84 a4 03 00 00 8d 74 26 00 68 dc 06 EAX: 3e8ed03c EBX: 00000000 ECX: 3e8ed03c EDX: 00010000 ESI: 00040000 EDI: eec04000 EBP: eef3fc60 ESP: eef3fbe0 DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00010206 CR0: 80050033 CR2: 3e8ed03c CR3: 2ec77000 CR4: 000006d0 Call Trace: r520_init+0x26/0x240 [radeon] radeon_device_init+0x533/0xa50 [radeon] radeon_driver_load_kms+0x80/0x220 [radeon] drm_dev_register+0xa7/0x180 [drm] radeon_pci_probe+0x10f/0x1a0 [radeon] pci_device_probe+0xd4/0x140 Fix the issue by updating all drivers which can access a platform provided ROM. Instead of calling the helper function pci_platform_rom() which uses phys_to_virt(), call ioremap() directly on the pdev->rom. radeon_read_platform_bios() previously directly accessed an __iomem pointer. Avoid this by calling memcpy_fromio() instead of kmemdup(). pci_platform_rom() now has no remaining callers, so remove it. Link: https://lore.kernel.org/r/20200319021623.5426-1-mikel@mikelr.com Signed-off-by: Mikel Rychliski Signed-off-by: Bjorn Helgaas Acked-by: Alex Deucher Signed-off-by: Sasha Levin --- include/linux/pci.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index f39f22f9ee47..e92bd9b32f36 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1216,7 +1216,6 @@ int pci_enable_rom(struct pci_dev *pdev); void pci_disable_rom(struct pci_dev *pdev); void __iomem __must_check *pci_map_rom(struct pci_dev *pdev, size_t *size); void pci_unmap_rom(struct pci_dev *pdev, void __iomem *rom); -void __iomem __must_check *pci_platform_rom(struct pci_dev *pdev, size_t *size); /* Power management related routines */ int pci_save_state(struct pci_dev *dev); -- cgit v1.2.3 From 24c56b6fe0ba0e745f62a4c5b1037195012147cd Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 1 Apr 2020 13:04:49 -0400 Subject: NFS: Fix races nfs_page_group_destroy() vs nfs_destroy_unlinked_subrequests() [ Upstream commit 08ca8b21f760c0ed5034a5c122092eec22ccf8f4 ] When a subrequest is being detached from the subgroup, we want to ensure that it is not holding the group lock, or in the process of waiting for the group lock. Fixes: 5b2b5187fa85 ("NFS: Fix nfs_page_group_destroy() and nfs_lock_and_join_requests() race cases") Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin --- include/linux/nfs_page.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index 0bbd587fac6a..7e9419d74b86 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -142,6 +142,8 @@ extern void nfs_unlock_and_release_request(struct nfs_page *); extern int nfs_page_group_lock(struct nfs_page *); extern void nfs_page_group_unlock(struct nfs_page *); extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int); +extern int nfs_page_set_headlock(struct nfs_page *req); +extern void nfs_page_clear_headlock(struct nfs_page *req); extern bool nfs_async_iocounter_wait(struct rpc_task *, struct nfs_lock_context *); /* -- cgit v1.2.3 From 55c3e7fac92eb82c72b2713a565be09ac235a8a5 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 20 Mar 2020 17:32:41 -0400 Subject: svcrdma: Fix backchannel return code [ Upstream commit ea740bd5f58e2912e74f401fd01a9d6aa985ca05 ] Way back when I was writing the RPC/RDMA server-side backchannel code, I misread the TCP backchannel reply handler logic. When svc_tcp_recvfrom() successfully receives a backchannel reply, it does not return -EAGAIN. It sets XPT_DATA and returns zero. Update svc_rdma_recvfrom() to return zero. Here, XPT_DATA doesn't need to be set again: it is set whenever a new message is received, behind a spin lock in a single threaded context. Also, if handling the cb reply is not successful, the message is simply dropped. There's no special message framing to deal with as there is in the TCP case. Now that the handle_bc_reply() return value is ignored, I've removed the dprintk call sites in the error exit of handle_bc_reply() in favor of trace points in other areas that already report the error cases. Signed-off-by: Chuck Lever Signed-off-by: Sasha Levin --- include/linux/sunrpc/svc_rdma.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 26f282e5e082..77589ed787f5 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -154,9 +154,8 @@ struct svc_rdma_send_ctxt { }; /* svc_rdma_backchannel.c */ -extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, - __be32 *rdma_resp, - struct xdr_buf *rcvbuf); +extern void svc_rdma_handle_bc_reply(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *rctxt); /* svc_rdma_recvfrom.c */ extern void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma); -- cgit v1.2.3 From 82dfe9c32dea40fd1b823fbec515b27834327c9e Mon Sep 17 00:00:00 2001 From: Dmitry Bogdanov Date: Wed, 9 Sep 2020 20:43:08 +0300 Subject: net: qed: Disable aRFS for NPAR and 100G [ Upstream commit 2d2fe8433796603091ac8ea235b9165ac5a85f9a ] In CMT and NPAR the PF is unknown when the GFS block processes the packet. Therefore cannot use searcher as it has a per PF database, and thus ARFS must be disabled. Fixes: d51e4af5c209 ("qed: aRFS infrastructure support") Signed-off-by: Manish Chopra Signed-off-by: Igor Russkikh Signed-off-by: Michal Kalderon Signed-off-by: Dmitry Bogdanov Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- include/linux/qed/qed_if.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index b5db1ee96d78..65a7355ed07b 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -637,6 +637,7 @@ struct qed_dev_info { #define QED_MFW_VERSION_3_OFFSET 24 u32 flash_size; + bool b_arfs_capable; bool b_inter_pf_switch; bool tx_switching; bool rdma_supported; -- cgit v1.2.3 From c4ab0a8370932db7d9d85fd8fea059449027307e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 10 Sep 2020 17:55:05 +0900 Subject: kprobes: tracing/kprobes: Fix to kill kprobes on initmem after boot commit 82d083ab60c3693201c6f5c7a5f23a6ed422098d upstream. Since kprobe_event= cmdline option allows user to put kprobes on the functions in initmem, kprobe has to make such probes gone after boot. Currently the probes on the init functions in modules will be handled by module callback, but the kernel init text isn't handled. Without this, kprobes may access non-exist text area to disable or remove it. Link: https://lkml.kernel.org/r/159972810544.428528.1839307531600646955.stgit@devnote2 Fixes: 970988e19eb0 ("tracing/kprobe: Add kprobe_event= boot parameter") Cc: Jonathan Corbet Cc: Shuah Khan Cc: Randy Dunlap Cc: Ingo Molnar Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- include/linux/kprobes.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 645fd401c856..a60488867dd0 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -369,6 +369,8 @@ void unregister_kretprobes(struct kretprobe **rps, int num); void kprobe_flush_task(struct task_struct *tk); void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head); +void kprobe_free_init_mem(void); + int disable_kprobe(struct kprobe *kp); int enable_kprobe(struct kprobe *kp); @@ -426,6 +428,9 @@ static inline void unregister_kretprobes(struct kretprobe **rps, int num) static inline void kprobe_flush_task(struct task_struct *tk) { } +static inline void kprobe_free_init_mem(void) +{ +} static inline int disable_kprobe(struct kprobe *kp) { return -ENOSYS; -- cgit v1.2.3 From b6e503c0a0ffd67de01fb5b3f01883c740745460 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 31 Oct 2019 10:59:44 +0100 Subject: ata: define AC_ERR_OK commit 25937580a5065d6fbd92d9c8ebd47145ad80052e upstream. Since we will return enum ata_completion_errors from qc_prep in the next patch, let's define AC_ERR_OK to mark the OK status. Signed-off-by: Jiri Slaby Cc: Jens Axboe Cc: linux-ide@vger.kernel.org Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- include/linux/libata.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index e752368ea351..6346272432ff 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -486,6 +486,7 @@ enum hsm_task_states { }; enum ata_completion_errors { + AC_ERR_OK = 0, /* no error */ AC_ERR_DEV = (1 << 0), /* device reported error */ AC_ERR_HSM = (1 << 1), /* host state machine violation */ AC_ERR_TIMEOUT = (1 << 2), /* timeout */ -- cgit v1.2.3 From e11c83520cd04b813cd1748ee2a8f2c620e5f7e3 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 31 Oct 2019 10:59:45 +0100 Subject: ata: make qc_prep return ata_completion_errors commit 95364f36701e62dd50eee91e1303187fd1a9f567 upstream. In case a driver wants to return an error from qc_prep, return enum ata_completion_errors. sata_mv is one of those drivers -- see the next patch. Other drivers return the newly defined AC_ERR_OK. [v2] use enum ata_completion_errors and AC_ERR_OK. Signed-off-by: Jiri Slaby Cc: Jens Axboe Cc: linux-ide@vger.kernel.org Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- include/linux/libata.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 6346272432ff..3c3d8d6b1618 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -896,9 +896,9 @@ struct ata_port_operations { /* * Command execution */ - int (*qc_defer)(struct ata_queued_cmd *qc); - int (*check_atapi_dma)(struct ata_queued_cmd *qc); - void (*qc_prep)(struct ata_queued_cmd *qc); + int (*qc_defer)(struct ata_queued_cmd *qc); + int (*check_atapi_dma)(struct ata_queued_cmd *qc); + enum ata_completion_errors (*qc_prep)(struct ata_queued_cmd *qc); unsigned int (*qc_issue)(struct ata_queued_cmd *qc); bool (*qc_fill_rtf)(struct ata_queued_cmd *qc); @@ -1166,7 +1166,7 @@ extern int ata_xfer_mode2shift(unsigned long xfer_mode); extern const char *ata_mode_string(unsigned long xfer_mask); extern unsigned long ata_id_xfermask(const u16 *id); extern int ata_std_qc_defer(struct ata_queued_cmd *qc); -extern void ata_noop_qc_prep(struct ata_queued_cmd *qc); +extern enum ata_completion_errors ata_noop_qc_prep(struct ata_queued_cmd *qc); extern void ata_sg_init(struct ata_queued_cmd *qc, struct scatterlist *sg, unsigned int n_elem); extern unsigned int ata_dev_classify(const struct ata_taskfile *tf); @@ -1900,9 +1900,9 @@ extern const struct ata_port_operations ata_bmdma_port_ops; .sg_tablesize = LIBATA_MAX_PRD, \ .dma_boundary = ATA_DMA_BOUNDARY -extern void ata_bmdma_qc_prep(struct ata_queued_cmd *qc); +extern enum ata_completion_errors ata_bmdma_qc_prep(struct ata_queued_cmd *qc); extern unsigned int ata_bmdma_qc_issue(struct ata_queued_cmd *qc); -extern void ata_bmdma_dumb_qc_prep(struct ata_queued_cmd *qc); +extern enum ata_completion_errors ata_bmdma_dumb_qc_prep(struct ata_queued_cmd *qc); extern unsigned int ata_bmdma_port_intr(struct ata_port *ap, struct ata_queued_cmd *qc); extern irqreturn_t ata_bmdma_interrupt(int irq, void *dev_instance); -- cgit v1.2.3 From 215459ff36664a8f15ae3ea0efe4d3e76f354657 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 14 Nov 2019 10:57:40 +0100 Subject: vsock/virtio: add transport parameter to the virtio_transport_reset_no_sock() [ Upstream commit 4c7246dc45e2706770d5233f7ce1597a07e069ba ] We are going to add 'struct vsock_sock *' parameter to virtio_transport_get_ops(). In some cases, like in the virtio_transport_reset_no_sock(), we don't have any socket assigned to the packet received, so we can't use the virtio_transport_get_ops(). In order to allow virtio_transport_reset_no_sock() to use the '.send_pkt' callback from the 'vhost_transport' or 'virtio_transport', we add the 'struct virtio_transport *' to it and to its caller: virtio_transport_recv_pkt(). We moved the 'vhost_transport' and 'virtio_transport' definition, to pass their address to the virtio_transport_recv_pkt(). Reviewed-by: Stefan Hajnoczi Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- include/linux/virtio_vsock.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index 07875ccc7bb5..b139f76060a6 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -150,7 +150,8 @@ virtio_transport_dgram_enqueue(struct vsock_sock *vsk, void virtio_transport_destruct(struct vsock_sock *vsk); -void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt); +void virtio_transport_recv_pkt(struct virtio_transport *t, + struct virtio_vsock_pkt *pkt); void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt); void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt); u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 wanted); -- cgit v1.2.3 From de21eb7f8cb0045aedcfe8083fbebedf1209a3ae Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Fri, 25 Sep 2020 16:49:51 +0800 Subject: memstick: Skip allocating card when removing host commit 62c59a8786e6bb75569cee91dab66e9da3ff4b68 upstream. After commit 6827ca573c03 ("memstick: rtsx_usb_ms: Support runtime power management"), removing module rtsx_usb_ms will be stuck. The deadlock is caused by powering on and powering off at the same time, the former one is when memstick_check() is flushed, and the later is called by memstick_remove_host(). Soe let's skip allocating card to prevent this issue. Fixes: 6827ca573c03 ("memstick: rtsx_usb_ms: Support runtime power management") Signed-off-by: Kai-Heng Feng Link: https://lore.kernel.org/r/20200925084952.13220-1-kai.heng.feng@canonical.com Cc: stable@vger.kernel.org Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman --- include/linux/memstick.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/memstick.h b/include/linux/memstick.h index 216a713bef7f..1198ea3d4012 100644 --- a/include/linux/memstick.h +++ b/include/linux/memstick.h @@ -281,6 +281,7 @@ struct memstick_host { struct memstick_dev *card; unsigned int retries; + bool removing; /* Notify the host that some requests are pending. */ void (*request)(struct memstick_host *host); -- cgit v1.2.3 From 2334b2d5a2bd1a4e9877623ef532b114d83bdf94 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 25 Mar 2020 16:07:04 +0300 Subject: block/diskstats: more accurate approximation of io_ticks for slow disks commit 2b8bd423614c595540eaadcfbc702afe8e155e50 upstream. Currently io_ticks is approximated by adding one at each start and end of requests if jiffies counter has changed. This works perfectly for requests shorter than a jiffy or if one of requests starts/ends at each jiffy. If disk executes just one request at a time and they are longer than two jiffies then only first and last jiffies will be accounted. Fix is simple: at the end of request add up into io_ticks jiffies passed since last update rather than just one jiffy. Example: common HDD executes random read 4k requests around 12ms. fio --name=test --filename=/dev/sdb --rw=randread --direct=1 --runtime=30 & iostat -x 10 sdb Note changes of iostat's "%util" 8,43% -> 99,99% before/after patch: Before: Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util sdb 0,00 0,00 82,60 0,00 330,40 0,00 8,00 0,96 12,09 12,09 0,00 1,02 8,43 After: Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util sdb 0,00 0,00 82,50 0,00 330,00 0,00 8,00 1,00 12,10 12,10 0,00 12,12 99,99 Now io_ticks does not loose time between start and end of requests, but for queue-depth > 1 some I/O time between adjacent starts might be lost. For load estimation "%util" is not as useful as average queue length, but it clearly shows how often disk queue is completely empty. Fixes: 5b18b5a73760 ("block: delete part_round_stats and switch to less precise counting") Signed-off-by: Konstantin Khlebnikov Reviewed-by: Ming Lei Signed-off-by: Jens Axboe From: "Banerjee, Debabrata" Signed-off-by: Greg Kroah-Hartman --- include/linux/genhd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 62a2ec9f17df..c1bf9956256f 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -419,7 +419,7 @@ static inline void free_part_info(struct hd_struct *part) kfree(part->info); } -void update_io_ticks(struct hd_struct *part, unsigned long now); +void update_io_ticks(struct hd_struct *part, unsigned long now, bool end); /* block/genhd.c */ extern void device_add_disk(struct device *parent, struct gendisk *disk, -- cgit v1.2.3 From 42b7153dd6a64b61141771f4ae0b1b1131321720 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Fri, 25 Sep 2020 21:19:28 -0700 Subject: mm: replace memmap_context by meminit_context commit c1d0da83358a2316d9be7f229f26126dbaa07468 upstream. Patch series "mm: fix memory to node bad links in sysfs", v3. Sometimes, firmware may expose interleaved memory layout like this: Early memory node ranges node 1: [mem 0x0000000000000000-0x000000011fffffff] node 2: [mem 0x0000000120000000-0x000000014fffffff] node 1: [mem 0x0000000150000000-0x00000001ffffffff] node 0: [mem 0x0000000200000000-0x000000048fffffff] node 2: [mem 0x0000000490000000-0x00000007ffffffff] In that case, we can see memory blocks assigned to multiple nodes in sysfs: $ ls -l /sys/devices/system/memory/memory21 total 0 lrwxrwxrwx 1 root root 0 Aug 24 05:27 node1 -> ../../node/node1 lrwxrwxrwx 1 root root 0 Aug 24 05:27 node2 -> ../../node/node2 -rw-r--r-- 1 root root 65536 Aug 24 05:27 online -r--r--r-- 1 root root 65536 Aug 24 05:27 phys_device -r--r--r-- 1 root root 65536 Aug 24 05:27 phys_index drwxr-xr-x 2 root root 0 Aug 24 05:27 power -r--r--r-- 1 root root 65536 Aug 24 05:27 removable -rw-r--r-- 1 root root 65536 Aug 24 05:27 state lrwxrwxrwx 1 root root 0 Aug 24 05:25 subsystem -> ../../../../bus/memory -rw-r--r-- 1 root root 65536 Aug 24 05:25 uevent -r--r--r-- 1 root root 65536 Aug 24 05:27 valid_zones The same applies in the node's directory with a memory21 link in both the node1 and node2's directory. This is wrong but doesn't prevent the system to run. However when later, one of these memory blocks is hot-unplugged and then hot-plugged, the system is detecting an inconsistency in the sysfs layout and a BUG_ON() is raised: kernel BUG at /Users/laurent/src/linux-ppc/mm/memory_hotplug.c:1084! LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries Modules linked in: rpadlpar_io rpaphp pseries_rng rng_core vmx_crypto gf128mul binfmt_misc ip_tables x_tables xfs libcrc32c crc32c_vpmsum autofs4 CPU: 8 PID: 10256 Comm: drmgr Not tainted 5.9.0-rc1+ #25 Call Trace: add_memory_resource+0x23c/0x340 (unreliable) __add_memory+0x5c/0xf0 dlpar_add_lmb+0x1b4/0x500 dlpar_memory+0x1f8/0xb80 handle_dlpar_errorlog+0xc0/0x190 dlpar_store+0x198/0x4a0 kobj_attr_store+0x30/0x50 sysfs_kf_write+0x64/0x90 kernfs_fop_write+0x1b0/0x290 vfs_write+0xe8/0x290 ksys_write+0xdc/0x130 system_call_exception+0x160/0x270 system_call_common+0xf0/0x27c This has been seen on PowerPC LPAR. The root cause of this issue is that when node's memory is registered, the range used can overlap another node's range, thus the memory block is registered to multiple nodes in sysfs. There are two issues here: (a) The sysfs memory and node's layouts are broken due to these multiple links (b) The link errors in link_mem_sections() should not lead to a system panic. To address (a) register_mem_sect_under_node should not rely on the system state to detect whether the link operation is triggered by a hot plug operation or not. This is addressed by the patches 1 and 2 of this series. Issue (b) will be addressed separately. This patch (of 2): The memmap_context enum is used to detect whether a memory operation is due to a hot-add operation or happening at boot time. Make it general to the hotplug operation and rename it as meminit_context. There is no functional change introduced by this patch Suggested-by: David Hildenbrand Signed-off-by: Laurent Dufour Signed-off-by: Andrew Morton Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Cc: Greg Kroah-Hartman Cc: "Rafael J . Wysocki" Cc: Nathan Lynch Cc: Scott Cheloha Cc: Tony Luck Cc: Fenghua Yu Cc: Link: https://lkml.kernel.org/r/20200915094143.79181-1-ldufour@linux.ibm.com Link: https://lkml.kernel.org/r/20200915132624.9723-1-ldufour@linux.ibm.com Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/mm.h | 2 +- include/linux/mmzone.h | 11 ++++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3285dae06c03..34119f393a80 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2208,7 +2208,7 @@ static inline void zero_resv_unavail(void) {} extern void set_dma_reserve(unsigned long new_dma_reserve); extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long, - enum memmap_context, struct vmem_altmap *); + enum meminit_context, struct vmem_altmap *); extern void setup_per_zone_wmarks(void); extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 85804ba62215..a90aba3d6afb 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -822,10 +822,15 @@ bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned int alloc_flags); bool zone_watermark_ok_safe(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx); -enum memmap_context { - MEMMAP_EARLY, - MEMMAP_HOTPLUG, +/* + * Memory initialization context, use to differentiate memory added by + * the platform statically or via memory hotplug interface. + */ +enum meminit_context { + MEMINIT_EARLY, + MEMINIT_HOTPLUG, }; + extern void init_currently_empty_zone(struct zone *zone, unsigned long start_pfn, unsigned long size); -- cgit v1.2.3 From 9626c1a63703c08e6c17adceb0d4cd468fc7ce14 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Fri, 25 Sep 2020 21:19:31 -0700 Subject: mm: don't rely on system state to detect hot-plug operations commit f85086f95fa36194eb0db5cd5c12e56801b98523 upstream. In register_mem_sect_under_node() the system_state's value is checked to detect whether the call is made during boot time or during an hot-plug operation. Unfortunately, that check against SYSTEM_BOOTING is wrong because regular memory is registered at SYSTEM_SCHEDULING state. In addition, memory hot-plug operation can be triggered at this system state by the ACPI [1]. So checking against the system state is not enough. The consequence is that on system with interleaved node's ranges like this: Early memory node ranges node 1: [mem 0x0000000000000000-0x000000011fffffff] node 2: [mem 0x0000000120000000-0x000000014fffffff] node 1: [mem 0x0000000150000000-0x00000001ffffffff] node 0: [mem 0x0000000200000000-0x000000048fffffff] node 2: [mem 0x0000000490000000-0x00000007ffffffff] This can be seen on PowerPC LPAR after multiple memory hot-plug and hot-unplug operations are done. At the next reboot the node's memory ranges can be interleaved and since the call to link_mem_sections() is made in topology_init() while the system is in the SYSTEM_SCHEDULING state, the node's id is not checked, and the sections registered to multiple nodes: $ ls -l /sys/devices/system/memory/memory21/node* total 0 lrwxrwxrwx 1 root root 0 Aug 24 05:27 node1 -> ../../node/node1 lrwxrwxrwx 1 root root 0 Aug 24 05:27 node2 -> ../../node/node2 In that case, the system is able to boot but if later one of theses memory blocks is hot-unplugged and then hot-plugged, the sysfs inconsistency is detected and this is triggering a BUG_ON(): kernel BUG at /Users/laurent/src/linux-ppc/mm/memory_hotplug.c:1084! Oops: Exception in kernel mode, sig: 5 [#1] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries Modules linked in: rpadlpar_io rpaphp pseries_rng rng_core vmx_crypto gf128mul binfmt_misc ip_tables x_tables xfs libcrc32c crc32c_vpmsum autofs4 CPU: 8 PID: 10256 Comm: drmgr Not tainted 5.9.0-rc1+ #25 Call Trace: add_memory_resource+0x23c/0x340 (unreliable) __add_memory+0x5c/0xf0 dlpar_add_lmb+0x1b4/0x500 dlpar_memory+0x1f8/0xb80 handle_dlpar_errorlog+0xc0/0x190 dlpar_store+0x198/0x4a0 kobj_attr_store+0x30/0x50 sysfs_kf_write+0x64/0x90 kernfs_fop_write+0x1b0/0x290 vfs_write+0xe8/0x290 ksys_write+0xdc/0x130 system_call_exception+0x160/0x270 system_call_common+0xf0/0x27c This patch addresses the root cause by not relying on the system_state value to detect whether the call is due to a hot-plug operation. An extra parameter is added to link_mem_sections() detailing whether the operation is due to a hot-plug operation. [1] According to Oscar Salvador, using this qemu command line, ACPI memory hotplug operations are raised at SYSTEM_SCHEDULING state: $QEMU -enable-kvm -machine pc -smp 4,sockets=4,cores=1,threads=1 -cpu host -monitor pty \ -m size=$MEM,slots=255,maxmem=4294967296k \ -numa node,nodeid=0,cpus=0-3,mem=512 -numa node,nodeid=1,mem=512 \ -object memory-backend-ram,id=memdimm0,size=134217728 -device pc-dimm,node=0,memdev=memdimm0,id=dimm0,slot=0 \ -object memory-backend-ram,id=memdimm1,size=134217728 -device pc-dimm,node=0,memdev=memdimm1,id=dimm1,slot=1 \ -object memory-backend-ram,id=memdimm2,size=134217728 -device pc-dimm,node=0,memdev=memdimm2,id=dimm2,slot=2 \ -object memory-backend-ram,id=memdimm3,size=134217728 -device pc-dimm,node=0,memdev=memdimm3,id=dimm3,slot=3 \ -object memory-backend-ram,id=memdimm4,size=134217728 -device pc-dimm,node=1,memdev=memdimm4,id=dimm4,slot=4 \ -object memory-backend-ram,id=memdimm5,size=134217728 -device pc-dimm,node=1,memdev=memdimm5,id=dimm5,slot=5 \ -object memory-backend-ram,id=memdimm6,size=134217728 -device pc-dimm,node=1,memdev=memdimm6,id=dimm6,slot=6 \ Fixes: 4fbce633910e ("mm/memory_hotplug.c: make register_mem_sect_under_node() a callback of walk_memory_range()") Signed-off-by: Laurent Dufour Signed-off-by: Andrew Morton Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Fenghua Yu Cc: Nathan Lynch Cc: Scott Cheloha Cc: Tony Luck Cc: Link: https://lkml.kernel.org/r/20200915094143.79181-3-ldufour@linux.ibm.com Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/node.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/node.h b/include/linux/node.h index 4866f32a02d8..014ba3ab2efd 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -99,11 +99,13 @@ extern struct node *node_devices[]; typedef void (*node_registration_func_t)(struct node *); #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_NUMA) -extern int link_mem_sections(int nid, unsigned long start_pfn, - unsigned long end_pfn); +int link_mem_sections(int nid, unsigned long start_pfn, + unsigned long end_pfn, + enum meminit_context context); #else static inline int link_mem_sections(int nid, unsigned long start_pfn, - unsigned long end_pfn) + unsigned long end_pfn, + enum meminit_context context) { return 0; } @@ -128,7 +130,8 @@ static inline int register_one_node(int nid) if (error) return error; /* link memory sections under this node */ - error = link_mem_sections(nid, start_pfn, end_pfn); + error = link_mem_sections(nid, start_pfn, end_pfn, + MEMINIT_EARLY); } return error; -- cgit v1.2.3