From af3ff8045bbf3e32f1a448542e73abb4c8ceb6f1 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 28 Nov 2017 18:01:38 -0800 Subject: crypto: hmac - require that the underlying hash algorithm is unkeyed Because the HMAC template didn't check that its underlying hash algorithm is unkeyed, trying to use "hmac(hmac(sha3-512-generic))" through AF_ALG or through KEYCTL_DH_COMPUTE resulted in the inner HMAC being used without having been keyed, resulting in sha3_update() being called without sha3_init(), causing a stack buffer overflow. This is a very old bug, but it seems to have only started causing real problems when SHA-3 support was added (requires CONFIG_CRYPTO_SHA3) because the innermost hash's state is ->import()ed from a zeroed buffer, and it just so happens that other hash algorithms are fine with that, but SHA-3 is not. However, there could be arch or hardware-dependent hash algorithms also affected; I couldn't test everything. Fix the bug by introducing a function crypto_shash_alg_has_setkey() which tests whether a shash algorithm is keyed. Then update the HMAC template to require that its underlying hash algorithm is unkeyed. Here is a reproducer: #include #include int main() { int algfd; struct sockaddr_alg addr = { .salg_type = "hash", .salg_name = "hmac(hmac(sha3-512-generic))", }; char key[4096] = { 0 }; algfd = socket(AF_ALG, SOCK_SEQPACKET, 0); bind(algfd, (const struct sockaddr *)&addr, sizeof(addr)); setsockopt(algfd, SOL_ALG, ALG_SET_KEY, key, sizeof(key)); } Here was the KASAN report from syzbot: BUG: KASAN: stack-out-of-bounds in memcpy include/linux/string.h:341 [inline] BUG: KASAN: stack-out-of-bounds in sha3_update+0xdf/0x2e0 crypto/sha3_generic.c:161 Write of size 4096 at addr ffff8801cca07c40 by task syzkaller076574/3044 CPU: 1 PID: 3044 Comm: syzkaller076574 Not tainted 4.14.0-mm1+ #25 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:53 print_address_description+0x73/0x250 mm/kasan/report.c:252 kasan_report_error mm/kasan/report.c:351 [inline] kasan_report+0x25b/0x340 mm/kasan/report.c:409 check_memory_region_inline mm/kasan/kasan.c:260 [inline] check_memory_region+0x137/0x190 mm/kasan/kasan.c:267 memcpy+0x37/0x50 mm/kasan/kasan.c:303 memcpy include/linux/string.h:341 [inline] sha3_update+0xdf/0x2e0 crypto/sha3_generic.c:161 crypto_shash_update+0xcb/0x220 crypto/shash.c:109 shash_finup_unaligned+0x2a/0x60 crypto/shash.c:151 crypto_shash_finup+0xc4/0x120 crypto/shash.c:165 hmac_finup+0x182/0x330 crypto/hmac.c:152 crypto_shash_finup+0xc4/0x120 crypto/shash.c:165 shash_digest_unaligned+0x9e/0xd0 crypto/shash.c:172 crypto_shash_digest+0xc4/0x120 crypto/shash.c:186 hmac_setkey+0x36a/0x690 crypto/hmac.c:66 crypto_shash_setkey+0xad/0x190 crypto/shash.c:64 shash_async_setkey+0x47/0x60 crypto/shash.c:207 crypto_ahash_setkey+0xaf/0x180 crypto/ahash.c:200 hash_setkey+0x40/0x90 crypto/algif_hash.c:446 alg_setkey crypto/af_alg.c:221 [inline] alg_setsockopt+0x2a1/0x350 crypto/af_alg.c:254 SYSC_setsockopt net/socket.c:1851 [inline] SyS_setsockopt+0x189/0x360 net/socket.c:1830 entry_SYSCALL_64_fastpath+0x1f/0x96 Reported-by: syzbot Cc: Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- include/crypto/internal/hash.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/crypto/internal/hash.h b/include/crypto/internal/hash.h index f0b44c16e88f..c2bae8da642c 100644 --- a/include/crypto/internal/hash.h +++ b/include/crypto/internal/hash.h @@ -82,6 +82,14 @@ int ahash_register_instance(struct crypto_template *tmpl, struct ahash_instance *inst); void ahash_free_instance(struct crypto_instance *inst); +int shash_no_setkey(struct crypto_shash *tfm, const u8 *key, + unsigned int keylen); + +static inline bool crypto_shash_alg_has_setkey(struct shash_alg *alg) +{ + return alg->setkey != shash_no_setkey; +} + int crypto_init_ahash_spawn(struct crypto_ahash_spawn *spawn, struct hash_alg_common *alg, struct crypto_instance *inst); -- cgit v1.2.3 From 8d26fdfcb45dc420115b267ac9d6b3ac13457f1b Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 30 Nov 2017 14:35:08 +0100 Subject: spi: Fix double "when" Signed-off-by: Geert Uytterhoeven Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 7b2170bfd6e7..bc6bb325d1bf 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -126,7 +126,7 @@ void spi_statistics_add_transfer_stats(struct spi_statistics *stats, * for that name. This appears in the sysfs "modalias" attribute * for driver coldplugging, and in uevents used for hotplugging * @cs_gpio: gpio number of the chipselect line (optional, -ENOENT when - * when not using a GPIO line) + * not using a GPIO line) * * @statistics: statistics for the spi_device * -- cgit v1.2.3 From 250d0c7754aa37c6443f07f1f5f591e2806295d8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 19 Oct 2017 10:32:13 +0200 Subject: tracing: always define trace_{irq,preempt}_{enable_disable} We get a build error in the irqsoff tracer in some configurations: kernel/trace/trace_irqsoff.c: In function 'trace_preempt_on': kernel/trace/trace_irqsoff.c:855:2: error: implicit declaration of function 'trace_preempt_enable_rcuidle'; did you mean 'trace_irq_enable_rcuidle'? [-Werror=implicit-function-declaration] trace_preempt_enable_rcuidle(a0, a1); The problem is that trace_preempt_enable_rcuidle() has different definition based on multiple Kconfig symbols, but not all combinations have a valid definition. This changes the conditions so that we always get exactly one definition of each of the four tracing macros. I have not tried to verify that these definitions are sensible, but now we can build all randconfig combinations again. Link: http://lkml.kernel.org/r/20171019083230.2450779-1-arnd@arndb.de Fixes: d59158162e03 ("tracing: Add support for preempt and irq enable/disable events") Acked-by: Joel Fernandes Signed-off-by: Arnd Bergmann Signed-off-by: Steven Rostedt (VMware) --- include/trace/events/preemptirq.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/trace/events/preemptirq.h b/include/trace/events/preemptirq.h index f5024c560d8f..9c4eb33c5a1d 100644 --- a/include/trace/events/preemptirq.h +++ b/include/trace/events/preemptirq.h @@ -56,15 +56,18 @@ DEFINE_EVENT(preemptirq_template, preempt_enable, #include -#else /* !CONFIG_PREEMPTIRQ_EVENTS */ +#endif /* !CONFIG_PREEMPTIRQ_EVENTS */ +#if !defined(CONFIG_PREEMPTIRQ_EVENTS) || defined(CONFIG_PROVE_LOCKING) #define trace_irq_enable(...) #define trace_irq_disable(...) -#define trace_preempt_enable(...) -#define trace_preempt_disable(...) #define trace_irq_enable_rcuidle(...) #define trace_irq_disable_rcuidle(...) +#endif + +#if !defined(CONFIG_PREEMPTIRQ_EVENTS) || !defined(CONFIG_DEBUG_PREEMPT) +#define trace_preempt_enable(...) +#define trace_preempt_disable(...) #define trace_preempt_enable_rcuidle(...) #define trace_preempt_disable_rcuidle(...) - #endif -- cgit v1.2.3 From a773d419275bf54854ca6cfda8f2594ed2790faa Mon Sep 17 00:00:00 2001 From: Felipe Balbi Date: Fri, 2 Jun 2017 13:20:25 +0300 Subject: tracing: Pass export pointer as argument to ->write() By passing an export descriptor to the write function, users don't need to keep a global static pointer and can rely on container_of() to fetch their own structure. Link: http://lkml.kernel.org/r/20170602102025.5140-1-felipe.balbi@linux.intel.com Acked-by: Steven Rostedt (VMware) Reviewed-by: Chunyan Zhang Signed-off-by: Felipe Balbi Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/trace.h b/include/linux/trace.h index d24991c1fef3..b95ffb2188ab 100644 --- a/include/linux/trace.h +++ b/include/linux/trace.h @@ -18,7 +18,7 @@ */ struct trace_export { struct trace_export __rcu *next; - void (*write)(const void *, unsigned int); + void (*write)(struct trace_export *, const void *, unsigned int); }; int register_ftrace_export(struct trace_export *export); -- cgit v1.2.3 From 975b820b6836b6b6c42fb84cd2e772e2b41bca67 Mon Sep 17 00:00:00 2001 From: Cai Li Date: Tue, 21 Nov 2017 17:24:38 +0800 Subject: clk: fix a panic error caused by accessing NULL pointer In some cases the clock parent would be set NULL when doing re-parent, it will cause a NULL pointer accessing if clk_set trace event is enabled. This patch sets the parent as "none" if the input parameter is NULL. Fixes: dfc202ead312 (clk: Add tracepoints for hardware operations) Signed-off-by: Cai Li Signed-off-by: Chunyan Zhang Signed-off-by: Stephen Boyd --- include/trace/events/clk.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/trace/events/clk.h b/include/trace/events/clk.h index 758607226bfd..2cd449328aee 100644 --- a/include/trace/events/clk.h +++ b/include/trace/events/clk.h @@ -134,12 +134,12 @@ DECLARE_EVENT_CLASS(clk_parent, TP_STRUCT__entry( __string( name, core->name ) - __string( pname, parent->name ) + __string( pname, parent ? parent->name : "none" ) ), TP_fast_assign( __assign_str(name, core->name); - __assign_str(pname, parent->name); + __assign_str(pname, parent ? parent->name : "none"); ), TP_printk("%s %s", __get_str(name), __get_str(pname)) -- cgit v1.2.3 From 7912af5c835bd86f2b0347a480e0f40e2fab30d0 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 6 Dec 2017 14:55:05 -0600 Subject: PCI: Add pci_get_domain_bus_and_slot() stub The coretemp driver build fails when CONFIG_PCI is not enabled because it uses a function that does not have a stub for that config case, so add the function stub. ../drivers/hwmon/coretemp.c: In function 'adjust_tjmax': ../drivers/hwmon/coretemp.c:250:9: error: implicit declaration of function 'pci_get_domain_bus_and_slot' [-Werror=implicit-function-declaration] struct pci_dev *host_bridge = pci_get_domain_bus_and_slot(0, 0, devfn); ../drivers/hwmon/coretemp.c:250:32: warning: initialization makes pointer from integer without a cast [enabled by default] struct pci_dev *host_bridge = pci_get_domain_bus_and_slot(0, 0, devfn); Signed-off-by: Randy Dunlap [bhelgaas: identical patch also by Arnd Bergmann ] Signed-off-by: Bjorn Helgaas Acked-by: Guenter Roeck --- include/linux/pci.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index 0403894147a3..c170c9250c8b 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1674,6 +1674,9 @@ static inline struct pci_dev *pci_get_slot(struct pci_bus *bus, static inline struct pci_dev *pci_get_bus_and_slot(unsigned int bus, unsigned int devfn) { return NULL; } +static inline struct pci_dev *pci_get_domain_bus_and_slot(int domain, + unsigned int bus, unsigned int devfn) +{ return NULL; } static inline int pci_domain_nr(struct pci_bus *bus) { return 0; } static inline struct pci_dev *pci_dev_get(struct pci_dev *dev) { return NULL; } -- cgit v1.2.3 From b860b419d970f286294fbfb2b21a4028fd8ee442 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 6 Dec 2017 12:21:35 +0100 Subject: mfd: Fix RTS5227 (and others) powermanagement Commit 8275b77a1513 ("mfd: rts5249: Add support for RTS5250S power saving") adds powersaving support for device-ids 5249 524a and 525a. But as a side effect it breaks ASPM support for all the other device-ids, causing e.g. the Haswell CPU on a Lenovo T440s to not go into a higher c-state then PC3, while previously it would go to PC7, causing the machine to idle at 7.4W instead of 6.6W! The problem here is the new option.dev_aspm_mode field, which only gets explicitly initialized in the new code for the device-ids 5249 524a and 525a. Leaving the dev_aspm_mode 0 for the other device-ids. The default dev_aspm_mode 0 is mapped to DEV_ASPM_DISABLE, but the old behavior of calling rtsx_pci_enable_aspm() when idle and rtsx_pci_disable_aspm() when busy happens when dev_aspm_mode == DEV_ASPM_DYNAMIC. This commit changes the enum so that 0 = DEV_ASPM_DYNAMIC matching the old default behavior, fixing the pm regression with the other device-ids. Fixes: 8275b77a1513 ("mfd: rts5249: Add support for RTS5250S power saving") Signed-off-by: Hans de Goede Acked-by: Rui Feng Signed-off-by: Lee Jones --- include/linux/mfd/rtsx_pci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mfd/rtsx_pci.h b/include/linux/mfd/rtsx_pci.h index a2a1318a3d0c..c3d3f04d8cc6 100644 --- a/include/linux/mfd/rtsx_pci.h +++ b/include/linux/mfd/rtsx_pci.h @@ -915,10 +915,10 @@ enum PDEV_STAT {PDEV_STAT_IDLE, PDEV_STAT_RUN}; #define LTR_L1SS_PWR_GATE_CHECK_CARD_EN BIT(6) enum dev_aspm_mode { - DEV_ASPM_DISABLE = 0, DEV_ASPM_DYNAMIC, DEV_ASPM_BACKDOOR, DEV_ASPM_STATIC, + DEV_ASPM_DISABLE, }; /* -- cgit v1.2.3 From 9abffc6f2efe46c3564c04312e52e07622d40e51 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 30 Nov 2017 13:39:27 +0100 Subject: crypto: mcryptd - protect the per-CPU queue with a lock mcryptd_enqueue_request() grabs the per-CPU queue struct and protects access to it with disabled preemption. Then it schedules a worker on the same CPU. The worker in mcryptd_queue_worker() guards access to the same per-CPU variable with disabled preemption. If we take CPU-hotplug into account then it is possible that between queue_work_on() and the actual invocation of the worker the CPU goes down and the worker will be scheduled on _another_ CPU. And here the preempt_disable() protection does not work anymore. The easiest thing is to add a spin_lock() to guard access to the list. Another detail: mcryptd_queue_worker() is not processing more than MCRYPTD_BATCH invocation in a row. If there are still items left, then it will invoke queue_work() to proceed with more later. *I* would suggest to simply drop that check because it does not use a system workqueue and the workqueue is already marked as "CPU_INTENSIVE". And if preemption is required then the scheduler should do it. However if queue_work() is used then the work item is marked as CPU unbound. That means it will try to run on the local CPU but it may run on another CPU as well. Especially with CONFIG_DEBUG_WQ_FORCE_RR_CPU=y. Again, the preempt_disable() won't work here but lock which was introduced will help. In order to keep work-item on the local CPU (and avoid RR) I changed it to queue_work_on(). Cc: stable@vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Herbert Xu --- include/crypto/mcryptd.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/crypto/mcryptd.h b/include/crypto/mcryptd.h index cceafa01f907..b67404fc4b34 100644 --- a/include/crypto/mcryptd.h +++ b/include/crypto/mcryptd.h @@ -27,6 +27,7 @@ static inline struct mcryptd_ahash *__mcryptd_ahash_cast( struct mcryptd_cpu_queue { struct crypto_queue queue; + spinlock_t q_lock; struct work_struct work; }; -- cgit v1.2.3 From 3487972d7fa6c5143951436ada5933dcf0ec659d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 7 Dec 2017 02:41:18 +0100 Subject: PM / sleep: Avoid excess pm_runtime_enable() calls in device_resume() Middle-layer code doing suspend-time optimizations for devices with the DPM_FLAG_SMART_SUSPEND flag set (currently, the PCI bus type and the ACPI PM domain) needs to make the core skip ->thaw_early and ->thaw callbacks for those devices in some cases and it sets the power.direct_complete flag for them for this purpose. However, it turns out that setting power.direct_complete outside of the PM core is a bad idea as it triggers an excess invocation of pm_runtime_enable() in device_resume(). For this reason, provide a helper to clear power.is_late_suspended and power.is_suspended to be invoked by the middle-layer code in question instead of setting power.direct_complete and make that code call the new helper. Fixes: c4b65157aeef (PCI / PM: Take SMART_SUSPEND driver flag into account) Fixes: 05087360fd7a (ACPI / PM: Take SMART_SUSPEND driver flag into account) Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Acked-by: Bjorn Helgaas --- include/linux/pm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pm.h b/include/linux/pm.h index 65d39115f06d..492ed473ba7e 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -765,6 +765,7 @@ extern int pm_generic_poweroff_late(struct device *dev); extern int pm_generic_poweroff(struct device *dev); extern void pm_generic_complete(struct device *dev); +extern void dev_pm_skip_next_resume_phases(struct device *dev); extern bool dev_pm_smart_suspend_and_suspended(struct device *dev); #else /* !CONFIG_PM_SLEEP */ -- cgit v1.2.3 From a8ceb5dbfde1092b466936bca0ff3be127ecf38e Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 5 Dec 2017 21:29:37 +0200 Subject: ptr_ring: add barriers Users of ptr_ring expect that it's safe to give the data structure a pointer and have it be available to consumers, but that actually requires an smb_wmb or a stronger barrier. In absence of such barriers and on architectures that reorder writes, consumer might read an un=initialized value from an skb pointer stored in the skb array. This was observed causing crashes. To fix, add memory barriers. The barrier we use is a wmb, the assumption being that producers do not need to read the value so we do not need to order these reads. Reported-by: George Cherian Suggested-by: Jason Wang Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Signed-off-by: David S. Miller --- include/linux/ptr_ring.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h index 37b4bb2545b3..6866df4f31b5 100644 --- a/include/linux/ptr_ring.h +++ b/include/linux/ptr_ring.h @@ -101,12 +101,18 @@ static inline bool ptr_ring_full_bh(struct ptr_ring *r) /* Note: callers invoking this in a loop must use a compiler barrier, * for example cpu_relax(). Callers must hold producer_lock. + * Callers are responsible for making sure pointer that is being queued + * points to a valid data. */ static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr) { if (unlikely(!r->size) || r->queue[r->producer]) return -ENOSPC; + /* Make sure the pointer we are storing points to a valid data. */ + /* Pairs with smp_read_barrier_depends in __ptr_ring_consume. */ + smp_wmb(); + r->queue[r->producer++] = ptr; if (unlikely(r->producer >= r->size)) r->producer = 0; @@ -275,6 +281,9 @@ static inline void *__ptr_ring_consume(struct ptr_ring *r) if (ptr) __ptr_ring_discard_one(r); + /* Make sure anyone accessing data through the pointer is up to date. */ + /* Pairs with smp_wmb in __ptr_ring_produce. */ + smp_read_barrier_depends(); return ptr; } -- cgit v1.2.3 From 200809716aed1cac586fcac4c0551a688439be1f Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 10 Dec 2017 16:56:00 +0800 Subject: fou: fix some member types in guehdr guehdr struct is used to build or parse gue packets, which are always in big endian. It's better to define all guehdr members as __beXX types. Also, in validate_gue_flags it's not good to use a __be32 variable for both Standard flags(__be16) and Private flags (__be32), and pass it to other funcions. This patch could fix a bunch of sparse warnings from fou. Fixes: 5024c33ac354 ("gue: Add infrastructure for flags and options") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/gue.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/gue.h b/include/net/gue.h index 2fdb29ca74c2..fdad41469b65 100644 --- a/include/net/gue.h +++ b/include/net/gue.h @@ -44,10 +44,10 @@ struct guehdr { #else #error "Please fix " #endif - __u8 proto_ctype; - __u16 flags; + __u8 proto_ctype; + __be16 flags; }; - __u32 word; + __be32 word; }; }; @@ -84,11 +84,10 @@ static inline size_t guehdr_priv_flags_len(__be32 flags) * if there is an unknown standard or private flags, or the options length for * the flags exceeds the options length specific in hlen of the GUE header. */ -static inline int validate_gue_flags(struct guehdr *guehdr, - size_t optlen) +static inline int validate_gue_flags(struct guehdr *guehdr, size_t optlen) { + __be16 flags = guehdr->flags; size_t len; - __be32 flags = guehdr->flags; if (flags & ~GUE_FLAGS_ALL) return 1; @@ -101,12 +100,13 @@ static inline int validate_gue_flags(struct guehdr *guehdr, /* Private flags are last four bytes accounted in * guehdr_flags_len */ - flags = *(__be32 *)((void *)&guehdr[1] + len - GUE_LEN_PRIV); + __be32 pflags = *(__be32 *)((void *)&guehdr[1] + + len - GUE_LEN_PRIV); - if (flags & ~GUE_PFLAGS_ALL) + if (pflags & ~GUE_PFLAGS_ALL) return 1; - len += guehdr_priv_flags_len(flags); + len += guehdr_priv_flags_len(pflags); if (len > optlen) return 1; } -- cgit v1.2.3 From d89c70356acf11b7cf47ca5cfcafae5062a85451 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 28 Nov 2017 18:42:19 +0000 Subject: locking/core: Remove break_lock field when CONFIG_GENERIC_LOCKBREAK=y When CONFIG_GENERIC_LOCKBEAK=y, locking structures grow an extra int ->break_lock field which is used to implement raw_spin_is_contended() by setting the field to 1 when waiting on a lock and clearing it to zero when holding a lock. However, there are a few problems with this approach: - There is a write-write race between a CPU successfully taking the lock (and subsequently writing break_lock = 0) and a waiter waiting on the lock (and subsequently writing break_lock = 1). This could result in a contended lock being reported as uncontended and vice-versa. - On machines with store buffers, nothing guarantees that the writes to break_lock are visible to other CPUs at any particular time. - READ_ONCE/WRITE_ONCE are not used, so the field is potentially susceptible to harmful compiler optimisations, Consequently, the usefulness of this field is unclear and we'd be better off removing it and allowing architectures to implement raw_spin_is_contended() by providing a definition of arch_spin_is_contended(), as they can when CONFIG_GENERIC_LOCKBREAK=n. Signed-off-by: Will Deacon Acked-by: Peter Zijlstra Cc: Heiko Carstens Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Sebastian Ott Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1511894539-7988-3-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- include/linux/rwlock_types.h | 3 --- include/linux/spinlock.h | 5 ----- include/linux/spinlock_types.h | 3 --- 3 files changed, 11 deletions(-) (limited to 'include') diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h index cc0072e93e36..857a72ceb794 100644 --- a/include/linux/rwlock_types.h +++ b/include/linux/rwlock_types.h @@ -10,9 +10,6 @@ */ typedef struct { arch_rwlock_t raw_lock; -#ifdef CONFIG_GENERIC_LOCKBREAK - unsigned int break_lock; -#endif #ifdef CONFIG_DEBUG_SPINLOCK unsigned int magic, owner_cpu; void *owner; diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index a39186194cd6..3bf273538840 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -107,16 +107,11 @@ do { \ #define raw_spin_is_locked(lock) arch_spin_is_locked(&(lock)->raw_lock) -#ifdef CONFIG_GENERIC_LOCKBREAK -#define raw_spin_is_contended(lock) ((lock)->break_lock) -#else - #ifdef arch_spin_is_contended #define raw_spin_is_contended(lock) arch_spin_is_contended(&(lock)->raw_lock) #else #define raw_spin_is_contended(lock) (((void)(lock), 0)) #endif /*arch_spin_is_contended*/ -#endif /* * This barrier must provide two things: diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h index 73548eb13a5d..24b4e6f2c1a2 100644 --- a/include/linux/spinlock_types.h +++ b/include/linux/spinlock_types.h @@ -19,9 +19,6 @@ typedef struct raw_spinlock { arch_spinlock_t raw_lock; -#ifdef CONFIG_GENERIC_LOCKBREAK - unsigned int break_lock; -#endif #ifdef CONFIG_DEBUG_SPINLOCK unsigned int magic, owner_cpu; void *owner; -- cgit v1.2.3 From e966eaeeb623f09975ef362c2866fae6f86844f9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 12 Dec 2017 12:31:16 +0100 Subject: locking/lockdep: Remove the cross-release locking checks This code (CONFIG_LOCKDEP_CROSSRELEASE=y and CONFIG_LOCKDEP_COMPLETIONS=y), while it found a number of old bugs initially, was also causing too many false positives that caused people to disable lockdep - which is arguably a worse overall outcome. If we disable cross-release by default but keep the code upstream then in practice the most likely outcome is that we'll allow the situation to degrade gradually, by allowing entropy to introduce more and more false positives, until it overwhelms maintenance capacity. Another bad side effect was that people were trying to work around the false positives by uglifying/complicating unrelated code. There's a marked difference between annotating locking operations and uglifying good code just due to bad lock debugging code ... This gradual decrease in quality happened to a number of debugging facilities in the kernel, and lockdep is pretty complex already, so we cannot risk this outcome. Either cross-release checking can be done right with no false positives, or it should not be included in the upstream kernel. ( Note that it might make sense to maintain it out of tree and go through the false positives every now and then and see whether new bugs were introduced. ) Cc: Byungchul Park Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/completion.h | 45 ---------------- include/linux/lockdep.h | 125 --------------------------------------------- include/linux/sched.h | 11 ---- 3 files changed, 181 deletions(-) (limited to 'include') diff --git a/include/linux/completion.h b/include/linux/completion.h index 0662a417febe..94a59ba7d422 100644 --- a/include/linux/completion.h +++ b/include/linux/completion.h @@ -10,9 +10,6 @@ */ #include -#ifdef CONFIG_LOCKDEP_COMPLETIONS -#include -#endif /* * struct completion - structure used to maintain state for a "completion" @@ -29,58 +26,16 @@ struct completion { unsigned int done; wait_queue_head_t wait; -#ifdef CONFIG_LOCKDEP_COMPLETIONS - struct lockdep_map_cross map; -#endif }; -#ifdef CONFIG_LOCKDEP_COMPLETIONS -static inline void complete_acquire(struct completion *x) -{ - lock_acquire_exclusive((struct lockdep_map *)&x->map, 0, 0, NULL, _RET_IP_); -} - -static inline void complete_release(struct completion *x) -{ - lock_release((struct lockdep_map *)&x->map, 0, _RET_IP_); -} - -static inline void complete_release_commit(struct completion *x) -{ - lock_commit_crosslock((struct lockdep_map *)&x->map); -} - -#define init_completion_map(x, m) \ -do { \ - lockdep_init_map_crosslock((struct lockdep_map *)&(x)->map, \ - (m)->name, (m)->key, 0); \ - __init_completion(x); \ -} while (0) - -#define init_completion(x) \ -do { \ - static struct lock_class_key __key; \ - lockdep_init_map_crosslock((struct lockdep_map *)&(x)->map, \ - "(completion)" #x, \ - &__key, 0); \ - __init_completion(x); \ -} while (0) -#else #define init_completion_map(x, m) __init_completion(x) #define init_completion(x) __init_completion(x) static inline void complete_acquire(struct completion *x) {} static inline void complete_release(struct completion *x) {} static inline void complete_release_commit(struct completion *x) {} -#endif -#ifdef CONFIG_LOCKDEP_COMPLETIONS -#define COMPLETION_INITIALIZER(work) \ - { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait), \ - STATIC_CROSS_LOCKDEP_MAP_INIT("(completion)" #work, &(work)) } -#else #define COMPLETION_INITIALIZER(work) \ { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) } -#endif #define COMPLETION_INITIALIZER_ONSTACK_MAP(work, map) \ (*({ init_completion_map(&(work), &(map)); &(work); })) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index a842551fe044..2e75dc34bff5 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -158,12 +158,6 @@ struct lockdep_map { int cpu; unsigned long ip; #endif -#ifdef CONFIG_LOCKDEP_CROSSRELEASE - /* - * Whether it's a crosslock. - */ - int cross; -#endif }; static inline void lockdep_copy_map(struct lockdep_map *to, @@ -267,95 +261,8 @@ struct held_lock { unsigned int hardirqs_off:1; unsigned int references:12; /* 32 bits */ unsigned int pin_count; -#ifdef CONFIG_LOCKDEP_CROSSRELEASE - /* - * Generation id. - * - * A value of cross_gen_id will be stored when holding this, - * which is globally increased whenever each crosslock is held. - */ - unsigned int gen_id; -#endif -}; - -#ifdef CONFIG_LOCKDEP_CROSSRELEASE -#define MAX_XHLOCK_TRACE_ENTRIES 5 - -/* - * This is for keeping locks waiting for commit so that true dependencies - * can be added at commit step. - */ -struct hist_lock { - /* - * Id for each entry in the ring buffer. This is used to - * decide whether the ring buffer was overwritten or not. - * - * For example, - * - * |<----------- hist_lock ring buffer size ------->| - * pppppppppppppppppppppiiiiiiiiiiiiiiiiiiiiiiiiiiiii - * wrapped > iiiiiiiiiiiiiiiiiiiiiiiiiii....................... - * - * where 'p' represents an acquisition in process - * context, 'i' represents an acquisition in irq - * context. - * - * In this example, the ring buffer was overwritten by - * acquisitions in irq context, that should be detected on - * rollback or commit. - */ - unsigned int hist_id; - - /* - * Seperate stack_trace data. This will be used at commit step. - */ - struct stack_trace trace; - unsigned long trace_entries[MAX_XHLOCK_TRACE_ENTRIES]; - - /* - * Seperate hlock instance. This will be used at commit step. - * - * TODO: Use a smaller data structure containing only necessary - * data. However, we should make lockdep code able to handle the - * smaller one first. - */ - struct held_lock hlock; }; -/* - * To initialize a lock as crosslock, lockdep_init_map_crosslock() should - * be called instead of lockdep_init_map(). - */ -struct cross_lock { - /* - * When more than one acquisition of crosslocks are overlapped, - * we have to perform commit for them based on cross_gen_id of - * the first acquisition, which allows us to add more true - * dependencies. - * - * Moreover, when no acquisition of a crosslock is in progress, - * we should not perform commit because the lock might not exist - * any more, which might cause incorrect memory access. So we - * have to track the number of acquisitions of a crosslock. - */ - int nr_acquire; - - /* - * Seperate hlock instance. This will be used at commit step. - * - * TODO: Use a smaller data structure containing only necessary - * data. However, we should make lockdep code able to handle the - * smaller one first. - */ - struct held_lock hlock; -}; - -struct lockdep_map_cross { - struct lockdep_map map; - struct cross_lock xlock; -}; -#endif - /* * Initialization, self-test and debugging-output methods: */ @@ -560,37 +467,6 @@ enum xhlock_context_t { XHLOCK_CTX_NR, }; -#ifdef CONFIG_LOCKDEP_CROSSRELEASE -extern void lockdep_init_map_crosslock(struct lockdep_map *lock, - const char *name, - struct lock_class_key *key, - int subclass); -extern void lock_commit_crosslock(struct lockdep_map *lock); - -/* - * What we essencially have to initialize is 'nr_acquire'. Other members - * will be initialized in add_xlock(). - */ -#define STATIC_CROSS_LOCK_INIT() \ - { .nr_acquire = 0,} - -#define STATIC_CROSS_LOCKDEP_MAP_INIT(_name, _key) \ - { .map.name = (_name), .map.key = (void *)(_key), \ - .map.cross = 1, .xlock = STATIC_CROSS_LOCK_INIT(), } - -/* - * To initialize a lockdep_map statically use this macro. - * Note that _name must not be NULL. - */ -#define STATIC_LOCKDEP_MAP_INIT(_name, _key) \ - { .name = (_name), .key = (void *)(_key), .cross = 0, } - -extern void crossrelease_hist_start(enum xhlock_context_t c); -extern void crossrelease_hist_end(enum xhlock_context_t c); -extern void lockdep_invariant_state(bool force); -extern void lockdep_init_task(struct task_struct *task); -extern void lockdep_free_task(struct task_struct *task); -#else /* !CROSSRELEASE */ #define lockdep_init_map_crosslock(m, n, k, s) do {} while (0) /* * To initialize a lockdep_map statically use this macro. @@ -604,7 +480,6 @@ static inline void crossrelease_hist_end(enum xhlock_context_t c) {} static inline void lockdep_invariant_state(bool force) {} static inline void lockdep_init_task(struct task_struct *task) {} static inline void lockdep_free_task(struct task_struct *task) {} -#endif /* CROSSRELEASE */ #ifdef CONFIG_LOCK_STAT diff --git a/include/linux/sched.h b/include/linux/sched.h index 21991d668d35..9ce6c3001e9f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -849,17 +849,6 @@ struct task_struct { struct held_lock held_locks[MAX_LOCK_DEPTH]; #endif -#ifdef CONFIG_LOCKDEP_CROSSRELEASE -#define MAX_XHLOCKS_NR 64UL - struct hist_lock *xhlocks; /* Crossrelease history locks */ - unsigned int xhlock_idx; - /* For restoring at history boundaries */ - unsigned int xhlock_idx_hist[XHLOCK_CTX_NR]; - unsigned int hist_id; - /* For overwrite check at each context exit */ - unsigned int hist_id_save[XHLOCK_CTX_NR]; -#endif - #ifdef CONFIG_UBSAN unsigned int in_ubsan; #endif -- cgit v1.2.3 From b899a850431e2dd0943205a63a68573f3e312d0d Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 27 Nov 2017 10:38:23 +0000 Subject: compiler.h: Remove ACCESS_ONCE() There are no longer any kernelspace uses of ACCESS_ONCE(), so we can remove the definition from . This patch removes the ACCESS_ONCE() definition, and updates comments which referred to it. At the same time, some inconsistent and redundant whitespace is removed from comments. Tested-by: Paul E. McKenney Signed-off-by: Mark Rutland Cc: Arnaldo Carvalho de Melo Cc: Joe Perches Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: apw@canonical.com Link: http://lkml.kernel.org/r/20171127103824.36526-4-mark.rutland@arm.com Signed-off-by: Ingo Molnar --- include/linux/compiler.h | 47 +++++++++++------------------------------------ 1 file changed, 11 insertions(+), 36 deletions(-) (limited to 'include') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 188ed9f65517..52e611ab9a6c 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -220,21 +220,21 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s /* * Prevent the compiler from merging or refetching reads or writes. The * compiler is also forbidden from reordering successive instances of - * READ_ONCE, WRITE_ONCE and ACCESS_ONCE (see below), but only when the - * compiler is aware of some particular ordering. One way to make the - * compiler aware of ordering is to put the two invocations of READ_ONCE, - * WRITE_ONCE or ACCESS_ONCE() in different C statements. + * READ_ONCE and WRITE_ONCE, but only when the compiler is aware of some + * particular ordering. One way to make the compiler aware of ordering is to + * put the two invocations of READ_ONCE or WRITE_ONCE in different C + * statements. * - * In contrast to ACCESS_ONCE these two macros will also work on aggregate - * data types like structs or unions. If the size of the accessed data - * type exceeds the word size of the machine (e.g., 32 bits or 64 bits) - * READ_ONCE() and WRITE_ONCE() will fall back to memcpy(). There's at - * least two memcpy()s: one for the __builtin_memcpy() and then one for - * the macro doing the copy of variable - '__u' allocated on the stack. + * These two macros will also work on aggregate data types like structs or + * unions. If the size of the accessed data type exceeds the word size of + * the machine (e.g., 32 bits or 64 bits) READ_ONCE() and WRITE_ONCE() will + * fall back to memcpy(). There's at least two memcpy()s: one for the + * __builtin_memcpy() and then one for the macro doing the copy of variable + * - '__u' allocated on the stack. * * Their two major use cases are: (1) Mediating communication between * process-level code and irq/NMI handlers, all running on the same CPU, - * and (2) Ensuring that the compiler does not fold, spindle, or otherwise + * and (2) Ensuring that the compiler does not fold, spindle, or otherwise * mutilate accesses that either do not require ordering or that interact * with an explicit memory barrier or atomic instruction that provides the * required ordering. @@ -327,29 +327,4 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s compiletime_assert(__native_word(t), \ "Need native word sized stores/loads for atomicity.") -/* - * Prevent the compiler from merging or refetching accesses. The compiler - * is also forbidden from reordering successive instances of ACCESS_ONCE(), - * but only when the compiler is aware of some particular ordering. One way - * to make the compiler aware of ordering is to put the two invocations of - * ACCESS_ONCE() in different C statements. - * - * ACCESS_ONCE will only work on scalar types. For union types, ACCESS_ONCE - * on a union member will work as long as the size of the member matches the - * size of the union and the size is smaller than word size. - * - * The major use cases of ACCESS_ONCE used to be (1) Mediating communication - * between process-level code and irq/NMI handlers, all running on the same CPU, - * and (2) Ensuring that the compiler does not fold, spindle, or otherwise - * mutilate accesses that either do not require ordering or that interact - * with an explicit memory barrier or atomic instruction that provides the - * required ordering. - * - * If possible use READ_ONCE()/WRITE_ONCE() instead. - */ -#define __ACCESS_ONCE(x) ({ \ - __maybe_unused typeof(x) __var = (__force typeof(x)) 0; \ - (volatile typeof(x) *)&(x); }) -#define ACCESS_ONCE(x) (*__ACCESS_ONCE(x)) - #endif /* __LINUX_COMPILER_H */ -- cgit v1.2.3 From 4b4df570b41dbb421f52605357d5d56c872df6d9 Mon Sep 17 00:00:00 2001 From: Keith Packard Date: Wed, 13 Dec 2017 00:44:26 -0800 Subject: drm: Update edid-derived drm_display_info fields at edid property set [v2] There are a set of values in the drm_display_info structure for each connector which hold information derived from EDID. These are computed in drm_add_display_info. Before this patch, that was only called in drm_add_edid_modes. This meant that they were only set when EDID was present and never reset when EDID was not, as happened when the display was disconnected. One of these fields, non_desktop, is used from drm_mode_connector_update_edid_property, the function responsible for assigning the new edid value to the application-visible property. Various drivers call these two functions (drm_add_edid_modes and drm_mode_connector_update_edid_property) in different orders. This means that even when EDID is present, the drm_display_info fields may not have been computed at the time that drm_mode_connector_update_edid_property used the non_desktop value to set the non_desktop property. I've added a public function (drm_reset_display_info) that resets the drm_display_info field values to default values and then made the drm_add_display_info function public. These two functions are now called directly from drm_mode_connector_update_edid_property so that the drm_display_info fields are always computed from the current EDID information before being used in that function. This means that the drm_display_info values are often computed twice, once when the EDID property it set and a second time when EDID is used to compute modes for the device. The alternative would be to uniformly ensure that the values were computed once before being used, which would require that all drivers reliably invoke the two paths in the same order. The computation is inexpensive enough that it seems more maintainable in the long term to simply compute them in both paths. The API to drm_add_display_info has been changed so that it no longer takes the set of edid-based quirks as a parameter. Rather, it now computes those quirks itself and returns them for further use by drm_add_edid_modes. This patch also includes a number of 'const' additions caused by drm_mode_connector_update_edid_property taking a 'const struct edid *' parameter and wanting to pass that along to drm_add_display_info. v2: after review by Daniel Vetter Removed EXPORT_SYMBOL_GPL for drm_reset_display_info and drm_add_display_info. Added FIXME in drm_mode_connector_update_edid_property about potentially merging that with drm_add_edid_modes to avoid the need for two driver calls. Signed-off-by: Keith Packard Reviewed-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20171213084427.31199-1-keithp@keithp.com (danvet: cherry picked from commit 12a889bf4bca ("drm: rework delayed connector cleanup in connector_iter") from drm-misc-next since functional conflict with changes in -next and we need to make sure both have the right version and nothing gets lost.) Signed-off-by: Daniel Vetter --- include/drm/drm_edid.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/drm/drm_edid.h b/include/drm/drm_edid.h index 2ec41d032e56..efe6d5a8e834 100644 --- a/include/drm/drm_edid.h +++ b/include/drm/drm_edid.h @@ -465,6 +465,8 @@ struct edid *drm_get_edid(struct drm_connector *connector, struct edid *drm_get_edid_switcheroo(struct drm_connector *connector, struct i2c_adapter *adapter); struct edid *drm_edid_duplicate(const struct edid *edid); +void drm_reset_display_info(struct drm_connector *connector); +u32 drm_add_display_info(struct drm_connector *connector, const struct edid *edid); int drm_add_edid_modes(struct drm_connector *connector, struct edid *edid); u8 drm_match_cea_mode(const struct drm_display_mode *to_match); -- cgit v1.2.3 From b5476022bbada3764609368f03329ca287528dc8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 11 Dec 2017 07:17:39 -0800 Subject: ipv4: igmp: guard against silly MTU values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit IPv4 stack reacts to changes to small MTU, by disabling itself under RTNL. But there is a window where threads not using RTNL can see a wrong device mtu. This can lead to surprises, in igmp code where it is assumed the mtu is suitable. Fix this by reading device mtu once and checking IPv4 minimal MTU. This patch adds missing IPV4_MIN_MTU define, to not abuse ETH_MIN_MTU anymore. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index 9896f46cbbf1..af8addbaa3c1 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -34,6 +34,7 @@ #include #define IPV4_MAX_PMTU 65535U /* RFC 2675, Section 5.1 */ +#define IPV4_MIN_MTU 68 /* RFC 791 */ struct sock; -- cgit v1.2.3 From ea497bb92064875497554ee7cdf10df7fb7393fc Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Wed, 13 Dec 2017 13:49:36 +0100 Subject: drm: rework delayed connector cleanup in connector_iter PROBE_DEFER also uses system_wq to reprobe drivers, which means when that again fails, and we try to flush the overall system_wq (to get all the delayed connectore cleanup work_struct completed), we deadlock. Fix this by using just a single cleanup work, so that we can only flush that one and don't block on anything else. That means a free list plus locking, a standard pattern. v2: - Correctly free connectors only on last ref. Oops (Chris). - use llist_head/node (Chris). v3 - Add init_llist_head (Chris). Fixes: a703c55004e1 ("drm: safely free connectors from connector_iter") Fixes: 613051dac40d ("drm: locking&new iterators for connector_list") Cc: Ben Widawsky Cc: Dave Airlie Cc: Chris Wilson Cc: Sean Paul Cc: # v4.11+: 613051dac40d ("drm: locking&new iterators for connector_list" Cc: # v4.11+ Cc: Daniel Vetter Cc: Jani Nikula Cc: Gustavo Padovan Cc: David Airlie Cc: Javier Martinez Canillas Cc: Shuah Khan Cc: Guillaume Tucker Cc: Mark Brown Cc: Kevin Hilman Cc: Matt Hart Cc: Thierry Escande Cc: Tomeu Vizoso Cc: Enric Balletbo i Serra Tested-by: Marek Szyprowski Reviewed-by: Chris Wilson Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20171213124936.17914-1-daniel.vetter@ffwll.ch --- include/drm/drm_connector.h | 10 ++++++---- include/drm/drm_mode_config.h | 18 +++++++++++++++++- 2 files changed, 23 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h index a4649c56ca2f..5971577016a2 100644 --- a/include/drm/drm_connector.h +++ b/include/drm/drm_connector.h @@ -24,6 +24,7 @@ #define __DRM_CONNECTOR_H__ #include +#include #include #include #include @@ -918,12 +919,13 @@ struct drm_connector { uint16_t tile_h_size, tile_v_size; /** - * @free_work: + * @free_node: * - * Work used only by &drm_connector_iter to be able to clean up a - * connector from any context. + * List used only by &drm_connector_iter to be able to clean up a + * connector from any context, in conjunction with + * &drm_mode_config.connector_free_work. */ - struct work_struct free_work; + struct llist_node free_node; }; #define obj_to_connector(x) container_of(x, struct drm_connector, base) diff --git a/include/drm/drm_mode_config.h b/include/drm/drm_mode_config.h index b21e827c5c78..b0ce26d71296 100644 --- a/include/drm/drm_mode_config.h +++ b/include/drm/drm_mode_config.h @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -393,7 +394,7 @@ struct drm_mode_config { /** * @connector_list_lock: Protects @num_connector and - * @connector_list. + * @connector_list and @connector_free_list. */ spinlock_t connector_list_lock; /** @@ -413,6 +414,21 @@ struct drm_mode_config { * &struct drm_connector_list_iter to walk this list. */ struct list_head connector_list; + /** + * @connector_free_list: + * + * List of connector objects linked with &drm_connector.free_head. + * Protected by @connector_list_lock. Used by + * drm_for_each_connector_iter() and + * &struct drm_connector_list_iter to savely free connectors using + * @connector_free_work. + */ + struct llist_head connector_free_list; + /** + * @connector_free_work: Work to clean up @connector_free_list. + */ + struct work_struct connector_free_work; + /** * @num_encoder: * -- cgit v1.2.3 From c47d7f56e914900410f65835933f9fc4374d0a2b Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Thu, 14 Dec 2017 15:32:24 -0800 Subject: include/linux/idr.h: add #include The was removed from radix-tree.h by commit f5bba9d11a25 ("include/linux/radix-tree.h: remove unneeded #include "). Since that commit, tools/testing/radix-tree/ couldn't pass compilation due to tools/testing/radix-tree/idr.c:17: undefined reference to WARN_ON_ONCE. This patch adds the bug.h header to idr.h to solve the issue. Link: http://lkml.kernel.org/r/1511963726-34070-2-git-send-email-wei.w.wang@intel.com Fixes: f5bba9d11a2 ("include/linux/radix-tree.h: remove unneeded #include ") Signed-off-by: Wei Wang Cc: Matthew Wilcox Cc: Jan Kara Cc: Eric Biggers Cc: Tejun Heo Cc: Masahiro Yamada Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/idr.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/idr.h b/include/linux/idr.h index 7c3a365f7e12..fa14f834e4ed 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -15,6 +15,7 @@ #include #include #include +#include struct idr { struct radix_tree_root idr_rt; -- cgit v1.2.3 From 338f1d9d1b829fec494d053f62820a2ee625b1ec Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 14 Dec 2017 15:32:28 -0800 Subject: lib/rbtree,drm/mm: add rbtree_replace_node_cached() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a variant of rbtree_replace_node() that maintains the leftmost cache of struct rbtree_root_cached when replacing nodes within the rbtree. As drm_mm is the only rb_replace_node() being used on an interval tree, the mistake looks fairly self-contained. Furthermore the only user of drm_mm_replace_node() is its testsuite... Testcase: igt/drm_mm/replace Link: http://lkml.kernel.org/r/20171122100729.3742-1-chris@chris-wilson.co.uk Link: https://patchwork.freedesktop.org/patch/msgid/20171109212435.9265-1-chris@chris-wilson.co.uk Fixes: f808c13fd373 ("lib/interval_tree: fast overlap detection") Signed-off-by: Chris Wilson Reviewed-by: Joonas Lahtinen Acked-by: Davidlohr Bueso Cc: Jérôme Glisse Cc: Joonas Lahtinen Cc: Daniel Vetter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rbtree.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index d574361943ea..fcbeed4053ef 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -99,6 +99,8 @@ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new, struct rb_root *root); +extern void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new, + struct rb_root_cached *root); static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) -- cgit v1.2.3 From 146734b091430c80d80bb96b1139a96fb4bc830e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 14 Dec 2017 15:32:34 -0800 Subject: string.h: workaround for increased stack usage The hardened strlen() function causes rather large stack usage in at least one file in the kernel, in particular when CONFIG_KASAN is enabled: drivers/media/usb/em28xx/em28xx-dvb.c: In function 'em28xx_dvb_init': drivers/media/usb/em28xx/em28xx-dvb.c:2062:1: error: the frame size of 3256 bytes is larger than 204 bytes [-Werror=frame-larger-than=] Analyzing this problem led to the discovery that gcc fails to merge the stack slots for the i2c_board_info[] structures after we strlcpy() into them, due to the 'noreturn' attribute on the source string length check. I reported this as a gcc bug, but it is unlikely to get fixed for gcc-8, since it is relatively easy to work around, and it gets triggered rarely. An earlier workaround I did added an empty inline assembly statement before the call to fortify_panic(), which works surprisingly well, but is really ugly and unintuitive. This is a new approach to the same problem, this time addressing it by not calling the 'extern __real_strnlen()' function for string constants where __builtin_strlen() is a compile-time constant and therefore known to be safe. We do this by checking if the last character in the string is a compile-time constant '\0'. If it is, we can assume that strlen() of the string is also constant. As a side-effect, this should also improve the object code output for any other call of strlen() on a string constant. [akpm@linux-foundation.org: add comment] Link: http://lkml.kernel.org/r/20171205215143.3085755-1-arnd@arndb.de Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82365 Link: https://patchwork.kernel.org/patch/9980413/ Link: https://patchwork.kernel.org/patch/9974047/ Fixes: 6974f0c4555 ("include/linux/string.h: add the option of fortified string.h functions") Signed-off-by: Arnd Bergmann Cc: Kees Cook Cc: Mauro Carvalho Chehab Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Daniel Micay Cc: Greg Kroah-Hartman Cc: Martin Wilck Cc: Dan Williams Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/string.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/string.h b/include/linux/string.h index 410ecf17de3c..cfd83eb2f926 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -259,7 +259,10 @@ __FORTIFY_INLINE __kernel_size_t strlen(const char *p) { __kernel_size_t ret; size_t p_size = __builtin_object_size(p, 0); - if (p_size == (size_t)-1) + + /* Work around gcc excess stack consumption issue */ + if (p_size == (size_t)-1 || + (__builtin_constant_p(p[p_size - 1]) && p[p_size - 1] == '\0')) return __builtin_strlen(p); ret = strnlen(p, p_size); if (p_size <= ret) -- cgit v1.2.3 From 3756f6401c302617c5e091081ca4d26ab604bec5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 14 Dec 2017 15:32:41 -0800 Subject: exec: avoid gcc-8 warning for get_task_comm gcc-8 warns about using strncpy() with the source size as the limit: fs/exec.c:1223:32: error: argument to 'sizeof' in 'strncpy' call is the same expression as the source; did you mean to use the size of the destination? [-Werror=sizeof-pointer-memaccess] This is indeed slightly suspicious, as it protects us from source arguments without NUL-termination, but does not guarantee that the destination is terminated. This keeps the strncpy() to ensure we have properly padded target buffer, but ensures that we use the correct length, by passing the actual length of the destination buffer as well as adding a build-time check to ensure it is exactly TASK_COMM_LEN. There are only 23 callsites which I all reviewed to ensure this is currently the case. We could get away with doing only the check or passing the right length, but it doesn't hurt to do both. Link: http://lkml.kernel.org/r/20171205151724.1764896-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Suggested-by: Kees Cook Acked-by: Kees Cook Acked-by: Ingo Molnar Cc: Alexander Viro Cc: Peter Zijlstra Cc: Serge Hallyn Cc: James Morris Cc: Aleksa Sarai Cc: "Eric W. Biederman" Cc: Frederic Weisbecker Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 21991d668d35..5124ba709830 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1503,7 +1503,11 @@ static inline void set_task_comm(struct task_struct *tsk, const char *from) __set_task_comm(tsk, from, false); } -extern char *get_task_comm(char *to, struct task_struct *tsk); +extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk); +#define get_task_comm(buf, tsk) ({ \ + BUILD_BUG_ON(sizeof(buf) != TASK_COMM_LEN); \ + __get_task_comm(buf, sizeof(buf), tsk); \ +}) #ifdef CONFIG_SMP void scheduler_ipi(void); -- cgit v1.2.3 From bdcf0a423ea1c40bbb40e7ee483b50fc8aa3d758 Mon Sep 17 00:00:00 2001 From: Thiago Rafael Becker Date: Thu, 14 Dec 2017 15:33:12 -0800 Subject: kernel: make groups_sort calling a responsibility group_info allocators In testing, we found that nfsd threads may call set_groups in parallel for the same entry cached in auth.unix.gid, racing in the call of groups_sort, corrupting the groups for that entry and leading to permission denials for the client. This patch: - Make groups_sort globally visible. - Move the call to groups_sort to the modifiers of group_info - Remove the call to groups_sort from set_groups Link: http://lkml.kernel.org/r/20171211151420.18655-1-thiago.becker@gmail.com Signed-off-by: Thiago Rafael Becker Reviewed-by: Matthew Wilcox Reviewed-by: NeilBrown Acked-by: "J. Bruce Fields" Cc: Al Viro Cc: Martin Schwidefsky Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cred.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/cred.h b/include/linux/cred.h index 099058e1178b..631286535d0f 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -83,6 +83,7 @@ extern int set_current_groups(struct group_info *); extern void set_groups(struct cred *, struct group_info *); extern int groups_search(const struct group_info *, kgid_t); extern bool may_setgroups(void); +extern void groups_sort(struct group_info *); /* * The security context of a task -- cgit v1.2.3 From 4837fe37adff1d159904f0c013471b1ecbcb455e Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 14 Dec 2017 15:33:15 -0800 Subject: mm, oom_reaper: fix memory corruption David Rientjes has reported the following memory corruption while the oom reaper tries to unmap the victims address space BUG: Bad page map in process oom_reaper pte:6353826300000000 pmd:00000000 addr:00007f50cab1d000 vm_flags:08100073 anon_vma:ffff9eea335603f0 mapping: (null) index:7f50cab1d file: (null) fault: (null) mmap: (null) readpage: (null) CPU: 2 PID: 1001 Comm: oom_reaper Call Trace: unmap_page_range+0x1068/0x1130 __oom_reap_task_mm+0xd5/0x16b oom_reaper+0xff/0x14c kthread+0xc1/0xe0 Tetsuo Handa has noticed that the synchronization inside exit_mmap is insufficient. We only synchronize with the oom reaper if tsk_is_oom_victim which is not true if the final __mmput is called from a different context than the oom victim exit path. This can trivially happen from context of any task which has grabbed mm reference (e.g. to read /proc// file which requires mm etc.). The race would look like this oom_reaper oom_victim task mmget_not_zero do_exit mmput __oom_reap_task_mm mmput __mmput exit_mmap remove_vma unmap_page_range Fix this issue by providing a new mm_is_oom_victim() helper which operates on the mm struct rather than a task. Any context which operates on a remote mm struct should use this helper in place of tsk_is_oom_victim. The flag is set in mark_oom_victim and never cleared so it is stable in the exit_mmap path. Debugged by Tetsuo Handa. Link: http://lkml.kernel.org/r/20171210095130.17110-1-mhocko@kernel.org Fixes: 212925802454 ("mm: oom: let oom_reap_task and exit_mmap run concurrently") Signed-off-by: Michal Hocko Reported-by: David Rientjes Acked-by: David Rientjes Cc: Tetsuo Handa Cc: Andrea Argangeli Cc: [4.14] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 9 +++++++++ include/linux/sched/coredump.h | 1 + 2 files changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/oom.h b/include/linux/oom.h index 01c91d874a57..5bad038ac012 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -66,6 +66,15 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk) return tsk->signal->oom_mm; } +/* + * Use this helper if tsk->mm != mm and the victim mm needs a special + * handling. This is guaranteed to stay true after once set. + */ +static inline bool mm_is_oom_victim(struct mm_struct *mm) +{ + return test_bit(MMF_OOM_VICTIM, &mm->flags); +} + /* * Checks whether a page fault on the given mm is still reliable. * This is no longer true if the oom reaper started to reap the diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 9c8847395b5e..ec912d01126f 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -70,6 +70,7 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ #define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ #define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ +#define MMF_OOM_VICTIM 25 /* mm is the oom victim */ #define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ -- cgit v1.2.3 From 7a4fa29106d9a38ef005f5ab15d493c259f269c0 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Thu, 14 Dec 2017 15:54:29 +0200 Subject: net: sched: Add TCA_HW_OFFLOAD Qdiscs can be offloaded to HW, but current implementation isn't uniform. Instead, qdiscs either pass information about offload status via their TCA_OPTIONS or omit it altogether. Introduce a new attribute - TCA_HW_OFFLOAD that would form a uniform uAPI for the offloading status of qdiscs. Signed-off-by: Yuval Mintz Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/sch_generic.h | 1 + include/uapi/linux/rtnetlink.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 65d0d25f2648..83a3e47d5845 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -71,6 +71,7 @@ struct Qdisc { * qdisc_tree_decrease_qlen() should stop. */ #define TCQ_F_INVISIBLE 0x80 /* invisible by default in dump */ +#define TCQ_F_OFFLOADED 0x200 /* qdisc is offloaded to HW */ u32 limit; const struct Qdisc_ops *ops; struct qdisc_size_table __rcu *stab; diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index d8b5f80c2ea6..843e29aa3cac 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -557,6 +557,7 @@ enum { TCA_PAD, TCA_DUMP_INVISIBLE, TCA_CHAIN, + TCA_HW_OFFLOAD, __TCA_MAX }; -- cgit v1.2.3 From 4a98795bc8ea148b1ebbbf001283e06430cffe36 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Thu, 14 Dec 2017 15:54:31 +0200 Subject: pkt_sched: Remove TC_RED_OFFLOADED from uapi Following the previous patch, RED is now using the new uniform uapi for indicating it's offloaded. As a result, TC_RED_OFFLOADED is no longer utilized by kernel and can be removed [as it's still not part of any stable release]. Fixes: 602f3baf2218 ("net_sch: red: Add offload ability to RED qdisc") Signed-off-by: Yuval Mintz Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index af3cc2f4e1ad..37b5096ae97b 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -256,7 +256,6 @@ struct tc_red_qopt { #define TC_RED_ECN 1 #define TC_RED_HARDDROP 2 #define TC_RED_ADAPTATIVE 4 -#define TC_RED_OFFLOADED 8 }; struct tc_red_xstats { -- cgit v1.2.3 From 1784f9144b143a1e8b19fe94083b040aa559182b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 5 Dec 2017 14:14:47 +0100 Subject: drivers/misc/intel/pti: Rename the header file to free up the namespace We'd like to use the 'PTI' acronym for 'Page Table Isolation' - free up the namespace by renaming the driver header to . (Also standardize the header guard name while at it.) Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: J Freyensee Cc: Greg Kroah-Hartman Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/intel-pti.h | 43 +++++++++++++++++++++++++++++++++++++++++++ include/linux/pti.h | 43 ------------------------------------------- 2 files changed, 43 insertions(+), 43 deletions(-) create mode 100644 include/linux/intel-pti.h delete mode 100644 include/linux/pti.h (limited to 'include') diff --git a/include/linux/intel-pti.h b/include/linux/intel-pti.h new file mode 100644 index 000000000000..2710d72de3c9 --- /dev/null +++ b/include/linux/intel-pti.h @@ -0,0 +1,43 @@ +/* + * Copyright (C) Intel 2011 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * The PTI (Parallel Trace Interface) driver directs trace data routed from + * various parts in the system out through the Intel Penwell PTI port and + * out of the mobile device for analysis with a debugging tool + * (Lauterbach, Fido). This is part of a solution for the MIPI P1149.7, + * compact JTAG, standard. + * + * This header file will allow other parts of the OS to use the + * interface to write out it's contents for debugging a mobile system. + */ + +#ifndef LINUX_INTEL_PTI_H_ +#define LINUX_INTEL_PTI_H_ + +/* offset for last dword of any PTI message. Part of MIPI P1149.7 */ +#define PTI_LASTDWORD_DTS 0x30 + +/* basic structure used as a write address to the PTI HW */ +struct pti_masterchannel { + u8 master; + u8 channel; +}; + +/* the following functions are defined in misc/pti.c */ +void pti_writedata(struct pti_masterchannel *mc, u8 *buf, int count); +struct pti_masterchannel *pti_request_masterchannel(u8 type, + const char *thread_name); +void pti_release_masterchannel(struct pti_masterchannel *mc); + +#endif /* LINUX_INTEL_PTI_H_ */ diff --git a/include/linux/pti.h b/include/linux/pti.h deleted file mode 100644 index b3ea01a3197e..000000000000 --- a/include/linux/pti.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (C) Intel 2011 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - * - * The PTI (Parallel Trace Interface) driver directs trace data routed from - * various parts in the system out through the Intel Penwell PTI port and - * out of the mobile device for analysis with a debugging tool - * (Lauterbach, Fido). This is part of a solution for the MIPI P1149.7, - * compact JTAG, standard. - * - * This header file will allow other parts of the OS to use the - * interface to write out it's contents for debugging a mobile system. - */ - -#ifndef PTI_H_ -#define PTI_H_ - -/* offset for last dword of any PTI message. Part of MIPI P1149.7 */ -#define PTI_LASTDWORD_DTS 0x30 - -/* basic structure used as a write address to the PTI HW */ -struct pti_masterchannel { - u8 master; - u8 channel; -}; - -/* the following functions are defined in misc/pti.c */ -void pti_writedata(struct pti_masterchannel *mc, u8 *buf, int count); -struct pti_masterchannel *pti_request_masterchannel(u8 type, - const char *thread_name); -void pti_release_masterchannel(struct pti_masterchannel *mc); - -#endif /*PTI_H_*/ -- cgit v1.2.3 From c2bc66082e1048c7573d72e62f597bdc5ce13fea Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 24 Oct 2017 11:22:47 +0100 Subject: locking/barriers: Add implicit smp_read_barrier_depends() to READ_ONCE() [ Note, this is a Git cherry-pick of the following commit: 76ebbe78f739 ("locking/barriers: Add implicit smp_read_barrier_depends() to READ_ONCE()") ... for easier x86 PTI code testing and back-porting. ] In preparation for the removal of lockless_dereference(), which is the same as READ_ONCE() on all architectures other than Alpha, add an implicit smp_read_barrier_depends() to READ_ONCE() so that it can be used to head dependency chains on all architectures. Signed-off-by: Will Deacon Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1508840570-22169-3-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- include/linux/compiler.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 202710420d6d..712cd8bb00b4 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -341,6 +341,7 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s __read_once_size(&(x), __u.__c, sizeof(x)); \ else \ __read_once_size_nocheck(&(x), __u.__c, sizeof(x)); \ + smp_read_barrier_depends(); /* Enforce dependency ordering from x */ \ __u.__val; \ }) #define READ_ONCE(x) __READ_ONCE(x, 1) -- cgit v1.2.3 From 3382290ed2d5e275429cef510ab21889d3ccd164 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 24 Oct 2017 11:22:48 +0100 Subject: locking/barriers: Convert users of lockless_dereference() to READ_ONCE() [ Note, this is a Git cherry-pick of the following commit: 506458efaf15 ("locking/barriers: Convert users of lockless_dereference() to READ_ONCE()") ... for easier x86 PTI code testing and back-porting. ] READ_ONCE() now has an implicit smp_read_barrier_depends() call, so it can be used instead of lockless_dereference() without any change in semantics. Signed-off-by: Will Deacon Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1508840570-22169-4-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- include/linux/rculist.h | 4 ++-- include/linux/rcupdate.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/rculist.h b/include/linux/rculist.h index c2cdd45a880a..127f534fec94 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -275,7 +275,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). */ #define list_entry_rcu(ptr, type, member) \ - container_of(lockless_dereference(ptr), type, member) + container_of(READ_ONCE(ptr), type, member) /* * Where are list_empty_rcu() and list_first_entry_rcu()? @@ -368,7 +368,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, * example is when items are added to the list, but never deleted. */ #define list_entry_lockless(ptr, type, member) \ - container_of((typeof(ptr))lockless_dereference(ptr), type, member) + container_of((typeof(ptr))READ_ONCE(ptr), type, member) /** * list_for_each_entry_lockless - iterate over rcu list of given type diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 1a9f70d44af9..a6ddc42f87a5 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -346,7 +346,7 @@ static inline void rcu_preempt_sleep_check(void) { } #define __rcu_dereference_check(p, c, space) \ ({ \ /* Dependency order vs. p above. */ \ - typeof(*p) *________p1 = (typeof(*p) *__force)lockless_dereference(p); \ + typeof(*p) *________p1 = (typeof(*p) *__force)READ_ONCE(p); \ RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_check() usage"); \ rcu_dereference_sparse(p, space); \ ((typeof(*p) __force __kernel *)(________p1)); \ @@ -360,7 +360,7 @@ static inline void rcu_preempt_sleep_check(void) { } #define rcu_dereference_raw(p) \ ({ \ /* Dependency order vs. p above. */ \ - typeof(p) ________p1 = lockless_dereference(p); \ + typeof(p) ________p1 = READ_ONCE(p); \ ((typeof(*p) __force __kernel *)(________p1)); \ }) -- cgit v1.2.3 From f384dcfe4d918c1d80477d290c22ce0093823771 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 7 Dec 2017 11:46:15 +0000 Subject: KVM: arm/arm64: timer: Don't set irq as forwarded if no usable GIC If we don't have a usable GIC, do not try to set the vcpu affinity as this is guaranteed to fail. Reported-by: Andre Przywara Reviewed-by: Andre Przywara Tested-by: Andre Przywara Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- include/kvm/arm_arch_timer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index 6e45608b2399..9da6ce22803f 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -62,7 +62,7 @@ struct arch_timer_cpu { bool enabled; }; -int kvm_timer_hyp_init(void); +int kvm_timer_hyp_init(bool); int kvm_timer_enable(struct kvm_vcpu *vcpu); int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu); void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From e39d200fa5bf5b94a0948db0dae44c1b73b84a56 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 14 Dec 2017 17:40:50 -0800 Subject: KVM: Fix stack-out-of-bounds read in write_mmio MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reported by syzkaller: BUG: KASAN: stack-out-of-bounds in write_mmio+0x11e/0x270 [kvm] Read of size 8 at addr ffff8803259df7f8 by task syz-executor/32298 CPU: 6 PID: 32298 Comm: syz-executor Tainted: G OE 4.15.0-rc2+ #18 Hardware name: LENOVO ThinkCentre M8500t-N000/SHARKBAY, BIOS FBKTC1AUS 02/16/2016 Call Trace: dump_stack+0xab/0xe1 print_address_description+0x6b/0x290 kasan_report+0x28a/0x370 write_mmio+0x11e/0x270 [kvm] emulator_read_write_onepage+0x311/0x600 [kvm] emulator_read_write+0xef/0x240 [kvm] emulator_fix_hypercall+0x105/0x150 [kvm] em_hypercall+0x2b/0x80 [kvm] x86_emulate_insn+0x2b1/0x1640 [kvm] x86_emulate_instruction+0x39a/0xb90 [kvm] handle_exception+0x1b4/0x4d0 [kvm_intel] vcpu_enter_guest+0x15a0/0x2640 [kvm] kvm_arch_vcpu_ioctl_run+0x549/0x7d0 [kvm] kvm_vcpu_ioctl+0x479/0x880 [kvm] do_vfs_ioctl+0x142/0x9a0 SyS_ioctl+0x74/0x80 entry_SYSCALL_64_fastpath+0x23/0x9a The path of patched vmmcall will patch 3 bytes opcode 0F 01 C1(vmcall) to the guest memory, however, write_mmio tracepoint always prints 8 bytes through *(u64 *)val since kvm splits the mmio access into 8 bytes. This leaks 5 bytes from the kernel stack (CVE-2017-17741). This patch fixes it by just accessing the bytes which we operate on. Before patch: syz-executor-5567 [007] .... 51370.561696: kvm_mmio: mmio write len 3 gpa 0x10 val 0x1ffff10077c1010f After patch: syz-executor-13416 [002] .... 51302.299573: kvm_mmio: mmio write len 3 gpa 0x10 val 0xc1010f Reported-by: Dmitry Vyukov Reviewed-by: Darren Kenny Reviewed-by: Marc Zyngier Tested-by: Marc Zyngier Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Marc Zyngier Cc: Christoffer Dall Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- include/trace/events/kvm.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index e4b0b8e09932..2c735a3e6613 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -211,7 +211,7 @@ TRACE_EVENT(kvm_ack_irq, { KVM_TRACE_MMIO_WRITE, "write" } TRACE_EVENT(kvm_mmio, - TP_PROTO(int type, int len, u64 gpa, u64 val), + TP_PROTO(int type, int len, u64 gpa, void *val), TP_ARGS(type, len, gpa, val), TP_STRUCT__entry( @@ -225,7 +225,10 @@ TRACE_EVENT(kvm_mmio, __entry->type = type; __entry->len = len; __entry->gpa = gpa; - __entry->val = val; + __entry->val = 0; + if (val) + memcpy(&__entry->val, val, + min_t(u32, sizeof(__entry->val), len)); ), TP_printk("mmio %s len %u gpa 0x%llx val 0x%llx", -- cgit v1.2.3 From 14cb0dc6479dc5ebc63b3a459a5d89a2f1b39fed Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 18 Dec 2017 15:40:43 +0800 Subject: block: don't let passthrough IO go into .make_request_fn() Commit a8821f3f3("block: Improvements to bounce-buffer handling") tries to make sure that the bio to .make_request_fn won't exceed BIO_MAX_PAGES, but ignores that passthrough I/O can use blk_queue_bounce() too. Especially, passthrough IO may not be sector-aligned, and the check of 'sectors < bio_sectors(*bio_orig)' inside __blk_queue_bounce() may become true even though the max bvec number doesn't exceed BIO_MAX_PAGES, then cause the bio splitted, and the original passthrough bio is submited to generic_make_request(). This patch fixes this issue by checking if the bio is passthrough IO, and use bio_kmalloc() to allocate the cloned passthrough bio. Cc: NeilBrown Fixes: a8821f3f3("block: Improvements to bounce-buffer handling") Tested-by: Michele Ballabio Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8089ca17db9a..abd06f540863 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -241,14 +241,24 @@ struct request { struct request *next_rq; }; +static inline bool blk_op_is_scsi(unsigned int op) +{ + return op == REQ_OP_SCSI_IN || op == REQ_OP_SCSI_OUT; +} + +static inline bool blk_op_is_private(unsigned int op) +{ + return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT; +} + static inline bool blk_rq_is_scsi(struct request *rq) { - return req_op(rq) == REQ_OP_SCSI_IN || req_op(rq) == REQ_OP_SCSI_OUT; + return blk_op_is_scsi(req_op(rq)); } static inline bool blk_rq_is_private(struct request *rq) { - return req_op(rq) == REQ_OP_DRV_IN || req_op(rq) == REQ_OP_DRV_OUT; + return blk_op_is_private(req_op(rq)); } static inline bool blk_rq_is_passthrough(struct request *rq) @@ -256,6 +266,13 @@ static inline bool blk_rq_is_passthrough(struct request *rq) return blk_rq_is_scsi(rq) || blk_rq_is_private(rq); } +static inline bool bio_is_passthrough(struct bio *bio) +{ + unsigned op = bio_op(bio); + + return blk_op_is_scsi(op) || blk_op_is_private(op); +} + static inline unsigned short req_get_ioprio(struct request *req) { return req->ioprio; -- cgit v1.2.3 From 0abc2a10389f0c9070f76ca906c7382788036b93 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 18 Dec 2017 15:40:44 +0800 Subject: block: fix blk_rq_append_bio Commit caa4b02476e3(blk-map: call blk_queue_bounce from blk_rq_append_bio) moves blk_queue_bounce() into blk_rq_append_bio(), but don't consider the fact that the bounced bio becomes invisible to caller since the parameter type is 'struct bio *'. Make it a pointer to a pointer to a bio, so the caller sees the right bio also after a bounce. Fixes: caa4b02476e3 ("blk-map: call blk_queue_bounce from blk_rq_append_bio") Cc: Christoph Hellwig Reported-by: Michele Ballabio (handling failure of blk_rq_append_bio(), only call bio_get() after blk_rq_append_bio() returns OK) Tested-by: Michele Ballabio Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index abd06f540863..100d0df38026 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -965,7 +965,7 @@ extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, extern void blk_rq_unprep_clone(struct request *rq); extern blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq); -extern int blk_rq_append_bio(struct request *rq, struct bio *bio); +extern int blk_rq_append_bio(struct request *rq, struct bio **bio); extern void blk_delay_queue(struct request_queue *, unsigned long); extern void blk_queue_split(struct request_queue *, struct bio **); extern void blk_recount_segments(struct request_queue *, struct bio *); -- cgit v1.2.3 From acf568ee859f098279eadf551612f103afdacb4e Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 15 Dec 2017 16:40:44 +1100 Subject: xfrm: Reinject transport-mode packets through tasklet This is an old bugbear of mine: https://www.mail-archive.com/netdev@vger.kernel.org/msg03894.html By crafting special packets, it is possible to cause recursion in our kernel when processing transport-mode packets at levels that are only limited by packet size. The easiest one is with DNAT, but an even worse one is where UDP encapsulation is used in which case you just have to insert an UDP encapsulation header in between each level of recursion. This patch avoids this problem by reinjecting tranport-mode packets through a tasklet. Fixes: b05e106698d9 ("[IPV4/6]: Netfilter IPsec input hooks") Signed-off-by: Herbert Xu Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index dc28a98ce97c..ae35991b5877 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1570,6 +1570,9 @@ int xfrm_init_state(struct xfrm_state *x); int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb); int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type); int xfrm_input_resume(struct sk_buff *skb, int nexthdr); +int xfrm_trans_queue(struct sk_buff *skb, + int (*finish)(struct net *, struct sock *, + struct sk_buff *)); int xfrm_output_resume(struct sk_buff *skb, int err); int xfrm_output(struct sock *sk, struct sk_buff *skb); int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb); -- cgit v1.2.3 From 958a1b5a5ed02a768eb27760268251af93090caf Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Mon, 11 Dec 2017 15:37:49 -0700 Subject: nl80211: Remove obsolete kerneldoc line Commit ca986ad9bcd3 (nl80211: allow multiple active scheduled scan requests) removed WIPHY_FLAG_SUPPORTS_SCHED_SCAN but left the kerneldoc description in place, leading to this docs-build warning: ./include/net/cfg80211.h:3278: warning: Excess enum value 'WIPHY_FLAG_SUPPORTS_SCHED_SCAN' description in 'wiphy_flags' Remove the line and gain a bit of peace. Signed-off-by: Jonathan Corbet Acked-by: Arend van Spriel Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 8b8118a7fadb..cb4d92b79cd9 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3226,7 +3226,6 @@ struct cfg80211_ops { * @WIPHY_FLAG_IBSS_RSN: The device supports IBSS RSN. * @WIPHY_FLAG_MESH_AUTH: The device supports mesh authentication by routing * auth frames to userspace. See @NL80211_MESH_SETUP_USERSPACE_AUTH. - * @WIPHY_FLAG_SUPPORTS_SCHED_SCAN: The device supports scheduled scans. * @WIPHY_FLAG_SUPPORTS_FW_ROAM: The device supports roaming feature in the * firmware. * @WIPHY_FLAG_AP_UAPSD: The device supports uapsd on AP. -- cgit v1.2.3 From 231243c82793428467524227ae02ca451e6a98e7 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Fri, 10 Nov 2017 15:59:52 +0900 Subject: Revert "mlx5: move affinity hints assignments to generic code" Before the offending commit, mlx5 core did the IRQ affinity itself, and it seems that the new generic code have some drawbacks and one of them is the lack for user ability to modify irq affinity after the initial affinity values got assigned. The issue is still being discussed and a solution in the new generic code is required, until then we need to revert this patch. This fixes the following issue: echo > /proc/irq//smp_affinity fails with -EIO This reverts commit a435393acafbf0ecff4deb3e3cb554b34f0d0664. Note: kept mlx5_get_vector_affinity in include/linux/mlx5/driver.h since it is used in mlx5_ib driver. Fixes: a435393acafb ("mlx5: move affinity hints assignments to generic code") Cc: Sagi Grimberg Cc: Thomas Gleixner Cc: Jes Sorensen Reported-by: Jes Sorensen Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index a886b51511ab..40a6f33c4cde 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -556,6 +556,7 @@ struct mlx5_core_sriov { }; struct mlx5_irq_info { + cpumask_var_t mask; char name[MLX5_MAX_IRQ_NAME]; }; -- cgit v1.2.3 From 37e92a9d4fe38dc3e7308913575983a6a088c8d4 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Mon, 13 Nov 2017 10:11:27 +0200 Subject: net/mlx5: Fix rate limit packet pacing naming and struct In mlx5_ifc, struct size was not complete, and thus driver was sending garbage after the last defined field. Fixed it by adding reserved field to complete the struct size. In addition, rename all set_rate_limit to set_pp_rate_limit to be compliant with the Firmware <-> Driver definition. Fixes: 7486216b3a0b ("{net,IB}/mlx5: mlx5_ifc updates") Fixes: 1466cc5b23d1 ("net/mlx5: Rate limit tables support") Signed-off-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 38a7577a9ce7..d44ec5f41d4a 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -147,7 +147,7 @@ enum { MLX5_CMD_OP_ALLOC_Q_COUNTER = 0x771, MLX5_CMD_OP_DEALLOC_Q_COUNTER = 0x772, MLX5_CMD_OP_QUERY_Q_COUNTER = 0x773, - MLX5_CMD_OP_SET_RATE_LIMIT = 0x780, + MLX5_CMD_OP_SET_PP_RATE_LIMIT = 0x780, MLX5_CMD_OP_QUERY_RATE_LIMIT = 0x781, MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT = 0x782, MLX5_CMD_OP_DESTROY_SCHEDULING_ELEMENT = 0x783, @@ -7239,7 +7239,7 @@ struct mlx5_ifc_add_vxlan_udp_dport_in_bits { u8 vxlan_udp_port[0x10]; }; -struct mlx5_ifc_set_rate_limit_out_bits { +struct mlx5_ifc_set_pp_rate_limit_out_bits { u8 status[0x8]; u8 reserved_at_8[0x18]; @@ -7248,7 +7248,7 @@ struct mlx5_ifc_set_rate_limit_out_bits { u8 reserved_at_40[0x40]; }; -struct mlx5_ifc_set_rate_limit_in_bits { +struct mlx5_ifc_set_pp_rate_limit_in_bits { u8 opcode[0x10]; u8 reserved_at_10[0x10]; @@ -7261,6 +7261,8 @@ struct mlx5_ifc_set_rate_limit_in_bits { u8 reserved_at_60[0x20]; u8 rate_limit[0x20]; + + u8 reserved_at_a0[0x160]; }; struct mlx5_ifc_access_register_out_bits { -- cgit v1.2.3 From d6b2785cd55ee72e9608762650b3ef299f801b1b Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Tue, 21 Nov 2017 15:15:51 +0200 Subject: net/mlx5: Cleanup IRQs in case of unload failure When mlx5_stop_eqs fails to destroy any of the eqs it returns with an error. In such failure flow the function will return without releasing all EQs irqs and then pci_free_irq_vectors will fail. Fix by only warn on destroy EQ failure and continue to release other EQs and their irqs. It fixes the following kernel trace: kernel: kernel BUG at drivers/pci/msi.c:352! ... ... kernel: Call Trace: kernel: pci_disable_msix+0xd3/0x100 kernel: pci_free_irq_vectors+0xe/0x20 kernel: mlx5_load_one.isra.17+0x9f5/0xec0 [mlx5_core] Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters") Signed-off-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 40a6f33c4cde..57b109c6e422 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1049,7 +1049,7 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx, enum mlx5_eq_type type); int mlx5_destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq); int mlx5_start_eqs(struct mlx5_core_dev *dev); -int mlx5_stop_eqs(struct mlx5_core_dev *dev); +void mlx5_stop_eqs(struct mlx5_core_dev *dev); int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn, unsigned int *irqn); int mlx5_core_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn); -- cgit v1.2.3 From 102740bd9436a3a6ba129af3a48271d794009fa5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 19 Dec 2017 13:32:13 -0800 Subject: cls_bpf: fix offload assumptions after callback conversion cls_bpf used to take care of tracking what offload state a filter is in, i.e. it would track if offload request succeeded or not. This information would then be used to issue correct requests to the driver, e.g. requests for statistics only on offloaded filters, removing only filters which were offloaded, using add instead of replace if previous filter was not added etc. This tracking of offload state no longer functions with the new callback infrastructure. There could be multiple entities trying to offload the same filter. Throw out all the tracking and corresponding commands and simply pass to the drivers both old and new bpf program. Drivers will have to deal with offload state tracking by themselves. Fixes: 3f7889c4c79b ("net: sched: cls_bpf: call block callbacks for offload") Signed-off-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 0105445cab83..8e08b6da72f3 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -694,9 +694,7 @@ struct tc_cls_matchall_offload { }; enum tc_clsbpf_command { - TC_CLSBPF_ADD, - TC_CLSBPF_REPLACE, - TC_CLSBPF_DESTROY, + TC_CLSBPF_OFFLOAD, TC_CLSBPF_STATS, }; @@ -705,6 +703,7 @@ struct tc_cls_bpf_offload { enum tc_clsbpf_command command; struct tcf_exts *exts; struct bpf_prog *prog; + struct bpf_prog *oldprog; const char *name; bool exts_integrated; u32 gen_flags; -- cgit v1.2.3 From 111be883981748acc9a56e855c8336404a8e787c Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 20 Dec 2017 11:10:17 -0700 Subject: block-throttle: avoid double charge If a bio is throttled and split after throttling, the bio could be resubmited and enters the throttling again. This will cause part of the bio to be charged multiple times. If the cgroup has an IO limit, the double charge will significantly harm the performance. The bio split becomes quite common after arbitrary bio size change. To fix this, we always set the BIO_THROTTLED flag if a bio is throttled. If the bio is cloned/split, we copy the flag to new bio too to avoid a double charge. However, cloned bio could be directed to a new disk, keeping the flag be a problem. The observation is we always set new disk for the bio in this case, so we can clear the flag in bio_set_dev(). This issue exists for a long time, arbitrary bio size change just makes it worse, so this should go into stable at least since v4.2. V1-> V2: Not add extra field in bio based on discussion with Tejun Cc: Vivek Goyal Cc: stable@vger.kernel.org Acked-by: Tejun Heo Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 ++ include/linux/blk_types.h | 9 ++++----- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 82f0c8fd7be8..23d29b39f71e 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -492,6 +492,8 @@ extern unsigned int bvec_nr_vecs(unsigned short idx); #define bio_set_dev(bio, bdev) \ do { \ + if ((bio)->bi_disk != (bdev)->bd_disk) \ + bio_clear_flag(bio, BIO_THROTTLED);\ (bio)->bi_disk = (bdev)->bd_disk; \ (bio)->bi_partno = (bdev)->bd_partno; \ } while (0) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index a1e628e032da..9e7d8bd776d2 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -50,8 +50,6 @@ struct blk_issue_stat { struct bio { struct bio *bi_next; /* request queue link */ struct gendisk *bi_disk; - u8 bi_partno; - blk_status_t bi_status; unsigned int bi_opf; /* bottom bits req flags, * top bits REQ_OP. Use * accessors. @@ -59,8 +57,8 @@ struct bio { unsigned short bi_flags; /* status, etc and bvec pool number */ unsigned short bi_ioprio; unsigned short bi_write_hint; - - struct bvec_iter bi_iter; + blk_status_t bi_status; + u8 bi_partno; /* Number of segments in this BIO after * physical address coalescing is performed. @@ -74,8 +72,9 @@ struct bio { unsigned int bi_seg_front_size; unsigned int bi_seg_back_size; - atomic_t __bi_remaining; + struct bvec_iter bi_iter; + atomic_t __bi_remaining; bio_end_io_t *bi_end_io; void *bi_private; -- cgit v1.2.3 From b3cf8528bb21febb650a7ecbf080d0647be40b9f Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Tue, 12 Dec 2017 15:08:21 -0500 Subject: xen/balloon: Mark unallocated host memory as UNUSABLE Commit f5775e0b6116 ("x86/xen: discard RAM regions above the maximum reservation") left host memory not assigned to dom0 as available for memory hotplug. Unfortunately this also meant that those regions could be used by others. Specifically, commit fa564ad96366 ("x86/PCI: Enable a 64bit BAR on AMD Family 15h (Models 00-1f, 30-3f, 60-7f)") may try to map those addresses as MMIO. To prevent this mark unallocated host memory as E820_TYPE_UNUSABLE (thus effectively reverting f5775e0b6116) and keep track of that region as a hostmem resource that can be used for the hotplug. Signed-off-by: Boris Ostrovsky Reviewed-by: Juergen Gross --- include/xen/balloon.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/xen/balloon.h b/include/xen/balloon.h index 8906361bb50c..d0adfc78dcbd 100644 --- a/include/xen/balloon.h +++ b/include/xen/balloon.h @@ -43,3 +43,8 @@ static inline void xen_balloon_init(void) { } #endif + +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG +struct resource; +void arch_xen_balloon_init(struct resource *hostmem_resource); +#endif -- cgit v1.2.3 From 4ccafe032005e9b96acbef2e389a4de5b1254add Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 20 Dec 2017 13:13:58 -0700 Subject: block: unalign call_single_data in struct request A previous change blindly added massive alignment to the call_single_data structure in struct request. This ballooned it in size from 296 to 320 bytes on my setup, for no valid reason at all. Use the unaligned struct __call_single_data variant instead. Fixes: 966a967116e69 ("smp: Avoid using two cache lines for struct call_single_data") Cc: stable@vger.kernel.org # v4.14 Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 100d0df38026..0ce8a372d506 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -135,7 +135,7 @@ typedef __u32 __bitwise req_flags_t; struct request { struct list_head queuelist; union { - call_single_data_t csd; + struct __call_single_data csd; u64 fifo_time; }; -- cgit v1.2.3 From bb7f0f989ca7de1153bd128a40a71709e339fa03 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 18 Dec 2017 20:12:00 -0800 Subject: bpf: fix integer overflows There were various issues related to the limited size of integers used in the verifier: - `off + size` overflow in __check_map_access() - `off + reg->off` overflow in check_mem_access() - `off + reg->var_off.value` overflow or 32-bit truncation of `reg->var_off.value` in check_mem_access() - 32-bit truncation in check_stack_boundary() Make sure that any integer math cannot overflow by not allowing pointer math with large values. Also reduce the scope of "scalar op scalar" tracking. Fixes: f1174f77b50c ("bpf/verifier: rework value tracking") Reported-by: Jann Horn Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c561b986bab0..1632bb13ad8a 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -15,11 +15,11 @@ * In practice this is far bigger than any realistic pointer offset; this limit * ensures that umax_value + (int)off + (int)size cannot overflow a u64. */ -#define BPF_MAX_VAR_OFF (1ULL << 31) +#define BPF_MAX_VAR_OFF (1 << 29) /* Maximum variable size permitted for ARG_CONST_SIZE[_OR_ZERO]. This ensures * that converting umax_value to int cannot overflow. */ -#define BPF_MAX_VAR_SIZ INT_MAX +#define BPF_MAX_VAR_SIZ (1 << 29) /* Liveness marks, used for registers and spilled-regs (in stack slots). * Read marks propagate upwards until they find a write mark; they record that -- cgit v1.2.3 From 513674b5a2c9c7a67501506419da5c3c77ac6f08 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 20 Dec 2017 12:10:21 -0800 Subject: net: reevalulate autoflowlabel setting after sysctl setting sysctl.ip6.auto_flowlabels is default 1. In our hosts, we set it to 2. If sockopt doesn't set autoflowlabel, outcome packets from the hosts are supposed to not include flowlabel. This is true for normal packet, but not for reset packet. The reason is ipv6_pinfo.autoflowlabel is set in sock creation. Later if we change sysctl.ip6.auto_flowlabels, the ipv6_pinfo.autoflowlabel isn't changed, so the sock will keep the old behavior in terms of auto flowlabel. Reset packet is suffering from this problem, because reset packet is sent from a special control socket, which is created at boot time. Since sysctl.ipv6.auto_flowlabels is 1 by default, the control socket will always have its ipv6_pinfo.autoflowlabel set, even after user set sysctl.ipv6.auto_flowlabels to 1, so reset packset will always have flowlabel. Normal sock created before sysctl setting suffers from the same issue. We can't even turn off autoflowlabel unless we kill all socks in the hosts. To fix this, if IPV6_AUTOFLOWLABEL sockopt is used, we use the autoflowlabel setting from user, otherwise we always call ip6_default_np_autolabel() which has the new settings of sysctl. Note, this changes behavior a little bit. Before commit 42240901f7c4 (ipv6: Implement different admin modes for automatic flow labels), the autoflowlabel behavior of a sock isn't sticky, eg, if sysctl changes, existing connection will change autoflowlabel behavior. After that commit, autoflowlabel behavior is sticky in the whole life of the sock. With this patch, the behavior isn't sticky again. Cc: Martin KaFai Lau Cc: Eric Dumazet Cc: Tom Herbert Signed-off-by: Shaohua Li Signed-off-by: David S. Miller --- include/linux/ipv6.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index cb18c6290ca8..8415bf1a9776 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -273,7 +273,8 @@ struct ipv6_pinfo { * 100: prefer care-of address */ dontfrag:1, - autoflowlabel:1; + autoflowlabel:1, + autoflowlabel_set:1; __u8 min_hopcount; __u8 tclass; __be32 rcv_flowinfo; -- cgit v1.2.3 From 71a0ff65a21bf3e2c4fde208c4a635ed2bbb4e81 Mon Sep 17 00:00:00 2001 From: Majd Dibbiny Date: Thu, 21 Dec 2017 17:38:26 +0200 Subject: IB/mlx5: Fix congestion counters in LAG mode Congestion counters are counted and queried per physical function. When working in LAG mode, CNP packets can be sent or received on both of the functions, thus congestion counters should be aggregated from the two physical functions. Fixes: e1f24a79f424 ("IB/mlx5: Support congestion related counters") Signed-off-by: Majd Dibbiny Reviewed-by: Aviv Heller Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/linux/mlx5/driver.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index a886b51511ab..8846919356ca 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1164,6 +1164,10 @@ int mlx5_cmd_create_vport_lag(struct mlx5_core_dev *dev); int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev); bool mlx5_lag_is_active(struct mlx5_core_dev *dev); struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev); +int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev, + u64 *values, + int num_counters, + size_t *offsets); struct mlx5_uars_page *mlx5_get_uars_page(struct mlx5_core_dev *mdev); void mlx5_put_uars_page(struct mlx5_core_dev *mdev, struct mlx5_uars_page *up); -- cgit v1.2.3 From c10e83f598d08046dd1ebc8360d4bb12d802d51b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 14 Dec 2017 12:27:29 +0100 Subject: arch, mm: Allow arch_dup_mmap() to fail In order to sanitize the LDT initialization on x86 arch_dup_mmap() must be allowed to fail. Fix up all instances. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Andy Lutomirsky Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: dan.j.williams@intel.com Cc: hughd@google.com Cc: keescook@google.com Cc: kirill.shutemov@linux.intel.com Cc: linux-mm@kvack.org Signed-off-by: Ingo Molnar --- include/asm-generic/mm_hooks.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/asm-generic/mm_hooks.h b/include/asm-generic/mm_hooks.h index ea189d88a3cc..8ac4e68a12f0 100644 --- a/include/asm-generic/mm_hooks.h +++ b/include/asm-generic/mm_hooks.h @@ -7,9 +7,10 @@ #ifndef _ASM_GENERIC_MM_HOOKS_H #define _ASM_GENERIC_MM_HOOKS_H -static inline void arch_dup_mmap(struct mm_struct *oldmm, - struct mm_struct *mm) +static inline int arch_dup_mmap(struct mm_struct *oldmm, + struct mm_struct *mm) { + return 0; } static inline void arch_exit_mmap(struct mm_struct *mm) -- cgit v1.2.3 From 613e396bc0d4c7604fba23256644e78454c68cf6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 17 Dec 2017 10:56:29 +0100 Subject: init: Invoke init_espfix_bsp() from mm_init() init_espfix_bsp() needs to be invoked before the page table isolation initialization. Move it into mm_init() which is the place where pti_init() will be added. While at it get rid of the #ifdeffery and provide proper stub functions. Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/asm-generic/pgtable.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 757dc6ffc7ba..231b35a76dd9 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -1017,6 +1017,11 @@ static inline int pmd_clear_huge(pmd_t *pmd) struct file; int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot); + +#ifndef CONFIG_X86_ESPFIX64 +static inline void init_espfix_bsp(void) { } +#endif + #endif /* !__ASSEMBLY__ */ #ifndef io_remap_pfn_range -- cgit v1.2.3 From aa8c6248f8c75acfd610fe15d8cae23cf70d9d09 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Dec 2017 15:07:36 +0100 Subject: x86/mm/pti: Add infrastructure for page table isolation Add the initial files for kernel page table isolation, with a minimal init function and the boot time detection for this misfeature. Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Signed-off-by: Ingo Molnar --- include/linux/pti.h | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 include/linux/pti.h (limited to 'include') diff --git a/include/linux/pti.h b/include/linux/pti.h new file mode 100644 index 000000000000..0174883a935a --- /dev/null +++ b/include/linux/pti.h @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef _INCLUDE_PTI_H +#define _INCLUDE_PTI_H + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +#include +#else +static inline void pti_init(void) { } +#endif + +#endif -- cgit v1.2.3 From 6a6b0b9914e73a8a54253dd5f6f5e5dd5e4a756c Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Thu, 21 Dec 2017 10:29:09 -0800 Subject: tcp: Avoid preprocessor directives in tracepoint macro args Using a preprocessor directive to check for CONFIG_IPV6 in the middle of a DECLARE_EVENT_CLASS macro's arg list causes sparse to report a series of errors: ./include/trace/events/tcp.h:68:1: error: directive in argument list ./include/trace/events/tcp.h:75:1: error: directive in argument list ./include/trace/events/tcp.h:144:1: error: directive in argument list ./include/trace/events/tcp.h:151:1: error: directive in argument list ./include/trace/events/tcp.h:216:1: error: directive in argument list ./include/trace/events/tcp.h:223:1: error: directive in argument list ./include/trace/events/tcp.h:274:1: error: directive in argument list ./include/trace/events/tcp.h:281:1: error: directive in argument list Once sparse finds an error, it stops printing warnings for the file it is checking. This masks any sparse warnings that would normally be reported for the core TCP code. Instead, handle the preprocessor conditionals in a couple of auxiliary macros. This also has the benefit of reducing duplicate code. Cc: David Ahern Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 97 ++++++++++++++++++---------------------------- 1 file changed, 37 insertions(+), 60 deletions(-) (limited to 'include') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 07cccca6cbf1..ab34c561f26b 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -25,6 +25,35 @@ tcp_state_name(TCP_CLOSING), \ tcp_state_name(TCP_NEW_SYN_RECV)) +#define TP_STORE_V4MAPPED(__entry, saddr, daddr) \ + do { \ + struct in6_addr *pin6; \ + \ + pin6 = (struct in6_addr *)__entry->saddr_v6; \ + ipv6_addr_set_v4mapped(saddr, pin6); \ + pin6 = (struct in6_addr *)__entry->daddr_v6; \ + ipv6_addr_set_v4mapped(daddr, pin6); \ + } while (0) + +#if IS_ENABLED(CONFIG_IPV6) +#define TP_STORE_ADDRS(__entry, saddr, daddr, saddr6, daddr6) \ + do { \ + if (sk->sk_family == AF_INET6) { \ + struct in6_addr *pin6; \ + \ + pin6 = (struct in6_addr *)__entry->saddr_v6; \ + *pin6 = saddr6; \ + pin6 = (struct in6_addr *)__entry->daddr_v6; \ + *pin6 = daddr6; \ + } else { \ + TP_STORE_V4MAPPED(__entry, saddr, daddr); \ + } \ + } while (0) +#else +#define TP_STORE_ADDRS(__entry, saddr, daddr, saddr6, daddr6) \ + TP_STORE_V4MAPPED(__entry, saddr, daddr) +#endif + /* * tcp event with arguments sk and skb * @@ -50,7 +79,6 @@ DECLARE_EVENT_CLASS(tcp_event_sk_skb, TP_fast_assign( struct inet_sock *inet = inet_sk(sk); - struct in6_addr *pin6; __be32 *p32; __entry->skbaddr = skb; @@ -65,20 +93,8 @@ DECLARE_EVENT_CLASS(tcp_event_sk_skb, p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; -#if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6) { - pin6 = (struct in6_addr *)__entry->saddr_v6; - *pin6 = sk->sk_v6_rcv_saddr; - pin6 = (struct in6_addr *)__entry->daddr_v6; - *pin6 = sk->sk_v6_daddr; - } else -#endif - { - pin6 = (struct in6_addr *)__entry->saddr_v6; - ipv6_addr_set_v4mapped(inet->inet_saddr, pin6); - pin6 = (struct in6_addr *)__entry->daddr_v6; - ipv6_addr_set_v4mapped(inet->inet_daddr, pin6); - } + TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, + sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); ), TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c", @@ -127,7 +143,6 @@ DECLARE_EVENT_CLASS(tcp_event_sk, TP_fast_assign( struct inet_sock *inet = inet_sk(sk); - struct in6_addr *pin6; __be32 *p32; __entry->skaddr = sk; @@ -141,20 +156,8 @@ DECLARE_EVENT_CLASS(tcp_event_sk, p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; -#if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6) { - pin6 = (struct in6_addr *)__entry->saddr_v6; - *pin6 = sk->sk_v6_rcv_saddr; - pin6 = (struct in6_addr *)__entry->daddr_v6; - *pin6 = sk->sk_v6_daddr; - } else -#endif - { - pin6 = (struct in6_addr *)__entry->saddr_v6; - ipv6_addr_set_v4mapped(inet->inet_saddr, pin6); - pin6 = (struct in6_addr *)__entry->daddr_v6; - ipv6_addr_set_v4mapped(inet->inet_daddr, pin6); - } + TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, + sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); ), TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c", @@ -197,7 +200,6 @@ TRACE_EVENT(tcp_set_state, TP_fast_assign( struct inet_sock *inet = inet_sk(sk); - struct in6_addr *pin6; __be32 *p32; __entry->skaddr = sk; @@ -213,20 +215,8 @@ TRACE_EVENT(tcp_set_state, p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; -#if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6) { - pin6 = (struct in6_addr *)__entry->saddr_v6; - *pin6 = sk->sk_v6_rcv_saddr; - pin6 = (struct in6_addr *)__entry->daddr_v6; - *pin6 = sk->sk_v6_daddr; - } else -#endif - { - pin6 = (struct in6_addr *)__entry->saddr_v6; - ipv6_addr_set_v4mapped(inet->inet_saddr, pin6); - pin6 = (struct in6_addr *)__entry->daddr_v6; - ipv6_addr_set_v4mapped(inet->inet_daddr, pin6); - } + TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, + sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); ), TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s", @@ -256,7 +246,6 @@ TRACE_EVENT(tcp_retransmit_synack, TP_fast_assign( struct inet_request_sock *ireq = inet_rsk(req); - struct in6_addr *pin6; __be32 *p32; __entry->skaddr = sk; @@ -271,20 +260,8 @@ TRACE_EVENT(tcp_retransmit_synack, p32 = (__be32 *) __entry->daddr; *p32 = ireq->ir_rmt_addr; -#if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6) { - pin6 = (struct in6_addr *)__entry->saddr_v6; - *pin6 = ireq->ir_v6_loc_addr; - pin6 = (struct in6_addr *)__entry->daddr_v6; - *pin6 = ireq->ir_v6_rmt_addr; - } else -#endif - { - pin6 = (struct in6_addr *)__entry->saddr_v6; - ipv6_addr_set_v4mapped(ireq->ir_loc_addr, pin6); - pin6 = (struct in6_addr *)__entry->daddr_v6; - ipv6_addr_set_v4mapped(ireq->ir_rmt_addr, pin6); - } + TP_STORE_ADDRS(__entry, ireq->ir_loc_addr, ireq->ir_rmt_addr, + ireq->ir_v6_loc_addr, ireq->ir_v6_rmt_addr); ), TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c", -- cgit v1.2.3 From 39c3fd58952d7599d367c84c1330b785d91d6088 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 2 Dec 2017 18:11:04 +0100 Subject: kernel/irq: Extend lockdep class for request mutex The IRQ code already has support for lockdep class for the lock mutex in an interrupt descriptor. Extend this to add a second class for the request mutex in the descriptor. Not having a class is resulting in false positive splats in some code paths. Signed-off-by: Andrew Lunn Signed-off-by: Thomas Gleixner Acked-by: linus.walleij@linaro.org Cc: grygorii.strashko@ti.com Cc: f.fainelli@gmail.com Link: https://lkml.kernel.org/r/1512234664-21555-1-git-send-email-andrew@lunn.ch --- include/linux/gpio/driver.h | 33 +++++++++++++++++++++------------ include/linux/irqdesc.h | 9 ++++++--- 2 files changed, 27 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 55e672592fa9..7258cd676df4 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -66,9 +66,10 @@ struct gpio_irq_chip { /** * @lock_key: * - * Per GPIO IRQ chip lockdep class. + * Per GPIO IRQ chip lockdep classes. */ struct lock_class_key *lock_key; + struct lock_class_key *request_key; /** * @parent_handler: @@ -323,7 +324,8 @@ extern const char *gpiochip_is_requested(struct gpio_chip *chip, /* add/remove chips */ extern int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data, - struct lock_class_key *lock_key); + struct lock_class_key *lock_key, + struct lock_class_key *request_key); /** * gpiochip_add_data() - register a gpio_chip @@ -350,11 +352,13 @@ extern int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data, */ #ifdef CONFIG_LOCKDEP #define gpiochip_add_data(chip, data) ({ \ - static struct lock_class_key key; \ - gpiochip_add_data_with_key(chip, data, &key); \ + static struct lock_class_key lock_key; \ + static struct lock_class_key request_key; \ + gpiochip_add_data_with_key(chip, data, &lock_key, \ + &request_key); \ }) #else -#define gpiochip_add_data(chip, data) gpiochip_add_data_with_key(chip, data, NULL) +#define gpiochip_add_data(chip, data) gpiochip_add_data_with_key(chip, data, NULL, NULL) #endif static inline int gpiochip_add(struct gpio_chip *chip) @@ -429,7 +433,8 @@ int gpiochip_irqchip_add_key(struct gpio_chip *gpiochip, irq_flow_handler_t handler, unsigned int type, bool threaded, - struct lock_class_key *lock_key); + struct lock_class_key *lock_key, + struct lock_class_key *request_key); #ifdef CONFIG_LOCKDEP @@ -445,10 +450,12 @@ static inline int gpiochip_irqchip_add(struct gpio_chip *gpiochip, irq_flow_handler_t handler, unsigned int type) { - static struct lock_class_key key; + static struct lock_class_key lock_key; + static struct lock_class_key request_key; return gpiochip_irqchip_add_key(gpiochip, irqchip, first_irq, - handler, type, false, &key); + handler, type, false, + &lock_key, &request_key); } static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gpiochip, @@ -458,10 +465,12 @@ static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gpiochip, unsigned int type) { - static struct lock_class_key key; + static struct lock_class_key lock_key; + static struct lock_class_key request_key; return gpiochip_irqchip_add_key(gpiochip, irqchip, first_irq, - handler, type, true, &key); + handler, type, true, + &lock_key, &request_key); } #else static inline int gpiochip_irqchip_add(struct gpio_chip *gpiochip, @@ -471,7 +480,7 @@ static inline int gpiochip_irqchip_add(struct gpio_chip *gpiochip, unsigned int type) { return gpiochip_irqchip_add_key(gpiochip, irqchip, first_irq, - handler, type, false, NULL); + handler, type, false, NULL, NULL); } static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gpiochip, @@ -481,7 +490,7 @@ static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gpiochip, unsigned int type) { return gpiochip_irqchip_add_key(gpiochip, irqchip, first_irq, - handler, type, true, NULL); + handler, type, true, NULL, NULL); } #endif /* CONFIG_LOCKDEP */ diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 39fb3700f7a9..25b33b664537 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -255,12 +255,15 @@ static inline bool irq_is_percpu_devid(unsigned int irq) } static inline void -irq_set_lockdep_class(unsigned int irq, struct lock_class_key *class) +irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class, + struct lock_class_key *request_class) { struct irq_desc *desc = irq_to_desc(irq); - if (desc) - lockdep_set_class(&desc->lock, class); + if (desc) { + lockdep_set_class(&desc->lock, lock_class); + lockdep_set_class(&desc->request_mutex, request_class); + } } #ifdef CONFIG_IRQ_PREFLOW_FASTEOI -- cgit v1.2.3 From 466a2b42d67644447a1765276259a3ea5531ddff Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 21 Dec 2017 02:22:45 +0100 Subject: cpufreq: schedutil: Use idle_calls counter of the remote CPU Since the recent remote cpufreq callback work, its possible that a cpufreq update is triggered from a remote CPU. For single policies however, the current code uses the local CPU when trying to determine if the remote sg_cpu entered idle or is busy. This is incorrect. To remedy this, compare with the nohz tick idle_calls counter of the remote CPU. Fixes: 674e75411fc2 (sched: cpufreq: Allow remote cpufreq callbacks) Acked-by: Viresh Kumar Acked-by: Peter Zijlstra (Intel) Signed-off-by: Joel Fernandes Cc: 4.14+ # 4.14+ Signed-off-by: Rafael J. Wysocki --- include/linux/tick.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/tick.h b/include/linux/tick.h index f442d1a42025..7cc35921218e 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -119,6 +119,7 @@ extern void tick_nohz_idle_exit(void); extern void tick_nohz_irq_exit(void); extern ktime_t tick_nohz_get_sleep_length(void); extern unsigned long tick_nohz_get_idle_calls(void); +extern unsigned long tick_nohz_get_idle_calls_cpu(int cpu); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); #else /* !CONFIG_NO_HZ_COMMON */ -- cgit v1.2.3 From 602f7a2714a3b3aa4bec82ab0a86a9f5a2c4aa61 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 28 Dec 2017 11:00:43 -0800 Subject: sock: Add sock_owned_by_user_nocheck This allows checking socket lock ownership with producing lockdep warnings. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/sock.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 9155da422692..7a7b14e9628a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1514,6 +1514,11 @@ static inline bool sock_owned_by_user(const struct sock *sk) return sk->sk_lock.owned; } +static inline bool sock_owned_by_user_nocheck(const struct sock *sk) +{ + return sk->sk_lock.owned; +} + /* no reclassification while locks are held */ static inline bool sock_allow_reclassification(const struct sock *csk) { -- cgit v1.2.3 From 69790ba92b8d67eaee5e50b30a5b696d40664caf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 29 Dec 2017 16:44:34 +0100 Subject: genirq: Introduce IRQD_CAN_RESERVE flag Add a new flag to mark interrupts which can use reservation mode. This is going to be used in subsequent patches to disable reservation mode for a certain class of MSI devices. Signed-off-by: Thomas Gleixner Tested-by: Alexandru Chirvasitu Tested-by: Andy Shevchenko Cc: Dou Liyang Cc: Pavel Machek Cc: Maciej W. Rozycki Cc: Mikael Pettersson Cc: Josh Poulson Cc: Mihai Costache Cc: Stephen Hemminger Cc: Marc Zyngier Cc: linux-pci@vger.kernel.org Cc: Haiyang Zhang Cc: Dexuan Cui Cc: Simon Xiao Cc: Saeed Mahameed Cc: Jork Loeser Cc: Bjorn Helgaas Cc: devel@linuxdriverproject.org Cc: KY Srinivasan Cc: Alan Cox Cc: Sakari Ailus , Cc: linux-media@vger.kernel.org --- include/linux/irq.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/linux/irq.h b/include/linux/irq.h index e140f69163b6..a0231e96a578 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -212,6 +212,7 @@ struct irq_data { * mask. Applies only to affinity managed irqs. * IRQD_SINGLE_TARGET - IRQ allows only a single affinity target * IRQD_DEFAULT_TRIGGER_SET - Expected trigger already been set + * IRQD_CAN_RESERVE - Can use reservation mode */ enum { IRQD_TRIGGER_MASK = 0xf, @@ -233,6 +234,7 @@ enum { IRQD_MANAGED_SHUTDOWN = (1 << 23), IRQD_SINGLE_TARGET = (1 << 24), IRQD_DEFAULT_TRIGGER_SET = (1 << 25), + IRQD_CAN_RESERVE = (1 << 26), }; #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) @@ -377,6 +379,21 @@ static inline bool irqd_is_managed_and_shutdown(struct irq_data *d) return __irqd_to_state(d) & IRQD_MANAGED_SHUTDOWN; } +static inline void irqd_set_can_reserve(struct irq_data *d) +{ + __irqd_to_state(d) |= IRQD_CAN_RESERVE; +} + +static inline void irqd_clr_can_reserve(struct irq_data *d) +{ + __irqd_to_state(d) &= ~IRQD_CAN_RESERVE; +} + +static inline bool irqd_can_reserve(struct irq_data *d) +{ + return __irqd_to_state(d) & IRQD_CAN_RESERVE; +} + #undef __irqd_to_state static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) -- cgit v1.2.3 From 702cb0a02813299d6911b775c637906ae21b737d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 29 Dec 2017 16:59:06 +0100 Subject: genirq/irqdomain: Rename early argument of irq_domain_activate_irq() The 'early' argument of irq_domain_activate_irq() is actually used to denote reservation mode. To avoid confusion, rename it before abuse happens. No functional change. Fixes: 72491643469a ("genirq/irqdomain: Update irq_domain_ops.activate() signature") Signed-off-by: Thomas Gleixner Cc: Alexandru Chirvasitu Cc: Andy Shevchenko Cc: Dou Liyang Cc: Pavel Machek Cc: Maciej W. Rozycki Cc: Mikael Pettersson Cc: Josh Poulson Cc: Mihai Costache Cc: Stephen Hemminger Cc: Marc Zyngier Cc: linux-pci@vger.kernel.org Cc: Haiyang Zhang Cc: Dexuan Cui Cc: Simon Xiao Cc: Saeed Mahameed Cc: Jork Loeser Cc: Bjorn Helgaas Cc: devel@linuxdriverproject.org Cc: KY Srinivasan Cc: Alan Cox Cc: Sakari Ailus , Cc: linux-media@vger.kernel.org --- include/linux/irqdomain.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index a34355d19546..48c7e86bb556 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -113,7 +113,7 @@ struct irq_domain_ops { unsigned int nr_irqs, void *arg); void (*free)(struct irq_domain *d, unsigned int virq, unsigned int nr_irqs); - int (*activate)(struct irq_domain *d, struct irq_data *irqd, bool early); + int (*activate)(struct irq_domain *d, struct irq_data *irqd, bool reserve); void (*deactivate)(struct irq_domain *d, struct irq_data *irq_data); int (*translate)(struct irq_domain *d, struct irq_fwspec *fwspec, unsigned long *out_hwirq, unsigned int *out_type); -- cgit v1.2.3 From 26456f87aca7157c057de65c9414b37f1ab881d1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 27 Dec 2017 21:37:25 +0100 Subject: timers: Reinitialize per cpu bases on hotplug The timer wheel bases are not (re)initialized on CPU hotplug. That leaves them with a potentially stale clk and next_expiry valuem, which can cause trouble then the CPU is plugged. Add a prepare callback which forwards the clock, sets next_expiry to far in the future and reset the control flags to a known state. Set base->must_forward_clk so the first timer which is queued will try to forward the clock to current jiffies. Fixes: 500462a9de65 ("timers: Switch to a non-cascading wheel") Reported-by: Paul E. McKenney Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Sebastian Siewior Cc: Anna-Maria Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712272152200.2431@nanos --- include/linux/cpuhotplug.h | 2 +- include/linux/timer.h | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 201ab7267986..1a32e558eb11 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -86,7 +86,7 @@ enum cpuhp_state { CPUHP_MM_ZSWP_POOL_PREPARE, CPUHP_KVM_PPC_BOOK3S_PREPARE, CPUHP_ZCOMP_PREPARE, - CPUHP_TIMERS_DEAD, + CPUHP_TIMERS_PREPARE, CPUHP_MIPS_SOC_PREPARE, CPUHP_BP_PREPARE_DYN, CPUHP_BP_PREPARE_DYN_END = CPUHP_BP_PREPARE_DYN + 20, diff --git a/include/linux/timer.h b/include/linux/timer.h index 04af640ea95b..2448f9cc48a3 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -207,9 +207,11 @@ unsigned long round_jiffies_up(unsigned long j); unsigned long round_jiffies_up_relative(unsigned long j); #ifdef CONFIG_HOTPLUG_CPU +int timers_prepare_cpu(unsigned int cpu); int timers_dead_cpu(unsigned int cpu); #else -#define timers_dead_cpu NULL +#define timers_prepare_cpu NULL +#define timers_dead_cpu NULL #endif #endif -- cgit v1.2.3