From 8d26fdfcb45dc420115b267ac9d6b3ac13457f1b Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 30 Nov 2017 14:35:08 +0100 Subject: spi: Fix double "when" Signed-off-by: Geert Uytterhoeven Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 7b2170bfd6e7..bc6bb325d1bf 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -126,7 +126,7 @@ void spi_statistics_add_transfer_stats(struct spi_statistics *stats, * for that name. This appears in the sysfs "modalias" attribute * for driver coldplugging, and in uevents used for hotplugging * @cs_gpio: gpio number of the chipselect line (optional, -ENOENT when - * when not using a GPIO line) + * not using a GPIO line) * * @statistics: statistics for the spi_device * -- cgit v1.2.3 From a773d419275bf54854ca6cfda8f2594ed2790faa Mon Sep 17 00:00:00 2001 From: Felipe Balbi Date: Fri, 2 Jun 2017 13:20:25 +0300 Subject: tracing: Pass export pointer as argument to ->write() By passing an export descriptor to the write function, users don't need to keep a global static pointer and can rely on container_of() to fetch their own structure. Link: http://lkml.kernel.org/r/20170602102025.5140-1-felipe.balbi@linux.intel.com Acked-by: Steven Rostedt (VMware) Reviewed-by: Chunyan Zhang Signed-off-by: Felipe Balbi Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/trace.h b/include/linux/trace.h index d24991c1fef3..b95ffb2188ab 100644 --- a/include/linux/trace.h +++ b/include/linux/trace.h @@ -18,7 +18,7 @@ */ struct trace_export { struct trace_export __rcu *next; - void (*write)(const void *, unsigned int); + void (*write)(struct trace_export *, const void *, unsigned int); }; int register_ftrace_export(struct trace_export *export); -- cgit v1.2.3 From 7912af5c835bd86f2b0347a480e0f40e2fab30d0 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 6 Dec 2017 14:55:05 -0600 Subject: PCI: Add pci_get_domain_bus_and_slot() stub The coretemp driver build fails when CONFIG_PCI is not enabled because it uses a function that does not have a stub for that config case, so add the function stub. ../drivers/hwmon/coretemp.c: In function 'adjust_tjmax': ../drivers/hwmon/coretemp.c:250:9: error: implicit declaration of function 'pci_get_domain_bus_and_slot' [-Werror=implicit-function-declaration] struct pci_dev *host_bridge = pci_get_domain_bus_and_slot(0, 0, devfn); ../drivers/hwmon/coretemp.c:250:32: warning: initialization makes pointer from integer without a cast [enabled by default] struct pci_dev *host_bridge = pci_get_domain_bus_and_slot(0, 0, devfn); Signed-off-by: Randy Dunlap [bhelgaas: identical patch also by Arnd Bergmann ] Signed-off-by: Bjorn Helgaas Acked-by: Guenter Roeck --- include/linux/pci.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 0403894147a3..c170c9250c8b 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1674,6 +1674,9 @@ static inline struct pci_dev *pci_get_slot(struct pci_bus *bus, static inline struct pci_dev *pci_get_bus_and_slot(unsigned int bus, unsigned int devfn) { return NULL; } +static inline struct pci_dev *pci_get_domain_bus_and_slot(int domain, + unsigned int bus, unsigned int devfn) +{ return NULL; } static inline int pci_domain_nr(struct pci_bus *bus) { return 0; } static inline struct pci_dev *pci_dev_get(struct pci_dev *dev) { return NULL; } -- cgit v1.2.3 From b860b419d970f286294fbfb2b21a4028fd8ee442 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 6 Dec 2017 12:21:35 +0100 Subject: mfd: Fix RTS5227 (and others) powermanagement Commit 8275b77a1513 ("mfd: rts5249: Add support for RTS5250S power saving") adds powersaving support for device-ids 5249 524a and 525a. But as a side effect it breaks ASPM support for all the other device-ids, causing e.g. the Haswell CPU on a Lenovo T440s to not go into a higher c-state then PC3, while previously it would go to PC7, causing the machine to idle at 7.4W instead of 6.6W! The problem here is the new option.dev_aspm_mode field, which only gets explicitly initialized in the new code for the device-ids 5249 524a and 525a. Leaving the dev_aspm_mode 0 for the other device-ids. The default dev_aspm_mode 0 is mapped to DEV_ASPM_DISABLE, but the old behavior of calling rtsx_pci_enable_aspm() when idle and rtsx_pci_disable_aspm() when busy happens when dev_aspm_mode == DEV_ASPM_DYNAMIC. This commit changes the enum so that 0 = DEV_ASPM_DYNAMIC matching the old default behavior, fixing the pm regression with the other device-ids. Fixes: 8275b77a1513 ("mfd: rts5249: Add support for RTS5250S power saving") Signed-off-by: Hans de Goede Acked-by: Rui Feng Signed-off-by: Lee Jones --- include/linux/mfd/rtsx_pci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mfd/rtsx_pci.h b/include/linux/mfd/rtsx_pci.h index a2a1318a3d0c..c3d3f04d8cc6 100644 --- a/include/linux/mfd/rtsx_pci.h +++ b/include/linux/mfd/rtsx_pci.h @@ -915,10 +915,10 @@ enum PDEV_STAT {PDEV_STAT_IDLE, PDEV_STAT_RUN}; #define LTR_L1SS_PWR_GATE_CHECK_CARD_EN BIT(6) enum dev_aspm_mode { - DEV_ASPM_DISABLE = 0, DEV_ASPM_DYNAMIC, DEV_ASPM_BACKDOOR, DEV_ASPM_STATIC, + DEV_ASPM_DISABLE, }; /* -- cgit v1.2.3 From 3487972d7fa6c5143951436ada5933dcf0ec659d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 7 Dec 2017 02:41:18 +0100 Subject: PM / sleep: Avoid excess pm_runtime_enable() calls in device_resume() Middle-layer code doing suspend-time optimizations for devices with the DPM_FLAG_SMART_SUSPEND flag set (currently, the PCI bus type and the ACPI PM domain) needs to make the core skip ->thaw_early and ->thaw callbacks for those devices in some cases and it sets the power.direct_complete flag for them for this purpose. However, it turns out that setting power.direct_complete outside of the PM core is a bad idea as it triggers an excess invocation of pm_runtime_enable() in device_resume(). For this reason, provide a helper to clear power.is_late_suspended and power.is_suspended to be invoked by the middle-layer code in question instead of setting power.direct_complete and make that code call the new helper. Fixes: c4b65157aeef (PCI / PM: Take SMART_SUSPEND driver flag into account) Fixes: 05087360fd7a (ACPI / PM: Take SMART_SUSPEND driver flag into account) Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Acked-by: Bjorn Helgaas --- include/linux/pm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pm.h b/include/linux/pm.h index 65d39115f06d..492ed473ba7e 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -765,6 +765,7 @@ extern int pm_generic_poweroff_late(struct device *dev); extern int pm_generic_poweroff(struct device *dev); extern void pm_generic_complete(struct device *dev); +extern void dev_pm_skip_next_resume_phases(struct device *dev); extern bool dev_pm_smart_suspend_and_suspended(struct device *dev); #else /* !CONFIG_PM_SLEEP */ -- cgit v1.2.3 From a8ceb5dbfde1092b466936bca0ff3be127ecf38e Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 5 Dec 2017 21:29:37 +0200 Subject: ptr_ring: add barriers Users of ptr_ring expect that it's safe to give the data structure a pointer and have it be available to consumers, but that actually requires an smb_wmb or a stronger barrier. In absence of such barriers and on architectures that reorder writes, consumer might read an un=initialized value from an skb pointer stored in the skb array. This was observed causing crashes. To fix, add memory barriers. The barrier we use is a wmb, the assumption being that producers do not need to read the value so we do not need to order these reads. Reported-by: George Cherian Suggested-by: Jason Wang Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Signed-off-by: David S. Miller --- include/linux/ptr_ring.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h index 37b4bb2545b3..6866df4f31b5 100644 --- a/include/linux/ptr_ring.h +++ b/include/linux/ptr_ring.h @@ -101,12 +101,18 @@ static inline bool ptr_ring_full_bh(struct ptr_ring *r) /* Note: callers invoking this in a loop must use a compiler barrier, * for example cpu_relax(). Callers must hold producer_lock. + * Callers are responsible for making sure pointer that is being queued + * points to a valid data. */ static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr) { if (unlikely(!r->size) || r->queue[r->producer]) return -ENOSPC; + /* Make sure the pointer we are storing points to a valid data. */ + /* Pairs with smp_read_barrier_depends in __ptr_ring_consume. */ + smp_wmb(); + r->queue[r->producer++] = ptr; if (unlikely(r->producer >= r->size)) r->producer = 0; @@ -275,6 +281,9 @@ static inline void *__ptr_ring_consume(struct ptr_ring *r) if (ptr) __ptr_ring_discard_one(r); + /* Make sure anyone accessing data through the pointer is up to date. */ + /* Pairs with smp_wmb in __ptr_ring_produce. */ + smp_read_barrier_depends(); return ptr; } -- cgit v1.2.3 From d89c70356acf11b7cf47ca5cfcafae5062a85451 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 28 Nov 2017 18:42:19 +0000 Subject: locking/core: Remove break_lock field when CONFIG_GENERIC_LOCKBREAK=y When CONFIG_GENERIC_LOCKBEAK=y, locking structures grow an extra int ->break_lock field which is used to implement raw_spin_is_contended() by setting the field to 1 when waiting on a lock and clearing it to zero when holding a lock. However, there are a few problems with this approach: - There is a write-write race between a CPU successfully taking the lock (and subsequently writing break_lock = 0) and a waiter waiting on the lock (and subsequently writing break_lock = 1). This could result in a contended lock being reported as uncontended and vice-versa. - On machines with store buffers, nothing guarantees that the writes to break_lock are visible to other CPUs at any particular time. - READ_ONCE/WRITE_ONCE are not used, so the field is potentially susceptible to harmful compiler optimisations, Consequently, the usefulness of this field is unclear and we'd be better off removing it and allowing architectures to implement raw_spin_is_contended() by providing a definition of arch_spin_is_contended(), as they can when CONFIG_GENERIC_LOCKBREAK=n. Signed-off-by: Will Deacon Acked-by: Peter Zijlstra Cc: Heiko Carstens Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Sebastian Ott Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1511894539-7988-3-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- include/linux/rwlock_types.h | 3 --- include/linux/spinlock.h | 5 ----- include/linux/spinlock_types.h | 3 --- 3 files changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h index cc0072e93e36..857a72ceb794 100644 --- a/include/linux/rwlock_types.h +++ b/include/linux/rwlock_types.h @@ -10,9 +10,6 @@ */ typedef struct { arch_rwlock_t raw_lock; -#ifdef CONFIG_GENERIC_LOCKBREAK - unsigned int break_lock; -#endif #ifdef CONFIG_DEBUG_SPINLOCK unsigned int magic, owner_cpu; void *owner; diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index a39186194cd6..3bf273538840 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -107,16 +107,11 @@ do { \ #define raw_spin_is_locked(lock) arch_spin_is_locked(&(lock)->raw_lock) -#ifdef CONFIG_GENERIC_LOCKBREAK -#define raw_spin_is_contended(lock) ((lock)->break_lock) -#else - #ifdef arch_spin_is_contended #define raw_spin_is_contended(lock) arch_spin_is_contended(&(lock)->raw_lock) #else #define raw_spin_is_contended(lock) (((void)(lock), 0)) #endif /*arch_spin_is_contended*/ -#endif /* * This barrier must provide two things: diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h index 73548eb13a5d..24b4e6f2c1a2 100644 --- a/include/linux/spinlock_types.h +++ b/include/linux/spinlock_types.h @@ -19,9 +19,6 @@ typedef struct raw_spinlock { arch_spinlock_t raw_lock; -#ifdef CONFIG_GENERIC_LOCKBREAK - unsigned int break_lock; -#endif #ifdef CONFIG_DEBUG_SPINLOCK unsigned int magic, owner_cpu; void *owner; -- cgit v1.2.3 From e966eaeeb623f09975ef362c2866fae6f86844f9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 12 Dec 2017 12:31:16 +0100 Subject: locking/lockdep: Remove the cross-release locking checks This code (CONFIG_LOCKDEP_CROSSRELEASE=y and CONFIG_LOCKDEP_COMPLETIONS=y), while it found a number of old bugs initially, was also causing too many false positives that caused people to disable lockdep - which is arguably a worse overall outcome. If we disable cross-release by default but keep the code upstream then in practice the most likely outcome is that we'll allow the situation to degrade gradually, by allowing entropy to introduce more and more false positives, until it overwhelms maintenance capacity. Another bad side effect was that people were trying to work around the false positives by uglifying/complicating unrelated code. There's a marked difference between annotating locking operations and uglifying good code just due to bad lock debugging code ... This gradual decrease in quality happened to a number of debugging facilities in the kernel, and lockdep is pretty complex already, so we cannot risk this outcome. Either cross-release checking can be done right with no false positives, or it should not be included in the upstream kernel. ( Note that it might make sense to maintain it out of tree and go through the false positives every now and then and see whether new bugs were introduced. ) Cc: Byungchul Park Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/completion.h | 45 ---------------- include/linux/lockdep.h | 125 --------------------------------------------- include/linux/sched.h | 11 ---- 3 files changed, 181 deletions(-) (limited to 'include/linux') diff --git a/include/linux/completion.h b/include/linux/completion.h index 0662a417febe..94a59ba7d422 100644 --- a/include/linux/completion.h +++ b/include/linux/completion.h @@ -10,9 +10,6 @@ */ #include -#ifdef CONFIG_LOCKDEP_COMPLETIONS -#include -#endif /* * struct completion - structure used to maintain state for a "completion" @@ -29,58 +26,16 @@ struct completion { unsigned int done; wait_queue_head_t wait; -#ifdef CONFIG_LOCKDEP_COMPLETIONS - struct lockdep_map_cross map; -#endif }; -#ifdef CONFIG_LOCKDEP_COMPLETIONS -static inline void complete_acquire(struct completion *x) -{ - lock_acquire_exclusive((struct lockdep_map *)&x->map, 0, 0, NULL, _RET_IP_); -} - -static inline void complete_release(struct completion *x) -{ - lock_release((struct lockdep_map *)&x->map, 0, _RET_IP_); -} - -static inline void complete_release_commit(struct completion *x) -{ - lock_commit_crosslock((struct lockdep_map *)&x->map); -} - -#define init_completion_map(x, m) \ -do { \ - lockdep_init_map_crosslock((struct lockdep_map *)&(x)->map, \ - (m)->name, (m)->key, 0); \ - __init_completion(x); \ -} while (0) - -#define init_completion(x) \ -do { \ - static struct lock_class_key __key; \ - lockdep_init_map_crosslock((struct lockdep_map *)&(x)->map, \ - "(completion)" #x, \ - &__key, 0); \ - __init_completion(x); \ -} while (0) -#else #define init_completion_map(x, m) __init_completion(x) #define init_completion(x) __init_completion(x) static inline void complete_acquire(struct completion *x) {} static inline void complete_release(struct completion *x) {} static inline void complete_release_commit(struct completion *x) {} -#endif -#ifdef CONFIG_LOCKDEP_COMPLETIONS -#define COMPLETION_INITIALIZER(work) \ - { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait), \ - STATIC_CROSS_LOCKDEP_MAP_INIT("(completion)" #work, &(work)) } -#else #define COMPLETION_INITIALIZER(work) \ { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) } -#endif #define COMPLETION_INITIALIZER_ONSTACK_MAP(work, map) \ (*({ init_completion_map(&(work), &(map)); &(work); })) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index a842551fe044..2e75dc34bff5 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -158,12 +158,6 @@ struct lockdep_map { int cpu; unsigned long ip; #endif -#ifdef CONFIG_LOCKDEP_CROSSRELEASE - /* - * Whether it's a crosslock. - */ - int cross; -#endif }; static inline void lockdep_copy_map(struct lockdep_map *to, @@ -267,95 +261,8 @@ struct held_lock { unsigned int hardirqs_off:1; unsigned int references:12; /* 32 bits */ unsigned int pin_count; -#ifdef CONFIG_LOCKDEP_CROSSRELEASE - /* - * Generation id. - * - * A value of cross_gen_id will be stored when holding this, - * which is globally increased whenever each crosslock is held. - */ - unsigned int gen_id; -#endif -}; - -#ifdef CONFIG_LOCKDEP_CROSSRELEASE -#define MAX_XHLOCK_TRACE_ENTRIES 5 - -/* - * This is for keeping locks waiting for commit so that true dependencies - * can be added at commit step. - */ -struct hist_lock { - /* - * Id for each entry in the ring buffer. This is used to - * decide whether the ring buffer was overwritten or not. - * - * For example, - * - * |<----------- hist_lock ring buffer size ------->| - * pppppppppppppppppppppiiiiiiiiiiiiiiiiiiiiiiiiiiiii - * wrapped > iiiiiiiiiiiiiiiiiiiiiiiiiii....................... - * - * where 'p' represents an acquisition in process - * context, 'i' represents an acquisition in irq - * context. - * - * In this example, the ring buffer was overwritten by - * acquisitions in irq context, that should be detected on - * rollback or commit. - */ - unsigned int hist_id; - - /* - * Seperate stack_trace data. This will be used at commit step. - */ - struct stack_trace trace; - unsigned long trace_entries[MAX_XHLOCK_TRACE_ENTRIES]; - - /* - * Seperate hlock instance. This will be used at commit step. - * - * TODO: Use a smaller data structure containing only necessary - * data. However, we should make lockdep code able to handle the - * smaller one first. - */ - struct held_lock hlock; }; -/* - * To initialize a lock as crosslock, lockdep_init_map_crosslock() should - * be called instead of lockdep_init_map(). - */ -struct cross_lock { - /* - * When more than one acquisition of crosslocks are overlapped, - * we have to perform commit for them based on cross_gen_id of - * the first acquisition, which allows us to add more true - * dependencies. - * - * Moreover, when no acquisition of a crosslock is in progress, - * we should not perform commit because the lock might not exist - * any more, which might cause incorrect memory access. So we - * have to track the number of acquisitions of a crosslock. - */ - int nr_acquire; - - /* - * Seperate hlock instance. This will be used at commit step. - * - * TODO: Use a smaller data structure containing only necessary - * data. However, we should make lockdep code able to handle the - * smaller one first. - */ - struct held_lock hlock; -}; - -struct lockdep_map_cross { - struct lockdep_map map; - struct cross_lock xlock; -}; -#endif - /* * Initialization, self-test and debugging-output methods: */ @@ -560,37 +467,6 @@ enum xhlock_context_t { XHLOCK_CTX_NR, }; -#ifdef CONFIG_LOCKDEP_CROSSRELEASE -extern void lockdep_init_map_crosslock(struct lockdep_map *lock, - const char *name, - struct lock_class_key *key, - int subclass); -extern void lock_commit_crosslock(struct lockdep_map *lock); - -/* - * What we essencially have to initialize is 'nr_acquire'. Other members - * will be initialized in add_xlock(). - */ -#define STATIC_CROSS_LOCK_INIT() \ - { .nr_acquire = 0,} - -#define STATIC_CROSS_LOCKDEP_MAP_INIT(_name, _key) \ - { .map.name = (_name), .map.key = (void *)(_key), \ - .map.cross = 1, .xlock = STATIC_CROSS_LOCK_INIT(), } - -/* - * To initialize a lockdep_map statically use this macro. - * Note that _name must not be NULL. - */ -#define STATIC_LOCKDEP_MAP_INIT(_name, _key) \ - { .name = (_name), .key = (void *)(_key), .cross = 0, } - -extern void crossrelease_hist_start(enum xhlock_context_t c); -extern void crossrelease_hist_end(enum xhlock_context_t c); -extern void lockdep_invariant_state(bool force); -extern void lockdep_init_task(struct task_struct *task); -extern void lockdep_free_task(struct task_struct *task); -#else /* !CROSSRELEASE */ #define lockdep_init_map_crosslock(m, n, k, s) do {} while (0) /* * To initialize a lockdep_map statically use this macro. @@ -604,7 +480,6 @@ static inline void crossrelease_hist_end(enum xhlock_context_t c) {} static inline void lockdep_invariant_state(bool force) {} static inline void lockdep_init_task(struct task_struct *task) {} static inline void lockdep_free_task(struct task_struct *task) {} -#endif /* CROSSRELEASE */ #ifdef CONFIG_LOCK_STAT diff --git a/include/linux/sched.h b/include/linux/sched.h index 21991d668d35..9ce6c3001e9f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -849,17 +849,6 @@ struct task_struct { struct held_lock held_locks[MAX_LOCK_DEPTH]; #endif -#ifdef CONFIG_LOCKDEP_CROSSRELEASE -#define MAX_XHLOCKS_NR 64UL - struct hist_lock *xhlocks; /* Crossrelease history locks */ - unsigned int xhlock_idx; - /* For restoring at history boundaries */ - unsigned int xhlock_idx_hist[XHLOCK_CTX_NR]; - unsigned int hist_id; - /* For overwrite check at each context exit */ - unsigned int hist_id_save[XHLOCK_CTX_NR]; -#endif - #ifdef CONFIG_UBSAN unsigned int in_ubsan; #endif -- cgit v1.2.3 From b899a850431e2dd0943205a63a68573f3e312d0d Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 27 Nov 2017 10:38:23 +0000 Subject: compiler.h: Remove ACCESS_ONCE() There are no longer any kernelspace uses of ACCESS_ONCE(), so we can remove the definition from . This patch removes the ACCESS_ONCE() definition, and updates comments which referred to it. At the same time, some inconsistent and redundant whitespace is removed from comments. Tested-by: Paul E. McKenney Signed-off-by: Mark Rutland Cc: Arnaldo Carvalho de Melo Cc: Joe Perches Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: apw@canonical.com Link: http://lkml.kernel.org/r/20171127103824.36526-4-mark.rutland@arm.com Signed-off-by: Ingo Molnar --- include/linux/compiler.h | 47 +++++++++++------------------------------------ 1 file changed, 11 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 188ed9f65517..52e611ab9a6c 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -220,21 +220,21 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s /* * Prevent the compiler from merging or refetching reads or writes. The * compiler is also forbidden from reordering successive instances of - * READ_ONCE, WRITE_ONCE and ACCESS_ONCE (see below), but only when the - * compiler is aware of some particular ordering. One way to make the - * compiler aware of ordering is to put the two invocations of READ_ONCE, - * WRITE_ONCE or ACCESS_ONCE() in different C statements. + * READ_ONCE and WRITE_ONCE, but only when the compiler is aware of some + * particular ordering. One way to make the compiler aware of ordering is to + * put the two invocations of READ_ONCE or WRITE_ONCE in different C + * statements. * - * In contrast to ACCESS_ONCE these two macros will also work on aggregate - * data types like structs or unions. If the size of the accessed data - * type exceeds the word size of the machine (e.g., 32 bits or 64 bits) - * READ_ONCE() and WRITE_ONCE() will fall back to memcpy(). There's at - * least two memcpy()s: one for the __builtin_memcpy() and then one for - * the macro doing the copy of variable - '__u' allocated on the stack. + * These two macros will also work on aggregate data types like structs or + * unions. If the size of the accessed data type exceeds the word size of + * the machine (e.g., 32 bits or 64 bits) READ_ONCE() and WRITE_ONCE() will + * fall back to memcpy(). There's at least two memcpy()s: one for the + * __builtin_memcpy() and then one for the macro doing the copy of variable + * - '__u' allocated on the stack. * * Their two major use cases are: (1) Mediating communication between * process-level code and irq/NMI handlers, all running on the same CPU, - * and (2) Ensuring that the compiler does not fold, spindle, or otherwise + * and (2) Ensuring that the compiler does not fold, spindle, or otherwise * mutilate accesses that either do not require ordering or that interact * with an explicit memory barrier or atomic instruction that provides the * required ordering. @@ -327,29 +327,4 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s compiletime_assert(__native_word(t), \ "Need native word sized stores/loads for atomicity.") -/* - * Prevent the compiler from merging or refetching accesses. The compiler - * is also forbidden from reordering successive instances of ACCESS_ONCE(), - * but only when the compiler is aware of some particular ordering. One way - * to make the compiler aware of ordering is to put the two invocations of - * ACCESS_ONCE() in different C statements. - * - * ACCESS_ONCE will only work on scalar types. For union types, ACCESS_ONCE - * on a union member will work as long as the size of the member matches the - * size of the union and the size is smaller than word size. - * - * The major use cases of ACCESS_ONCE used to be (1) Mediating communication - * between process-level code and irq/NMI handlers, all running on the same CPU, - * and (2) Ensuring that the compiler does not fold, spindle, or otherwise - * mutilate accesses that either do not require ordering or that interact - * with an explicit memory barrier or atomic instruction that provides the - * required ordering. - * - * If possible use READ_ONCE()/WRITE_ONCE() instead. - */ -#define __ACCESS_ONCE(x) ({ \ - __maybe_unused typeof(x) __var = (__force typeof(x)) 0; \ - (volatile typeof(x) *)&(x); }) -#define ACCESS_ONCE(x) (*__ACCESS_ONCE(x)) - #endif /* __LINUX_COMPILER_H */ -- cgit v1.2.3 From c47d7f56e914900410f65835933f9fc4374d0a2b Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Thu, 14 Dec 2017 15:32:24 -0800 Subject: include/linux/idr.h: add #include The was removed from radix-tree.h by commit f5bba9d11a25 ("include/linux/radix-tree.h: remove unneeded #include "). Since that commit, tools/testing/radix-tree/ couldn't pass compilation due to tools/testing/radix-tree/idr.c:17: undefined reference to WARN_ON_ONCE. This patch adds the bug.h header to idr.h to solve the issue. Link: http://lkml.kernel.org/r/1511963726-34070-2-git-send-email-wei.w.wang@intel.com Fixes: f5bba9d11a2 ("include/linux/radix-tree.h: remove unneeded #include ") Signed-off-by: Wei Wang Cc: Matthew Wilcox Cc: Jan Kara Cc: Eric Biggers Cc: Tejun Heo Cc: Masahiro Yamada Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/idr.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/idr.h b/include/linux/idr.h index 7c3a365f7e12..fa14f834e4ed 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -15,6 +15,7 @@ #include #include #include +#include struct idr { struct radix_tree_root idr_rt; -- cgit v1.2.3 From 338f1d9d1b829fec494d053f62820a2ee625b1ec Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 14 Dec 2017 15:32:28 -0800 Subject: lib/rbtree,drm/mm: add rbtree_replace_node_cached() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a variant of rbtree_replace_node() that maintains the leftmost cache of struct rbtree_root_cached when replacing nodes within the rbtree. As drm_mm is the only rb_replace_node() being used on an interval tree, the mistake looks fairly self-contained. Furthermore the only user of drm_mm_replace_node() is its testsuite... Testcase: igt/drm_mm/replace Link: http://lkml.kernel.org/r/20171122100729.3742-1-chris@chris-wilson.co.uk Link: https://patchwork.freedesktop.org/patch/msgid/20171109212435.9265-1-chris@chris-wilson.co.uk Fixes: f808c13fd373 ("lib/interval_tree: fast overlap detection") Signed-off-by: Chris Wilson Reviewed-by: Joonas Lahtinen Acked-by: Davidlohr Bueso Cc: Jérôme Glisse Cc: Joonas Lahtinen Cc: Daniel Vetter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rbtree.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index d574361943ea..fcbeed4053ef 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -99,6 +99,8 @@ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new, struct rb_root *root); +extern void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new, + struct rb_root_cached *root); static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) -- cgit v1.2.3 From 146734b091430c80d80bb96b1139a96fb4bc830e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 14 Dec 2017 15:32:34 -0800 Subject: string.h: workaround for increased stack usage The hardened strlen() function causes rather large stack usage in at least one file in the kernel, in particular when CONFIG_KASAN is enabled: drivers/media/usb/em28xx/em28xx-dvb.c: In function 'em28xx_dvb_init': drivers/media/usb/em28xx/em28xx-dvb.c:2062:1: error: the frame size of 3256 bytes is larger than 204 bytes [-Werror=frame-larger-than=] Analyzing this problem led to the discovery that gcc fails to merge the stack slots for the i2c_board_info[] structures after we strlcpy() into them, due to the 'noreturn' attribute on the source string length check. I reported this as a gcc bug, but it is unlikely to get fixed for gcc-8, since it is relatively easy to work around, and it gets triggered rarely. An earlier workaround I did added an empty inline assembly statement before the call to fortify_panic(), which works surprisingly well, but is really ugly and unintuitive. This is a new approach to the same problem, this time addressing it by not calling the 'extern __real_strnlen()' function for string constants where __builtin_strlen() is a compile-time constant and therefore known to be safe. We do this by checking if the last character in the string is a compile-time constant '\0'. If it is, we can assume that strlen() of the string is also constant. As a side-effect, this should also improve the object code output for any other call of strlen() on a string constant. [akpm@linux-foundation.org: add comment] Link: http://lkml.kernel.org/r/20171205215143.3085755-1-arnd@arndb.de Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82365 Link: https://patchwork.kernel.org/patch/9980413/ Link: https://patchwork.kernel.org/patch/9974047/ Fixes: 6974f0c4555 ("include/linux/string.h: add the option of fortified string.h functions") Signed-off-by: Arnd Bergmann Cc: Kees Cook Cc: Mauro Carvalho Chehab Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Daniel Micay Cc: Greg Kroah-Hartman Cc: Martin Wilck Cc: Dan Williams Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/string.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/string.h b/include/linux/string.h index 410ecf17de3c..cfd83eb2f926 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -259,7 +259,10 @@ __FORTIFY_INLINE __kernel_size_t strlen(const char *p) { __kernel_size_t ret; size_t p_size = __builtin_object_size(p, 0); - if (p_size == (size_t)-1) + + /* Work around gcc excess stack consumption issue */ + if (p_size == (size_t)-1 || + (__builtin_constant_p(p[p_size - 1]) && p[p_size - 1] == '\0')) return __builtin_strlen(p); ret = strnlen(p, p_size); if (p_size <= ret) -- cgit v1.2.3 From 3756f6401c302617c5e091081ca4d26ab604bec5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 14 Dec 2017 15:32:41 -0800 Subject: exec: avoid gcc-8 warning for get_task_comm gcc-8 warns about using strncpy() with the source size as the limit: fs/exec.c:1223:32: error: argument to 'sizeof' in 'strncpy' call is the same expression as the source; did you mean to use the size of the destination? [-Werror=sizeof-pointer-memaccess] This is indeed slightly suspicious, as it protects us from source arguments without NUL-termination, but does not guarantee that the destination is terminated. This keeps the strncpy() to ensure we have properly padded target buffer, but ensures that we use the correct length, by passing the actual length of the destination buffer as well as adding a build-time check to ensure it is exactly TASK_COMM_LEN. There are only 23 callsites which I all reviewed to ensure this is currently the case. We could get away with doing only the check or passing the right length, but it doesn't hurt to do both. Link: http://lkml.kernel.org/r/20171205151724.1764896-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Suggested-by: Kees Cook Acked-by: Kees Cook Acked-by: Ingo Molnar Cc: Alexander Viro Cc: Peter Zijlstra Cc: Serge Hallyn Cc: James Morris Cc: Aleksa Sarai Cc: "Eric W. Biederman" Cc: Frederic Weisbecker Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 21991d668d35..5124ba709830 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1503,7 +1503,11 @@ static inline void set_task_comm(struct task_struct *tsk, const char *from) __set_task_comm(tsk, from, false); } -extern char *get_task_comm(char *to, struct task_struct *tsk); +extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk); +#define get_task_comm(buf, tsk) ({ \ + BUILD_BUG_ON(sizeof(buf) != TASK_COMM_LEN); \ + __get_task_comm(buf, sizeof(buf), tsk); \ +}) #ifdef CONFIG_SMP void scheduler_ipi(void); -- cgit v1.2.3 From bdcf0a423ea1c40bbb40e7ee483b50fc8aa3d758 Mon Sep 17 00:00:00 2001 From: Thiago Rafael Becker Date: Thu, 14 Dec 2017 15:33:12 -0800 Subject: kernel: make groups_sort calling a responsibility group_info allocators In testing, we found that nfsd threads may call set_groups in parallel for the same entry cached in auth.unix.gid, racing in the call of groups_sort, corrupting the groups for that entry and leading to permission denials for the client. This patch: - Make groups_sort globally visible. - Move the call to groups_sort to the modifiers of group_info - Remove the call to groups_sort from set_groups Link: http://lkml.kernel.org/r/20171211151420.18655-1-thiago.becker@gmail.com Signed-off-by: Thiago Rafael Becker Reviewed-by: Matthew Wilcox Reviewed-by: NeilBrown Acked-by: "J. Bruce Fields" Cc: Al Viro Cc: Martin Schwidefsky Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cred.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cred.h b/include/linux/cred.h index 099058e1178b..631286535d0f 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -83,6 +83,7 @@ extern int set_current_groups(struct group_info *); extern void set_groups(struct cred *, struct group_info *); extern int groups_search(const struct group_info *, kgid_t); extern bool may_setgroups(void); +extern void groups_sort(struct group_info *); /* * The security context of a task -- cgit v1.2.3 From 4837fe37adff1d159904f0c013471b1ecbcb455e Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 14 Dec 2017 15:33:15 -0800 Subject: mm, oom_reaper: fix memory corruption David Rientjes has reported the following memory corruption while the oom reaper tries to unmap the victims address space BUG: Bad page map in process oom_reaper pte:6353826300000000 pmd:00000000 addr:00007f50cab1d000 vm_flags:08100073 anon_vma:ffff9eea335603f0 mapping: (null) index:7f50cab1d file: (null) fault: (null) mmap: (null) readpage: (null) CPU: 2 PID: 1001 Comm: oom_reaper Call Trace: unmap_page_range+0x1068/0x1130 __oom_reap_task_mm+0xd5/0x16b oom_reaper+0xff/0x14c kthread+0xc1/0xe0 Tetsuo Handa has noticed that the synchronization inside exit_mmap is insufficient. We only synchronize with the oom reaper if tsk_is_oom_victim which is not true if the final __mmput is called from a different context than the oom victim exit path. This can trivially happen from context of any task which has grabbed mm reference (e.g. to read /proc// file which requires mm etc.). The race would look like this oom_reaper oom_victim task mmget_not_zero do_exit mmput __oom_reap_task_mm mmput __mmput exit_mmap remove_vma unmap_page_range Fix this issue by providing a new mm_is_oom_victim() helper which operates on the mm struct rather than a task. Any context which operates on a remote mm struct should use this helper in place of tsk_is_oom_victim. The flag is set in mark_oom_victim and never cleared so it is stable in the exit_mmap path. Debugged by Tetsuo Handa. Link: http://lkml.kernel.org/r/20171210095130.17110-1-mhocko@kernel.org Fixes: 212925802454 ("mm: oom: let oom_reap_task and exit_mmap run concurrently") Signed-off-by: Michal Hocko Reported-by: David Rientjes Acked-by: David Rientjes Cc: Tetsuo Handa Cc: Andrea Argangeli Cc: [4.14] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 9 +++++++++ include/linux/sched/coredump.h | 1 + 2 files changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/oom.h b/include/linux/oom.h index 01c91d874a57..5bad038ac012 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -66,6 +66,15 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk) return tsk->signal->oom_mm; } +/* + * Use this helper if tsk->mm != mm and the victim mm needs a special + * handling. This is guaranteed to stay true after once set. + */ +static inline bool mm_is_oom_victim(struct mm_struct *mm) +{ + return test_bit(MMF_OOM_VICTIM, &mm->flags); +} + /* * Checks whether a page fault on the given mm is still reliable. * This is no longer true if the oom reaper started to reap the diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 9c8847395b5e..ec912d01126f 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -70,6 +70,7 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ #define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ #define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ +#define MMF_OOM_VICTIM 25 /* mm is the oom victim */ #define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ -- cgit v1.2.3 From 1784f9144b143a1e8b19fe94083b040aa559182b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 5 Dec 2017 14:14:47 +0100 Subject: drivers/misc/intel/pti: Rename the header file to free up the namespace We'd like to use the 'PTI' acronym for 'Page Table Isolation' - free up the namespace by renaming the driver header to . (Also standardize the header guard name while at it.) Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: J Freyensee Cc: Greg Kroah-Hartman Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/intel-pti.h | 43 +++++++++++++++++++++++++++++++++++++++++++ include/linux/pti.h | 43 ------------------------------------------- 2 files changed, 43 insertions(+), 43 deletions(-) create mode 100644 include/linux/intel-pti.h delete mode 100644 include/linux/pti.h (limited to 'include/linux') diff --git a/include/linux/intel-pti.h b/include/linux/intel-pti.h new file mode 100644 index 000000000000..2710d72de3c9 --- /dev/null +++ b/include/linux/intel-pti.h @@ -0,0 +1,43 @@ +/* + * Copyright (C) Intel 2011 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * The PTI (Parallel Trace Interface) driver directs trace data routed from + * various parts in the system out through the Intel Penwell PTI port and + * out of the mobile device for analysis with a debugging tool + * (Lauterbach, Fido). This is part of a solution for the MIPI P1149.7, + * compact JTAG, standard. + * + * This header file will allow other parts of the OS to use the + * interface to write out it's contents for debugging a mobile system. + */ + +#ifndef LINUX_INTEL_PTI_H_ +#define LINUX_INTEL_PTI_H_ + +/* offset for last dword of any PTI message. Part of MIPI P1149.7 */ +#define PTI_LASTDWORD_DTS 0x30 + +/* basic structure used as a write address to the PTI HW */ +struct pti_masterchannel { + u8 master; + u8 channel; +}; + +/* the following functions are defined in misc/pti.c */ +void pti_writedata(struct pti_masterchannel *mc, u8 *buf, int count); +struct pti_masterchannel *pti_request_masterchannel(u8 type, + const char *thread_name); +void pti_release_masterchannel(struct pti_masterchannel *mc); + +#endif /* LINUX_INTEL_PTI_H_ */ diff --git a/include/linux/pti.h b/include/linux/pti.h deleted file mode 100644 index b3ea01a3197e..000000000000 --- a/include/linux/pti.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (C) Intel 2011 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - * - * The PTI (Parallel Trace Interface) driver directs trace data routed from - * various parts in the system out through the Intel Penwell PTI port and - * out of the mobile device for analysis with a debugging tool - * (Lauterbach, Fido). This is part of a solution for the MIPI P1149.7, - * compact JTAG, standard. - * - * This header file will allow other parts of the OS to use the - * interface to write out it's contents for debugging a mobile system. - */ - -#ifndef PTI_H_ -#define PTI_H_ - -/* offset for last dword of any PTI message. Part of MIPI P1149.7 */ -#define PTI_LASTDWORD_DTS 0x30 - -/* basic structure used as a write address to the PTI HW */ -struct pti_masterchannel { - u8 master; - u8 channel; -}; - -/* the following functions are defined in misc/pti.c */ -void pti_writedata(struct pti_masterchannel *mc, u8 *buf, int count); -struct pti_masterchannel *pti_request_masterchannel(u8 type, - const char *thread_name); -void pti_release_masterchannel(struct pti_masterchannel *mc); - -#endif /*PTI_H_*/ -- cgit v1.2.3 From c2bc66082e1048c7573d72e62f597bdc5ce13fea Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 24 Oct 2017 11:22:47 +0100 Subject: locking/barriers: Add implicit smp_read_barrier_depends() to READ_ONCE() [ Note, this is a Git cherry-pick of the following commit: 76ebbe78f739 ("locking/barriers: Add implicit smp_read_barrier_depends() to READ_ONCE()") ... for easier x86 PTI code testing and back-porting. ] In preparation for the removal of lockless_dereference(), which is the same as READ_ONCE() on all architectures other than Alpha, add an implicit smp_read_barrier_depends() to READ_ONCE() so that it can be used to head dependency chains on all architectures. Signed-off-by: Will Deacon Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1508840570-22169-3-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- include/linux/compiler.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 202710420d6d..712cd8bb00b4 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -341,6 +341,7 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s __read_once_size(&(x), __u.__c, sizeof(x)); \ else \ __read_once_size_nocheck(&(x), __u.__c, sizeof(x)); \ + smp_read_barrier_depends(); /* Enforce dependency ordering from x */ \ __u.__val; \ }) #define READ_ONCE(x) __READ_ONCE(x, 1) -- cgit v1.2.3 From 3382290ed2d5e275429cef510ab21889d3ccd164 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 24 Oct 2017 11:22:48 +0100 Subject: locking/barriers: Convert users of lockless_dereference() to READ_ONCE() [ Note, this is a Git cherry-pick of the following commit: 506458efaf15 ("locking/barriers: Convert users of lockless_dereference() to READ_ONCE()") ... for easier x86 PTI code testing and back-porting. ] READ_ONCE() now has an implicit smp_read_barrier_depends() call, so it can be used instead of lockless_dereference() without any change in semantics. Signed-off-by: Will Deacon Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1508840570-22169-4-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- include/linux/rculist.h | 4 ++-- include/linux/rcupdate.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rculist.h b/include/linux/rculist.h index c2cdd45a880a..127f534fec94 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -275,7 +275,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). */ #define list_entry_rcu(ptr, type, member) \ - container_of(lockless_dereference(ptr), type, member) + container_of(READ_ONCE(ptr), type, member) /* * Where are list_empty_rcu() and list_first_entry_rcu()? @@ -368,7 +368,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, * example is when items are added to the list, but never deleted. */ #define list_entry_lockless(ptr, type, member) \ - container_of((typeof(ptr))lockless_dereference(ptr), type, member) + container_of((typeof(ptr))READ_ONCE(ptr), type, member) /** * list_for_each_entry_lockless - iterate over rcu list of given type diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 1a9f70d44af9..a6ddc42f87a5 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -346,7 +346,7 @@ static inline void rcu_preempt_sleep_check(void) { } #define __rcu_dereference_check(p, c, space) \ ({ \ /* Dependency order vs. p above. */ \ - typeof(*p) *________p1 = (typeof(*p) *__force)lockless_dereference(p); \ + typeof(*p) *________p1 = (typeof(*p) *__force)READ_ONCE(p); \ RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_check() usage"); \ rcu_dereference_sparse(p, space); \ ((typeof(*p) __force __kernel *)(________p1)); \ @@ -360,7 +360,7 @@ static inline void rcu_preempt_sleep_check(void) { } #define rcu_dereference_raw(p) \ ({ \ /* Dependency order vs. p above. */ \ - typeof(p) ________p1 = lockless_dereference(p); \ + typeof(p) ________p1 = READ_ONCE(p); \ ((typeof(*p) __force __kernel *)(________p1)); \ }) -- cgit v1.2.3 From 14cb0dc6479dc5ebc63b3a459a5d89a2f1b39fed Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 18 Dec 2017 15:40:43 +0800 Subject: block: don't let passthrough IO go into .make_request_fn() Commit a8821f3f3("block: Improvements to bounce-buffer handling") tries to make sure that the bio to .make_request_fn won't exceed BIO_MAX_PAGES, but ignores that passthrough I/O can use blk_queue_bounce() too. Especially, passthrough IO may not be sector-aligned, and the check of 'sectors < bio_sectors(*bio_orig)' inside __blk_queue_bounce() may become true even though the max bvec number doesn't exceed BIO_MAX_PAGES, then cause the bio splitted, and the original passthrough bio is submited to generic_make_request(). This patch fixes this issue by checking if the bio is passthrough IO, and use bio_kmalloc() to allocate the cloned passthrough bio. Cc: NeilBrown Fixes: a8821f3f3("block: Improvements to bounce-buffer handling") Tested-by: Michele Ballabio Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8089ca17db9a..abd06f540863 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -241,14 +241,24 @@ struct request { struct request *next_rq; }; +static inline bool blk_op_is_scsi(unsigned int op) +{ + return op == REQ_OP_SCSI_IN || op == REQ_OP_SCSI_OUT; +} + +static inline bool blk_op_is_private(unsigned int op) +{ + return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT; +} + static inline bool blk_rq_is_scsi(struct request *rq) { - return req_op(rq) == REQ_OP_SCSI_IN || req_op(rq) == REQ_OP_SCSI_OUT; + return blk_op_is_scsi(req_op(rq)); } static inline bool blk_rq_is_private(struct request *rq) { - return req_op(rq) == REQ_OP_DRV_IN || req_op(rq) == REQ_OP_DRV_OUT; + return blk_op_is_private(req_op(rq)); } static inline bool blk_rq_is_passthrough(struct request *rq) @@ -256,6 +266,13 @@ static inline bool blk_rq_is_passthrough(struct request *rq) return blk_rq_is_scsi(rq) || blk_rq_is_private(rq); } +static inline bool bio_is_passthrough(struct bio *bio) +{ + unsigned op = bio_op(bio); + + return blk_op_is_scsi(op) || blk_op_is_private(op); +} + static inline unsigned short req_get_ioprio(struct request *req) { return req->ioprio; -- cgit v1.2.3 From 0abc2a10389f0c9070f76ca906c7382788036b93 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 18 Dec 2017 15:40:44 +0800 Subject: block: fix blk_rq_append_bio Commit caa4b02476e3(blk-map: call blk_queue_bounce from blk_rq_append_bio) moves blk_queue_bounce() into blk_rq_append_bio(), but don't consider the fact that the bounced bio becomes invisible to caller since the parameter type is 'struct bio *'. Make it a pointer to a pointer to a bio, so the caller sees the right bio also after a bounce. Fixes: caa4b02476e3 ("blk-map: call blk_queue_bounce from blk_rq_append_bio") Cc: Christoph Hellwig Reported-by: Michele Ballabio (handling failure of blk_rq_append_bio(), only call bio_get() after blk_rq_append_bio() returns OK) Tested-by: Michele Ballabio Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index abd06f540863..100d0df38026 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -965,7 +965,7 @@ extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, extern void blk_rq_unprep_clone(struct request *rq); extern blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq); -extern int blk_rq_append_bio(struct request *rq, struct bio *bio); +extern int blk_rq_append_bio(struct request *rq, struct bio **bio); extern void blk_delay_queue(struct request_queue *, unsigned long); extern void blk_queue_split(struct request_queue *, struct bio **); extern void blk_recount_segments(struct request_queue *, struct bio *); -- cgit v1.2.3 From 231243c82793428467524227ae02ca451e6a98e7 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Fri, 10 Nov 2017 15:59:52 +0900 Subject: Revert "mlx5: move affinity hints assignments to generic code" Before the offending commit, mlx5 core did the IRQ affinity itself, and it seems that the new generic code have some drawbacks and one of them is the lack for user ability to modify irq affinity after the initial affinity values got assigned. The issue is still being discussed and a solution in the new generic code is required, until then we need to revert this patch. This fixes the following issue: echo > /proc/irq//smp_affinity fails with -EIO This reverts commit a435393acafbf0ecff4deb3e3cb554b34f0d0664. Note: kept mlx5_get_vector_affinity in include/linux/mlx5/driver.h since it is used in mlx5_ib driver. Fixes: a435393acafb ("mlx5: move affinity hints assignments to generic code") Cc: Sagi Grimberg Cc: Thomas Gleixner Cc: Jes Sorensen Reported-by: Jes Sorensen Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index a886b51511ab..40a6f33c4cde 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -556,6 +556,7 @@ struct mlx5_core_sriov { }; struct mlx5_irq_info { + cpumask_var_t mask; char name[MLX5_MAX_IRQ_NAME]; }; -- cgit v1.2.3 From 37e92a9d4fe38dc3e7308913575983a6a088c8d4 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Mon, 13 Nov 2017 10:11:27 +0200 Subject: net/mlx5: Fix rate limit packet pacing naming and struct In mlx5_ifc, struct size was not complete, and thus driver was sending garbage after the last defined field. Fixed it by adding reserved field to complete the struct size. In addition, rename all set_rate_limit to set_pp_rate_limit to be compliant with the Firmware <-> Driver definition. Fixes: 7486216b3a0b ("{net,IB}/mlx5: mlx5_ifc updates") Fixes: 1466cc5b23d1 ("net/mlx5: Rate limit tables support") Signed-off-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 38a7577a9ce7..d44ec5f41d4a 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -147,7 +147,7 @@ enum { MLX5_CMD_OP_ALLOC_Q_COUNTER = 0x771, MLX5_CMD_OP_DEALLOC_Q_COUNTER = 0x772, MLX5_CMD_OP_QUERY_Q_COUNTER = 0x773, - MLX5_CMD_OP_SET_RATE_LIMIT = 0x780, + MLX5_CMD_OP_SET_PP_RATE_LIMIT = 0x780, MLX5_CMD_OP_QUERY_RATE_LIMIT = 0x781, MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT = 0x782, MLX5_CMD_OP_DESTROY_SCHEDULING_ELEMENT = 0x783, @@ -7239,7 +7239,7 @@ struct mlx5_ifc_add_vxlan_udp_dport_in_bits { u8 vxlan_udp_port[0x10]; }; -struct mlx5_ifc_set_rate_limit_out_bits { +struct mlx5_ifc_set_pp_rate_limit_out_bits { u8 status[0x8]; u8 reserved_at_8[0x18]; @@ -7248,7 +7248,7 @@ struct mlx5_ifc_set_rate_limit_out_bits { u8 reserved_at_40[0x40]; }; -struct mlx5_ifc_set_rate_limit_in_bits { +struct mlx5_ifc_set_pp_rate_limit_in_bits { u8 opcode[0x10]; u8 reserved_at_10[0x10]; @@ -7261,6 +7261,8 @@ struct mlx5_ifc_set_rate_limit_in_bits { u8 reserved_at_60[0x20]; u8 rate_limit[0x20]; + + u8 reserved_at_a0[0x160]; }; struct mlx5_ifc_access_register_out_bits { -- cgit v1.2.3 From d6b2785cd55ee72e9608762650b3ef299f801b1b Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Tue, 21 Nov 2017 15:15:51 +0200 Subject: net/mlx5: Cleanup IRQs in case of unload failure When mlx5_stop_eqs fails to destroy any of the eqs it returns with an error. In such failure flow the function will return without releasing all EQs irqs and then pci_free_irq_vectors will fail. Fix by only warn on destroy EQ failure and continue to release other EQs and their irqs. It fixes the following kernel trace: kernel: kernel BUG at drivers/pci/msi.c:352! ... ... kernel: Call Trace: kernel: pci_disable_msix+0xd3/0x100 kernel: pci_free_irq_vectors+0xe/0x20 kernel: mlx5_load_one.isra.17+0x9f5/0xec0 [mlx5_core] Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters") Signed-off-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 40a6f33c4cde..57b109c6e422 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1049,7 +1049,7 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx, enum mlx5_eq_type type); int mlx5_destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq); int mlx5_start_eqs(struct mlx5_core_dev *dev); -int mlx5_stop_eqs(struct mlx5_core_dev *dev); +void mlx5_stop_eqs(struct mlx5_core_dev *dev); int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn, unsigned int *irqn); int mlx5_core_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn); -- cgit v1.2.3 From 111be883981748acc9a56e855c8336404a8e787c Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 20 Dec 2017 11:10:17 -0700 Subject: block-throttle: avoid double charge If a bio is throttled and split after throttling, the bio could be resubmited and enters the throttling again. This will cause part of the bio to be charged multiple times. If the cgroup has an IO limit, the double charge will significantly harm the performance. The bio split becomes quite common after arbitrary bio size change. To fix this, we always set the BIO_THROTTLED flag if a bio is throttled. If the bio is cloned/split, we copy the flag to new bio too to avoid a double charge. However, cloned bio could be directed to a new disk, keeping the flag be a problem. The observation is we always set new disk for the bio in this case, so we can clear the flag in bio_set_dev(). This issue exists for a long time, arbitrary bio size change just makes it worse, so this should go into stable at least since v4.2. V1-> V2: Not add extra field in bio based on discussion with Tejun Cc: Vivek Goyal Cc: stable@vger.kernel.org Acked-by: Tejun Heo Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 ++ include/linux/blk_types.h | 9 ++++----- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 82f0c8fd7be8..23d29b39f71e 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -492,6 +492,8 @@ extern unsigned int bvec_nr_vecs(unsigned short idx); #define bio_set_dev(bio, bdev) \ do { \ + if ((bio)->bi_disk != (bdev)->bd_disk) \ + bio_clear_flag(bio, BIO_THROTTLED);\ (bio)->bi_disk = (bdev)->bd_disk; \ (bio)->bi_partno = (bdev)->bd_partno; \ } while (0) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index a1e628e032da..9e7d8bd776d2 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -50,8 +50,6 @@ struct blk_issue_stat { struct bio { struct bio *bi_next; /* request queue link */ struct gendisk *bi_disk; - u8 bi_partno; - blk_status_t bi_status; unsigned int bi_opf; /* bottom bits req flags, * top bits REQ_OP. Use * accessors. @@ -59,8 +57,8 @@ struct bio { unsigned short bi_flags; /* status, etc and bvec pool number */ unsigned short bi_ioprio; unsigned short bi_write_hint; - - struct bvec_iter bi_iter; + blk_status_t bi_status; + u8 bi_partno; /* Number of segments in this BIO after * physical address coalescing is performed. @@ -74,8 +72,9 @@ struct bio { unsigned int bi_seg_front_size; unsigned int bi_seg_back_size; - atomic_t __bi_remaining; + struct bvec_iter bi_iter; + atomic_t __bi_remaining; bio_end_io_t *bi_end_io; void *bi_private; -- cgit v1.2.3 From 4ccafe032005e9b96acbef2e389a4de5b1254add Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 20 Dec 2017 13:13:58 -0700 Subject: block: unalign call_single_data in struct request A previous change blindly added massive alignment to the call_single_data structure in struct request. This ballooned it in size from 296 to 320 bytes on my setup, for no valid reason at all. Use the unaligned struct __call_single_data variant instead. Fixes: 966a967116e69 ("smp: Avoid using two cache lines for struct call_single_data") Cc: stable@vger.kernel.org # v4.14 Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 100d0df38026..0ce8a372d506 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -135,7 +135,7 @@ typedef __u32 __bitwise req_flags_t; struct request { struct list_head queuelist; union { - call_single_data_t csd; + struct __call_single_data csd; u64 fifo_time; }; -- cgit v1.2.3 From bb7f0f989ca7de1153bd128a40a71709e339fa03 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 18 Dec 2017 20:12:00 -0800 Subject: bpf: fix integer overflows There were various issues related to the limited size of integers used in the verifier: - `off + size` overflow in __check_map_access() - `off + reg->off` overflow in check_mem_access() - `off + reg->var_off.value` overflow or 32-bit truncation of `reg->var_off.value` in check_mem_access() - 32-bit truncation in check_stack_boundary() Make sure that any integer math cannot overflow by not allowing pointer math with large values. Also reduce the scope of "scalar op scalar" tracking. Fixes: f1174f77b50c ("bpf/verifier: rework value tracking") Reported-by: Jann Horn Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c561b986bab0..1632bb13ad8a 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -15,11 +15,11 @@ * In practice this is far bigger than any realistic pointer offset; this limit * ensures that umax_value + (int)off + (int)size cannot overflow a u64. */ -#define BPF_MAX_VAR_OFF (1ULL << 31) +#define BPF_MAX_VAR_OFF (1 << 29) /* Maximum variable size permitted for ARG_CONST_SIZE[_OR_ZERO]. This ensures * that converting umax_value to int cannot overflow. */ -#define BPF_MAX_VAR_SIZ INT_MAX +#define BPF_MAX_VAR_SIZ (1 << 29) /* Liveness marks, used for registers and spilled-regs (in stack slots). * Read marks propagate upwards until they find a write mark; they record that -- cgit v1.2.3 From 513674b5a2c9c7a67501506419da5c3c77ac6f08 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 20 Dec 2017 12:10:21 -0800 Subject: net: reevalulate autoflowlabel setting after sysctl setting sysctl.ip6.auto_flowlabels is default 1. In our hosts, we set it to 2. If sockopt doesn't set autoflowlabel, outcome packets from the hosts are supposed to not include flowlabel. This is true for normal packet, but not for reset packet. The reason is ipv6_pinfo.autoflowlabel is set in sock creation. Later if we change sysctl.ip6.auto_flowlabels, the ipv6_pinfo.autoflowlabel isn't changed, so the sock will keep the old behavior in terms of auto flowlabel. Reset packet is suffering from this problem, because reset packet is sent from a special control socket, which is created at boot time. Since sysctl.ipv6.auto_flowlabels is 1 by default, the control socket will always have its ipv6_pinfo.autoflowlabel set, even after user set sysctl.ipv6.auto_flowlabels to 1, so reset packset will always have flowlabel. Normal sock created before sysctl setting suffers from the same issue. We can't even turn off autoflowlabel unless we kill all socks in the hosts. To fix this, if IPV6_AUTOFLOWLABEL sockopt is used, we use the autoflowlabel setting from user, otherwise we always call ip6_default_np_autolabel() which has the new settings of sysctl. Note, this changes behavior a little bit. Before commit 42240901f7c4 (ipv6: Implement different admin modes for automatic flow labels), the autoflowlabel behavior of a sock isn't sticky, eg, if sysctl changes, existing connection will change autoflowlabel behavior. After that commit, autoflowlabel behavior is sticky in the whole life of the sock. With this patch, the behavior isn't sticky again. Cc: Martin KaFai Lau Cc: Eric Dumazet Cc: Tom Herbert Signed-off-by: Shaohua Li Signed-off-by: David S. Miller --- include/linux/ipv6.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index cb18c6290ca8..8415bf1a9776 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -273,7 +273,8 @@ struct ipv6_pinfo { * 100: prefer care-of address */ dontfrag:1, - autoflowlabel:1; + autoflowlabel:1, + autoflowlabel_set:1; __u8 min_hopcount; __u8 tclass; __be32 rcv_flowinfo; -- cgit v1.2.3 From 71a0ff65a21bf3e2c4fde208c4a635ed2bbb4e81 Mon Sep 17 00:00:00 2001 From: Majd Dibbiny Date: Thu, 21 Dec 2017 17:38:26 +0200 Subject: IB/mlx5: Fix congestion counters in LAG mode Congestion counters are counted and queried per physical function. When working in LAG mode, CNP packets can be sent or received on both of the functions, thus congestion counters should be aggregated from the two physical functions. Fixes: e1f24a79f424 ("IB/mlx5: Support congestion related counters") Signed-off-by: Majd Dibbiny Reviewed-by: Aviv Heller Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/linux/mlx5/driver.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index a886b51511ab..8846919356ca 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1164,6 +1164,10 @@ int mlx5_cmd_create_vport_lag(struct mlx5_core_dev *dev); int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev); bool mlx5_lag_is_active(struct mlx5_core_dev *dev); struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev); +int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev, + u64 *values, + int num_counters, + size_t *offsets); struct mlx5_uars_page *mlx5_get_uars_page(struct mlx5_core_dev *mdev); void mlx5_put_uars_page(struct mlx5_core_dev *mdev, struct mlx5_uars_page *up); -- cgit v1.2.3 From aa8c6248f8c75acfd610fe15d8cae23cf70d9d09 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Dec 2017 15:07:36 +0100 Subject: x86/mm/pti: Add infrastructure for page table isolation Add the initial files for kernel page table isolation, with a minimal init function and the boot time detection for this misfeature. Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Signed-off-by: Ingo Molnar --- include/linux/pti.h | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 include/linux/pti.h (limited to 'include/linux') diff --git a/include/linux/pti.h b/include/linux/pti.h new file mode 100644 index 000000000000..0174883a935a --- /dev/null +++ b/include/linux/pti.h @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef _INCLUDE_PTI_H +#define _INCLUDE_PTI_H + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +#include +#else +static inline void pti_init(void) { } +#endif + +#endif -- cgit v1.2.3 From 39c3fd58952d7599d367c84c1330b785d91d6088 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 2 Dec 2017 18:11:04 +0100 Subject: kernel/irq: Extend lockdep class for request mutex The IRQ code already has support for lockdep class for the lock mutex in an interrupt descriptor. Extend this to add a second class for the request mutex in the descriptor. Not having a class is resulting in false positive splats in some code paths. Signed-off-by: Andrew Lunn Signed-off-by: Thomas Gleixner Acked-by: linus.walleij@linaro.org Cc: grygorii.strashko@ti.com Cc: f.fainelli@gmail.com Link: https://lkml.kernel.org/r/1512234664-21555-1-git-send-email-andrew@lunn.ch --- include/linux/gpio/driver.h | 33 +++++++++++++++++++++------------ include/linux/irqdesc.h | 9 ++++++--- 2 files changed, 27 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 55e672592fa9..7258cd676df4 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -66,9 +66,10 @@ struct gpio_irq_chip { /** * @lock_key: * - * Per GPIO IRQ chip lockdep class. + * Per GPIO IRQ chip lockdep classes. */ struct lock_class_key *lock_key; + struct lock_class_key *request_key; /** * @parent_handler: @@ -323,7 +324,8 @@ extern const char *gpiochip_is_requested(struct gpio_chip *chip, /* add/remove chips */ extern int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data, - struct lock_class_key *lock_key); + struct lock_class_key *lock_key, + struct lock_class_key *request_key); /** * gpiochip_add_data() - register a gpio_chip @@ -350,11 +352,13 @@ extern int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data, */ #ifdef CONFIG_LOCKDEP #define gpiochip_add_data(chip, data) ({ \ - static struct lock_class_key key; \ - gpiochip_add_data_with_key(chip, data, &key); \ + static struct lock_class_key lock_key; \ + static struct lock_class_key request_key; \ + gpiochip_add_data_with_key(chip, data, &lock_key, \ + &request_key); \ }) #else -#define gpiochip_add_data(chip, data) gpiochip_add_data_with_key(chip, data, NULL) +#define gpiochip_add_data(chip, data) gpiochip_add_data_with_key(chip, data, NULL, NULL) #endif static inline int gpiochip_add(struct gpio_chip *chip) @@ -429,7 +433,8 @@ int gpiochip_irqchip_add_key(struct gpio_chip *gpiochip, irq_flow_handler_t handler, unsigned int type, bool threaded, - struct lock_class_key *lock_key); + struct lock_class_key *lock_key, + struct lock_class_key *request_key); #ifdef CONFIG_LOCKDEP @@ -445,10 +450,12 @@ static inline int gpiochip_irqchip_add(struct gpio_chip *gpiochip, irq_flow_handler_t handler, unsigned int type) { - static struct lock_class_key key; + static struct lock_class_key lock_key; + static struct lock_class_key request_key; return gpiochip_irqchip_add_key(gpiochip, irqchip, first_irq, - handler, type, false, &key); + handler, type, false, + &lock_key, &request_key); } static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gpiochip, @@ -458,10 +465,12 @@ static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gpiochip, unsigned int type) { - static struct lock_class_key key; + static struct lock_class_key lock_key; + static struct lock_class_key request_key; return gpiochip_irqchip_add_key(gpiochip, irqchip, first_irq, - handler, type, true, &key); + handler, type, true, + &lock_key, &request_key); } #else static inline int gpiochip_irqchip_add(struct gpio_chip *gpiochip, @@ -471,7 +480,7 @@ static inline int gpiochip_irqchip_add(struct gpio_chip *gpiochip, unsigned int type) { return gpiochip_irqchip_add_key(gpiochip, irqchip, first_irq, - handler, type, false, NULL); + handler, type, false, NULL, NULL); } static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gpiochip, @@ -481,7 +490,7 @@ static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gpiochip, unsigned int type) { return gpiochip_irqchip_add_key(gpiochip, irqchip, first_irq, - handler, type, true, NULL); + handler, type, true, NULL, NULL); } #endif /* CONFIG_LOCKDEP */ diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 39fb3700f7a9..25b33b664537 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -255,12 +255,15 @@ static inline bool irq_is_percpu_devid(unsigned int irq) } static inline void -irq_set_lockdep_class(unsigned int irq, struct lock_class_key *class) +irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class, + struct lock_class_key *request_class) { struct irq_desc *desc = irq_to_desc(irq); - if (desc) - lockdep_set_class(&desc->lock, class); + if (desc) { + lockdep_set_class(&desc->lock, lock_class); + lockdep_set_class(&desc->request_mutex, request_class); + } } #ifdef CONFIG_IRQ_PREFLOW_FASTEOI -- cgit v1.2.3 From 466a2b42d67644447a1765276259a3ea5531ddff Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 21 Dec 2017 02:22:45 +0100 Subject: cpufreq: schedutil: Use idle_calls counter of the remote CPU Since the recent remote cpufreq callback work, its possible that a cpufreq update is triggered from a remote CPU. For single policies however, the current code uses the local CPU when trying to determine if the remote sg_cpu entered idle or is busy. This is incorrect. To remedy this, compare with the nohz tick idle_calls counter of the remote CPU. Fixes: 674e75411fc2 (sched: cpufreq: Allow remote cpufreq callbacks) Acked-by: Viresh Kumar Acked-by: Peter Zijlstra (Intel) Signed-off-by: Joel Fernandes Cc: 4.14+ # 4.14+ Signed-off-by: Rafael J. Wysocki --- include/linux/tick.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/tick.h b/include/linux/tick.h index f442d1a42025..7cc35921218e 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -119,6 +119,7 @@ extern void tick_nohz_idle_exit(void); extern void tick_nohz_irq_exit(void); extern ktime_t tick_nohz_get_sleep_length(void); extern unsigned long tick_nohz_get_idle_calls(void); +extern unsigned long tick_nohz_get_idle_calls_cpu(int cpu); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); #else /* !CONFIG_NO_HZ_COMMON */ -- cgit v1.2.3 From 69790ba92b8d67eaee5e50b30a5b696d40664caf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 29 Dec 2017 16:44:34 +0100 Subject: genirq: Introduce IRQD_CAN_RESERVE flag Add a new flag to mark interrupts which can use reservation mode. This is going to be used in subsequent patches to disable reservation mode for a certain class of MSI devices. Signed-off-by: Thomas Gleixner Tested-by: Alexandru Chirvasitu Tested-by: Andy Shevchenko Cc: Dou Liyang Cc: Pavel Machek Cc: Maciej W. Rozycki Cc: Mikael Pettersson Cc: Josh Poulson Cc: Mihai Costache Cc: Stephen Hemminger Cc: Marc Zyngier Cc: linux-pci@vger.kernel.org Cc: Haiyang Zhang Cc: Dexuan Cui Cc: Simon Xiao Cc: Saeed Mahameed Cc: Jork Loeser Cc: Bjorn Helgaas Cc: devel@linuxdriverproject.org Cc: KY Srinivasan Cc: Alan Cox Cc: Sakari Ailus , Cc: linux-media@vger.kernel.org --- include/linux/irq.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index e140f69163b6..a0231e96a578 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -212,6 +212,7 @@ struct irq_data { * mask. Applies only to affinity managed irqs. * IRQD_SINGLE_TARGET - IRQ allows only a single affinity target * IRQD_DEFAULT_TRIGGER_SET - Expected trigger already been set + * IRQD_CAN_RESERVE - Can use reservation mode */ enum { IRQD_TRIGGER_MASK = 0xf, @@ -233,6 +234,7 @@ enum { IRQD_MANAGED_SHUTDOWN = (1 << 23), IRQD_SINGLE_TARGET = (1 << 24), IRQD_DEFAULT_TRIGGER_SET = (1 << 25), + IRQD_CAN_RESERVE = (1 << 26), }; #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) @@ -377,6 +379,21 @@ static inline bool irqd_is_managed_and_shutdown(struct irq_data *d) return __irqd_to_state(d) & IRQD_MANAGED_SHUTDOWN; } +static inline void irqd_set_can_reserve(struct irq_data *d) +{ + __irqd_to_state(d) |= IRQD_CAN_RESERVE; +} + +static inline void irqd_clr_can_reserve(struct irq_data *d) +{ + __irqd_to_state(d) &= ~IRQD_CAN_RESERVE; +} + +static inline bool irqd_can_reserve(struct irq_data *d) +{ + return __irqd_to_state(d) & IRQD_CAN_RESERVE; +} + #undef __irqd_to_state static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) -- cgit v1.2.3 From 702cb0a02813299d6911b775c637906ae21b737d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 29 Dec 2017 16:59:06 +0100 Subject: genirq/irqdomain: Rename early argument of irq_domain_activate_irq() The 'early' argument of irq_domain_activate_irq() is actually used to denote reservation mode. To avoid confusion, rename it before abuse happens. No functional change. Fixes: 72491643469a ("genirq/irqdomain: Update irq_domain_ops.activate() signature") Signed-off-by: Thomas Gleixner Cc: Alexandru Chirvasitu Cc: Andy Shevchenko Cc: Dou Liyang Cc: Pavel Machek Cc: Maciej W. Rozycki Cc: Mikael Pettersson Cc: Josh Poulson Cc: Mihai Costache Cc: Stephen Hemminger Cc: Marc Zyngier Cc: linux-pci@vger.kernel.org Cc: Haiyang Zhang Cc: Dexuan Cui Cc: Simon Xiao Cc: Saeed Mahameed Cc: Jork Loeser Cc: Bjorn Helgaas Cc: devel@linuxdriverproject.org Cc: KY Srinivasan Cc: Alan Cox Cc: Sakari Ailus , Cc: linux-media@vger.kernel.org --- include/linux/irqdomain.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index a34355d19546..48c7e86bb556 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -113,7 +113,7 @@ struct irq_domain_ops { unsigned int nr_irqs, void *arg); void (*free)(struct irq_domain *d, unsigned int virq, unsigned int nr_irqs); - int (*activate)(struct irq_domain *d, struct irq_data *irqd, bool early); + int (*activate)(struct irq_domain *d, struct irq_data *irqd, bool reserve); void (*deactivate)(struct irq_domain *d, struct irq_data *irq_data); int (*translate)(struct irq_domain *d, struct irq_fwspec *fwspec, unsigned long *out_hwirq, unsigned int *out_type); -- cgit v1.2.3 From 26456f87aca7157c057de65c9414b37f1ab881d1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 27 Dec 2017 21:37:25 +0100 Subject: timers: Reinitialize per cpu bases on hotplug The timer wheel bases are not (re)initialized on CPU hotplug. That leaves them with a potentially stale clk and next_expiry valuem, which can cause trouble then the CPU is plugged. Add a prepare callback which forwards the clock, sets next_expiry to far in the future and reset the control flags to a known state. Set base->must_forward_clk so the first timer which is queued will try to forward the clock to current jiffies. Fixes: 500462a9de65 ("timers: Switch to a non-cascading wheel") Reported-by: Paul E. McKenney Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Sebastian Siewior Cc: Anna-Maria Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712272152200.2431@nanos --- include/linux/cpuhotplug.h | 2 +- include/linux/timer.h | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 201ab7267986..1a32e558eb11 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -86,7 +86,7 @@ enum cpuhp_state { CPUHP_MM_ZSWP_POOL_PREPARE, CPUHP_KVM_PPC_BOOK3S_PREPARE, CPUHP_ZCOMP_PREPARE, - CPUHP_TIMERS_DEAD, + CPUHP_TIMERS_PREPARE, CPUHP_MIPS_SOC_PREPARE, CPUHP_BP_PREPARE_DYN, CPUHP_BP_PREPARE_DYN_END = CPUHP_BP_PREPARE_DYN + 20, diff --git a/include/linux/timer.h b/include/linux/timer.h index 04af640ea95b..2448f9cc48a3 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -207,9 +207,11 @@ unsigned long round_jiffies_up(unsigned long j); unsigned long round_jiffies_up_relative(unsigned long j); #ifdef CONFIG_HOTPLUG_CPU +int timers_prepare_cpu(unsigned int cpu); int timers_dead_cpu(unsigned int cpu); #else -#define timers_dead_cpu NULL +#define timers_prepare_cpu NULL +#define timers_dead_cpu NULL #endif #endif -- cgit v1.2.3