From d7dcf26ff0ffd7b56fe2b09ed7f1867589f3cdf1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 1 Mar 2019 23:48:21 +0100 Subject: softirq: Remove tasklet_hrtimer There are no more users of this interface. Remove it. Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: David S. Miller Cc: netdev@vger.kernel.org Link: https://lkml.kernel.org/r/20190301224821.29843-4-bigeasy@linutronix.de --- include/linux/interrupt.h | 25 ------------------------- 1 file changed, 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 690b238a44d5..c7eef32e7739 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -668,31 +668,6 @@ extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu); extern void tasklet_init(struct tasklet_struct *t, void (*func)(unsigned long), unsigned long data); -struct tasklet_hrtimer { - struct hrtimer timer; - struct tasklet_struct tasklet; - enum hrtimer_restart (*function)(struct hrtimer *); -}; - -extern void -tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer, - enum hrtimer_restart (*function)(struct hrtimer *), - clockid_t which_clock, enum hrtimer_mode mode); - -static inline -void tasklet_hrtimer_start(struct tasklet_hrtimer *ttimer, ktime_t time, - const enum hrtimer_mode mode) -{ - hrtimer_start(&ttimer->timer, time, mode); -} - -static inline -void tasklet_hrtimer_cancel(struct tasklet_hrtimer *ttimer) -{ - hrtimer_cancel(&ttimer->timer); - tasklet_kill(&ttimer->tasklet); -} - /* * Autoprobing for irqs: * -- cgit v1.2.3 From 1b72d43237980eab9b6ae6bb8181e51c840377e6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 Mar 2019 16:39:20 +0100 Subject: tick: Remove outgoing CPU from broadcast masks Valentin reported that unplugging a CPU occasionally results in a warning in the tick broadcast code which is triggered when an offline CPU is in the broadcast mask. This happens because the outgoing CPU is not removing itself from the broadcast masks, especially not from the broadcast_force_mask. The removal happens on the control CPU after the outgoing CPU is dead. It's a long standing issue, but the warning is harmless. Rework the hotplug mechanism so that the outgoing CPU removes itself from the broadcast masks after disabling interrupts and removing itself from the online mask. Reported-by: Valentin Schneider Signed-off-by: Thomas Gleixner Tested-by: Valentin Schneider Cc: Frederic Weisbecker Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1903211540180.1784@nanos.tec.linutronix.de --- include/linux/tick.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tick.h b/include/linux/tick.h index 55388ab45fd4..76acb48acdb7 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -68,6 +68,12 @@ extern void tick_broadcast_control(enum tick_broadcast_mode mode); static inline void tick_broadcast_control(enum tick_broadcast_mode mode) { } #endif /* BROADCAST */ +#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_HOTPLUG_CPU) +extern void tick_offline_cpu(unsigned int cpu); +#else +static inline void tick_offline_cpu(unsigned int cpu) { } +#endif + #ifdef CONFIG_GENERIC_CLOCKEVENTS extern int tick_broadcast_oneshot_control(enum tick_broadcast_state state); #else -- cgit v1.2.3 From b699cce1604e828f19c39845252626eb78cdf38a Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Mon, 11 Mar 2019 17:28:03 +0530 Subject: rcu: Do a single rhp->func read in rcu_head_after_call_rcu() The rcu_head_after_call_rcu() function reads the rhp->func pointer twice, which can result in a false-positive WARN_ON_ONCE() if the callback were passed to call_rcu() between the two reads. Although racing rcu_head_after_call_rcu() with call_rcu() is to be a dubious use case (the return value is not reliable in that case), intermittent and irreproducible warnings are also quite dubious. This commit therefore uses a single READ_ONCE() to pick up the value of rhp->func once, then tests that value twice, thus guaranteeing consistent processing within rcu_head_after_call_rcu()(). Neverthless, racing rcu_head_after_call_rcu() with call_rcu() is still a dubious use case. Signed-off-by: Neeraj Upadhyay [ paulmck: Add blank line after declaration per checkpatch.pl. ] Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 6cdb1db776cf..922bb6848813 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -878,9 +878,11 @@ static inline void rcu_head_init(struct rcu_head *rhp) static inline bool rcu_head_after_call_rcu(struct rcu_head *rhp, rcu_callback_t f) { - if (READ_ONCE(rhp->func) == f) + rcu_callback_t func = READ_ONCE(rhp->func); + + if (func == f) return true; - WARN_ON_ONCE(READ_ONCE(rhp->func) != (rcu_callback_t)~0L); + WARN_ON_ONCE(func != (rcu_callback_t)~0L); return false; } -- cgit v1.2.3 From f5ad3991493c69d203d42b94d32349b54c58a3f1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 13 Feb 2019 13:54:37 -0800 Subject: srcu: Remove cleanup_srcu_struct_quiesced() The cleanup_srcu_struct_quiesced() function was added because NVME used WQ_MEM_RECLAIM workqueues and SRCU did not, which meant that NVME workqueues waiting on SRCU workqueues could result in deadlocks during low-memory conditions. However, SRCU now also has WQ_MEM_RECLAIM workqueues, so there is no longer a potential for deadlock. Furthermore, it turns out to be extremely hard to use cleanup_srcu_struct_quiesced() correctly due to the fact that SRCU callback invocation accesses the srcu_struct structure's per-CPU data area just after callbacks are invoked. Therefore, the usual practice of using srcu_barrier() to wait for callbacks to be invoked before invoking cleanup_srcu_struct_quiesced() fails because SRCU's callback-invocation workqueue handler might be delayed, which can result in cleanup_srcu_struct_quiesced() being invoked (and thus freeing the per-CPU data) before the SRCU's callback-invocation workqueue handler is finished using that per-CPU data. Nor is this a theoretical problem: KASAN emitted use-after-free warnings because of this problem on actual runs. In short, NVME can now safely invoke cleanup_srcu_struct(), which avoids the use-after-free scenario. And cleanup_srcu_struct_quiesced() is quite difficult to use safely. This commit therefore removes cleanup_srcu_struct_quiesced(), switching its sole user back to cleanup_srcu_struct(). This effectively reverts the following pair of commits: f7194ac32ca2 ("srcu: Add cleanup_srcu_struct_quiesced()") 4317228ad9b8 ("nvme: Avoid flush dependency in delete controller flow") Reported-by: Bart Van Assche Signed-off-by: Paul E. McKenney Reviewed-by: Bart Van Assche Tested-by: Bart Van Assche --- include/linux/srcu.h | 36 +----------------------------------- 1 file changed, 1 insertion(+), 35 deletions(-) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index c495b2d51569..e432cc92c73d 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -56,45 +56,11 @@ struct srcu_struct { }; void call_srcu(struct srcu_struct *ssp, struct rcu_head *head, void (*func)(struct rcu_head *head)); -void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced); +void cleanup_srcu_struct(struct srcu_struct *ssp); int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp); void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp); void synchronize_srcu(struct srcu_struct *ssp); -/** - * cleanup_srcu_struct - deconstruct a sleep-RCU structure - * @ssp: structure to clean up. - * - * Must invoke this after you are finished using a given srcu_struct that - * was initialized via init_srcu_struct(), else you leak memory. - */ -static inline void cleanup_srcu_struct(struct srcu_struct *ssp) -{ - _cleanup_srcu_struct(ssp, false); -} - -/** - * cleanup_srcu_struct_quiesced - deconstruct a quiesced sleep-RCU structure - * @ssp: structure to clean up. - * - * Must invoke this after you are finished using a given srcu_struct that - * was initialized via init_srcu_struct(), else you leak memory. Also, - * all grace-period processing must have completed. - * - * "Completed" means that the last synchronize_srcu() and - * synchronize_srcu_expedited() calls must have returned before the call - * to cleanup_srcu_struct_quiesced(). It also means that the callback - * from the last call_srcu() must have been invoked before the call to - * cleanup_srcu_struct_quiesced(), but you can use srcu_barrier() to help - * with this last. Violating these rules will get you a WARN_ON() splat - * (with high probability, anyway), and will also cause the srcu_struct - * to be leaked. - */ -static inline void cleanup_srcu_struct_quiesced(struct srcu_struct *ssp) -{ - _cleanup_srcu_struct(ssp, true); -} - #ifdef CONFIG_DEBUG_LOCK_ALLOC /** -- cgit v1.2.3 From 7f07e5f1f778605e98cf2156d4db1ff3a3a1a74a Mon Sep 17 00:00:00 2001 From: Claudiu Manoil Date: Tue, 26 Mar 2019 11:48:57 +0200 Subject: net: mii: Fix PAUSE cap advertisement from linkmode_adv_to_lcl_adv_t() helper With a recent link mode advertisement code update this helper providing local pause capability translation used for flow control link mode negotiation got broken. For eth drivers using this helper, the issue is apparent only if either PAUSE or ASYM_PAUSE is being advertised. Fixes: 3c1bcc8614db ("net: ethernet: Convert phydev advertize and supported from u32 to link mode") Signed-off-by: Claudiu Manoil Signed-off-by: David S. Miller --- include/linux/mii.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mii.h b/include/linux/mii.h index 6fee8b1a4400..5cd824c1c0ca 100644 --- a/include/linux/mii.h +++ b/include/linux/mii.h @@ -469,7 +469,7 @@ static inline u32 linkmode_adv_to_lcl_adv_t(unsigned long *advertising) if (linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT, advertising)) lcl_adv |= ADVERTISE_PAUSE_CAP; - if (linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT, + if (linkmode_test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, advertising)) lcl_adv |= ADVERTISE_PAUSE_ASYM; -- cgit v1.2.3 From 7a8e61f8478639072d402a26789055a4a4de8f77 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 23 Mar 2019 11:36:19 +0100 Subject: timekeeping: Force upper bound for setting CLOCK_REALTIME Several people reported testing failures after setting CLOCK_REALTIME close to the limits of the kernel internal representation in nanoseconds, i.e. year 2262. The failures are exposed in subsequent operations, i.e. when arming timers or when the advancing CLOCK_MONOTONIC makes the calculation of CLOCK_REALTIME overflow into negative space. Now people start to paper over the underlying problem by clamping calculations to the valid range, but that's just wrong because such workarounds will prevent detection of real issues as well. It is reasonable to force an upper bound for the various methods of setting CLOCK_REALTIME. Year 2262 is the absolute upper bound. Assume a maximum uptime of 30 years which is plenty enough even for esoteric embedded systems. That results in an upper bound of year 2232 for setting the time. Once that limit is reached in reality this limit is only a small part of the problem space. But until then this stops people from trying to paper over the problem at the wrong places. Reported-by: Xiongfeng Wang Reported-by: Hongbo Yao Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Stephen Boyd Cc: Miroslav Lichvar Cc: Arnd Bergmann Cc: Richard Cochran Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1903231125480.2157@nanos.tec.linutronix.de --- include/linux/time64.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/time64.h b/include/linux/time64.h index f38d382ffec1..a620ee610b9f 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -33,6 +33,17 @@ struct itimerspec64 { #define KTIME_MAX ((s64)~((u64)1 << 63)) #define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) +/* + * Limits for settimeofday(): + * + * To prevent setting the time close to the wraparound point time setting + * is limited so a reasonable uptime can be accomodated. Uptime of 30 years + * should be really sufficient, which means the cutoff is 2232. At that + * point the cutoff is just a small part of the larger problem. + */ +#define TIME_UPTIME_SEC_MAX (30LL * 365 * 24 *3600) +#define TIME_SETTOD_SEC_MAX (KTIME_SEC_MAX - TIME_UPTIME_SEC_MAX) + static inline int timespec64_equal(const struct timespec64 *a, const struct timespec64 *b) { @@ -100,6 +111,16 @@ static inline bool timespec64_valid_strict(const struct timespec64 *ts) return true; } +static inline bool timespec64_valid_settod(const struct timespec64 *ts) +{ + if (!timespec64_valid(ts)) + return false; + /* Disallow values which cause overflow issues vs. CLOCK_REALTIME */ + if ((unsigned long long)ts->tv_sec >= TIME_SETTOD_SEC_MAX) + return false; + return true; +} + /** * timespec64_to_ns - Convert timespec64 to nanoseconds * @ts: pointer to the timespec64 variable to be converted -- cgit v1.2.3 From 0fca08122eaf5c956a2cbe12775245d747f8b1ac Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 28 Mar 2019 20:34:28 +0100 Subject: efi: Unify DMI setup code over the arm/arm64, ia64 and x86 architectures All architectures (arm/arm64, ia64 and x86) do the same here, so unify the code. Note: We do not need to call dump_stack_set_arch_desc() in case of !dmi_available. Both strings, dmi_ids_string and dump_stack_arch_ desc_str are initialized zero and thus nothing would change. Signed-off-by: Robert Richter Signed-off-by: Ard Biesheuvel Reviewed-by: Jean Delvare Cc: Linus Torvalds Cc: Matt Fleming Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/20190328193429.21373-5-ard.biesheuvel@linaro.org Signed-off-by: Ingo Molnar --- include/linux/dmi.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dmi.h b/include/linux/dmi.h index c46fdb36700b..8de8c4f15163 100644 --- a/include/linux/dmi.h +++ b/include/linux/dmi.h @@ -102,9 +102,7 @@ const struct dmi_system_id *dmi_first_match(const struct dmi_system_id *list); extern const char * dmi_get_system_info(int field); extern const struct dmi_device * dmi_find_device(int type, const char *name, const struct dmi_device *from); -extern void dmi_scan_machine(void); -extern void dmi_memdev_walk(void); -extern void dmi_set_dump_stack_arch_desc(void); +extern void dmi_setup(void); extern bool dmi_get_date(int field, int *yearp, int *monthp, int *dayp); extern int dmi_get_bios_year(void); extern int dmi_name_in_vendors(const char *str); @@ -122,9 +120,7 @@ static inline int dmi_check_system(const struct dmi_system_id *list) { return 0; static inline const char * dmi_get_system_info(int field) { return NULL; } static inline const struct dmi_device * dmi_find_device(int type, const char *name, const struct dmi_device *from) { return NULL; } -static inline void dmi_scan_machine(void) { return; } -static inline void dmi_memdev_walk(void) { } -static inline void dmi_set_dump_stack_arch_desc(void) { } +static inline void dmi_setup(void) { } static inline bool dmi_get_date(int field, int *yearp, int *monthp, int *dayp) { if (yearp) -- cgit v1.2.3 From 80a2a9026b24c6bd34b8d58256973e22270bedec Mon Sep 17 00:00:00 2001 From: Yuval Avnery Date: Mon, 11 Mar 2019 06:18:24 +0200 Subject: net/mlx5e: Add a lock on tir list Refresh tirs is looping over a global list of tirs while netdevs are adding and removing tirs from that list. That is why a lock is required. Fixes: 724b2aa15126 ("net/mlx5e: TIRs management refactoring") Signed-off-by: Yuval Avnery Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 022541dc5dbf..0d0729648844 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -594,6 +594,8 @@ enum mlx5_pagefault_type_flags { }; struct mlx5_td { + /* protects tirs list changes while tirs refresh */ + struct mutex list_lock; struct list_head tirs_list; u32 tdn; }; -- cgit v1.2.3 From 817b4d64da036f5559297a2fdb82b8b14f4ffdcd Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 18 Mar 2019 23:00:54 +0300 Subject: ACPI / utils: Introduce acpi_dev_get_first_match_dev() helper The acpi_dev_get_first_match_name() is missing put_device() call and thus keeping reference counting unbalanced. In order to fix the issue introduce a new helper to convert existing users one-by-one to a better API. Signed-off-by: Andy Shevchenko Reviewed-by: Hans de Goede Reviewed-by: Mika Westerberg Acked-by: Mark Brown Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index d5dcebd7aad3..3e1d16b00513 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -669,6 +669,12 @@ static inline bool acpi_dev_present(const char *hid, const char *uid, s64 hrv) return false; } +static inline struct acpi_device * +acpi_dev_get_first_match_dev(const char *hid, const char *uid, s64 hrv) +{ + return NULL; +} + static inline const char * acpi_dev_get_first_match_name(const char *hid, const char *uid, s64 hrv) { -- cgit v1.2.3 From 257f9053c0204ea47491aa236004fd1226f75fa8 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 28 Mar 2019 19:17:29 +0200 Subject: ACPI / utils: Remove deprecated function since no user left There is no more user of acpi_dev_get_first_match_name(), which is deprecated and has no user left, so, remove it for good. Signed-off-by: Andy Shevchenko Reviewed-by: Mika Westerberg Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 3e1d16b00513..392413075cc0 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -675,12 +675,6 @@ acpi_dev_get_first_match_dev(const char *hid, const char *uid, s64 hrv) return NULL; } -static inline const char * -acpi_dev_get_first_match_name(const char *hid, const char *uid, s64 hrv) -{ - return NULL; -} - static inline bool is_acpi_node(struct fwnode_handle *fwnode) { return false; -- cgit v1.2.3 From 5a25e3f7cc538fb49e11267c1e41c54ccf83835e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 26 Mar 2019 12:15:13 +0100 Subject: cpufreq: intel_pstate: Driver-specific handling of _PPC updates In some cases, the platform firmware disables or enables turbo frequencies for all CPUs globally before triggering a _PPC change notification for one of them. Obviously, that global change affects all CPUs, not just the notified one, and it needs to be acted upon by cpufreq. The intel_pstate driver is able to detect such global changes of the settings, but it also needs to update policy limits for all CPUs if that happens, in particular if turbo frequencies are enabled globally - to allow them to be used. For this reason, introduce a new cpufreq driver callback to be invoked on _PPC notifications, if present, instead of simply calling cpufreq_update_policy() for the notified CPU and make intel_pstate use it to trigger policy updates for all CPUs in the system if global settings change. Link: https://bugzilla.kernel.org/show_bug.cgi?id=200759 Reported-by: Gabriele Mazzotta Tested-by: Gabriele Mazzotta Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- include/linux/cpufreq.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index b160e98076e3..5005ea40364f 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -195,6 +195,7 @@ void disable_cpufreq(void); u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy); int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu); void cpufreq_update_policy(unsigned int cpu); +void cpufreq_update_limits(unsigned int cpu); bool have_governor_per_policy(void); struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy); void cpufreq_enable_fast_switch(struct cpufreq_policy *policy); @@ -322,6 +323,9 @@ struct cpufreq_driver { /* should be defined, if possible */ unsigned int (*get)(unsigned int cpu); + /* Called to update policy limits on firmware notifications. */ + void (*update_limits)(unsigned int cpu); + /* optional */ int (*bios_limit)(int cpu, unsigned int *limit); -- cgit v1.2.3 From b5dee3130bb4014511f5d0dd46855ed843e3fdc8 Mon Sep 17 00:00:00 2001 From: Harry Pan Date: Mon, 25 Feb 2019 20:36:41 +0800 Subject: PM / sleep: Refactor filesystems sync to reduce duplication Create a common helper to sync filesystems for system suspend and hibernation. Signed-off-by: Harry Pan Acked-by: Pavel Machek [ rjw: Changelog ] Signed-off-by: Rafael J. Wysocki --- include/linux/suspend.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 3f529ad9a9d2..6b3ea9ea6a9e 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -425,6 +425,7 @@ void restore_processor_state(void); /* kernel/power/main.c */ extern int register_pm_notifier(struct notifier_block *nb); extern int unregister_pm_notifier(struct notifier_block *nb); +extern void ksys_sync_helper(void); #define pm_notifier(fn, pri) { \ static struct notifier_block fn##_nb = \ @@ -462,6 +463,8 @@ static inline int unregister_pm_notifier(struct notifier_block *nb) return 0; } +static inline void ksys_sync_helper(void) {} + #define pm_notifier(fn, pri) do { (void)(fn); } while (0) static inline bool pm_wakeup_pending(void) { return false; } -- cgit v1.2.3 From de7b77e5bb9451417ca57f1b6501da654587c048 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 27 Mar 2019 07:00:29 -0500 Subject: cpu/hotplug: Create SMT sysfs interface for all arches Make the /sys/devices/system/cpu/smt/* files available on all arches, so user space has a consistent way to detect whether SMT is enabled. The 'control' file now shows 'notimplemented' for architectures which don't yet have CONFIG_HOTPLUG_SMT. [ tglx: Make notimplemented a real state ] Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Cc: Andrea Arcangeli Cc: Waiman Long Cc: Peter Zijlstra Cc: Jiri Kosina Link: https://lkml.kernel.org/r/469c2b98055f2c41e75748e06447d592a64080c9.1553635520.git.jpoimboe@redhat.com --- include/linux/cpu.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 5041357d0297..ae99dde02320 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -175,6 +175,7 @@ enum cpuhp_smt_control { CPU_SMT_DISABLED, CPU_SMT_FORCE_DISABLED, CPU_SMT_NOT_SUPPORTED, + CPU_SMT_NOT_IMPLEMENTED, }; #if defined(CONFIG_SMP) && defined(CONFIG_HOTPLUG_SMT) @@ -182,7 +183,7 @@ extern enum cpuhp_smt_control cpu_smt_control; extern void cpu_smt_disable(bool force); extern void cpu_smt_check_topology(void); #else -# define cpu_smt_control (CPU_SMT_ENABLED) +# define cpu_smt_control (CPU_SMT_NOT_IMPLEMENTED) static inline void cpu_smt_disable(bool force) { } static inline void cpu_smt_check_topology(void) { } #endif -- cgit v1.2.3 From fad9fab975cb9fae651854c811cb07a30bc2b98a Mon Sep 17 00:00:00 2001 From: Thor Thayer Date: Tue, 2 Apr 2019 17:40:56 +0200 Subject: EDAC/altera, firmware/intel: Add Stratix10 ECC DBE SMC call Reserve ECC Double Bit Error SMC call to alert U-Boot that a DBE has occurred. Move the call from local EDAC header file to a common header. [ bp: Merge the two patches. ] Signed-off-by: Thor Thayer Signed-off-by: Borislav Petkov Reviewed-by: Richard Gong Reviewed-by: Alan Tull # firmware Cc: Greg KH Cc: James Morse Cc: linux-edac Cc: mchehab@kernel.org Link: https://lkml.kernel.org/r/1553870639-23895-1-git-send-email-thor.thayer@linux.intel.com Signed-off-by: Borislav Petkov --- include/linux/firmware/intel/stratix10-smc.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/intel/stratix10-smc.h b/include/linux/firmware/intel/stratix10-smc.h index 5be5dab50b13..01684d935580 100644 --- a/include/linux/firmware/intel/stratix10-smc.h +++ b/include/linux/firmware/intel/stratix10-smc.h @@ -309,4 +309,23 @@ INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_FPGA_CONFIG_COMPLETED_WRITE) #define INTEL_SIP_SMC_FUNCID_RSU_UPDATE 12 #define INTEL_SIP_SMC_RSU_UPDATE \ INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_RSU_UPDATE) + +/* + * Request INTEL_SIP_SMC_ECC_DBE + * + * Sync call used by service driver at EL1 to alert EL3 that a Double + * Bit ECC error has occurred. + * + * Call register usage: + * a0 INTEL_SIP_SMC_ECC_DBE + * a1 SysManager Double Bit Error value + * a2-7 not used + * + * Return status + * a0 INTEL_SIP_SMC_STATUS_OK + */ +#define INTEL_SIP_SMC_FUNCID_ECC_DBE 13 +#define INTEL_SIP_SMC_ECC_DBE \ + INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_ECC_DBE) + #endif -- cgit v1.2.3 From 37686b1353cfc30e127cef811959cdbcd0495d98 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 7 Mar 2019 11:48:02 -0600 Subject: tracing: Improve "if" macro code generation With CONFIG_PROFILE_ALL_BRANCHES=y, the "if" macro converts the conditional to an array index. This can cause GCC to create horrible code. When there are nested ifs, the generated code uses register values to encode branching decisions. Make it easier for GCC to optimize by keeping the conditional as a conditional rather than converting it to an integer. This shrinks the generated code quite a bit, and also makes the code sane enough for objtool to understand. Reported-by: Peter Zijlstra Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Linus Torvalds Cc: Thomas Gleixner Cc: brgerst@gmail.com Cc: catalin.marinas@arm.com Cc: dvlasenk@redhat.com Cc: dvyukov@google.com Cc: hpa@zytor.com Cc: james.morse@arm.com Cc: julien.thierry@arm.com Cc: luto@amacapital.net Cc: luto@kernel.org Cc: rostedt@goodmis.org Cc: valentin.schneider@arm.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190307174802.46fmpysxyo35hh43@treble Signed-off-by: Ingo Molnar --- include/linux/compiler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 445348facea9..d58aa0db05f9 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -67,7 +67,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, .line = __LINE__, \ }; \ ______r = !!(cond); \ - ______f.miss_hit[______r]++; \ + ______r ? ______f.miss_hit[1]++ : ______f.miss_hit[0]++;\ ______r; \ })) #endif /* CONFIG_PROFILE_ALL_BRANCHES */ -- cgit v1.2.3 From e74deb11931ff682b59d5b9d387f7115f689698e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 3 Apr 2019 09:39:48 +0200 Subject: x86/uaccess: Introduce user_access_{save,restore}() Introduce common helpers for when we need to safely suspend a uaccess section; for instance to generate a {KA,UB}SAN report. Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/uaccess.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 37b226e8df13..2b70130af585 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -268,6 +268,8 @@ extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count); #define user_access_end() do { } while (0) #define unsafe_get_user(x, ptr, err) do { if (unlikely(__get_user(x, ptr))) goto err; } while (0) #define unsafe_put_user(x, ptr, err) do { if (unlikely(__put_user(x, ptr))) goto err; } while (0) +static inline unsigned long user_access_save(void) { return 0UL; } +static inline void user_access_restore(unsigned long flags) { } #endif #ifdef CONFIG_HARDENED_USERCOPY -- cgit v1.2.3 From 1279e41d535e28cc3b56fa4a09e71a709641cae6 Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Wed, 3 Apr 2019 14:54:24 +0800 Subject: perf/headers: Fix stale comment for struct perf_addr_filter The @inode field has been removed after: 9511bce9fe8e ("perf/core: Fix bad use of igrab()") Update the description. Signed-off-by: Shaokun Zhang Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Song Liu Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: https://lkml.kernel.org/r/1554274464-5739-1-git-send-email-zhangshaokun@hisilicon.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index e47ef764f613..085a95e2582a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -464,7 +464,7 @@ enum perf_addr_filter_action_t { /** * struct perf_addr_filter - address range filter definition * @entry: event's filter list linkage - * @inode: object file's inode for file-based filters + * @path: object file's path for file-based filters * @offset: filter range offset * @size: filter range size (size==0 means single address trigger) * @action: filter/start/stop -- cgit v1.2.3 From a0fe2c6479aab5723239b315ef1b552673f434a3 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 29 Mar 2019 22:46:49 +0100 Subject: linux/kernel.h: Use parentheses around argument in u64_to_user_ptr() Use parentheses around uses of the argument in u64_to_user_ptr() to ensure that the cast doesn't apply to part of the argument. There are existing uses of the macro of the form u64_to_user_ptr(A + B) which expands to (void __user *)(uintptr_t)A + B (the cast applies to the first operand of the addition, the addition is a pointer addition). This happens to still work as intended, the semantic difference doesn't cause a difference in behavior. But I want to use u64_to_user_ptr() with a ternary operator in the argument, like so: u64_to_user_ptr(A ? B : C) This currently doesn't work as intended. Signed-off-by: Jann Horn Signed-off-by: Borislav Petkov Reviewed-by: Mukesh Ojha Cc: Andrei Vagin Cc: Andrew Morton Cc: Dan Carpenter Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jani Nikula Cc: Kees Cook Cc: Masahiro Yamada Cc: NeilBrown Cc: Peter Zijlstra Cc: Qiaowei Ren Cc: Thomas Gleixner Cc: x86-ml Link: https://lkml.kernel.org/r/20190329214652.258477-1-jannh@google.com --- include/linux/kernel.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 34a5036debd3..2d14e21c16c0 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -47,8 +47,8 @@ #define u64_to_user_ptr(x) ( \ { \ - typecheck(u64, x); \ - (void __user *)(uintptr_t)x; \ + typecheck(u64, (x)); \ + (void __user *)(uintptr_t)(x); \ } \ ) -- cgit v1.2.3 From 994aeb7a93e43d28f6074195ccb03a384342e1bf Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Wed, 20 Mar 2019 20:34:24 -0400 Subject: sched_domain: Annotate RCU pointers properly The scheduler uses RCU API in various places to access sched_domain pointers. These cause sparse errors as below. Many new errors show up because of an annotation check I added to rcu_assign_pointer(). Let us annotate the pointers correctly which also will help sparse catch any potential future bugs. This fixes the following sparse errors: rt.c:1681:9: error: incompatible types in comparison expression deadline.c:1904:9: error: incompatible types in comparison expression core.c:519:9: error: incompatible types in comparison expression core.c:1634:17: error: incompatible types in comparison expression fair.c:6193:14: error: incompatible types in comparison expression fair.c:9883:22: error: incompatible types in comparison expression fair.c:9897:9: error: incompatible types in comparison expression sched.h:1287:9: error: incompatible types in comparison expression topology.c:612:9: error: incompatible types in comparison expression topology.c:615:9: error: incompatible types in comparison expression sched.h:1300:9: error: incompatible types in comparison expression topology.c:618:9: error: incompatible types in comparison expression sched.h:1287:9: error: incompatible types in comparison expression topology.c:621:9: error: incompatible types in comparison expression sched.h:1300:9: error: incompatible types in comparison expression topology.c:624:9: error: incompatible types in comparison expression topology.c:671:9: error: incompatible types in comparison expression stats.c:45:17: error: incompatible types in comparison expression fair.c:5998:15: error: incompatible types in comparison expression fair.c:5989:15: error: incompatible types in comparison expression fair.c:5998:15: error: incompatible types in comparison expression fair.c:5989:15: error: incompatible types in comparison expression fair.c:6120:19: error: incompatible types in comparison expression fair.c:6506:14: error: incompatible types in comparison expression fair.c:6515:14: error: incompatible types in comparison expression fair.c:6623:9: error: incompatible types in comparison expression fair.c:5970:17: error: incompatible types in comparison expression fair.c:8642:21: error: incompatible types in comparison expression fair.c:9253:9: error: incompatible types in comparison expression fair.c:9331:9: error: incompatible types in comparison expression fair.c:9519:15: error: incompatible types in comparison expression fair.c:9533:14: error: incompatible types in comparison expression fair.c:9542:14: error: incompatible types in comparison expression fair.c:9567:14: error: incompatible types in comparison expression fair.c:9597:14: error: incompatible types in comparison expression fair.c:9421:16: error: incompatible types in comparison expression fair.c:9421:16: error: incompatible types in comparison expression Signed-off-by: Joel Fernandes (Google) Signed-off-by: Peter Zijlstra (Intel) [ From an RCU perspective. ] Reviewed-by: Paul E. McKenney Cc: Josh Triplett Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Luc Van Oostenryck Cc: Mathieu Desnoyers Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: keescook@chromium.org Cc: kernel-hardening@lists.openwall.com Cc: kernel-team@android.com Link: https://lkml.kernel.org/r/20190321003426.160260-3-joel@joelfernandes.org Signed-off-by: Ingo Molnar --- include/linux/sched/topology.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 57c7ed3fe465..cfc0a89a7159 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -76,8 +76,8 @@ struct sched_domain_shared { struct sched_domain { /* These fields must be setup */ - struct sched_domain *parent; /* top domain must be null terminated */ - struct sched_domain *child; /* bottom domain must be null terminated */ + struct sched_domain __rcu *parent; /* top domain must be null terminated */ + struct sched_domain __rcu *child; /* bottom domain must be null terminated */ struct sched_group *groups; /* the balancing groups of the domain */ unsigned long min_interval; /* Minimum balance interval ms */ unsigned long max_interval; /* Maximum balance interval ms */ -- cgit v1.2.3 From 03f4b48edae7d936c5bf23488c1ce3fbe7fc2733 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Wed, 20 Mar 2019 20:34:25 -0400 Subject: rcuwait: Annotate task_struct with __rcu This suppresses sparse error generated due to the recently added rcu_assign_pointer sparse check. percpu-rwsem.c:162:9: sparse: error: incompatible types in comparison expression exit.c:316:16: sparse: error: incompatible types in comparison expression Signed-off-by: Joel Fernandes (Google) Signed-off-by: Peter Zijlstra (Intel) [ From an RCU perspective. ] Reviewed-by: Paul E. McKenney Cc: Josh Triplett Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Luc Van Oostenryck Cc: Mathieu Desnoyers Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: keescook@chromium.org Cc: kernel-hardening@lists.openwall.com Cc: kernel-team@android.com Link: https://lkml.kernel.org/r/20190321003426.160260-4-joel@joelfernandes.org Signed-off-by: Ingo Molnar --- include/linux/rcuwait.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h index 90bfa3279a01..563290fc194f 100644 --- a/include/linux/rcuwait.h +++ b/include/linux/rcuwait.h @@ -18,7 +18,7 @@ * awoken. */ struct rcuwait { - struct task_struct *task; + struct task_struct __rcu *task; }; #define __RCUWAIT_INITIALIZER(name) \ -- cgit v1.2.3 From 46ad0840b1584b92b5ff2cc3ed0b011dd6b8e0f1 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 22 Mar 2019 10:30:06 -0400 Subject: locking/rwsem: Remove arch specific rwsem files As the generic rwsem-xadd code is using the appropriate acquire and release versions of the atomic operations, the arch specific rwsem.h files will not be that much faster than the generic code as long as the atomic functions are properly implemented. So we can remove those arch specific rwsem.h and stop building asm/rwsem.h to reduce maintenance effort. Currently, only x86, alpha and ia64 have implemented architecture specific fast paths. I don't have access to alpha and ia64 systems for testing, but they are legacy systems that are not likely to be updated to the latest kernel anyway. By using a rwsem microbenchmark, the total locking rates on a 4-socket 56-core 112-thread x86-64 system before and after the patch were as follows (mixed means equal # of read and write locks): Before Patch After Patch # of Threads wlock rlock mixed wlock rlock mixed ------------ ----- ----- ----- ----- ----- ----- 1 29,201 30,143 29,458 28,615 30,172 29,201 2 6,807 13,299 1,171 7,725 15,025 1,804 4 6,504 12,755 1,520 7,127 14,286 1,345 8 6,762 13,412 764 6,826 13,652 726 16 6,693 15,408 662 6,599 15,938 626 32 6,145 15,286 496 5,549 15,487 511 64 5,812 15,495 60 5,858 15,572 60 There were some run-to-run variations for the multi-thread tests. For x86-64, using the generic C code fast path seems to be a little bit faster than the assembly version with low lock contention. Looking at the assembly version of the fast paths, there are assembly to/from C code wrappers that save and restore all the callee-clobbered registers (7 registers on x86-64). The assembly generated from the generic C code doesn't need to do that. That may explain the slight performance gain here. The generic asm rwsem.h can also be merged into kernel/locking/rwsem.h with no code change as no other code other than those under kernel/locking needs to access the internal rwsem macros and functions. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Acked-by: Linus Torvalds Cc: Andrew Morton Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: linux-c6x-dev@linux-c6x.org Cc: linux-m68k@lists.linux-m68k.org Cc: linux-riscv@lists.infradead.org Cc: linux-um@lists.infradead.org Cc: linux-xtensa@linux-xtensa.org Cc: linuxppc-dev@lists.ozlabs.org Cc: nios2-dev@lists.rocketboards.org Cc: openrisc@lists.librecores.org Cc: uclinux-h8-devel@lists.sourceforge.jp Link: https://lkml.kernel.org/r/20190322143008.21313-2-longman@redhat.com Signed-off-by: Ingo Molnar --- include/linux/rwsem.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 67dbb57508b1..6e56006b2cb6 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -57,15 +57,13 @@ extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *); extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); -/* Include the arch specific part */ -#include - /* In all implementations count != 0 means locked */ static inline int rwsem_is_locked(struct rw_semaphore *sem) { return atomic_long_read(&sem->count) != 0; } +#define RWSEM_UNLOCKED_VALUE 0L #define __RWSEM_INIT_COUNT(name) .count = ATOMIC_LONG_INIT(RWSEM_UNLOCKED_VALUE) #endif -- cgit v1.2.3 From 390a0c62c23cb026cd4664a66f6f45fed3a215f6 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 22 Mar 2019 10:30:07 -0400 Subject: locking/rwsem: Remove rwsem-spinlock.c & use rwsem-xadd.c for all archs Currently, we have two different implementation of rwsem: 1) CONFIG_RWSEM_GENERIC_SPINLOCK (rwsem-spinlock.c) 2) CONFIG_RWSEM_XCHGADD_ALGORITHM (rwsem-xadd.c) As we are going to use a single generic implementation for rwsem-xadd.c and no architecture-specific code will be needed, there is no point in keeping two different implementations of rwsem. In most cases, the performance of rwsem-spinlock.c will be worse. It also doesn't get all the performance tuning and optimizations that had been implemented in rwsem-xadd.c over the years. For simplication, we are going to remove rwsem-spinlock.c and make all architectures use a single implementation of rwsem - rwsem-xadd.c. All references to RWSEM_GENERIC_SPINLOCK and RWSEM_XCHGADD_ALGORITHM in the code are removed. Suggested-by: Peter Zijlstra Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Acked-by: Linus Torvalds Cc: Andrew Morton Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Paul E. McKenney Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: linux-c6x-dev@linux-c6x.org Cc: linux-m68k@lists.linux-m68k.org Cc: linux-riscv@lists.infradead.org Cc: linux-um@lists.infradead.org Cc: linux-xtensa@linux-xtensa.org Cc: linuxppc-dev@lists.ozlabs.org Cc: nios2-dev@lists.rocketboards.org Cc: openrisc@lists.librecores.org Cc: uclinux-h8-devel@lists.sourceforge.jp Link: https://lkml.kernel.org/r/20190322143008.21313-3-longman@redhat.com Signed-off-by: Ingo Molnar --- include/linux/rwsem-spinlock.h | 47 ------------------------------------------ include/linux/rwsem.h | 5 ----- 2 files changed, 52 deletions(-) delete mode 100644 include/linux/rwsem-spinlock.h (limited to 'include/linux') diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h deleted file mode 100644 index e47568363e5e..000000000000 --- a/include/linux/rwsem-spinlock.h +++ /dev/null @@ -1,47 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* rwsem-spinlock.h: fallback C implementation - * - * Copyright (c) 2001 David Howells (dhowells@redhat.com). - * - Derived partially from ideas by Andrea Arcangeli - * - Derived also from comments by Linus - */ - -#ifndef _LINUX_RWSEM_SPINLOCK_H -#define _LINUX_RWSEM_SPINLOCK_H - -#ifndef _LINUX_RWSEM_H -#error "please don't include linux/rwsem-spinlock.h directly, use linux/rwsem.h instead" -#endif - -#ifdef __KERNEL__ -/* - * the rw-semaphore definition - * - if count is 0 then there are no active readers or writers - * - if count is +ve then that is the number of active readers - * - if count is -1 then there is one active writer - * - if wait_list is not empty, then there are processes waiting for the semaphore - */ -struct rw_semaphore { - __s32 count; - raw_spinlock_t wait_lock; - struct list_head wait_list; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif -}; - -#define RWSEM_UNLOCKED_VALUE 0x00000000 - -extern void __down_read(struct rw_semaphore *sem); -extern int __must_check __down_read_killable(struct rw_semaphore *sem); -extern int __down_read_trylock(struct rw_semaphore *sem); -extern void __down_write(struct rw_semaphore *sem); -extern int __must_check __down_write_killable(struct rw_semaphore *sem); -extern int __down_write_trylock(struct rw_semaphore *sem); -extern void __up_read(struct rw_semaphore *sem); -extern void __up_write(struct rw_semaphore *sem); -extern void __downgrade_write(struct rw_semaphore *sem); -extern int rwsem_is_locked(struct rw_semaphore *sem); - -#endif /* __KERNEL__ */ -#endif /* _LINUX_RWSEM_SPINLOCK_H */ diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 6e56006b2cb6..0fc41062c649 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -22,10 +22,6 @@ struct rw_semaphore; -#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK -#include /* use a generic implementation */ -#define __RWSEM_INIT_COUNT(name) .count = RWSEM_UNLOCKED_VALUE -#else /* All arch specific implementations share the same struct */ struct rw_semaphore { atomic_long_t count; @@ -65,7 +61,6 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem) #define RWSEM_UNLOCKED_VALUE 0L #define __RWSEM_INIT_COUNT(name) .count = ATOMIC_LONG_INIT(RWSEM_UNLOCKED_VALUE) -#endif /* Common initializer macros and functions */ -- cgit v1.2.3 From 24e516049360eda85cf3fe9903221d43886c2689 Mon Sep 17 00:00:00 2001 From: Neil Leeder Date: Tue, 26 Mar 2019 15:17:50 +0000 Subject: ACPI/IORT: Add support for PMCG Add support for the SMMU Performance Monitor Counter Group information from ACPI. This is in preparation for its use in the SMMUv3 PMU driver. Signed-off-by: Neil Leeder Signed-off-by: Hanjun Guo Signed-off-by: Shameer Kolothum Reviewed-by: Robin Murphy Acked-by: Lorenzo Pieralisi Signed-off-by: Will Deacon --- include/linux/acpi_iort.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h index 38cd77b39a64..052ef7b9f985 100644 --- a/include/linux/acpi_iort.h +++ b/include/linux/acpi_iort.h @@ -26,6 +26,13 @@ #define IORT_IRQ_MASK(irq) (irq & 0xffffffffULL) #define IORT_IRQ_TRIGGER_MASK(irq) ((irq >> 32) & 0xffffffffULL) +/* + * PMCG model identifiers for use in smmu pmu driver. Please note + * that this is purely for the use of software and has nothing to + * do with hardware or with IORT specification. + */ +#define IORT_SMMU_V3_PMCG_GENERIC 0x00000000 /* Generic SMMUv3 PMCG */ + int iort_register_domain_token(int trans_id, phys_addr_t base, struct fwnode_handle *fw_node); void iort_deregister_domain_token(int trans_id); -- cgit v1.2.3 From 631b7abacd02b88f4b0795c08b54ad4fc3e7c7c0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 7 Nov 2016 16:26:35 -0500 Subject: ptrace: Remove maxargs from task_current_syscall() task_current_syscall() has a single user that passes in 6 for maxargs, which is the maximum arguments that can be used to get system calls from syscall_get_arguments(). Instead of passing in a number of arguments to grab, just get 6 arguments. The args argument even specifies that it's an array of 6 items. This will also allow changing syscall_get_arguments() to not get a variable number of arguments, but always grab 6. Linus also suggested not passing in a bunch of arguments to task_current_syscall() but to instead pass in a pointer to a structure, and just fill the structure. struct seccomp_data has almost all the parameters that is needed except for the stack pointer (sp). As seccomp_data is part of uapi, and I'm afraid to change it, a new structure was created "syscall_info", which includes seccomp_data and adds the "sp" field. Link: http://lkml.kernel.org/r/20161107213233.466776454@goodmis.org Cc: Andy Lutomirski Cc: Alexey Dobriyan Cc: Oleg Nesterov Cc: Kees Cook Cc: Al Viro Cc: linux-fsdevel@vger.kernel.org Reviewed-by: Thomas Gleixner Signed-off-by: Steven Rostedt (VMware) --- include/linux/ptrace.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index edb9b040c94c..d5084ebd9f03 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -9,6 +9,13 @@ #include /* For BUG_ON. */ #include /* For task_active_pid_ns. */ #include +#include + +/* Add sp to seccomp_data, as seccomp is user API, we don't want to modify it */ +struct syscall_info { + __u64 sp; + struct seccomp_data data; +}; extern int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags); @@ -407,9 +414,7 @@ static inline void user_single_step_report(struct pt_regs *regs) #define current_user_stack_pointer() user_stack_pointer(current_pt_regs()) #endif -extern int task_current_syscall(struct task_struct *target, long *callno, - unsigned long args[6], unsigned int maxargs, - unsigned long *sp, unsigned long *pc); +extern int task_current_syscall(struct task_struct *target, struct syscall_info *info); extern void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact); #endif -- cgit v1.2.3 From 24062fe85860debfdae0eeaa495f27c9971ec163 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Tue, 26 Mar 2019 15:17:53 +0000 Subject: perf/smmuv3: Enable HiSilicon Erratum 162001800 quirk HiSilicon erratum 162001800 describes the limitation of SMMUv3 PMCG implementation on HiSilicon Hip08 platforms. On these platforms, the PMCG event counter registers (SMMU_PMCG_EVCNTRn) are read only and as a result it is not possible to set the initial counter period value on event monitor start. To work around this, the current value of the counter is read and used for delta calculations. OEM information from ACPI header is used to identify the affected hardware platforms. Signed-off-by: Shameer Kolothum Reviewed-by: Hanjun Guo Reviewed-by: Robin Murphy Acked-by: Lorenzo Pieralisi [will: update silicon-errata.txt and add reason string to acpi match] Signed-off-by: Will Deacon --- include/linux/acpi_iort.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h index 052ef7b9f985..723e4dfa1c14 100644 --- a/include/linux/acpi_iort.h +++ b/include/linux/acpi_iort.h @@ -32,6 +32,7 @@ * do with hardware or with IORT specification. */ #define IORT_SMMU_V3_PMCG_GENERIC 0x00000000 /* Generic SMMUv3 PMCG */ +#define IORT_SMMU_V3_PMCG_HISI_HIP08 0x00000001 /* HiSilicon HIP08 PMCG */ int iort_register_domain_token(int trans_id, phys_addr_t base, struct fwnode_handle *fw_node); -- cgit v1.2.3 From 5f074f3e192f10c9fade898b9b3b8812e3d83342 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Fri, 5 Apr 2019 18:38:45 -0700 Subject: lib/string.c: implement a basic bcmp A recent optimization in Clang (r355672) lowers comparisons of the return value of memcmp against zero to comparisons of the return value of bcmp against zero. This helps some platforms that implement bcmp more efficiently than memcmp. glibc simply aliases bcmp to memcmp, but an optimized implementation is in the works. This results in linkage failures for all targets with Clang due to the undefined symbol. For now, just implement bcmp as a tailcail to memcmp to unbreak the build. This routine can be further optimized in the future. Other ideas discussed: * A weak alias was discussed, but breaks for architectures that define their own implementations of memcmp since aliases to declarations are not permitted (only definitions). Arch-specific memcmp implementations typically declare memcmp in C headers, but implement them in assembly. * -ffreestanding also is used sporadically throughout the kernel. * -fno-builtin-bcmp doesn't work when doing LTO. Link: https://bugs.llvm.org/show_bug.cgi?id=41035 Link: https://code.woboq.org/userspace/glibc/string/memcmp.c.html#bcmp Link: https://github.com/llvm/llvm-project/commit/8e16d73346f8091461319a7dfc4ddd18eedcff13 Link: https://github.com/ClangBuiltLinux/linux/issues/416 Link: http://lkml.kernel.org/r/20190313211335.165605-1-ndesaulniers@google.com Signed-off-by: Nick Desaulniers Reported-by: Nathan Chancellor Reported-by: Adhemerval Zanella Suggested-by: Arnd Bergmann Suggested-by: James Y Knight Suggested-by: Masahiro Yamada Suggested-by: Nathan Chancellor Suggested-by: Rasmus Villemoes Acked-by: Steven Rostedt (VMware) Reviewed-by: Nathan Chancellor Tested-by: Nathan Chancellor Reviewed-by: Masahiro Yamada Reviewed-by: Andy Shevchenko Cc: David Laight Cc: Rasmus Villemoes Cc: Namhyung Kim Cc: Greg Kroah-Hartman Cc: Alexander Shishkin Cc: Dan Williams Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/string.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/string.h b/include/linux/string.h index 7927b875f80c..6ab0a6fa512e 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -150,6 +150,9 @@ extern void * memscan(void *,int,__kernel_size_t); #ifndef __HAVE_ARCH_MEMCMP extern int memcmp(const void *,const void *,__kernel_size_t); #endif +#ifndef __HAVE_ARCH_BCMP +extern int bcmp(const void *,const void *,__kernel_size_t); +#endif #ifndef __HAVE_ARCH_MEMCHR extern void * memchr(const void *,int,__kernel_size_t); #endif -- cgit v1.2.3 From 6147e136ff5071609b54f18982dea87706288e21 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 5 Apr 2019 18:38:53 -0700 Subject: include/linux/bitrev.h: fix constant bitrev clang points out with hundreds of warnings that the bitrev macros have a problem with constant input: drivers/hwmon/sht15.c:187:11: error: variable '__x' is uninitialized when used within its own initialization [-Werror,-Wuninitialized] u8 crc = bitrev8(data->val_status & 0x0F); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ include/linux/bitrev.h:102:21: note: expanded from macro 'bitrev8' __constant_bitrev8(__x) : \ ~~~~~~~~~~~~~~~~~~~^~~~ include/linux/bitrev.h:67:11: note: expanded from macro '__constant_bitrev8' u8 __x = x; \ ~~~ ^ Both the bitrev and the __constant_bitrev macros use an internal variable named __x, which goes horribly wrong when passing one to the other. The obvious fix is to rename one of the variables, so this adds an extra '_'. It seems we got away with this because - there are only a few drivers using bitrev macros - usually there are no constant arguments to those - when they are constant, they tend to be either 0 or (unsigned)-1 (drivers/isdn/i4l/isdnhdlc.o, drivers/iio/amplifiers/ad8366.c) and give the correct result by pure chance. In fact, the only driver that I could find that gets different results with this is drivers/net/wan/slic_ds26522.c, which in turn is a driver for fairly rare hardware (adding the maintainer to Cc for testing). Link: http://lkml.kernel.org/r/20190322140503.123580-1-arnd@arndb.de Fixes: 556d2f055bf6 ("ARM: 8187/1: add CONFIG_HAVE_ARCH_BITREVERSE to support rbit instruction") Signed-off-by: Arnd Bergmann Reviewed-by: Nick Desaulniers Cc: Zhao Qiang Cc: Yalin Wang Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitrev.h | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitrev.h b/include/linux/bitrev.h index 50fb0dee23e8..d35b8ec1c485 100644 --- a/include/linux/bitrev.h +++ b/include/linux/bitrev.h @@ -34,41 +34,41 @@ static inline u32 __bitrev32(u32 x) #define __constant_bitrev32(x) \ ({ \ - u32 __x = x; \ - __x = (__x >> 16) | (__x << 16); \ - __x = ((__x & (u32)0xFF00FF00UL) >> 8) | ((__x & (u32)0x00FF00FFUL) << 8); \ - __x = ((__x & (u32)0xF0F0F0F0UL) >> 4) | ((__x & (u32)0x0F0F0F0FUL) << 4); \ - __x = ((__x & (u32)0xCCCCCCCCUL) >> 2) | ((__x & (u32)0x33333333UL) << 2); \ - __x = ((__x & (u32)0xAAAAAAAAUL) >> 1) | ((__x & (u32)0x55555555UL) << 1); \ - __x; \ + u32 ___x = x; \ + ___x = (___x >> 16) | (___x << 16); \ + ___x = ((___x & (u32)0xFF00FF00UL) >> 8) | ((___x & (u32)0x00FF00FFUL) << 8); \ + ___x = ((___x & (u32)0xF0F0F0F0UL) >> 4) | ((___x & (u32)0x0F0F0F0FUL) << 4); \ + ___x = ((___x & (u32)0xCCCCCCCCUL) >> 2) | ((___x & (u32)0x33333333UL) << 2); \ + ___x = ((___x & (u32)0xAAAAAAAAUL) >> 1) | ((___x & (u32)0x55555555UL) << 1); \ + ___x; \ }) #define __constant_bitrev16(x) \ ({ \ - u16 __x = x; \ - __x = (__x >> 8) | (__x << 8); \ - __x = ((__x & (u16)0xF0F0U) >> 4) | ((__x & (u16)0x0F0FU) << 4); \ - __x = ((__x & (u16)0xCCCCU) >> 2) | ((__x & (u16)0x3333U) << 2); \ - __x = ((__x & (u16)0xAAAAU) >> 1) | ((__x & (u16)0x5555U) << 1); \ - __x; \ + u16 ___x = x; \ + ___x = (___x >> 8) | (___x << 8); \ + ___x = ((___x & (u16)0xF0F0U) >> 4) | ((___x & (u16)0x0F0FU) << 4); \ + ___x = ((___x & (u16)0xCCCCU) >> 2) | ((___x & (u16)0x3333U) << 2); \ + ___x = ((___x & (u16)0xAAAAU) >> 1) | ((___x & (u16)0x5555U) << 1); \ + ___x; \ }) #define __constant_bitrev8x4(x) \ ({ \ - u32 __x = x; \ - __x = ((__x & (u32)0xF0F0F0F0UL) >> 4) | ((__x & (u32)0x0F0F0F0FUL) << 4); \ - __x = ((__x & (u32)0xCCCCCCCCUL) >> 2) | ((__x & (u32)0x33333333UL) << 2); \ - __x = ((__x & (u32)0xAAAAAAAAUL) >> 1) | ((__x & (u32)0x55555555UL) << 1); \ - __x; \ + u32 ___x = x; \ + ___x = ((___x & (u32)0xF0F0F0F0UL) >> 4) | ((___x & (u32)0x0F0F0F0FUL) << 4); \ + ___x = ((___x & (u32)0xCCCCCCCCUL) >> 2) | ((___x & (u32)0x33333333UL) << 2); \ + ___x = ((___x & (u32)0xAAAAAAAAUL) >> 1) | ((___x & (u32)0x55555555UL) << 1); \ + ___x; \ }) #define __constant_bitrev8(x) \ ({ \ - u8 __x = x; \ - __x = (__x >> 4) | (__x << 4); \ - __x = ((__x & (u8)0xCCU) >> 2) | ((__x & (u8)0x33U) << 2); \ - __x = ((__x & (u8)0xAAU) >> 1) | ((__x & (u8)0x55U) << 1); \ - __x; \ + u8 ___x = x; \ + ___x = (___x >> 4) | (___x << 4); \ + ___x = ((___x & (u8)0xCCU) >> 2) | ((___x & (u8)0x33U) << 2); \ + ___x = ((___x & (u8)0xAAU) >> 1) | ((___x & (u8)0x55U) << 1); \ + ___x; \ }) #define bitrev32(x) \ -- cgit v1.2.3 From fcae96ff96538f66e7acd5d4e0f2e7516ff8cbd0 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 5 Apr 2019 18:39:01 -0700 Subject: mm: fix vm_fault_t cast in VM_FAULT_GET_HINDEX() Symmetrically to VM_FAULT_SET_HINDEX(), we need a force-cast in VM_FAULT_GET_HINDEX() to tell sparse that this is intentional. Sparse complains about the current code when building a kernel with CONFIG_MEMORY_FAILURE: arch/x86/mm/fault.c:1058:53: warning: restricted vm_fault_t degrades to integer Link: http://lkml.kernel.org/r/20190327204117.35215-1-jannh@google.com Fixes: 3d3539018d2c ("mm: create the new vm_fault_t type") Signed-off-by: Jann Horn Reviewed-by: Andrew Morton Cc: Souptick Joarder Cc: Matthew Wilcox Cc: Vlastimil Babka Cc: "Kirill A. Shutemov" Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7eade9132f02..4ef4bbe78a1d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -671,7 +671,7 @@ enum vm_fault_reason { /* Encode hstate index for a hwpoisoned large page */ #define VM_FAULT_SET_HINDEX(x) ((__force vm_fault_t)((x) << 16)) -#define VM_FAULT_GET_HINDEX(x) (((x) >> 16) & 0xf) +#define VM_FAULT_GET_HINDEX(x) (((__force unsigned int)(x) >> 16) & 0xf) #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | \ VM_FAULT_SIGSEGV | VM_FAULT_HWPOISON | \ -- cgit v1.2.3 From 0b3d6e6f2dd0a7b697b1aa8c167265908940624b Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Fri, 5 Apr 2019 18:39:18 -0700 Subject: mm: writeback: use exact memcg dirty counts Since commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in memory.stat reporting") memcg dirty and writeback counters are managed as: 1) per-memcg per-cpu values in range of [-32..32] 2) per-memcg atomic counter When a per-cpu counter cannot fit in [-32..32] it's flushed to the atomic. Stat readers only check the atomic. Thus readers such as balance_dirty_pages() may see a nontrivial error margin: 32 pages per cpu. Assuming 100 cpus: 4k x86 page_size: 13 MiB error per memcg 64k ppc page_size: 200 MiB error per memcg Considering that dirty+writeback are used together for some decisions the errors double. This inaccuracy can lead to undeserved oom kills. One nasty case is when all per-cpu counters hold positive values offsetting an atomic negative value (i.e. per_cpu[*]=32, atomic=n_cpu*-32). balance_dirty_pages() only consults the atomic and does not consider throttling the next n_cpu*32 dirty pages. If the file_lru is in the 13..200 MiB range then there's absolutely no dirty throttling, which burdens vmscan with only dirty+writeback pages thus resorting to oom kill. It could be argued that tiny containers are not supported, but it's more subtle. It's the amount the space available for file lru that matters. If a container has memory.max-200MiB of non reclaimable memory, then it will also suffer such oom kills on a 100 cpu machine. The following test reliably ooms without this patch. This patch avoids oom kills. $ cat test mount -t cgroup2 none /dev/cgroup cd /dev/cgroup echo +io +memory > cgroup.subtree_control mkdir test cd test echo 10M > memory.max (echo $BASHPID > cgroup.procs && exec /memcg-writeback-stress /foo) (echo $BASHPID > cgroup.procs && exec dd if=/dev/zero of=/foo bs=2M count=100) $ cat memcg-writeback-stress.c /* * Dirty pages from all but one cpu. * Clean pages from the non dirtying cpu. * This is to stress per cpu counter imbalance. * On a 100 cpu machine: * - per memcg per cpu dirty count is 32 pages for each of 99 cpus * - per memcg atomic is -99*32 pages * - thus the complete dirty limit: sum of all counters 0 * - balance_dirty_pages() only sees atomic count -99*32 pages, which * it max()s to 0. * - So a workload can dirty -99*32 pages before balance_dirty_pages() * cares. */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include static char *buf; static int bufSize; static void set_affinity(int cpu) { cpu_set_t affinity; CPU_ZERO(&affinity); CPU_SET(cpu, &affinity); if (sched_setaffinity(0, sizeof(affinity), &affinity)) err(1, "sched_setaffinity"); } static void dirty_on(int output_fd, int cpu) { int i, wrote; set_affinity(cpu); for (i = 0; i < 32; i++) { for (wrote = 0; wrote < bufSize; ) { int ret = write(output_fd, buf+wrote, bufSize-wrote); if (ret == -1) err(1, "write"); wrote += ret; } } } int main(int argc, char **argv) { int cpu, flush_cpu = 1, output_fd; const char *output; if (argc != 2) errx(1, "usage: output_file"); output = argv[1]; bufSize = getpagesize(); buf = malloc(getpagesize()); if (buf == NULL) errx(1, "malloc failed"); output_fd = open(output, O_CREAT|O_RDWR); if (output_fd == -1) err(1, "open(%s)", output); for (cpu = 0; cpu < get_nprocs(); cpu++) { if (cpu != flush_cpu) dirty_on(output_fd, cpu); } set_affinity(flush_cpu); if (fsync(output_fd)) err(1, "fsync(%s)", output); if (close(output_fd)) err(1, "close(%s)", output); free(buf); } Make balance_dirty_pages() and wb_over_bg_thresh() work harder to collect exact per memcg counters. This avoids the aforementioned oom kills. This does not affect the overhead of memory.stat, which still reads the single atomic counter. Why not use percpu_counter? memcg already handles cpus going offline, so no need for that overhead from percpu_counter. And the percpu_counter spinlocks are more heavyweight than is required. It probably also makes sense to use exact dirty and writeback counters in memcg oom reports. But that is saved for later. Link: http://lkml.kernel.org/r/20190329174609.164344-1-gthelen@google.com Signed-off-by: Greg Thelen Reviewed-by: Roman Gushchin Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Tejun Heo Cc: [4.16+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1f3d880b7ca1..dbb6118370c1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -566,7 +566,10 @@ struct mem_cgroup *lock_page_memcg(struct page *page); void __unlock_page_memcg(struct mem_cgroup *memcg); void unlock_page_memcg(struct page *page); -/* idx can be of type enum memcg_stat_item or node_stat_item */ +/* + * idx can be of type enum memcg_stat_item or node_stat_item. + * Keep in sync with memcg_exact_page_state(). + */ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { -- cgit v1.2.3 From 10dce8af34226d90fa56746a934f8da5dcdba3df Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Tue, 26 Mar 2019 22:20:43 +0000 Subject: fs: stream_open - opener for stream-like files so that read and write can run simultaneously without deadlock Commit 9c225f2655e3 ("vfs: atomic f_pos accesses as per POSIX") added locking for file.f_pos access and in particular made concurrent read and write not possible - now both those functions take f_pos lock for the whole run, and so if e.g. a read is blocked waiting for data, write will deadlock waiting for that read to complete. This caused regression for stream-like files where previously read and write could run simultaneously, but after that patch could not do so anymore. See e.g. commit 581d21a2d02a ("xenbus: fix deadlock on writes to /proc/xen/xenbus") which fixes such regression for particular case of /proc/xen/xenbus. The patch that added f_pos lock in 2014 did so to guarantee POSIX thread safety for read/write/lseek and added the locking to file descriptors of all regular files. In 2014 that thread-safety problem was not new as it was already discussed earlier in 2006. However even though 2006'th version of Linus's patch was adding f_pos locking "only for files that are marked seekable with FMODE_LSEEK (thus avoiding the stream-like objects like pipes and sockets)", the 2014 version - the one that actually made it into the tree as 9c225f2655e3 - is doing so irregardless of whether a file is seekable or not. See https://lore.kernel.org/lkml/53022DB1.4070805@gmail.com/ https://lwn.net/Articles/180387 https://lwn.net/Articles/180396 for historic context. The reason that it did so is, probably, that there are many files that are marked non-seekable, but e.g. their read implementation actually depends on knowing current position to correctly handle the read. Some examples: kernel/power/user.c snapshot_read fs/debugfs/file.c u32_array_read fs/fuse/control.c fuse_conn_waiting_read + ... drivers/hwmon/asus_atk0110.c atk_debugfs_ggrp_read arch/s390/hypfs/inode.c hypfs_read_iter ... Despite that, many nonseekable_open users implement read and write with pure stream semantics - they don't depend on passed ppos at all. And for those cases where read could wait for something inside, it creates a situation similar to xenbus - the write could be never made to go until read is done, and read is waiting for some, potentially external, event, for potentially unbounded time -> deadlock. Besides xenbus, there are 14 such places in the kernel that I've found with semantic patch (see below): drivers/xen/evtchn.c:667:8-24: ERROR: evtchn_fops: .read() can deadlock .write() drivers/isdn/capi/capi.c:963:8-24: ERROR: capi_fops: .read() can deadlock .write() drivers/input/evdev.c:527:1-17: ERROR: evdev_fops: .read() can deadlock .write() drivers/char/pcmcia/cm4000_cs.c:1685:7-23: ERROR: cm4000_fops: .read() can deadlock .write() net/rfkill/core.c:1146:8-24: ERROR: rfkill_fops: .read() can deadlock .write() drivers/s390/char/fs3270.c:488:1-17: ERROR: fs3270_fops: .read() can deadlock .write() drivers/usb/misc/ldusb.c:310:1-17: ERROR: ld_usb_fops: .read() can deadlock .write() drivers/hid/uhid.c:635:1-17: ERROR: uhid_fops: .read() can deadlock .write() net/batman-adv/icmp_socket.c:80:1-17: ERROR: batadv_fops: .read() can deadlock .write() drivers/media/rc/lirc_dev.c:198:1-17: ERROR: lirc_fops: .read() can deadlock .write() drivers/leds/uleds.c:77:1-17: ERROR: uleds_fops: .read() can deadlock .write() drivers/input/misc/uinput.c:400:1-17: ERROR: uinput_fops: .read() can deadlock .write() drivers/infiniband/core/user_mad.c:985:7-23: ERROR: umad_fops: .read() can deadlock .write() drivers/gnss/core.c:45:1-17: ERROR: gnss_fops: .read() can deadlock .write() In addition to the cases above another regression caused by f_pos locking is that now FUSE filesystems that implement open with FOPEN_NONSEEKABLE flag, can no longer implement bidirectional stream-like files - for the same reason as above e.g. read can deadlock write locking on file.f_pos in the kernel. FUSE's FOPEN_NONSEEKABLE was added in 2008 in a7c1b990f715 ("fuse: implement nonseekable open") to support OSSPD. OSSPD implements /dev/dsp in userspace with FOPEN_NONSEEKABLE flag, with corresponding read and write routines not depending on current position at all, and with both read and write being potentially blocking operations: See https://github.com/libfuse/osspd https://lwn.net/Articles/308445 https://github.com/libfuse/osspd/blob/14a9cff0/osspd.c#L1406 https://github.com/libfuse/osspd/blob/14a9cff0/osspd.c#L1438-L1477 https://github.com/libfuse/osspd/blob/14a9cff0/osspd.c#L1479-L1510 Corresponding libfuse example/test also describes FOPEN_NONSEEKABLE as "somewhat pipe-like files ..." with read handler not using offset. However that test implements only read without write and cannot exercise the deadlock scenario: https://github.com/libfuse/libfuse/blob/fuse-3.4.2-3-ga1bff7d/example/poll.c#L124-L131 https://github.com/libfuse/libfuse/blob/fuse-3.4.2-3-ga1bff7d/example/poll.c#L146-L163 https://github.com/libfuse/libfuse/blob/fuse-3.4.2-3-ga1bff7d/example/poll.c#L209-L216 I've actually hit the read vs write deadlock for real while implementing my FUSE filesystem where there is /head/watch file, for which open creates separate bidirectional socket-like stream in between filesystem and its user with both read and write being later performed simultaneously. And there it is semantically not easy to split the stream into two separate read-only and write-only channels: https://lab.nexedi.com/kirr/wendelin.core/blob/f13aa600/wcfs/wcfs.go#L88-169 Let's fix this regression. The plan is: 1. We can't change nonseekable_open to include &~FMODE_ATOMIC_POS - doing so would break many in-kernel nonseekable_open users which actually use ppos in read/write handlers. 2. Add stream_open() to kernel to open stream-like non-seekable file descriptors. Read and write on such file descriptors would never use nor change ppos. And with that property on stream-like files read and write will be running without taking f_pos lock - i.e. read and write could be running simultaneously. 3. With semantic patch search and convert to stream_open all in-kernel nonseekable_open users for which read and write actually do not depend on ppos and where there is no other methods in file_operations which assume @offset access. 4. Add FOPEN_STREAM to fs/fuse/ and open in-kernel file-descriptors via steam_open if that bit is present in filesystem open reply. It was tempting to change fs/fuse/ open handler to use stream_open instead of nonseekable_open on just FOPEN_NONSEEKABLE flags, but grepping through Debian codesearch shows users of FOPEN_NONSEEKABLE, and in particular GVFS which actually uses offset in its read and write handlers https://codesearch.debian.net/search?q=-%3Enonseekable+%3D https://gitlab.gnome.org/GNOME/gvfs/blob/1.40.0-6-gcbc54396/client/gvfsfusedaemon.c#L1080 https://gitlab.gnome.org/GNOME/gvfs/blob/1.40.0-6-gcbc54396/client/gvfsfusedaemon.c#L1247-1346 https://gitlab.gnome.org/GNOME/gvfs/blob/1.40.0-6-gcbc54396/client/gvfsfusedaemon.c#L1399-1481 so if we would do such a change it will break a real user. 5. Add stream_open and FOPEN_STREAM handling to stable kernels starting from v3.14+ (the kernel where 9c225f2655 first appeared). This will allow to patch OSSPD and other FUSE filesystems that provide stream-like files to return FOPEN_STREAM | FOPEN_NONSEEKABLE in their open handler and this way avoid the deadlock on all kernel versions. This should work because fs/fuse/ ignores unknown open flags returned from a filesystem and so passing FOPEN_STREAM to a kernel that is not aware of this flag cannot hurt. In turn the kernel that is not aware of FOPEN_STREAM will be < v3.14 where just FOPEN_NONSEEKABLE is sufficient to implement streams without read vs write deadlock. This patch adds stream_open, converts /proc/xen/xenbus to it and adds semantic patch to automatically locate in-kernel places that are either required to be converted due to read vs write deadlock, or that are just safe to be converted because read and write do not use ppos and there are no other funky methods in file_operations. Regarding semantic patch I've verified each generated change manually - that it is correct to convert - and each other nonseekable_open instance left - that it is either not correct to convert there, or that it is not converted due to current stream_open.cocci limitations. The script also does not convert files that should be valid to convert, but that currently have .llseek = noop_llseek or generic_file_llseek for unknown reason despite file being opened with nonseekable_open (e.g. drivers/input/mousedev.c) Cc: Michael Kerrisk Cc: Yongzhi Pan Cc: Jonathan Corbet Cc: David Vrabel Cc: Juergen Gross Cc: Miklos Szeredi Cc: Tejun Heo Cc: Kirill Tkhai Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Greg Kroah-Hartman Cc: Julia Lawall Cc: Nikolaus Rath Cc: Han-Wen Nienhuys Signed-off-by: Kirill Smelkov Signed-off-by: Linus Torvalds --- include/linux/fs.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 8b42df09b04c..dd28e7679089 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -158,6 +158,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define FMODE_OPENED ((__force fmode_t)0x80000) #define FMODE_CREATED ((__force fmode_t)0x100000) +/* File is stream-like */ +#define FMODE_STREAM ((__force fmode_t)0x200000) + /* File was opened by fanotify and shouldn't generate fanotify events */ #define FMODE_NONOTIFY ((__force fmode_t)0x4000000) @@ -3074,6 +3077,7 @@ extern loff_t no_seek_end_llseek_size(struct file *, loff_t, int, loff_t); extern loff_t no_seek_end_llseek(struct file *, loff_t, int); extern int generic_file_open(struct inode * inode, struct file * filp); extern int nonseekable_open(struct inode * inode, struct file * filp); +extern int stream_open(struct inode * inode, struct file * filp); #ifdef CONFIG_BLOCK typedef void (dio_submit_t)(struct bio *bio, struct inode *inode, -- cgit v1.2.3 From 5861381d486601430cccf64849bd0a226154bc0d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 21 Mar 2019 23:18:01 +0100 Subject: PM / arch: x86: Rework the MSR_IA32_ENERGY_PERF_BIAS handling The current handling of MSR_IA32_ENERGY_PERF_BIAS in the kernel is problematic, because it may cause changes made by user space to that MSR (with the help of the x86_energy_perf_policy tool, for example) to be lost every time a CPU goes offline and then back online as well as during system-wide power management transitions into sleep states and back into the working state. The first problem is that if the current EPB value for a CPU going online is 0 ('performance'), the kernel will change it to 6 ('normal') regardless of whether or not this is the first bring-up of that CPU. That also happens during system-wide resume from sleep states (including, but not limited to, hibernation). However, the EPB may have been adjusted by user space this way and the kernel should not blindly override that setting. The second problem is that if the platform firmware resets the EPB values for any CPUs during system-wide resume from a sleep state, the kernel will not restore their previous EPB values that may have been set by user space before the preceding system-wide suspend transition. Again, that behavior may at least be confusing from the user space perspective. In order to address these issues, rework the handling of MSR_IA32_ENERGY_PERF_BIAS so that the EPB value is saved on CPU offline and restored on CPU online as well as (for the boot CPU) during the syscore stages of system-wide suspend and resume transitions, respectively. However, retain the policy by which the EPB is set to 6 ('normal') on the first bring-up of each CPU if its initial value is 0, based on the observation that 0 may mean 'not initialized' just as well as 'performance' in that case. While at it, move the MSR_IA32_ENERGY_PERF_BIAS handling code into a separate file and document it in Documentation/admin-guide. Fixes: abe48b108247 (x86, intel, power: Initialize MSR_IA32_ENERGY_PERF_BIAS) Fixes: b51ef52df71c (x86/cpu: Restore MSR_IA32_ENERGY_PERF_BIAS after resume) Reported-by: Thomas Renninger Signed-off-by: Rafael J. Wysocki Reviewed-by: Hannes Reinecke Acked-by: Borislav Petkov Acked-by: Thomas Gleixner --- include/linux/cpuhotplug.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index e78281d07b70..dbfdd0fadbef 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -147,6 +147,7 @@ enum cpuhp_state { CPUHP_AP_X86_VDSO_VMA_ONLINE, CPUHP_AP_IRQ_AFFINITY_ONLINE, CPUHP_AP_ARM_MVEBU_SYNC_CLOCKS, + CPUHP_AP_X86_INTEL_EPB_ONLINE, CPUHP_AP_PERF_ONLINE, CPUHP_AP_PERF_X86_ONLINE, CPUHP_AP_PERF_X86_UNCORE_ONLINE, -- cgit v1.2.3 From 9083e4986124389e2a7c0ffca95630a4983887f0 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 26 Mar 2019 12:19:52 +0100 Subject: cpufreq: intel_pstate: Update max frequency on global turbo changes While the cpuinfo.max_freq value doesn't really matter for intel_pstate in the active mode, in the passive mode it is used by governors as the maximum physical frequency of the CPU and the results of governor computations generally depend on it. Also it is made available to user space via sysfs and it should match the current HW configuration. For this reason, make intel_pstate update cpuinfo.max_freq for all CPUs if it detects a global change of turbo frequency settings from "disable" to "enable" or the other way associated with a _PPC change notification from the platform firmware. Note that policy_is_inactive(), cpufreq_cpu_acquire(), cpufreq_cpu_release(), and cpufreq_set_policy() need to be made available to it for this purpose. Link: https://bugzilla.kernel.org/show_bug.cgi?id=200759 Reported-by: Gabriele Mazzotta Tested-by: Gabriele Mazzotta Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- include/linux/cpufreq.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 5005ea40364f..684caf067003 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -178,6 +178,11 @@ static inline struct cpufreq_policy *cpufreq_cpu_get(unsigned int cpu) static inline void cpufreq_cpu_put(struct cpufreq_policy *policy) { } #endif +static inline bool policy_is_inactive(struct cpufreq_policy *policy) +{ + return cpumask_empty(policy->cpus); +} + static inline bool policy_is_shared(struct cpufreq_policy *policy) { return cpumask_weight(policy->cpus) > 1; @@ -193,7 +198,12 @@ unsigned int cpufreq_quick_get_max(unsigned int cpu); void disable_cpufreq(void); u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy); + +struct cpufreq_policy *cpufreq_cpu_acquire(unsigned int cpu); +void cpufreq_cpu_release(struct cpufreq_policy *policy); int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu); +int cpufreq_set_policy(struct cpufreq_policy *policy, + struct cpufreq_policy *new_policy); void cpufreq_update_policy(unsigned int cpu); void cpufreq_update_limits(unsigned int cpu); bool have_governor_per_policy(void); -- cgit v1.2.3 From 60ca1e5a200cd294a12907fa36dece4241db4ab8 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 22 Feb 2019 12:59:59 +0000 Subject: mmiowb: Hook up mmiowb helpers to spinlocks and generic I/O accessors Removing explicit calls to mmiowb() from driver code means that we must now call into the generic mmiowb_spin_{lock,unlock}() functions from the core spinlock code. In order to elide barriers following critical sections without any I/O writes, we also hook into the asm-generic I/O routines. Acked-by: Linus Torvalds Signed-off-by: Will Deacon --- include/linux/spinlock.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index e089157dcf97..ed7c4d6b8235 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -57,6 +57,7 @@ #include #include #include +#include /* @@ -178,6 +179,7 @@ static inline void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock) { __acquire(lock); arch_spin_lock(&lock->raw_lock); + mmiowb_spin_lock(); } #ifndef arch_spin_lock_flags @@ -189,15 +191,22 @@ do_raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long *flags) __acquires(lo { __acquire(lock); arch_spin_lock_flags(&lock->raw_lock, *flags); + mmiowb_spin_lock(); } static inline int do_raw_spin_trylock(raw_spinlock_t *lock) { - return arch_spin_trylock(&(lock)->raw_lock); + int ret = arch_spin_trylock(&(lock)->raw_lock); + + if (ret) + mmiowb_spin_lock(); + + return ret; } static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) { + mmiowb_spin_unlock(); arch_spin_unlock(&lock->raw_lock); __release(lock); } -- cgit v1.2.3 From fb24ea52f78e0d595852e09e3a55697c8f442189 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 22 Feb 2019 17:14:59 +0000 Subject: drivers: Remove explicit invocations of mmiowb() mmiowb() is now implied by spin_unlock() on architectures that require it, so there is no reason to call it from driver code. This patch was generated using coccinelle: @mmiowb@ @@ - mmiowb(); and invoked as: $ for d in drivers include/linux/qed sound; do \ spatch --include-headers --sp-file mmiowb.cocci --dir $d --in-place; done NOTE: mmiowb() has only ever guaranteed ordering in conjunction with spin_unlock(). However, pairing each mmiowb() removal in this patch with the corresponding call to spin_unlock() is not at all trivial, so there is a small chance that this change may regress any drivers incorrectly relying on mmiowb() to order MMIO writes between CPUs using lock-free synchronisation. If you've ended up bisecting to this commit, you can reintroduce the mmiowb() calls using wmb() instead, which should restore the old behaviour on all architectures other than some esoteric ia64 systems. Acked-by: Linus Torvalds Signed-off-by: Will Deacon --- include/linux/qed/qed_if.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index f6165d304b4d..48841e5dab90 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -1338,7 +1338,6 @@ static inline u16 qed_sb_update_sb_idx(struct qed_sb_info *sb_info) } /* Let SB update */ - mmiowb(); return rc; } @@ -1374,7 +1373,6 @@ static inline void qed_sb_ack(struct qed_sb_info *sb_info, /* Both segments (interrupts & acks) are written to same place address; * Need to guarantee all commands will be received (in-order) by HW. */ - mmiowb(); barrier(); } -- cgit v1.2.3 From 1200e07f3ad4b9d976cf2fff3a0c3d9a1faecb3e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 8 Apr 2019 19:02:38 +0800 Subject: block: don't use for-inside-for in bio_for_each_segment_all Commit 6dc4f100c175 ("block: allow bio_for_each_segment_all() to iterate over multi-page bvec") changes bio_for_each_segment_all() to use for-inside-for. This way breaks all bio_for_each_segment_all() call with error out branch via 'break', since now 'break' can only break from the inner loop. Fixes this issue by implementing bio_for_each_segment_all() via single 'for' loop, and now the logic is very similar with normal bvec iterator. Cc: Qu Wenruo Cc: linux-btrfs@vger.kernel.org Cc: linux-fsdevel@vger.kernel.org Cc: Omar Sandoval Reviewed-by: Johannes Thumshirn Reported-and-Tested-by: Qu Wenruo Fixes: 6dc4f100c175 ("block: allow bio_for_each_segment_all() to iterate over multi-page bvec") Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/bio.h | 20 ++++++++++++-------- include/linux/bvec.h | 14 ++++++++++---- 2 files changed, 22 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index bb6090aa165d..e584673c1881 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -120,19 +120,23 @@ static inline bool bio_full(struct bio *bio) return bio->bi_vcnt >= bio->bi_max_vecs; } -#define mp_bvec_for_each_segment(bv, bvl, i, iter_all) \ - for (bv = bvec_init_iter_all(&iter_all); \ - (iter_all.done < (bvl)->bv_len) && \ - (mp_bvec_next_segment((bvl), &iter_all), 1); \ - iter_all.done += bv->bv_len, i += 1) +static inline bool bio_next_segment(const struct bio *bio, + struct bvec_iter_all *iter) +{ + if (iter->idx >= bio->bi_vcnt) + return false; + + bvec_advance(&bio->bi_io_vec[iter->idx], iter); + return true; +} /* * drivers should _never_ use the all version - the bio may have been split * before it got to the driver and the driver won't own all of it */ -#define bio_for_each_segment_all(bvl, bio, i, iter_all) \ - for (i = 0, iter_all.idx = 0; iter_all.idx < (bio)->bi_vcnt; iter_all.idx++) \ - mp_bvec_for_each_segment(bvl, &((bio)->bi_io_vec[iter_all.idx]), i, iter_all) +#define bio_for_each_segment_all(bvl, bio, i, iter) \ + for (i = 0, bvl = bvec_init_iter_all(&iter); \ + bio_next_segment((bio), &iter); i++) static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter, unsigned bytes) diff --git a/include/linux/bvec.h b/include/linux/bvec.h index f6275c4da13a..3bc91879e1e2 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -145,18 +145,18 @@ static inline bool bvec_iter_advance(const struct bio_vec *bv, static inline struct bio_vec *bvec_init_iter_all(struct bvec_iter_all *iter_all) { - iter_all->bv.bv_page = NULL; iter_all->done = 0; + iter_all->idx = 0; return &iter_all->bv; } -static inline void mp_bvec_next_segment(const struct bio_vec *bvec, - struct bvec_iter_all *iter_all) +static inline void bvec_advance(const struct bio_vec *bvec, + struct bvec_iter_all *iter_all) { struct bio_vec *bv = &iter_all->bv; - if (bv->bv_page) { + if (iter_all->done) { bv->bv_page = nth_page(bv->bv_page, 1); bv->bv_offset = 0; } else { @@ -165,6 +165,12 @@ static inline void mp_bvec_next_segment(const struct bio_vec *bvec, } bv->bv_len = min_t(unsigned int, PAGE_SIZE - bv->bv_offset, bvec->bv_len - iter_all->done); + iter_all->done += bv->bv_len; + + if (iter_all->done == bvec->bv_len) { + iter_all->idx++; + iter_all->done = 0; + } } /* -- cgit v1.2.3 From cf94db21905333e610e479688add629397a4b384 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Mon, 8 Apr 2019 14:33:22 +0200 Subject: virtio: Honour 'may_reduce_num' in vring_create_virtqueue vring_create_virtqueue() allows the caller to specify via the may_reduce_num parameter whether the vring code is allowed to allocate a smaller ring than specified. However, the split ring allocation code tries to allocate a smaller ring on allocation failure regardless of what the caller specified. This may cause trouble for e.g. virtio-pci in legacy mode, which does not support ring resizing. (The packed ring code does not resize in any case.) Let's fix this by bailing out immediately in the split ring code if the requested size cannot be allocated and may_reduce_num has not been specified. While at it, fix a typo in the usage instructions. Fixes: 2a2d1382fe9d ("virtio: Add improved queue allocation API") Cc: stable@vger.kernel.org # v4.6+ Signed-off-by: Cornelia Huck Signed-off-by: Michael S. Tsirkin Reviewed-by: Halil Pasic Reviewed-by: Jens Freimann --- include/linux/virtio_ring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index fab02133a919..3dc70adfe5f5 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -63,7 +63,7 @@ struct virtqueue; /* * Creates a virtqueue and allocates the descriptor ring. If * may_reduce_num is set, then this may allocate a smaller ring than - * expected. The caller should query virtqueue_get_ring_size to learn + * expected. The caller should query virtqueue_get_vring_size to learn * the actual size of the ring. */ struct virtqueue *vring_create_virtqueue(unsigned int index, -- cgit v1.2.3 From 49a27e279052cf71ba931a26b00194ff46510480 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 27 Mar 2019 15:35:45 +0100 Subject: PM / Domains: Add generic data pointer to struct genpd_power_state Add a data pointer to the genpd_power_state struct, to allow a genpd backend driver to store per-state specific data. To introduce the pointer, change the way genpd deals with freeing of the corresponding allocated data. More precisely, clarify the responsibility of whom that shall free the data, by adding a ->free_states() callback to the generic_pm_domain structure. The one allocating the data will be expected to set the callback, to allow genpd to invoke it from genpd_remove(). Co-developed-by: Lina Iyer Acked-by: Daniel Lezcano Signed-off-by: Ulf Hansson [ rjw: Subject & changelog ] Signed-off-by: Rafael J. Wysocki --- include/linux/pm_domain.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index 1ed5874bcee0..8e1399231753 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -69,6 +69,7 @@ struct genpd_power_state { s64 residency_ns; struct fwnode_handle *fwnode; ktime_t idle_time; + void *data; }; struct genpd_lock_ops; @@ -110,9 +111,10 @@ struct generic_pm_domain { struct device *dev); unsigned int flags; /* Bit field of configs for genpd */ struct genpd_power_state *states; + void (*free_states)(struct genpd_power_state *states, + unsigned int state_count); unsigned int state_count; /* number of states */ unsigned int state_idx; /* state that genpd will go to when off */ - void *free; /* Free the state that was allocated for default */ ktime_t on_time; ktime_t accounting_time; const struct genpd_lock_ops *lock_ops; -- cgit v1.2.3 From eb594b7325f61835555140922a4cb715264a325c Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 27 Mar 2019 15:35:46 +0100 Subject: PM / Domains: Add support for CPU devices to genpd To enable a CPU device to be attached to a PM domain managed by genpd, make a few changes to it for convenience. To be able to quickly find out what CPUs are attached to a genpd, which typically becomes useful from a genpd governor as subsequent changes are about to show, add a cpumask to struct generic_pm_domain to be updated when a CPU device gets attached to the genpd containing that cpumask. Also, propagate the cpumask changes upwards in the domain hierarchy to the master PM domains. This way, the cpumask for a genpd hierarchically reflects all CPUs attached to the topology below it. Finally, make this an opt-in feature, to avoid having to manage CPUs and the cpumask for a genpd that don't need it. To that end, add a new genpd configuration bit, GENPD_FLAG_CPU_DOMAIN. Co-developed-by: Lina Iyer Acked-by: Daniel Lezcano Signed-off-by: Ulf Hansson [ rjw: Changelog ] Signed-off-by: Rafael J. Wysocki --- include/linux/pm_domain.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index 8e1399231753..a6e251fe9deb 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -16,6 +16,7 @@ #include #include #include +#include /* * Flags to control the behaviour of a genpd. @@ -42,11 +43,22 @@ * GENPD_FLAG_ACTIVE_WAKEUP: Instructs genpd to keep the PM domain powered * on, in case any of its attached devices is used * in the wakeup path to serve system wakeups. + * + * GENPD_FLAG_CPU_DOMAIN: Instructs genpd that it should expect to get + * devices attached, which may belong to CPUs or + * possibly have subdomains with CPUs attached. + * This flag enables the genpd backend driver to + * deploy idle power management support for CPUs + * and groups of CPUs. Note that, the backend + * driver must then comply with the so called, + * last-man-standing algorithm, for the CPUs in the + * PM domain. */ #define GENPD_FLAG_PM_CLK (1U << 0) #define GENPD_FLAG_IRQ_SAFE (1U << 1) #define GENPD_FLAG_ALWAYS_ON (1U << 2) #define GENPD_FLAG_ACTIVE_WAKEUP (1U << 3) +#define GENPD_FLAG_CPU_DOMAIN (1U << 4) enum gpd_status { GPD_STATE_ACTIVE = 0, /* PM domain is active */ @@ -94,6 +106,7 @@ struct generic_pm_domain { unsigned int suspended_count; /* System suspend device counter */ unsigned int prepared_count; /* Suspend counter of prepared devices */ unsigned int performance_state; /* Aggregated max performance state */ + cpumask_var_t cpus; /* A cpumask of the attached CPUs */ int (*power_off)(struct generic_pm_domain *domain); int (*power_on)(struct generic_pm_domain *domain); struct opp_table *opp_table; /* OPP table of the genpd */ -- cgit v1.2.3 From 6f9b83ac877fb5558d76b9f78590f3afd1bdf421 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 27 Mar 2019 15:35:47 +0100 Subject: cpuidle: Export the next timer expiration for CPUs To be able to predict the sleep duration for a CPU entering idle, it is essential to know the expiration time of the next timer. Both the teo and the menu cpuidle governors already use this information for CPU idle state selection. Moving forward, a similar prediction needs to be made for a group of idle CPUs rather than for a single one and the following changes implement a new genpd governor for that purpose. In order to support that feature, add a new function called tick_nohz_get_next_hrtimer() that will return the next hrtimer expiration time of a given CPU to be invoked after deciding whether or not to stop the scheduler tick on that CPU. Make the cpuidle core call tick_nohz_get_next_hrtimer() right before invoking the ->enter() callback provided by the cpuidle driver for the given state and store its return value in the per-CPU struct cpuidle_device, so as to make it available to code outside of cpuidle. Note that at the point when cpuidle calls tick_nohz_get_next_hrtimer(), the governor's ->select() callback has already returned and indicated whether or not the tick should be stopped, so in fact the value returned by tick_nohz_get_next_hrtimer() always is the next hrtimer expiration time for the given CPU, possibly including the tick (if it hasn't been stopped). Co-developed-by: Lina Iyer Co-developed-by: Daniel Lezcano Acked-by: Daniel Lezcano Signed-off-by: Ulf Hansson [ rjw: Subject & changelog ] Signed-off-by: Rafael J. Wysocki --- include/linux/cpuidle.h | 1 + include/linux/tick.h | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 3b39472324a3..bb9a0db89f1a 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -83,6 +83,7 @@ struct cpuidle_device { unsigned int use_deepest_state:1; unsigned int poll_time_limit:1; unsigned int cpu; + ktime_t next_hrtimer; int last_residency; struct cpuidle_state_usage states_usage[CPUIDLE_STATE_MAX]; diff --git a/include/linux/tick.h b/include/linux/tick.h index 55388ab45fd4..8891b5ac3e40 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -122,6 +122,7 @@ extern void tick_nohz_idle_enter(void); extern void tick_nohz_idle_exit(void); extern void tick_nohz_irq_exit(void); extern bool tick_nohz_idle_got_tick(void); +extern ktime_t tick_nohz_get_next_hrtimer(void); extern ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next); extern unsigned long tick_nohz_get_idle_calls(void); extern unsigned long tick_nohz_get_idle_calls_cpu(int cpu); @@ -145,7 +146,11 @@ static inline void tick_nohz_idle_restart_tick(void) { } static inline void tick_nohz_idle_enter(void) { } static inline void tick_nohz_idle_exit(void) { } static inline bool tick_nohz_idle_got_tick(void) { return false; } - +static inline ktime_t tick_nohz_get_next_hrtimer(void) +{ + /* Next wake up is the tick period, assume it starts now */ + return ktime_add(ktime_get(), TICK_NSEC); +} static inline ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) { *delta_next = TICK_NSEC; -- cgit v1.2.3 From 2f36bde0fc8f1ab79d54bd2caa7c1cf874fd2206 Mon Sep 17 00:00:00 2001 From: "Andrew-sh.Cheng" Date: Fri, 29 Mar 2019 14:46:10 +0800 Subject: OPP: Introduce dev_pm_opp_find_freq_ceil_by_volt() This patch introduces a new helper routine in the OPP core, which returns the OPP with the highest frequency which has voltage less than or equal to the target voltage passed to the helper. Signed-off-by: Andrew-sh.Cheng [ Viresh: Massaged the commit log and renamed the helper with some cleanups. ] Signed-off-by: Viresh Kumar --- include/linux/pm_opp.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index 24c757a32a7b..b150fe97ce5a 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -102,6 +102,8 @@ struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev, struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev, unsigned long *freq); +struct dev_pm_opp *dev_pm_opp_find_freq_ceil_by_volt(struct device *dev, + unsigned long u_volt); struct dev_pm_opp *dev_pm_opp_find_freq_ceil(struct device *dev, unsigned long *freq); @@ -207,6 +209,12 @@ static inline struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev, return ERR_PTR(-ENOTSUPP); } +static inline struct dev_pm_opp *dev_pm_opp_find_freq_ceil_by_volt(struct device *dev, + unsigned long u_volt) +{ + return ERR_PTR(-ENOTSUPP); +} + static inline struct dev_pm_opp *dev_pm_opp_find_freq_ceil(struct device *dev, unsigned long *freq) { -- cgit v1.2.3 From 12a30a7fc142a123c61da9623bd824d95d36c12e Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 4 Apr 2019 13:43:12 -0400 Subject: locking/rwsem: Move rwsem internal function declarations to rwsem-xadd.h We don't need to expose rwsem internal functions which are not supposed to be called directly from other kernel code. Signed-off-by: Waiman Long Acked-by: Peter Zijlstra Acked-by: Will Deacon Acked-by: Davidlohr Bueso Cc: Andrew Morton Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Link: http://lkml.kernel.org/r/20190404174320.22416-4-longman@redhat.com Signed-off-by: Ingo Molnar --- include/linux/rwsem.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 0fc41062c649..b44e533235c7 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -46,13 +46,6 @@ struct rw_semaphore { */ #define RWSEM_OWNER_UNKNOWN ((struct task_struct *)-2L) -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_read_failed_killable(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); - /* In all implementations count != 0 means locked */ static inline int rwsem_is_locked(struct rw_semaphore *sem) { -- cgit v1.2.3 From 364f784f048c984721986db90c95ca8350213c91 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 4 Apr 2019 13:43:20 -0400 Subject: locking/rwsem: Optimize rwsem structure for uncontended lock acquisition For an uncontended rwsem, count and owner are the only fields a task needs to touch when acquiring the rwsem. So they are put next to each other to increase the chance that they will share the same cacheline. On a ThunderX2 99xx (arm64) system with 32K L1 cache and 256K L2 cache, a rwsem locking microbenchmark with one locking thread was run to write-lock and write-unlock an array of rwsems separated 2 cachelines apart in a 1M byte memory block. The locking rates (kops/s) of the microbenchmark when the rwsems are at various "long" (8-byte) offsets from beginning of the cacheline before and after the patch were as follows: Cacheline Offset Pre-patch Post-patch ---------------- --------- ---------- 0 17,449 16,588 1 17,450 16,465 2 17,450 16,460 3 17,453 16,462 4 14,867 16,471 5 14,867 16,470 6 14,853 16,464 7 14,867 13,172 Before the patch, the count and owner are 4 "long"s apart. After the patch, they are only 1 "long" apart. The rwsem data have to be loaded from the L3 cache for each access. It can be seen that the locking rates are more consistent after the patch than before. Note that for this particular system, the performance drop happens whenever the count and owner are at an odd multiples of "long"s apart. No performance drop was observed when only a single rwsem was used (hot cache). So the drop is likely just an idiosyncrasy of the cache architecture of this chip than an inherent problem with the patch. Suggested-by: Linus Torvalds Signed-off-by: Waiman Long Acked-by: Peter Zijlstra Cc: Andrew Morton Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Link: http://lkml.kernel.org/r/20190404174320.22416-12-longman@redhat.com Signed-off-by: Ingo Molnar --- include/linux/rwsem.h | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index b44e533235c7..2ea18a3def04 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -20,21 +20,30 @@ #include #endif -struct rw_semaphore; - -/* All arch specific implementations share the same struct */ +/* + * For an uncontended rwsem, count and owner are the only fields a task + * needs to touch when acquiring the rwsem. So they are put next to each + * other to increase the chance that they will share the same cacheline. + * + * In a contended rwsem, the owner is likely the most frequently accessed + * field in the structure as the optimistic waiter that holds the osq lock + * will spin on owner. For an embedded rwsem, other hot fields in the + * containing structure should be moved further away from the rwsem to + * reduce the chance that they will share the same cacheline causing + * cacheline bouncing problem. + */ struct rw_semaphore { atomic_long_t count; - struct list_head wait_list; - raw_spinlock_t wait_lock; #ifdef CONFIG_RWSEM_SPIN_ON_OWNER - struct optimistic_spin_queue osq; /* spinner MCS lock */ /* * Write owner. Used as a speculative check to see * if the owner is running on the cpu. */ struct task_struct *owner; + struct optimistic_spin_queue osq; /* spinner MCS lock */ #endif + raw_spinlock_t wait_lock; + struct list_head wait_list; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif -- cgit v1.2.3 From 1b8f21b74c3c9c82fce5a751d7aefb7cc0b8d33d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 9 Apr 2019 06:31:21 +0800 Subject: blk-mq: introduce blk_mq_complete_request_sync() In NVMe's error handler, follows the typical steps of tearing down hardware for recovering controller: 1) stop blk_mq hw queues 2) stop the real hw queues 3) cancel in-flight requests via blk_mq_tagset_busy_iter(tags, cancel_request, ...) cancel_request(): mark the request as abort blk_mq_complete_request(req); 4) destroy real hw queues However, there may be race between #3 and #4, because blk_mq_complete_request() may run q->mq_ops->complete(rq) remotelly and asynchronously, and ->complete(rq) may be run after #4. This patch introduces blk_mq_complete_request_sync() for fixing the above race. Cc: Sagi Grimberg Cc: Bart Van Assche Cc: James Smart Cc: linux-nvme@lists.infradead.org Reviewed-by: Keith Busch Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index cb2aa7ecafff..db29928de467 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -302,6 +302,7 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); bool blk_mq_complete_request(struct request *rq); +void blk_mq_complete_request_sync(struct request *rq); bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, struct bio *bio); bool blk_mq_queue_stopped(struct request_queue *q); -- cgit v1.2.3 From 7c2e07130090ae001a97a6b65597830d6815e93e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20M=C3=BCller?= Date: Mon, 8 Apr 2019 15:33:54 +0200 Subject: clk: x86: Add system specific quirk to mark clocks as critical MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit 648e921888ad ("clk: x86: Stop marking clocks as CLK_IS_CRITICAL"), the pmc_plt_clocks of the Bay Trail SoC are unconditionally gated off. Unfortunately this will break systems where these clocks are used for external purposes beyond the kernel's knowledge. Fix it by implementing a system specific quirk to mark the necessary pmc_plt_clks as critical. Fixes: 648e921888ad ("clk: x86: Stop marking clocks as CLK_IS_CRITICAL") Signed-off-by: David Müller Signed-off-by: Hans de Goede Reviewed-by: Andy Shevchenko Signed-off-by: Stephen Boyd --- include/linux/platform_data/x86/clk-pmc-atom.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/clk-pmc-atom.h b/include/linux/platform_data/x86/clk-pmc-atom.h index 3ab892208343..7a37ac27d0fb 100644 --- a/include/linux/platform_data/x86/clk-pmc-atom.h +++ b/include/linux/platform_data/x86/clk-pmc-atom.h @@ -35,10 +35,13 @@ struct pmc_clk { * * @base: PMC clock register base offset * @clks: pointer to set of registered clocks, typically 0..5 + * @critical: flag to indicate if firmware enabled pmc_plt_clks + * should be marked as critial or not */ struct pmc_clk_data { void __iomem *base; const struct pmc_clk *clks; + bool critical; }; #endif /* __PLATFORM_DATA_X86_CLK_PMC_ATOM_H */ -- cgit v1.2.3 From 8065a779f17e94536a1c4dcee4f9d88011672f97 Mon Sep 17 00:00:00 2001 From: Si-Wei Liu Date: Mon, 8 Apr 2019 19:45:27 -0400 Subject: failover: allow name change on IFF_UP slave interfaces When a netdev appears through hot plug then gets enslaved by a failover master that is already up and running, the slave will be opened right away after getting enslaved. Today there's a race that userspace (udev) may fail to rename the slave if the kernel (net_failover) opens the slave earlier than when the userspace rename happens. Unlike bond or team, the primary slave of failover can't be renamed by userspace ahead of time, since the kernel initiated auto-enslavement is unable to, or rather, is never meant to be synchronized with the rename request from userspace. As the failover slave interfaces are not designed to be operated directly by userspace apps: IP configuration, filter rules with regard to network traffic passing and etc., should all be done on master interface. In general, userspace apps only care about the name of master interface, while slave names are less important as long as admin users can see reliable names that may carry other information describing the netdev. For e.g., they can infer that "ens3nsby" is a standby slave of "ens3", while for a name like "eth0" they can't tell which master it belongs to. Historically the name of IFF_UP interface can't be changed because there might be admin script or management software that is already relying on such behavior and assumes that the slave name can't be changed once UP. But failover is special: with the in-kernel auto-enslavement mechanism, the userspace expectation for device enumeration and bring-up order is already broken. Previously initramfs and various userspace config tools were modified to bypass failover slaves because of auto-enslavement and duplicate MAC address. Similarly, in case that users care about seeing reliable slave name, the new type of failover slaves needs to be taken care of specifically in userspace anyway. It's less risky to lift up the rename restriction on failover slave which is already UP. Although it's possible this change may potentially break userspace component (most likely configuration scripts or management software) that assumes slave name can't be changed while UP, it's relatively a limited and controllable set among all userspace components, which can be fixed specifically to listen for the rename events on failover slaves. Userspace component interacting with slaves is expected to be changed to operate on failover master interface instead, as the failover slave is dynamic in nature which may come and go at any point. The goal is to make the role of failover slaves less relevant, and userspace components should only deal with failover master in the long run. Fixes: 30c8bd5aa8b2 ("net: Introduce generic failover module") Signed-off-by: Si-Wei Liu Reviewed-by: Liran Alon Acked-by: Sridhar Samudrala Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 26f69cf763f4..324e872c91d1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1500,6 +1500,7 @@ struct net_device_ops { * @IFF_FAILOVER: device is a failover master device * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device * @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device + * @IFF_LIVE_RENAME_OK: rename is allowed while device is up and running */ enum netdev_priv_flags { IFF_802_1Q_VLAN = 1<<0, @@ -1532,6 +1533,7 @@ enum netdev_priv_flags { IFF_FAILOVER = 1<<27, IFF_FAILOVER_SLAVE = 1<<28, IFF_L3MDEV_RX_HANDLER = 1<<29, + IFF_LIVE_RENAME_OK = 1<<30, }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN @@ -1563,6 +1565,7 @@ enum netdev_priv_flags { #define IFF_FAILOVER IFF_FAILOVER #define IFF_FAILOVER_SLAVE IFF_FAILOVER_SLAVE #define IFF_L3MDEV_RX_HANDLER IFF_L3MDEV_RX_HANDLER +#define IFF_LIVE_RENAME_OK IFF_LIVE_RENAME_OK /** * struct net_device - The DEVICE structure. -- cgit v1.2.3 From d808b7f759b50acf0784ce6230ffa63e12ef465d Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 9 Apr 2019 10:03:59 -0600 Subject: nvmet: fix discover log page when offsets are used The nvme target hadn't been taking the Get Log Page offset parameter into consideration, and so has been returning corrupted log pages when offsets are used. Since many tools, including nvme-cli, split the log request to 4k, we've been breaking discovery log responses when more than 3 subsystems exist. Fix the returned data by internally generating the entire discovery log page and copying only the requested bytes into the user buffer. The command log page offset type has been modified to a native __le64 to make it easier to extract the value from a command. Signed-off-by: Keith Busch Tested-by: Minwoo Im Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: James Smart Signed-off-by: Christoph Hellwig --- include/linux/nvme.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index baa49e6a23cc..c40720cb59ac 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -967,8 +967,13 @@ struct nvme_get_log_page_command { __le16 numdl; __le16 numdu; __u16 rsvd11; - __le32 lpol; - __le32 lpou; + union { + struct { + __le32 lpol; + __le32 lpou; + }; + __le64 lpo; + }; __u32 rsvd14[2]; }; -- cgit v1.2.3 From af6b61d7ef58099c82d854395a0e002be6bd036c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 11 Apr 2019 15:16:52 -0400 Subject: Revert "SUNRPC: Micro-optimise when the task is known not to be sleeping" This reverts commit 009a82f6437490c262584d65a14094a818bcb747. The ability to optimise here relies on compiler being able to optimise away tail calls to avoid stack overflows. Unfortunately, we are seeing reports of problems, so let's just revert. Reported-by: Daniel Mack Signed-off-by: Trond Myklebust --- include/linux/sunrpc/sched.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index ec861cd0cfe8..52d41d0c1ae1 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -304,12 +304,4 @@ rpc_clnt_swap_deactivate(struct rpc_clnt *clnt) } #endif /* CONFIG_SUNRPC_SWAP */ -static inline bool -rpc_task_need_resched(const struct rpc_task *task) -{ - if (RPC_IS_QUEUED(task) || task->tk_callback) - return true; - return false; -} - #endif /* _LINUX_SUNRPC_SCHED_H_ */ -- cgit v1.2.3 From e94999688e3aa3c0a8ad5a60352cdc3ca3030434 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Thu, 11 Apr 2019 20:17:33 +0200 Subject: PM / Domains: Add genpd governor for CPUs After some preceding changes, PM domains managed by genpd may contain CPU devices, so idle state residency values should be taken into account during the state selection process. [The residency value is the minimum amount of time to be spent by a CPU (or a group of CPUs) in an idle state in order to save more energy than could be saved by picking up a shallower idle state.] For this purpose, add a new genpd governor, pm_domain_cpu_gov, to be used for selecting idle states of PM domains with CPU devices attached either directly or through subdomains. The new governor computes the minimum expected idle duration for all online CPUs attached to a PM domain and its subdomains. Next, it finds the deepest idle state whose target residency is within the expected idle duration and selects it as the target idle state of the domain. It should be noted that the minimum expected idle duration computation is based on the closest timer event information stored in the per-CPU variables cpuidle_devices for all of the CPUs in the domain. That needs to be revisited in future, as obviously there are other reasons why a CPU may be woken up from idle. Co-developed-by: Lina Iyer Acked-by: Daniel Lezcano Signed-off-by: Ulf Hansson [ rjw: Changelog ] Signed-off-by: Rafael J. Wysocki --- include/linux/pm_domain.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index a6e251fe9deb..bc82e74560ee 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -118,6 +118,7 @@ struct generic_pm_domain { s64 max_off_time_ns; /* Maximum allowed "suspended" time. */ bool max_off_time_changed; bool cached_power_down_ok; + bool cached_power_down_state_idx; int (*attach_dev)(struct generic_pm_domain *domain, struct device *dev); void (*detach_dev)(struct generic_pm_domain *domain, @@ -202,6 +203,9 @@ int dev_pm_genpd_set_performance_state(struct device *dev, unsigned int state); extern struct dev_power_governor simple_qos_governor; extern struct dev_power_governor pm_domain_always_on_gov; +#ifdef CONFIG_CPU_IDLE +extern struct dev_power_governor pm_domain_cpu_gov; +#endif #else static inline struct generic_pm_domain_data *dev_gpd_data(struct device *dev) -- cgit v1.2.3 From 77f1e0a52d26242b6c2dba019f6ebebfb9ff701e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 18 Jan 2019 10:34:16 -0700 Subject: bfq: update internal depth state when queue depth changes A previous commit moved the shallow depth and BFQ depth map calculations to be done at init time, moving it outside of the hotter IO path. This potentially causes hangs if the users changes the depth of the scheduler map, by writing to the 'nr_requests' sysfs file for that device. Add a blk-mq-sched hook that allows blk-mq to inform the scheduler if the depth changes, so that the scheduler can update its internal state. Tested-by: Kai Krakow Reported-by: Paolo Valente Fixes: f0635b8a416e ("bfq: calculate shallow depths at init time") Signed-off-by: Jens Axboe --- include/linux/elevator.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 2e9e2763bf47..6e8bc53740f0 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -31,6 +31,7 @@ struct elevator_mq_ops { void (*exit_sched)(struct elevator_queue *); int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int); void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); + void (*depth_updated)(struct blk_mq_hw_ctx *); bool (*allow_merge)(struct request_queue *, struct request *, struct bio *); bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *); -- cgit v1.2.3 From f958d7b528b1b40c44cfda5eabe2d82760d868c3 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 11 Apr 2019 10:06:20 -0700 Subject: mm: make page ref count overflow check tighter and more explicit We have a VM_BUG_ON() to check that the page reference count doesn't underflow (or get close to overflow) by checking the sign of the count. That's all fine, but we actually want to allow people to use a "get page ref unless it's already very high" helper function, and we want that one to use the sign of the page ref (without triggering this VM_BUG_ON). Change the VM_BUG_ON to only check for small underflows (or _very_ close to overflowing), and ignore overflows which have strayed into negative territory. Acked-by: Matthew Wilcox Cc: Jann Horn Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- include/linux/mm.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 80bb6408fe73..541d99b86aea 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -965,6 +965,10 @@ static inline bool is_pci_p2pdma_page(const struct page *page) } #endif /* CONFIG_DEV_PAGEMAP_OPS */ +/* 127: arbitrary random number, small enough to assemble well */ +#define page_ref_zero_or_close_to_overflow(page) \ + ((unsigned int) page_ref_count(page) + 127u <= 127u) + static inline void get_page(struct page *page) { page = compound_head(page); @@ -972,7 +976,7 @@ static inline void get_page(struct page *page) * Getting a normal page or the head of a compound page * requires to already have an elevated page->_refcount. */ - VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page); + VM_BUG_ON_PAGE(page_ref_zero_or_close_to_overflow(page), page); page_ref_inc(page); } -- cgit v1.2.3 From 88b1a17dfc3ed7728316478fae0f5ad508f50397 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 11 Apr 2019 10:14:59 -0700 Subject: mm: add 'try_get_page()' helper function This is the same as the traditional 'get_page()' function, but instead of unconditionally incrementing the reference count of the page, it only does so if the count was "safe". It returns whether the reference count was incremented (and is marked __must_check, since the caller obviously has to be aware of it). Also like 'get_page()', you can't use this function unless you already had a reference to the page. The intent is that you can use this exactly like get_page(), but in situations where you want to limit the maximum reference count. The code currently does an unconditional WARN_ON_ONCE() if we ever hit the reference count issues (either zero or negative), as a notification that the conditional non-increment actually happened. NOTE! The count access for the "safety" check is inherently racy, but that doesn't matter since the buffer we use is basically half the range of the reference count (ie we look at the sign of the count). Acked-by: Matthew Wilcox Cc: Jann Horn Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- include/linux/mm.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 541d99b86aea..7000ddd807e0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -980,6 +980,15 @@ static inline void get_page(struct page *page) page_ref_inc(page); } +static inline __must_check bool try_get_page(struct page *page) +{ + page = compound_head(page); + if (WARN_ON_ONCE(page_ref_count(page) <= 0)) + return false; + page_ref_inc(page); + return true; +} + static inline void put_page(struct page *page) { page = compound_head(page); -- cgit v1.2.3 From 15fab63e1e57be9fdb5eec1bbc5916e9825e9acb Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 5 Apr 2019 14:02:10 -0700 Subject: fs: prevent page refcount overflow in pipe_buf_get Change pipe_buf_get() to return a bool indicating whether it succeeded in raising the refcount of the page (if the thing in the pipe is a page). This removes another mechanism for overflowing the page refcount. All callers converted to handle a failure. Reported-by: Jann Horn Signed-off-by: Matthew Wilcox Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- include/linux/pipe_fs_i.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 5a3bb3b7c9ad..3f2a42c11e20 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -108,18 +108,20 @@ struct pipe_buf_operations { /* * Get a reference to the pipe buffer. */ - void (*get)(struct pipe_inode_info *, struct pipe_buffer *); + bool (*get)(struct pipe_inode_info *, struct pipe_buffer *); }; /** * pipe_buf_get - get a reference to a pipe_buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to get a reference to + * + * Return: %true if the reference was successfully obtained. */ -static inline void pipe_buf_get(struct pipe_inode_info *pipe, +static inline __must_check bool pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - buf->ops->get(pipe, buf); + return buf->ops->get(pipe, buf); } /** @@ -178,7 +180,7 @@ struct pipe_inode_info *alloc_pipe_info(void); void free_pipe_info(struct pipe_inode_info *); /* Generic pipe buffer ops functions */ -void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); +bool generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *); -- cgit v1.2.3 From 0082517fa4bce073e7cf542633439f26538a14cc Mon Sep 17 00:00:00 2001 From: Jian-Hong Pan Date: Fri, 12 Apr 2019 16:01:53 +0800 Subject: x86/reboot, efi: Use EFI reboot for Acer TravelMate X514-51T Upon reboot, the Acer TravelMate X514-51T laptop appears to complete the shutdown process, but then it hangs in BIOS POST with a black screen. The problem is intermittent - at some points it has appeared related to Secure Boot settings or different kernel builds, but ultimately we have not been able to identify the exact conditions that trigger the issue to come and go. Besides, the EFI mode cannot be disabled in the BIOS of this model. However, after extensive testing, we observe that using the EFI reboot method reliably avoids the issue in all cases. So add a boot time quirk to use EFI reboot on such systems. Buglink: https://bugzilla.kernel.org/show_bug.cgi?id=203119 Signed-off-by: Jian-Hong Pan Signed-off-by: Daniel Drake Cc: Ard Biesheuvel Cc: Borislav Petkov Cc: Linus Torvalds Cc: Matt Fleming Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Cc: linux@endlessm.com Link: http://lkml.kernel.org/r/20190412080152.3718-1-jian-hong@endlessm.com [ Fix !CONFIG_EFI build failure, clarify the code and the changelog a bit. ] Signed-off-by: Ingo Molnar --- include/linux/efi.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 54357a258b35..6ebc2098cfe1 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1611,7 +1611,12 @@ efi_status_t efi_setup_gop(efi_system_table_t *sys_table_arg, struct screen_info *si, efi_guid_t *proto, unsigned long size); -bool efi_runtime_disabled(void); +#ifdef CONFIG_EFI +extern bool efi_runtime_disabled(void); +#else +static inline bool efi_runtime_disabled(void) { return true; } +#endif + extern void efi_call_virt_check_flags(unsigned long flags, const char *call); extern unsigned long efi_call_virt_save_flags(void); -- cgit v1.2.3 From c68d224e5ed15605e651e2482c6ffd95915ddf58 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 8 Apr 2019 10:32:51 -0700 Subject: perf/core: Add perf_pmu_resched() as global function This patch add perf_pmu_resched() a global function that can be called to force rescheduling of events for a given PMU. The function locks both cpuctx and task_ctx internally. This will be used by a subsequent patch. Signed-off-by: Stephane Eranian [ Simplified the calling convention. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Cc: kan.liang@intel.com Cc: nelson.dsouza@intel.com Cc: tonyj@suse.com Link: https://lkml.kernel.org/r/20190408173252.37932-2-eranian@google.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 085a95e2582a..f3864e1c5569 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -888,6 +888,9 @@ extern void perf_sched_cb_dec(struct pmu *pmu); extern void perf_sched_cb_inc(struct pmu *pmu); extern int perf_event_task_disable(void); extern int perf_event_task_enable(void); + +extern void perf_pmu_resched(struct pmu *pmu); + extern int perf_event_refresh(struct perf_event *event, int refresh); extern void perf_event_update_userpage(struct perf_event *event); extern int perf_event_release_kernel(struct perf_event *event); -- cgit v1.2.3 From 1d487e9bf8ba66a7174c56a0029c54b1eca8f99c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Apr 2019 11:16:47 +0200 Subject: KVM: fix spectrev1 gadgets These were found with smatch, and then generalized when applicable. Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9d55c63db09b..640a03642766 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -513,10 +514,10 @@ static inline struct kvm_io_bus *kvm_get_bus(struct kvm *kvm, enum kvm_bus idx) static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) { - /* Pairs with smp_wmb() in kvm_vm_ioctl_create_vcpu, in case - * the caller has read kvm->online_vcpus before (as is the case - * for kvm_for_each_vcpu, for example). - */ + int num_vcpus = atomic_read(&kvm->online_vcpus); + i = array_index_nospec(i, num_vcpus); + + /* Pairs with smp_wmb() in kvm_vm_ioctl_create_vcpu. */ smp_rmb(); return kvm->vcpus[i]; } @@ -600,6 +601,7 @@ void kvm_put_kvm(struct kvm *kvm); static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id) { + as_id = array_index_nospec(as_id, KVM_ADDRESS_SPACE_NUM); return srcu_dereference_check(kvm->memslots[as_id], &kvm->srcu, lockdep_is_held(&kvm->slots_lock) || !refcount_read(&kvm->users_count)); -- cgit v1.2.3 From 98af8452945c55652de68536afdde3b520fec429 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 12 Apr 2019 15:39:28 -0500 Subject: cpu/speculation: Add 'mitigations=' cmdline option Keeping track of the number of mitigations for all the CPU speculation bugs has become overwhelming for many users. It's getting more and more complicated to decide which mitigations are needed for a given architecture. Complicating matters is the fact that each arch tends to have its own custom way to mitigate the same vulnerability. Most users fall into a few basic categories: a) they want all mitigations off; b) they want all reasonable mitigations on, with SMT enabled even if it's vulnerable; or c) they want all reasonable mitigations on, with SMT disabled if vulnerable. Define a set of curated, arch-independent options, each of which is an aggregation of existing options: - mitigations=off: Disable all mitigations. - mitigations=auto: [default] Enable all the default mitigations, but leave SMT enabled, even if it's vulnerable. - mitigations=auto,nosmt: Enable all the default mitigations, disabling SMT if needed by a mitigation. Currently, these options are placeholders which don't actually do anything. They will be fleshed out in upcoming patches. Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Tested-by: Jiri Kosina (on x86) Reviewed-by: Jiri Kosina Cc: Borislav Petkov Cc: "H . Peter Anvin" Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Jiri Kosina Cc: Waiman Long Cc: Andrea Arcangeli Cc: Jon Masters Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: linuxppc-dev@lists.ozlabs.org Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: linux-s390@vger.kernel.org Cc: Catalin Marinas Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: linux-arch@vger.kernel.org Cc: Greg Kroah-Hartman Cc: Tyler Hicks Cc: Linus Torvalds Cc: Randy Dunlap Cc: Steven Price Cc: Phil Auld Link: https://lkml.kernel.org/r/b07a8ef9b7c5055c3a4637c87d07c296d5016fe0.1555085500.git.jpoimboe@redhat.com --- include/linux/cpu.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 5041357d0297..2d9c6f4b78f5 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -187,4 +187,28 @@ static inline void cpu_smt_disable(bool force) { } static inline void cpu_smt_check_topology(void) { } #endif +/* + * These are used for a global "mitigations=" cmdline option for toggling + * optional CPU mitigations. + */ +enum cpu_mitigations { + CPU_MITIGATIONS_OFF, + CPU_MITIGATIONS_AUTO, + CPU_MITIGATIONS_AUTO_NOSMT, +}; + +extern enum cpu_mitigations cpu_mitigations; + +/* mitigations=off */ +static inline bool cpu_mitigations_off(void) +{ + return cpu_mitigations == CPU_MITIGATIONS_OFF; +} + +/* mitigations=auto,nosmt */ +static inline bool cpu_mitigations_auto_nosmt(void) +{ + return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT; +} + #endif /* _LINUX_CPU_H_ */ -- cgit v1.2.3 From 3771b0fe9dfc3801eac0142d1af6ba94dee83c6c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 25 Mar 2019 13:57:57 +0100 Subject: locking/lockdep: Avoid bogus Clang warning When lockdep is enabled, and -Wuninitialized warnings are enabled, Clang produces a silly warning for every file we compile: In file included from kernel/sched/fair.c:23: kernel/sched/sched.h:1094:15: error: variable 'cookie' is uninitialized when used here [-Werror,-Wuninitialized] rf->cookie = lockdep_pin_lock(&rq->lock); ^~~~~~~~~~~~~~~~~~~~~~~~~~~ include/linux/lockdep.h:474:60: note: expanded from macro 'lockdep_pin_lock' #define lockdep_pin_lock(l) ({ struct pin_cookie cookie; cookie; }) ^~~~~~ kernel/sched/sched.h:1094:15: note: variable 'cookie' is declared here include/linux/lockdep.h:474:34: note: expanded from macro 'lockdep_pin_lock' #define lockdep_pin_lock(l) ({ struct pin_cookie cookie; cookie; }) ^ As the 'struct pin_cookie' structure is empty in this configuration, there is no need to initialize it for correctness, but it also does not hurt to set it to an empty structure, so do that to avoid the warning. Signed-off-by: Arnd Bergmann Acked-by: Will Deacon Cc: Bart Van Assche Cc: Joel Fernandes (Google) Cc: Linus Torvalds Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Peter Zijlstra Cc: Steven Rostedt (VMware) Cc: Thomas Gleixner Cc: Waiman Long Cc: clang-built-linux@googlegroups.com Link: http://lkml.kernel.org/r/20190325125807.1437049-1-arnd@arndb.de Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 79c3873d58ac..21725a91442b 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -471,7 +471,7 @@ struct pin_cookie { }; #define NIL_COOKIE (struct pin_cookie){ } -#define lockdep_pin_lock(l) ({ struct pin_cookie cookie; cookie; }) +#define lockdep_pin_lock(l) ({ struct pin_cookie cookie = { }; cookie; }) #define lockdep_repin_lock(l, c) do { (void)(l); (void)(c); } while (0) #define lockdep_unpin_lock(l, c) do { (void)(l); (void)(c); } while (0) -- cgit v1.2.3 From 0fcc2bdc8aff6e7feb3222930edb78b4b820cd3e Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Tue, 2 Apr 2019 13:30:37 +0300 Subject: device property: Add fwnode_graph_get_endpoint_by_id() fwnode_graph_get_endpoint_by_id() is intended for obtaining local endpoints by a given local port. fwnode_graph_get_endpoint_by_id() is slightly different from its OF counterpart, of_graph_get_endpoint_by_regs(): instead of using -1 as a value to indicate that a port or an endpoint number does not matter, it uses flags to look for equal or greater endpoint. The port number is always fixed. It also returns only remote endpoints that belong to an available device, a behaviour that can be turned off with a flag. Signed-off-by: Sakari Ailus [ rjw: Changelog ] Signed-off-by: Rafael J. Wysocki --- include/linux/property.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/property.h b/include/linux/property.h index 65d3420dd5d1..a29369c89e6e 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -13,6 +13,7 @@ #ifndef _LINUX_PROPERTY_H_ #define _LINUX_PROPERTY_H_ +#include #include #include @@ -304,6 +305,23 @@ struct fwnode_handle * fwnode_graph_get_remote_node(const struct fwnode_handle *fwnode, u32 port, u32 endpoint); +/* + * Fwnode lookup flags + * + * @FWNODE_GRAPH_ENDPOINT_NEXT: In the case of no exact match, look for the + * closest endpoint ID greater than the specified + * one. + * @FWNODE_GRAPH_DEVICE_DISABLED: That the device to which the remote + * endpoint of the given endpoint belongs to, + * may be disabled. + */ +#define FWNODE_GRAPH_ENDPOINT_NEXT BIT(0) +#define FWNODE_GRAPH_DEVICE_DISABLED BIT(1) + +struct fwnode_handle * +fwnode_graph_get_endpoint_by_id(const struct fwnode_handle *fwnode, + u32 port, u32 endpoint, unsigned long flags); + #define fwnode_graph_for_each_endpoint(fwnode, child) \ for (child = NULL; \ (child = fwnode_graph_get_next_endpoint(fwnode, child)); ) -- cgit v1.2.3 From 83b0b15bcb0f700e7c1d070aae2e7841170a4c33 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 5 Mar 2019 14:47:54 -0500 Subject: rseq: Remove superfluous rseq_len from task_struct The rseq system call, when invoked with flags of "0" or "RSEQ_FLAG_UNREGISTER" values, expects the rseq_len parameter to be equal to sizeof(struct rseq), which is fixed-size and fixed-layout, specified in uapi linux/rseq.h. Expecting a fixed size for rseq_len is a design choice that ensures multiple libraries and application defining __rseq_abi in the same process agree on its exact size. Considering that this size is and will always be the same value, there is no point in saving this value within task_struct rseq_len. Remove this field from task_struct. No change in functionality intended. Signed-off-by: Mathieu Desnoyers Acked-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Andy Lutomirski Cc: Ben Maurer Cc: Boqun Feng Cc: Catalin Marinas Cc: Chris Lameter Cc: Dave Watson Cc: H. Peter Anvin Cc: Joel Fernandes Cc: Josh Triplett Cc: Linus Torvalds Cc: Michael Kerrisk Cc: Paul E. McKenney Cc: Paul Turner Cc: Peter Zijlstra Cc: Russell King Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Deacon Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/20190305194755.2602-3-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 1549584a1538..50606a6e73d6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1057,7 +1057,6 @@ struct task_struct { #ifdef CONFIG_RSEQ struct rseq __user *rseq; - u32 rseq_len; u32 rseq_sig; /* * RmW on rseq_event_mask must be performed atomically @@ -1855,12 +1854,10 @@ static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { if (clone_flags & CLONE_THREAD) { t->rseq = NULL; - t->rseq_len = 0; t->rseq_sig = 0; t->rseq_event_mask = 0; } else { t->rseq = current->rseq; - t->rseq_len = current->rseq_len; t->rseq_sig = current->rseq_sig; t->rseq_event_mask = current->rseq_event_mask; } @@ -1869,7 +1866,6 @@ static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) static inline void rseq_execve(struct task_struct *t) { t->rseq = NULL; - t->rseq_len = 0; t->rseq_sig = 0; t->rseq_event_mask = 0; } -- cgit v1.2.3 From 3ff9c075cc767b3060bdac12da72fc94dd7da1b8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sun, 24 Feb 2019 01:49:52 +0900 Subject: x86/kprobes: Verify stack frame on kretprobe Verify the stack frame pointer on kretprobe trampoline handler, If the stack frame pointer does not match, it skips the wrong entry and tries to find correct one. This can happen if user puts the kretprobe on the function which can be used in the path of ftrace user-function call. Such functions should not be probed, so this adds a warning message that reports which function should be blacklisted. Tested-by: Andrea Righi Signed-off-by: Masami Hiramatsu Acked-by: Steven Rostedt Cc: Linus Torvalds Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/155094059185.6137.15527904013362842072.stgit@devbox Signed-off-by: Ingo Molnar --- include/linux/kprobes.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 201f0f2683f2..9a897256e481 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -173,6 +173,7 @@ struct kretprobe_instance { struct kretprobe *rp; kprobe_opcode_t *ret_addr; struct task_struct *task; + void *fp; char data[0]; }; -- cgit v1.2.3 From af53d3e9e04024885de5b4fda51e5fa362ae2bd8 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 18 Apr 2019 17:50:13 -0700 Subject: mm: swapoff: shmem_unuse() stop eviction without igrab() The igrab() in shmem_unuse() looks good, but we forgot that it gives no protection against concurrent unmounting: a point made by Konstantin Khlebnikov eight years ago, and then fixed in 2.6.39 by 778dd893ae78 ("tmpfs: fix race between umount and swapoff"). The current 5.1-rc swapoff is liable to hit "VFS: Busy inodes after unmount of tmpfs. Self-destruct in 5 seconds. Have a nice day..." followed by GPF. Once again, give up on using igrab(); but don't go back to making such heavy-handed use of shmem_swaplist_mutex as last time: that would spoil the new design, and I expect could deadlock inside shmem_swapin_page(). Instead, shmem_unuse() just raise a "stop_eviction" count in the shmem- specific inode, and shmem_evict_inode() wait for that to go down to 0. Call it "stop_eviction" rather than "swapoff_busy" because it can be put to use for others later (huge tmpfs patches expect to use it). That simplifies shmem_unuse(), protecting it from both unlink and unmount; and in practice lets it locate all the swap in its first try. But do not rely on that: there's still a theoretical case, when shmem_writepage() might have been preempted after its get_swap_page(), before making the swap entry visible to swapoff. [hughd@google.com: remove incorrect list_del()] Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1904091133570.1898@eggly.anvils Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1904081259400.1523@eggly.anvils Fixes: b56a2d8af914 ("mm: rid swapoff of quadratic complexity") Signed-off-by: Hugh Dickins Cc: "Alex Xu (Hello71)" Cc: Huang Ying Cc: Kelley Nielsen Cc: Konstantin Khlebnikov Cc: Rik van Riel Cc: Vineeth Pillai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shmem_fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index f3fb1edb3526..20d815a33145 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -21,6 +21,7 @@ struct shmem_inode_info { struct list_head swaplist; /* chain of maybes on swap */ struct shared_policy policy; /* NUMA memory alloc policy */ struct simple_xattrs xattrs; /* list of xattrs */ + atomic_t stop_eviction; /* hold when working on inode */ struct inode vfs_inode; }; -- cgit v1.2.3 From 04f5866e41fb70690e28397487d8bd8eea7d712a Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 18 Apr 2019 17:50:52 -0700 Subject: coredump: fix race condition between mmget_not_zero()/get_task_mm() and core dumping The core dumping code has always run without holding the mmap_sem for writing, despite that is the only way to ensure that the entire vma layout will not change from under it. Only using some signal serialization on the processes belonging to the mm is not nearly enough. This was pointed out earlier. For example in Hugh's post from Jul 2017: https://lkml.kernel.org/r/alpine.LSU.2.11.1707191716030.2055@eggly.anvils "Not strictly relevant here, but a related note: I was very surprised to discover, only quite recently, how handle_mm_fault() may be called without down_read(mmap_sem) - when core dumping. That seems a misguided optimization to me, which would also be nice to correct" In particular because the growsdown and growsup can move the vm_start/vm_end the various loops the core dump does around the vma will not be consistent if page faults can happen concurrently. Pretty much all users calling mmget_not_zero()/get_task_mm() and then taking the mmap_sem had the potential to introduce unexpected side effects in the core dumping code. Adding mmap_sem for writing around the ->core_dump invocation is a viable long term fix, but it requires removing all copy user and page faults and to replace them with get_dump_page() for all binary formats which is not suitable as a short term fix. For the time being this solution manually covers the places that can confuse the core dump either by altering the vma layout or the vma flags while it runs. Once ->core_dump runs under mmap_sem for writing the function mmget_still_valid() can be dropped. Allowing mmap_sem protected sections to run in parallel with the coredump provides some minor parallelism advantage to the swapoff code (which seems to be safe enough by never mangling any vma field and can keep doing swapins in parallel to the core dumping) and to some other corner case. In order to facilitate the backporting I added "Fixes: 86039bd3b4e6" however the side effect of this same race condition in /proc/pid/mem should be reproducible since before 2.6.12-rc2 so I couldn't add any other "Fixes:" because there's no hash beyond the git genesis commit. Because find_extend_vma() is the only location outside of the process context that could modify the "mm" structures under mmap_sem for reading, by adding the mmget_still_valid() check to it, all other cases that take the mmap_sem for reading don't need the new check after mmget_not_zero()/get_task_mm(). The expand_stack() in page fault context also doesn't need the new check, because all tasks under core dumping are frozen. Link: http://lkml.kernel.org/r/20190325224949.11068-1-aarcange@redhat.com Fixes: 86039bd3b4e6 ("userfaultfd: add new syscall to provide memory externalization") Signed-off-by: Andrea Arcangeli Reported-by: Jann Horn Suggested-by: Oleg Nesterov Acked-by: Peter Xu Reviewed-by: Mike Rapoport Reviewed-by: Oleg Nesterov Reviewed-by: Jann Horn Acked-by: Jason Gunthorpe Acked-by: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/mm.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 0cd9f10423fb..a3fda9f024c3 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -49,6 +49,27 @@ static inline void mmdrop(struct mm_struct *mm) __mmdrop(mm); } +/* + * This has to be called after a get_task_mm()/mmget_not_zero() + * followed by taking the mmap_sem for writing before modifying the + * vmas or anything the coredump pretends not to change from under it. + * + * NOTE: find_extend_vma() called from GUP context is the only place + * that can modify the "mm" (notably the vm_start/end) under mmap_sem + * for reading and outside the context of the process, so it is also + * the only case that holds the mmap_sem for reading that must call + * this function. Generally if the mmap_sem is hold for reading + * there's no need of this check after get_task_mm()/mmget_not_zero(). + * + * This function can be obsoleted and the check can be removed, after + * the coredump code will hold the mmap_sem for writing before + * invoking the ->core_dump methods. + */ +static inline bool mmget_still_valid(struct mm_struct *mm) +{ + return likely(!mm->core_state); +} + /** * mmget() - Pin the address space associated with a &struct mm_struct. * @mm: The address space to pin. -- cgit v1.2.3 From b40fabc05ea047f6af5933d26a5483873340b0d4 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 19 Apr 2019 10:31:27 +0800 Subject: block: kill all_q_node in request_queue all_q_node has not been used since commit 4b855ad37194 ("blk-mq: Create hctx for each present CPU"), so remove it. Reviewed-by: Chaitanya Kulkarni Reviewed-by: Ming Lei Signed-off-by: Hou Tao Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5c58a3b2bf00..317ab30d2904 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -548,7 +548,6 @@ struct request_queue { struct rcu_head rcu_head; wait_queue_head_t mq_freeze_wq; struct percpu_ref q_usage_counter; - struct list_head all_q_node; struct blk_mq_tag_set *tag_set; struct list_head tag_set_list; -- cgit v1.2.3 From 6bedf00e55e5dd0a4ed1ad3f06131edd6fb56ec8 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 17 Apr 2019 09:11:26 +0800 Subject: block: make sure that bvec length can't be overflow bvec->bv_offset may be bigger than PAGE_SIZE sometimes, such as, when one bio is splitted in the middle of one bvec via bio_split(), and bi_iter.bi_bvec_done is used to build offset of the 1st bvec of remained bio. And the remained bio's bvec may be re-submitted to fs layer via ITER_IBVEC, such as loop and nvme-loop. So we have to make sure that every bvec's offset is less than PAGE_SIZE from bio_for_each_segment_all() because some drivers(loop, nvme-loop) passes the splitted bvec to fs layer via ITER_BVEC. This patch fixes this issue reported by Zhang Yi When running nvme/011. Cc: Christoph Hellwig Cc: Yi Zhang Reported-by: Yi Zhang Reviewed-by: Christoph Hellwig Fixes: 6dc4f100c175 ("block: allow bio_for_each_segment_all() to iterate over multi-page bvec") Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/bvec.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 3bc91879e1e2..ff13cbc1887d 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -160,8 +160,9 @@ static inline void bvec_advance(const struct bio_vec *bvec, bv->bv_page = nth_page(bv->bv_page, 1); bv->bv_offset = 0; } else { - bv->bv_page = bvec->bv_page; - bv->bv_offset = bvec->bv_offset; + bv->bv_page = bvec_nth_page(bvec->bv_page, bvec->bv_offset / + PAGE_SIZE); + bv->bv_offset = bvec->bv_offset & ~PAGE_MASK; } bv->bv_len = min_t(unsigned int, PAGE_SIZE - bv->bv_offset, bvec->bv_len - iter_all->done); -- cgit v1.2.3 From c2b71462d294cf517a0bc6e4fd6424d7cee5596f Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Fri, 19 Apr 2019 13:52:38 -0400 Subject: USB: core: Fix bug caused by duplicate interface PM usage counter The syzkaller fuzzer reported a bug in the USB hub driver which turned out to be caused by a negative runtime-PM usage counter. This allowed a hub to be runtime suspended at a time when the driver did not expect it. The symptom is a WARNING issued because the hub's status URB is submitted while it is already active: URB 0000000031fb463e submitted while active WARNING: CPU: 0 PID: 2917 at drivers/usb/core/urb.c:363 The negative runtime-PM usage count was caused by an unfortunate design decision made when runtime PM was first implemented for USB. At that time, USB class drivers were allowed to unbind from their interfaces without balancing the usage counter (i.e., leaving it with a positive count). The core code would take care of setting the counter back to 0 before allowing another driver to bind to the interface. Later on when runtime PM was implemented for the entire kernel, the opposite decision was made: Drivers were required to balance their runtime-PM get and put calls. In order to maintain backward compatibility, however, the USB subsystem adapted to the new implementation by keeping an independent usage counter for each interface and using it to automatically adjust the normal usage counter back to 0 whenever a driver was unbound. This approach involves duplicating information, but what is worse, it doesn't work properly in cases where a USB class driver delays decrementing the usage counter until after the driver's disconnect() routine has returned and the counter has been adjusted back to 0. Doing so would cause the usage counter to become negative. There's even a warning about this in the USB power management documentation! As it happens, this is exactly what the hub driver does. The kick_hub_wq() routine increments the runtime-PM usage counter, and the corresponding decrement is carried out by hub_event() in the context of the hub_wq work-queue thread. This work routine may sometimes run after the driver has been unbound from its interface, and when it does it causes the usage counter to go negative. It is not possible for hub_disconnect() to wait for a pending hub_event() call to finish, because hub_disconnect() is called with the device lock held and hub_event() acquires that lock. The only feasible fix is to reverse the original design decision: remove the duplicate interface-specific usage counter and require USB drivers to balance their runtime PM gets and puts. As far as I know, all existing drivers currently do this. Signed-off-by: Alan Stern Reported-and-tested-by: syzbot+7634edaea4d0b341c625@syzkaller.appspotmail.com CC: Signed-off-by: Greg Kroah-Hartman --- include/linux/usb.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index 5e49e82c4368..ff010d1fd1c7 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -200,7 +200,6 @@ usb_find_last_int_out_endpoint(struct usb_host_interface *alt, * @dev: driver model's view of this device * @usb_dev: if an interface is bound to the USB major, this will point * to the sysfs representation for that device. - * @pm_usage_cnt: PM usage counter for this interface * @reset_ws: Used for scheduling resets from atomic context. * @resetting_device: USB core reset the device, so use alt setting 0 as * current; needs bandwidth alloc after reset. @@ -257,7 +256,6 @@ struct usb_interface { struct device dev; /* interface specific device info */ struct device *usb_dev; - atomic_t pm_usage_cnt; /* usage counter for autosuspend */ struct work_struct reset_ws; /* for resets in atomic context */ }; #define to_usb_interface(d) container_of(d, struct usb_interface, dev) -- cgit v1.2.3 From fe066621c7966fe47fa17c6be6fd81adb3c0509f Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 12 Apr 2019 23:19:11 +0800 Subject: gpio: merrifield: Fix build err without CONFIG_ACPI When building CONFIG_ACPI is not set gcc warn this: drivers/gpio/gpio-merrifield.c: In function mrfld_gpio_get_pinctrl_dev_name: drivers/gpio/gpio-merrifield.c:388:19: error: dereferencing pointer to incomplete type struct acpi_device put_device(&adev->dev); ^~ Reported-by: Hulk Robot Fixes: d00d2109c367 ("gpio: merrifield: Convert to use acpi_dev_get_first_match_dev()") Suggested-by: Andy Shevchenko Signed-off-by: YueHaibing Reviewed-by: Andy Shevchenko Acked-by: Linus Walleij Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 392413075cc0..ca55ae00f8c9 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -675,6 +675,8 @@ acpi_dev_get_first_match_dev(const char *hid, const char *uid, s64 hrv) return NULL; } +static inline void acpi_dev_put(struct acpi_device *adev) {} + static inline bool is_acpi_node(struct fwnode_handle *fwnode) { return false; -- cgit v1.2.3 From 1c5c12ee308aacf635c8819cd4baa3bd58f8a8b7 Mon Sep 17 00:00:00 2001 From: Tao Ren Date: Wed, 24 Apr 2019 01:43:32 +0000 Subject: net/ncsi: handle overflow when incrementing mac address Previously BMC's MAC address is calculated by simply adding 1 to the last byte of network controller's MAC address, and it produces incorrect result when network controller's MAC address ends with 0xFF. The problem can be fixed by calling eth_addr_inc() function to increment MAC address; besides, the MAC address is also validated before assigning to BMC. Fixes: cb10c7c0dfd9 ("net/ncsi: Add NCSI Broadcom OEM command") Signed-off-by: Tao Ren Acked-by: Jakub Kicinski Acked-by: Samuel Mendoza-Jonas Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index e2f3b21cd72a..aa8bfd6f738c 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -448,6 +448,18 @@ static inline void eth_addr_dec(u8 *addr) u64_to_ether_addr(u, addr); } +/** + * eth_addr_inc() - Increment the given MAC address. + * @addr: Pointer to a six-byte array containing Ethernet address to increment. + */ +static inline void eth_addr_inc(u8 *addr) +{ + u64 u = ether_addr_to_u64(addr); + + u++; + u64_to_ether_addr(u, addr); +} + /** * is_etherdev_addr - Tell if given Ethernet address belongs to the device. * @dev: Pointer to a device structure -- cgit v1.2.3 From d4645d30b50d1691c26ff0f8fa4e718b08f8d3bb Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 24 Apr 2019 10:52:53 +0200 Subject: smpboot: Place the __percpu annotation correctly The test robot reported a wrong assignment of a per-CPU variable which it detected by using sparse and sent a report. The assignment itself is correct. The annotation for sparse was wrong and hence the report. The first pointer is a "normal" pointer and points to the per-CPU memory area. That means that the __percpu annotation has to be moved. Move the __percpu annotation to pointer which points to the per-CPU area. This change affects only the sparse tool (and is ignored by the compiler). Reported-by: kbuild test robot Signed-off-by: Sebastian Andrzej Siewior Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: f97f8f06a49fe ("smpboot: Provide infrastructure for percpu hotplug threads") Link: http://lkml.kernel.org/r/20190424085253.12178-1-bigeasy@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/smpboot.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h index d0884b525001..9d1bc65d226c 100644 --- a/include/linux/smpboot.h +++ b/include/linux/smpboot.h @@ -29,7 +29,7 @@ struct smpboot_thread_data; * @thread_comm: The base name of the thread */ struct smp_hotplug_thread { - struct task_struct __percpu **store; + struct task_struct * __percpu *store; struct list_head list; int (*thread_should_run)(unsigned int cpu); void (*thread_fn)(unsigned int cpu); -- cgit v1.2.3 From b88c9f4129dcec941e5a26508e991c08051ed1ac Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Thu, 25 Apr 2019 16:28:37 +0300 Subject: clk: Add missing stubs for a few functions Compilation fails if any of undeclared clk_set_*() functions are in use and CONFIG_HAVE_CLK=n. Reported-by: kbuild test robot Signed-off-by: Dmitry Osipenko Signed-off-by: Stephen Boyd --- include/linux/clk.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk.h b/include/linux/clk.h index d8bc1a856b39..f689fc58d7be 100644 --- a/include/linux/clk.h +++ b/include/linux/clk.h @@ -811,6 +811,22 @@ static inline bool clk_has_parent(struct clk *clk, struct clk *parent) return true; } +static inline int clk_set_rate_range(struct clk *clk, unsigned long min, + unsigned long max) +{ + return 0; +} + +static inline int clk_set_min_rate(struct clk *clk, unsigned long rate) +{ + return 0; +} + +static inline int clk_set_max_rate(struct clk *clk, unsigned long rate) +{ + return 0; +} + static inline int clk_set_parent(struct clk *clk, struct clk *parent) { return 0; -- cgit v1.2.3 From f9ccd7c3a1d87cea3a6f9ed6c946dee9e7456b2e Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Thu, 25 Apr 2019 11:04:13 +0200 Subject: PM / Domains: Allow to attach a CPU via genpd_dev_pm_attach_by_id|name() Attaching a device via genpd_dev_pm_attach_by_id|name() makes genpd allocate a virtual device that it attaches instead. This leads to a problem in case when the base device belongs to a CPU. More precisely, it means genpd_get_cpu() compares against the virtual device, thus it fails to find a matching CPU device. Address this limitation by passing the base device to genpd_get_cpu() rather than the virtual device. Moreover, to deal with detach correctly from genpd_remove_device(), store the CPU number in struct generic_pm_domain_data, so as to be able to clear the corresponding bit in the cpumask for the genpd. Signed-off-by: Ulf Hansson Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- include/linux/pm_domain.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index bc82e74560ee..0e8e356bed6a 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -175,6 +175,7 @@ struct generic_pm_domain_data { struct pm_domain_data base; struct gpd_timing_data td; struct notifier_block nb; + int cpu; unsigned int performance_state; void *data; }; -- cgit v1.2.3 From 0edd6b64d1939e9e9168ff27947995bb7751db5d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Apr 2019 21:55:59 +0200 Subject: bpf: Fix preempt_enable_no_resched() abuse Unless the very next line is schedule(), or implies it, one must not use preempt_enable_no_resched(). It can cause a preemption to go missing and thereby cause arbitrary delays, breaking the PREEMPT=y invariant. Cc: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f02367faa58d..944ccc310201 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -510,7 +510,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, } \ _out: \ rcu_read_unlock(); \ - preempt_enable_no_resched(); \ + preempt_enable(); \ _ret; \ }) -- cgit v1.2.3 From b987222654f84f7b4ca95b3a55eca784cb30235b Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 4 Apr 2019 23:59:25 +0200 Subject: tracing: Fix buffer_ref pipe ops This fixes multiple issues in buffer_pipe_buf_ops: - The ->steal() handler must not return zero unless the pipe buffer has the only reference to the page. But generic_pipe_buf_steal() assumes that every reference to the pipe is tracked by the page's refcount, which isn't true for these buffers - buffer_pipe_buf_get(), which duplicates a buffer, doesn't touch the page's refcount. Fix it by using generic_pipe_buf_nosteal(), which refuses every attempted theft. It should be easy to actually support ->steal, but the only current users of pipe_buf_steal() are the virtio console and FUSE, and they also only use it as an optimization. So it's probably not worth the effort. - The ->get() and ->release() handlers can be invoked concurrently on pipe buffers backed by the same struct buffer_ref. Make them safe against concurrency by using refcount_t. - The pointers stored in ->private were only zeroed out when the last reference to the buffer_ref was dropped. As far as I know, this shouldn't be necessary anyway, but if we do it, let's always do it. Link: http://lkml.kernel.org/r/20190404215925.253531-1-jannh@google.com Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Al Viro Cc: stable@vger.kernel.org Fixes: 73a757e63114d ("ring-buffer: Return reader page back into existing ring buffer") Signed-off-by: Jann Horn Signed-off-by: Steven Rostedt (VMware) --- include/linux/pipe_fs_i.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 787d224ff43e..a830e9a00eb9 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -174,6 +174,7 @@ void free_pipe_info(struct pipe_inode_info *); void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); +int generic_pipe_buf_nosteal(struct pipe_inode_info *, struct pipe_buffer *); void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *); void pipe_buf_mark_unmergeable(struct pipe_buffer *buf); -- cgit v1.2.3 From d15d356887e770c5f2dcf963b52c7cb510c9e42d Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 23 Apr 2019 00:26:52 +0800 Subject: perf/x86: Make perf callchains work without CONFIG_FRAME_POINTER Currently perf callchain doesn't work well with ORC unwinder when sampling from trace point. We'll get useless in kernel callchain like this: perf 6429 [000] 22.498450: kmem:mm_page_alloc: page=0x176a17 pfn=1534487 order=0 migratetype=0 gfp_flags=GFP_KERNEL ffffffffbe23e32e __alloc_pages_nodemask+0x22e (/lib/modules/5.1.0-rc3+/build/vmlinux) 7efdf7f7d3e8 __poll+0x18 (/usr/lib64/libc-2.28.so) 5651468729c1 [unknown] (/usr/bin/perf) 5651467ee82a main+0x69a (/usr/bin/perf) 7efdf7eaf413 __libc_start_main+0xf3 (/usr/lib64/libc-2.28.so) 5541f689495641d7 [unknown] ([unknown]) The root cause is that, for trace point events, it doesn't provide a real snapshot of the hardware registers. Instead perf tries to get required caller's registers and compose a fake register snapshot which suppose to contain enough information for start a unwinding. However without CONFIG_FRAME_POINTER, if failed to get caller's BP as the frame pointer, so current frame pointer is returned instead. We get a invalid register combination which confuse the unwinder, and end the stacktrace early. So in such case just don't try dump BP, and let the unwinder start directly when the register is not a real snapshot. Use SP as the skip mark, unwinder will skip all the frames until it meet the frame of the trace point caller. Tested with frame pointer unwinder and ORC unwinder, this makes perf callchain get the full kernel space stacktrace again like this: perf 6503 [000] 1567.570191: kmem:mm_page_alloc: page=0x16c904 pfn=1493252 order=0 migratetype=0 gfp_flags=GFP_KERNEL ffffffffb523e2ae __alloc_pages_nodemask+0x22e (/lib/modules/5.1.0-rc3+/build/vmlinux) ffffffffb52383bd __get_free_pages+0xd (/lib/modules/5.1.0-rc3+/build/vmlinux) ffffffffb52fd28a __pollwait+0x8a (/lib/modules/5.1.0-rc3+/build/vmlinux) ffffffffb521426f perf_poll+0x2f (/lib/modules/5.1.0-rc3+/build/vmlinux) ffffffffb52fe3e2 do_sys_poll+0x252 (/lib/modules/5.1.0-rc3+/build/vmlinux) ffffffffb52ff027 __x64_sys_poll+0x37 (/lib/modules/5.1.0-rc3+/build/vmlinux) ffffffffb500418b do_syscall_64+0x5b (/lib/modules/5.1.0-rc3+/build/vmlinux) ffffffffb5a0008c entry_SYSCALL_64_after_hwframe+0x44 (/lib/modules/5.1.0-rc3+/build/vmlinux) 7f71e92d03e8 __poll+0x18 (/usr/lib64/libc-2.28.so) 55a22960d9c1 [unknown] (/usr/bin/perf) 55a22958982a main+0x69a (/usr/bin/perf) 7f71e9202413 __libc_start_main+0xf3 (/usr/lib64/libc-2.28.so) 5541f689495641d7 [unknown] ([unknown]) Co-developed-by: Josh Poimboeuf Signed-off-by: Kairui Song Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Alexei Starovoitov Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Dave Young Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190422162652.15483-1-kasong@redhat.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f3864e1c5569..cf023db0e8a2 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1058,12 +1058,18 @@ static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned lo #endif /* - * Take a snapshot of the regs. Skip ip and frame pointer to - * the nth caller. We only need a few of the regs: + * When generating a perf sample in-line, instead of from an interrupt / + * exception, we lack a pt_regs. This is typically used from software events + * like: SW_CONTEXT_SWITCHES, SW_MIGRATIONS and the tie-in with tracepoints. + * + * We typically don't need a full set, but (for x86) do require: * - ip for PERF_SAMPLE_IP * - cs for user_mode() tests - * - bp for callchains - * - eflags, for future purposes, just in case + * - sp for PERF_SAMPLE_CALLCHAIN + * - eflags for MISC bits and CALLCHAIN (see: perf_hw_regs()) + * + * NOTE: assumes @regs is otherwise already 0 filled; this is important for + * things like PERF_SAMPLE_REGS_INTR. */ static inline void perf_fetch_caller_regs(struct pt_regs *regs) { -- cgit v1.2.3 From ad282a8117d5048398f506f20b092c14b3b3c43f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 29 Mar 2019 17:08:52 -0700 Subject: locking/static_key: Add support for deferred static branches Add deferred static branches. We can't unfortunately use the nice trick of encapsulating the entire structure in true/false variants, because the inside has to be either struct static_key_true or struct static_key_false. Use defines to pass the appropriate members to the helpers separately. Signed-off-by: Jakub Kicinski Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Simon Horman Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: alexei.starovoitov@gmail.com Cc: ard.biesheuvel@linaro.org Cc: oss-drivers@netronome.com Cc: yamada.masahiro@socionext.com Link: https://lkml.kernel.org/r/20190330000854.30142-2-jakub.kicinski@netronome.com Signed-off-by: Ingo Molnar --- include/linux/jump_label_ratelimit.h | 64 ++++++++++++++++++++++++++++++++++-- 1 file changed, 61 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jump_label_ratelimit.h b/include/linux/jump_label_ratelimit.h index a49f2b45b3f0..42710d5949ba 100644 --- a/include/linux/jump_label_ratelimit.h +++ b/include/linux/jump_label_ratelimit.h @@ -12,21 +12,79 @@ struct static_key_deferred { struct delayed_work work; }; -extern void static_key_slow_dec_deferred(struct static_key_deferred *key); -extern void static_key_deferred_flush(struct static_key_deferred *key); +struct static_key_true_deferred { + struct static_key_true key; + unsigned long timeout; + struct delayed_work work; +}; + +struct static_key_false_deferred { + struct static_key_false key; + unsigned long timeout; + struct delayed_work work; +}; + +#define static_key_slow_dec_deferred(x) \ + __static_key_slow_dec_deferred(&(x)->key, &(x)->work, (x)->timeout) +#define static_branch_slow_dec_deferred(x) \ + __static_key_slow_dec_deferred(&(x)->key.key, &(x)->work, (x)->timeout) + +#define static_key_deferred_flush(x) \ + __static_key_deferred_flush((x), &(x)->work) + +extern void +__static_key_slow_dec_deferred(struct static_key *key, + struct delayed_work *work, + unsigned long timeout); +extern void __static_key_deferred_flush(void *key, struct delayed_work *work); extern void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl); +extern void jump_label_update_timeout(struct work_struct *work); + +#define DEFINE_STATIC_KEY_DEFERRED_TRUE(name, rl) \ + struct static_key_true_deferred name = { \ + .key = { STATIC_KEY_INIT_TRUE }, \ + .timeout = (rl), \ + .work = __DELAYED_WORK_INITIALIZER((name).work, \ + jump_label_update_timeout, \ + 0), \ + } + +#define DEFINE_STATIC_KEY_DEFERRED_FALSE(name, rl) \ + struct static_key_false_deferred name = { \ + .key = { STATIC_KEY_INIT_FALSE }, \ + .timeout = (rl), \ + .work = __DELAYED_WORK_INITIALIZER((name).work, \ + jump_label_update_timeout, \ + 0), \ + } + +#define static_branch_deferred_inc(x) static_branch_inc(&(x)->key) + #else /* !CONFIG_JUMP_LABEL */ struct static_key_deferred { struct static_key key; }; +struct static_key_true_deferred { + struct static_key_true key; +}; +struct static_key_false_deferred { + struct static_key_false key; +}; +#define DEFINE_STATIC_KEY_DEFERRED_TRUE(name, rl) \ + struct static_key_true_deferred name = { STATIC_KEY_TRUE_INIT } +#define DEFINE_STATIC_KEY_DEFERRED_FALSE(name, rl) \ + struct static_key_false_deferred name = { STATIC_KEY_FALSE_INIT } + +#define static_branch_slow_dec_deferred(x) static_branch_dec(&(x)->key) + static inline void static_key_slow_dec_deferred(struct static_key_deferred *key) { STATIC_KEY_CHECK_USE(key); static_key_slow_dec(&key->key); } -static inline void static_key_deferred_flush(struct static_key_deferred *key) +static inline void static_key_deferred_flush(void *key) { STATIC_KEY_CHECK_USE(key); } -- cgit v1.2.3 From 268a78404973594d1a7ec3a2b6a2474e0543a435 Mon Sep 17 00:00:00 2001 From: Philipp Rudo Date: Tue, 26 Mar 2019 15:45:53 +0100 Subject: s390/kexec_file: Disable kexec_load when IPLed secure A kernel loaded via kexec_load cannot be verified. Thus disable kexec_load systemcall in kernels which where IPLed securely. Use the IMA mechanism to do so. Signed-off-by: Philipp Rudo Signed-off-by: Martin Schwidefsky --- include/linux/ima.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ima.h b/include/linux/ima.h index dc12fbcf484c..fd9f7cf4cdf5 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -31,7 +31,7 @@ extern void ima_post_path_mknod(struct dentry *dentry); extern void ima_add_kexec_buffer(struct kimage *image); #endif -#if defined(CONFIG_X86) && defined(CONFIG_EFI) +#if (defined(CONFIG_X86) && defined(CONFIG_EFI)) || defined(CONFIG_S390) extern bool arch_ima_get_secureboot(void); extern const char * const *arch_get_ima_policy(void); #else -- cgit v1.2.3 From 3d9a8072915366b5932beeed97f158f8d4955768 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2019 11:44:54 +0200 Subject: tracing: Cleanup stack trace code - Remove the extra array member of stack_dump_trace[] along with the ARRAY_SIZE - 1 initialization for struct stack_trace :: max_entries. Both are historical leftovers of no value. The stack tracer never exceeds the array and there is no extra storage requirement either. - Make variables which are only used in trace_stack.c static. - Simplify the enable/disable logic. - Rename stack_trace_print() as it's using the stack_trace_ namespace. Free the name up for stack trace related functions. Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Alexander Potapenko Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Christoph Lameter Cc: Pekka Enberg Cc: linux-mm@kvack.org Cc: David Rientjes Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: kasan-dev@googlegroups.com Cc: Mike Rapoport Cc: Akinobu Mita Cc: Christoph Hellwig Cc: iommu@lists.linux-foundation.org Cc: Robin Murphy Cc: Marek Szyprowski Cc: Johannes Thumshirn Cc: David Sterba Cc: Chris Mason Cc: Josef Bacik Cc: linux-btrfs@vger.kernel.org Cc: dm-devel@redhat.com Cc: Mike Snitzer Cc: Alasdair Kergon Cc: Daniel Vetter Cc: intel-gfx@lists.freedesktop.org Cc: Joonas Lahtinen Cc: Maarten Lankhorst Cc: dri-devel@lists.freedesktop.org Cc: David Airlie Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Tom Zanussi Cc: Miroslav Benes Cc: linux-arch@vger.kernel.org Link: https://lkml.kernel.org/r/20190425094801.230654524@linutronix.de --- include/linux/ftrace.h | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 730876187344..20899919ead8 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -241,21 +241,11 @@ static inline void ftrace_free_mem(struct module *mod, void *start, void *end) { #ifdef CONFIG_STACK_TRACER -#define STACK_TRACE_ENTRIES 500 - -struct stack_trace; - -extern unsigned stack_trace_index[]; -extern struct stack_trace stack_trace_max; -extern unsigned long stack_trace_max_size; -extern arch_spinlock_t stack_trace_max_lock; - extern int stack_tracer_enabled; -void stack_trace_print(void); -int -stack_trace_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + +int stack_trace_sysctl(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); /* DO NOT MODIFY THIS VARIABLE DIRECTLY! */ DECLARE_PER_CPU(int, disable_stack_tracer); -- cgit v1.2.3 From e9b98e162aa53cbea7c8b0d6c9d5dc6e0f822b9c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2019 11:44:55 +0200 Subject: stacktrace: Provide helpers for common stack trace operations All operations with stack traces are based on struct stack_trace. That's a horrible construct as the struct is a kitchen sink for input and output. Quite some usage sites embed it into their own data structures which creates weird indirections. There is absolutely no point in doing so. For all use cases a storage array and the number of valid stack trace entries in the array is sufficient. Provide helper functions which avoid the struct stack_trace indirection so the usage sites can be cleaned up. Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Steven Rostedt Cc: Alexander Potapenko Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Christoph Lameter Cc: Pekka Enberg Cc: linux-mm@kvack.org Cc: David Rientjes Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: kasan-dev@googlegroups.com Cc: Mike Rapoport Cc: Akinobu Mita Cc: Christoph Hellwig Cc: iommu@lists.linux-foundation.org Cc: Robin Murphy Cc: Marek Szyprowski Cc: Johannes Thumshirn Cc: David Sterba Cc: Chris Mason Cc: Josef Bacik Cc: linux-btrfs@vger.kernel.org Cc: dm-devel@redhat.com Cc: Mike Snitzer Cc: Alasdair Kergon Cc: Daniel Vetter Cc: intel-gfx@lists.freedesktop.org Cc: Joonas Lahtinen Cc: Maarten Lankhorst Cc: dri-devel@lists.freedesktop.org Cc: David Airlie Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Tom Zanussi Cc: Miroslav Benes Cc: linux-arch@vger.kernel.org Link: https://lkml.kernel.org/r/20190425094801.324810708@linutronix.de --- include/linux/stacktrace.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include/linux') diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h index ba29a0613e66..a24340b3e9e1 100644 --- a/include/linux/stacktrace.h +++ b/include/linux/stacktrace.h @@ -3,11 +3,26 @@ #define __LINUX_STACKTRACE_H #include +#include struct task_struct; struct pt_regs; #ifdef CONFIG_STACKTRACE +void stack_trace_print(unsigned long *trace, unsigned int nr_entries, + int spaces); +int stack_trace_snprint(char *buf, size_t size, unsigned long *entries, + unsigned int nr_entries, int spaces); +unsigned int stack_trace_save(unsigned long *store, unsigned int size, + unsigned int skipnr); +unsigned int stack_trace_save_tsk(struct task_struct *task, + unsigned long *store, unsigned int size, + unsigned int skipnr); +unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store, + unsigned int size, unsigned int skipnr); +unsigned int stack_trace_save_user(unsigned long *store, unsigned int size); + +/* Internal interfaces. Do not use in generic code */ struct stack_trace { unsigned int nr_entries, max_entries; unsigned long *entries; @@ -41,4 +56,16 @@ extern void save_stack_trace_user(struct stack_trace *trace); # define save_stack_trace_tsk_reliable(tsk, trace) ({ -ENOSYS; }) #endif /* CONFIG_STACKTRACE */ +#if defined(CONFIG_STACKTRACE) && defined(CONFIG_HAVE_RELIABLE_STACKTRACE) +int stack_trace_save_tsk_reliable(struct task_struct *tsk, unsigned long *store, + unsigned int size); +#else +static inline int stack_trace_save_tsk_reliable(struct task_struct *tsk, + unsigned long *store, + unsigned int size) +{ + return -ENOSYS; +} +#endif + #endif /* __LINUX_STACKTRACE_H */ -- cgit v1.2.3 From c0cfc337264c5e02e0bc79de6b62857999588879 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2019 11:44:56 +0200 Subject: lib/stackdepot: Provide functions which operate on plain storage arrays The struct stack_trace indirection in the stack depot functions is a truly pointless excercise which requires horrible code at the callsites. Provide interfaces based on plain storage arrays. Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf Acked-by: Alexander Potapenko Cc: Andy Lutomirski Cc: Steven Rostedt Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Christoph Lameter Cc: Pekka Enberg Cc: linux-mm@kvack.org Cc: David Rientjes Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: kasan-dev@googlegroups.com Cc: Mike Rapoport Cc: Akinobu Mita Cc: Christoph Hellwig Cc: iommu@lists.linux-foundation.org Cc: Robin Murphy Cc: Marek Szyprowski Cc: Johannes Thumshirn Cc: David Sterba Cc: Chris Mason Cc: Josef Bacik Cc: linux-btrfs@vger.kernel.org Cc: dm-devel@redhat.com Cc: Mike Snitzer Cc: Alasdair Kergon Cc: Daniel Vetter Cc: intel-gfx@lists.freedesktop.org Cc: Joonas Lahtinen Cc: Maarten Lankhorst Cc: dri-devel@lists.freedesktop.org Cc: David Airlie Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Tom Zanussi Cc: Miroslav Benes Cc: linux-arch@vger.kernel.org Link: https://lkml.kernel.org/r/20190425094801.414574828@linutronix.de --- include/linux/stackdepot.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 7978b3e2c1e1..4297c6d2991d 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -26,7 +26,11 @@ typedef u32 depot_stack_handle_t; struct stack_trace; depot_stack_handle_t depot_save_stack(struct stack_trace *trace, gfp_t flags); +depot_stack_handle_t stack_depot_save(unsigned long *entries, + unsigned int nr_entries, gfp_t gfp_flags); void depot_fetch_stack(depot_stack_handle_t handle, struct stack_trace *trace); +unsigned int stack_depot_fetch(depot_stack_handle_t handle, + unsigned long **entries); #endif -- cgit v1.2.3 From c120bce78065cbea460a58b1572c215db9c148ba Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2019 11:45:12 +0200 Subject: lockdep: Simplify stack trace handling Replace the indirection through struct stack_trace by using the storage array based interfaces and storing the information is a small lockdep specific data structure. Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf Acked-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Steven Rostedt Cc: Alexander Potapenko Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Christoph Lameter Cc: Pekka Enberg Cc: linux-mm@kvack.org Cc: David Rientjes Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: kasan-dev@googlegroups.com Cc: Mike Rapoport Cc: Akinobu Mita Cc: Christoph Hellwig Cc: iommu@lists.linux-foundation.org Cc: Robin Murphy Cc: Marek Szyprowski Cc: Johannes Thumshirn Cc: David Sterba Cc: Chris Mason Cc: Josef Bacik Cc: linux-btrfs@vger.kernel.org Cc: dm-devel@redhat.com Cc: Mike Snitzer Cc: Alasdair Kergon Cc: Daniel Vetter Cc: intel-gfx@lists.freedesktop.org Cc: Joonas Lahtinen Cc: Maarten Lankhorst Cc: dri-devel@lists.freedesktop.org Cc: David Airlie Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Tom Zanussi Cc: Miroslav Benes Cc: linux-arch@vger.kernel.org Link: https://lkml.kernel.org/r/20190425094802.891724020@linutronix.de --- include/linux/lockdep.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 79c3873d58ac..6f165d625320 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -66,6 +66,11 @@ struct lock_class_key { extern struct lock_class_key __lockdep_no_validate__; +struct lock_trace { + unsigned int nr_entries; + unsigned int offset; +}; + #define LOCKSTAT_POINTS 4 /* @@ -100,7 +105,7 @@ struct lock_class { * IRQ/softirq usage tracking bits: */ unsigned long usage_mask; - struct stack_trace usage_traces[XXX_LOCK_USAGE_STATES]; + struct lock_trace usage_traces[XXX_LOCK_USAGE_STATES]; /* * Generation counter, when doing certain classes of graph walking, @@ -188,7 +193,7 @@ struct lock_list { struct list_head entry; struct lock_class *class; struct lock_class *links_to; - struct stack_trace trace; + struct lock_trace trace; int distance; /* -- cgit v1.2.3 From 988ec8841ca1e22b2978fce0134d8267e838770e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2019 11:45:19 +0200 Subject: stacktrace: Remove obsolete functions No more users of the struct stack_trace based interfaces. Remove them. Remove the macro stubs for !CONFIG_STACKTRACE as well as they are pointless because the storage on the call sites is conditional on CONFIG_STACKTRACE already. No point to be 'smart'. Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Steven Rostedt Cc: Alexander Potapenko Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Christoph Lameter Cc: Pekka Enberg Cc: linux-mm@kvack.org Cc: David Rientjes Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: kasan-dev@googlegroups.com Cc: Mike Rapoport Cc: Akinobu Mita Cc: Christoph Hellwig Cc: iommu@lists.linux-foundation.org Cc: Robin Murphy Cc: Marek Szyprowski Cc: Johannes Thumshirn Cc: David Sterba Cc: Chris Mason Cc: Josef Bacik Cc: linux-btrfs@vger.kernel.org Cc: dm-devel@redhat.com Cc: Mike Snitzer Cc: Alasdair Kergon Cc: Daniel Vetter Cc: intel-gfx@lists.freedesktop.org Cc: Joonas Lahtinen Cc: Maarten Lankhorst Cc: dri-devel@lists.freedesktop.org Cc: David Airlie Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Tom Zanussi Cc: Miroslav Benes Cc: linux-arch@vger.kernel.org Link: https://lkml.kernel.org/r/20190425094803.524796783@linutronix.de --- include/linux/stacktrace.h | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h index a24340b3e9e1..40decfbb9a24 100644 --- a/include/linux/stacktrace.h +++ b/include/linux/stacktrace.h @@ -36,24 +36,7 @@ extern void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace); extern int save_stack_trace_tsk_reliable(struct task_struct *tsk, struct stack_trace *trace); - -extern void print_stack_trace(struct stack_trace *trace, int spaces); -extern int snprint_stack_trace(char *buf, size_t size, - struct stack_trace *trace, int spaces); - -#ifdef CONFIG_USER_STACKTRACE_SUPPORT extern void save_stack_trace_user(struct stack_trace *trace); -#else -# define save_stack_trace_user(trace) do { } while (0) -#endif - -#else /* !CONFIG_STACKTRACE */ -# define save_stack_trace(trace) do { } while (0) -# define save_stack_trace_tsk(tsk, trace) do { } while (0) -# define save_stack_trace_user(trace) do { } while (0) -# define print_stack_trace(trace, spaces) do { } while (0) -# define snprint_stack_trace(buf, size, trace, spaces) do { } while (0) -# define save_stack_trace_tsk_reliable(tsk, trace) ({ -ENOSYS; }) #endif /* CONFIG_STACKTRACE */ #if defined(CONFIG_STACKTRACE) && defined(CONFIG_HAVE_RELIABLE_STACKTRACE) -- cgit v1.2.3 From 56d8f079c51afc8b564b9fb0252d48e7b437c1e5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2019 11:45:20 +0200 Subject: lib/stackdepot: Remove obsolete functions No more users of the struct stack_trace based interfaces. Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf Acked-by: Alexander Potapenko Cc: Andy Lutomirski Cc: Steven Rostedt Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Christoph Lameter Cc: Pekka Enberg Cc: linux-mm@kvack.org Cc: David Rientjes Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: kasan-dev@googlegroups.com Cc: Mike Rapoport Cc: Akinobu Mita Cc: Christoph Hellwig Cc: iommu@lists.linux-foundation.org Cc: Robin Murphy Cc: Marek Szyprowski Cc: Johannes Thumshirn Cc: David Sterba Cc: Chris Mason Cc: Josef Bacik Cc: linux-btrfs@vger.kernel.org Cc: dm-devel@redhat.com Cc: Mike Snitzer Cc: Alasdair Kergon Cc: Daniel Vetter Cc: intel-gfx@lists.freedesktop.org Cc: Joonas Lahtinen Cc: Maarten Lankhorst Cc: dri-devel@lists.freedesktop.org Cc: David Airlie Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Tom Zanussi Cc: Miroslav Benes Cc: linux-arch@vger.kernel.org Link: https://lkml.kernel.org/r/20190425094803.617937448@linutronix.de --- include/linux/stackdepot.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 4297c6d2991d..0805dee1b6b8 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -23,13 +23,9 @@ typedef u32 depot_stack_handle_t; -struct stack_trace; - -depot_stack_handle_t depot_save_stack(struct stack_trace *trace, gfp_t flags); depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t gfp_flags); -void depot_fetch_stack(depot_stack_handle_t handle, struct stack_trace *trace); unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries); -- cgit v1.2.3 From 214d8ca6ee854f696f75e75511fe66b409e656db Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Apr 2019 11:45:21 +0200 Subject: stacktrace: Provide common infrastructure All architectures which support stacktrace carry duplicated code and do the stack storage and filtering at the architecture side. Provide a consolidated interface with a callback function for consuming the stack entries provided by the architecture specific stack walker. This removes lots of duplicated code and allows to implement better filtering than 'skip number of entries' in the future without touching any architecture specific code. Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: linux-arch@vger.kernel.org Cc: Steven Rostedt Cc: Alexander Potapenko Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Christoph Lameter Cc: Pekka Enberg Cc: linux-mm@kvack.org Cc: David Rientjes Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: kasan-dev@googlegroups.com Cc: Mike Rapoport Cc: Akinobu Mita Cc: Christoph Hellwig Cc: iommu@lists.linux-foundation.org Cc: Robin Murphy Cc: Marek Szyprowski Cc: Johannes Thumshirn Cc: David Sterba Cc: Chris Mason Cc: Josef Bacik Cc: linux-btrfs@vger.kernel.org Cc: dm-devel@redhat.com Cc: Mike Snitzer Cc: Alasdair Kergon Cc: Daniel Vetter Cc: intel-gfx@lists.freedesktop.org Cc: Joonas Lahtinen Cc: Maarten Lankhorst Cc: dri-devel@lists.freedesktop.org Cc: David Airlie Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Tom Zanussi Cc: Miroslav Benes Link: https://lkml.kernel.org/r/20190425094803.713568606@linutronix.de --- include/linux/stacktrace.h | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'include/linux') diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h index 40decfbb9a24..f0cfd12cb45e 100644 --- a/include/linux/stacktrace.h +++ b/include/linux/stacktrace.h @@ -23,6 +23,44 @@ unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store, unsigned int stack_trace_save_user(unsigned long *store, unsigned int size); /* Internal interfaces. Do not use in generic code */ +#ifdef CONFIG_ARCH_STACKWALK + +/** + * stack_trace_consume_fn - Callback for arch_stack_walk() + * @cookie: Caller supplied pointer handed back by arch_stack_walk() + * @addr: The stack entry address to consume + * @reliable: True when the stack entry is reliable. Required by + * some printk based consumers. + * + * Return: True, if the entry was consumed or skipped + * False, if there is no space left to store + */ +typedef bool (*stack_trace_consume_fn)(void *cookie, unsigned long addr, + bool reliable); +/** + * arch_stack_walk - Architecture specific function to walk the stack + * @consume_entry: Callback which is invoked by the architecture code for + * each entry. + * @cookie: Caller supplied pointer which is handed back to + * @consume_entry + * @task: Pointer to a task struct, can be NULL + * @regs: Pointer to registers, can be NULL + * + * ============ ======= ============================================ + * task regs + * ============ ======= ============================================ + * task NULL Stack trace from task (can be current) + * current regs Stack trace starting on regs->stackpointer + * ============ ======= ============================================ + */ +void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, + struct task_struct *task, struct pt_regs *regs); +int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, void *cookie, + struct task_struct *task); +void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie, + const struct pt_regs *regs); + +#else /* CONFIG_ARCH_STACKWALK */ struct stack_trace { unsigned int nr_entries, max_entries; unsigned long *entries; @@ -37,6 +75,7 @@ extern void save_stack_trace_tsk(struct task_struct *tsk, extern int save_stack_trace_tsk_reliable(struct task_struct *tsk, struct stack_trace *trace); extern void save_stack_trace_user(struct stack_trace *trace); +#endif /* !CONFIG_ARCH_STACKWALK */ #endif /* CONFIG_STACKTRACE */ #if defined(CONFIG_STACKTRACE) && defined(CONFIG_HAVE_RELIABLE_STACKTRACE) -- cgit v1.2.3 From aad42dd44db086c79ca3f470ad563d2ac4ac218d Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Fri, 26 Apr 2019 16:22:44 -0700 Subject: uprobes: Initialize uprobes earlier In order to have a separate address space for text poking, we need to duplicate init_mm early during start_kernel(). This, however, introduces a problem since uprobes functions are called from dup_mmap(), but uprobes is still not initialized in this early stage. Since uprobes initialization is necassary for fork, and since all the dependant initialization has been done when fork is initialized (percpu and vmalloc), move uprobes initialization to fork_init(). It does not seem uprobes introduces any security problem for the poking_mm. Crash and burn if uprobes initialization fails, similarly to other early initializations. Change the init_probes() name to probes_init() to match other early initialization functions name convention. Reported-by: kernel test robot Signed-off-by: Nadav Amit Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Rick Edgecombe Cc: Rik van Riel Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: ard.biesheuvel@linaro.org Cc: deneen.t.dock@intel.com Cc: kernel-hardening@lists.openwall.com Cc: kristen@linux.intel.com Cc: linux_dti@icloud.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190426232303.28381-6-nadav.amit@gmail.com Signed-off-by: Ingo Molnar --- include/linux/uprobes.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 103a48a48872..12bf0b68ed92 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -115,6 +115,7 @@ struct uprobes_state { struct xol_area *xol_area; }; +extern void __init uprobes_init(void); extern int set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); extern int set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); extern bool is_swbp_insn(uprobe_opcode_t *insn); @@ -154,6 +155,10 @@ extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, struct uprobes_state { }; +static inline void uprobes_init(void) +{ +} + #define uprobe_get_trap_addr(regs) instruction_pointer(regs) static inline int -- cgit v1.2.3 From 13585fa0668c724efab9635aaeef6ec390217415 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 25 Apr 2019 17:11:25 -0700 Subject: fork: Provide a function for copying init_mm Provide a function for copying init_mm. This function will be later used for setting a temporary mm. Tested-by: Masami Hiramatsu Signed-off-by: Nadav Amit Signed-off-by: Rick Edgecombe Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Masami Hiramatsu Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Rik van Riel Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190426001143.4983-6-namit@vmware.com Signed-off-by: Ingo Molnar --- include/linux/sched/task.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 2e97a2227045..f1227f2c38a4 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -76,6 +76,7 @@ extern void exit_itimers(struct signal_struct *); extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long); extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); struct task_struct *fork_idle(int); +struct mm_struct *copy_init_mm(void); extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); -- cgit v1.2.3 From f2c65fb3221adc6b73b0549fc7ba892022db9797 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 25 Apr 2019 17:11:31 -0700 Subject: x86/modules: Avoid breaking W^X while loading modules When modules and BPF filters are loaded, there is a time window in which some memory is both writable and executable. An attacker that has already found another vulnerability (e.g., a dangling pointer) might be able to exploit this behavior to overwrite kernel code. Prevent having writable executable PTEs in this stage. In addition, avoiding having W+X mappings can also slightly simplify the patching of modules code on initialization (e.g., by alternatives and static-key), as would be done in the next patch. This was actually the main motivation for this patch. To avoid having W+X mappings, set them initially as RW (NX) and after they are set as RO set them as X as well. Setting them as executable is done as a separate step to avoid one core in which the old PTE is cached (hence writable), and another which sees the updated PTE (executable), which would break the W^X protection. Suggested-by: Thomas Gleixner Suggested-by: Andy Lutomirski Signed-off-by: Nadav Amit Signed-off-by: Rick Edgecombe Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Jessica Yu Cc: Kees Cook Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Rik van Riel Link: https://lkml.kernel.org/r/20190426001143.4983-12-namit@vmware.com Signed-off-by: Ingo Molnar --- include/linux/filter.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 6074aa064b54..14ec3bdad9a9 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -746,6 +746,7 @@ static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) { set_memory_ro((unsigned long)hdr, hdr->pages); + set_memory_x((unsigned long)hdr, hdr->pages); } static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr) -- cgit v1.2.3 From d253ca0c3865a8d9a8c01143cf20425e0be4d0ce Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Thu, 25 Apr 2019 17:11:34 -0700 Subject: x86/mm/cpa: Add set_direct_map_*() functions Add two new functions set_direct_map_default_noflush() and set_direct_map_invalid_noflush() for setting the direct map alias for the page to its default valid permissions and to an invalid state that cannot be cached in a TLB, respectively. These functions do not flush the TLB. Note, __kernel_map_pages() does something similar but flushes the TLB and doesn't reset the permission bits to default on all architectures. Also add an ARCH config ARCH_HAS_SET_DIRECT_MAP for specifying whether these have an actual implementation or a default empty one. Signed-off-by: Rick Edgecombe Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Nadav Amit Cc: Rik van Riel Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190426001143.4983-15-namit@vmware.com Signed-off-by: Ingo Molnar --- include/linux/set_memory.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h index 2a986d282a97..b5071497b8cb 100644 --- a/include/linux/set_memory.h +++ b/include/linux/set_memory.h @@ -17,6 +17,17 @@ static inline int set_memory_x(unsigned long addr, int numpages) { return 0; } static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; } #endif +#ifndef CONFIG_ARCH_HAS_SET_DIRECT_MAP +static inline int set_direct_map_invalid_noflush(struct page *page) +{ + return 0; +} +static inline int set_direct_map_default_noflush(struct page *page) +{ + return 0; +} +#endif + #ifndef set_mce_nospec static inline int set_mce_nospec(unsigned long pfn) { -- cgit v1.2.3 From d63326928611600ad65baff54a70f53b02b3cdfe Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Thu, 25 Apr 2019 17:11:35 -0700 Subject: mm/hibernation: Make hibernation handle unmapped pages Make hibernate handle unmapped pages on the direct map when CONFIG_ARCH_HAS_SET_ALIAS=y is set. These functions allow for setting pages to invalid configurations, so now hibernate should check if the pages have valid mappings and handle if they are unmapped when doing a hibernate save operation. Previously this checking was already done when CONFIG_DEBUG_PAGEALLOC=y was configured. It does not appear to have a big hibernating performance impact. The speed of the saving operation before this change was measured as 819.02 MB/s, and after was measured at 813.32 MB/s. Before: [ 4.670938] PM: Wrote 171996 kbytes in 0.21 seconds (819.02 MB/s) After: [ 4.504714] PM: Wrote 178932 kbytes in 0.22 seconds (813.32 MB/s) Signed-off-by: Rick Edgecombe Signed-off-by: Peter Zijlstra (Intel) Acked-by: Pavel Machek Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Nadav Amit Cc: Rafael J. Wysocki Cc: Rik van Riel Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190426001143.4983-16-namit@vmware.com Signed-off-by: Ingo Molnar --- include/linux/mm.h | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6b10c21630f5..083d7b4863ed 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2610,37 +2610,31 @@ static inline void kernel_poison_pages(struct page *page, int numpages, int enable) { } #endif -#ifdef CONFIG_DEBUG_PAGEALLOC extern bool _debug_pagealloc_enabled; -extern void __kernel_map_pages(struct page *page, int numpages, int enable); static inline bool debug_pagealloc_enabled(void) { - return _debug_pagealloc_enabled; + return IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && _debug_pagealloc_enabled; } +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_ARCH_HAS_SET_DIRECT_MAP) +extern void __kernel_map_pages(struct page *page, int numpages, int enable); + static inline void kernel_map_pages(struct page *page, int numpages, int enable) { - if (!debug_pagealloc_enabled()) - return; - __kernel_map_pages(page, numpages, enable); } #ifdef CONFIG_HIBERNATION extern bool kernel_page_present(struct page *page); #endif /* CONFIG_HIBERNATION */ -#else /* CONFIG_DEBUG_PAGEALLOC */ +#else /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */ static inline void kernel_map_pages(struct page *page, int numpages, int enable) {} #ifdef CONFIG_HIBERNATION static inline bool kernel_page_present(struct page *page) { return true; } #endif /* CONFIG_HIBERNATION */ -static inline bool debug_pagealloc_enabled(void) -{ - return false; -} -#endif /* CONFIG_DEBUG_PAGEALLOC */ +#endif /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */ #ifdef __HAVE_ARCH_GATE_AREA extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); -- cgit v1.2.3 From 868b104d7379e28013e9d48bdd2db25e0bdcf751 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Thu, 25 Apr 2019 17:11:36 -0700 Subject: mm/vmalloc: Add flag for freeing of special permsissions Add a new flag VM_FLUSH_RESET_PERMS, for enabling vfree operations to immediately clear executable TLB entries before freeing pages, and handle resetting permissions on the directmap. This flag is useful for any kind of memory with elevated permissions, or where there can be related permissions changes on the directmap. Today this is RO+X and RO memory. Although this enables directly vfreeing non-writeable memory now, non-writable memory cannot be freed in an interrupt because the allocation itself is used as a node on deferred free list. So when RO memory needs to be freed in an interrupt the code doing the vfree needs to have its own work queue, as was the case before the deferred vfree list was added to vmalloc. For architectures with set_direct_map_ implementations this whole operation can be done with one TLB flush when centralized like this. For others with directmap permissions, currently only arm64, a backup method using set_memory functions is used to reset the directmap. When arm64 adds set_direct_map_ functions, this backup can be removed. When the TLB is flushed to both remove TLB entries for the vmalloc range mapping and the direct map permissions, the lazy purge operation could be done to try to save a TLB flush later. However today vm_unmap_aliases could flush a TLB range that does not include the directmap. So a helper is added with extra parameters that can allow both the vmalloc address and the direct mapping to be flushed during this operation. The behavior of the normal vm_unmap_aliases function is unchanged. Suggested-by: Dave Hansen Suggested-by: Andy Lutomirski Suggested-by: Will Deacon Signed-off-by: Rick Edgecombe Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Cc: Cc: Cc: Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Nadav Amit Cc: Rik van Riel Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190426001143.4983-17-namit@vmware.com Signed-off-by: Ingo Molnar --- include/linux/vmalloc.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 398e9c95cd61..c6eebb839552 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -21,6 +21,11 @@ struct notifier_block; /* in notifier.h */ #define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */ #define VM_NO_GUARD 0x00000040 /* don't add guard page */ #define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ +/* + * Memory with VM_FLUSH_RESET_PERMS cannot be freed in an interrupt or with + * vfree_atomic(). + */ +#define VM_FLUSH_RESET_PERMS 0x00000100 /* Reset direct map and flush TLB on unmap */ /* bits [20..32] reserved for arch specific ioremap internals */ /* @@ -142,6 +147,13 @@ extern int map_kernel_range_noflush(unsigned long start, unsigned long size, pgprot_t prot, struct page **pages); extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size); extern void unmap_kernel_range(unsigned long addr, unsigned long size); +static inline void set_vm_flush_reset_perms(void *addr) +{ + struct vm_struct *vm = find_vm_area(addr); + + if (vm) + vm->flags |= VM_FLUSH_RESET_PERMS; +} #else static inline int map_kernel_range_noflush(unsigned long start, unsigned long size, @@ -157,6 +169,9 @@ static inline void unmap_kernel_range(unsigned long addr, unsigned long size) { } +static inline void set_vm_flush_reset_perms(void *addr) +{ +} #endif /* Allocate/destroy a 'vmalloc' VM area. */ -- cgit v1.2.3 From d53d2f78ceadba081fc7785570798c3c8d50a718 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Thu, 25 Apr 2019 17:11:38 -0700 Subject: bpf: Use vmalloc special flag Use new flag VM_FLUSH_RESET_PERMS for handling freeing of special permissioned memory in vmalloc and remove places where memory was set RW before freeing which is no longer needed. Don't track if the memory is RO anymore because it is now tracked in vmalloc. Signed-off-by: Rick Edgecombe Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Cc: Cc: Cc: Cc: Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Daniel Borkmann Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Nadav Amit Cc: Rik van Riel Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190426001143.4983-19-namit@vmware.com Signed-off-by: Ingo Molnar --- include/linux/filter.h | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 14ec3bdad9a9..7d3abde3f183 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -20,6 +20,7 @@ #include #include #include +#include #include @@ -503,7 +504,6 @@ struct bpf_prog { u16 pages; /* Number of allocated pages */ u16 jited:1, /* Is our filter JIT'ed? */ jit_requested:1,/* archs need to JIT the prog */ - undo_set_mem:1, /* Passed set_memory_ro() checkpoint */ gpl_compatible:1, /* Is filter GPL compatible? */ cb_access:1, /* Is control block accessed? */ dst_needed:1, /* Do we need dst entry? */ @@ -733,27 +733,17 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default) static inline void bpf_prog_lock_ro(struct bpf_prog *fp) { - fp->undo_set_mem = 1; + set_vm_flush_reset_perms(fp); set_memory_ro((unsigned long)fp, fp->pages); } -static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) -{ - if (fp->undo_set_mem) - set_memory_rw((unsigned long)fp, fp->pages); -} - static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) { + set_vm_flush_reset_perms(hdr); set_memory_ro((unsigned long)hdr, hdr->pages); set_memory_x((unsigned long)hdr, hdr->pages); } -static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr) -{ - set_memory_rw((unsigned long)hdr, hdr->pages); -} - static inline struct bpf_binary_header * bpf_jit_binary_hdr(const struct bpf_prog *fp) { @@ -789,7 +779,6 @@ void __bpf_prog_free(struct bpf_prog *fp); static inline void bpf_prog_unlock_free(struct bpf_prog *fp) { - bpf_prog_unlock_ro(fp); __bpf_prog_free(fp); } -- cgit v1.2.3 From f5eb4d3b92a6a1096ef3480b54782a9409281300 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 26 Apr 2019 18:45:21 +0800 Subject: iov_iter: fix iov_iter_type Commit 875f1d0769cd ("iov_iter: add ITER_BVEC_FLAG_NO_REF flag") introduces one extra flag of ITER_BVEC_FLAG_NO_REF, and this flag is stored into iter->type. However, iov_iter_type() doesn't consider the new added flag, fix it by masking this flag in iov_iter_type(). Fixes: 875f1d0769cd ("iov_iter: add ITER_BVEC_FLAG_NO_REF flag") Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/uio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/uio.h b/include/linux/uio.h index f184af1999a8..2d0131ad4604 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -60,7 +60,7 @@ struct iov_iter { static inline enum iter_type iov_iter_type(const struct iov_iter *i) { - return i->type & ~(READ | WRITE); + return i->type & ~(READ | WRITE | ITER_BVEC_FLAG_NO_REF); } static inline bool iter_is_iovec(const struct iov_iter *i) -- cgit v1.2.3 From 72e830f68428ab9ea9eca65d160795f4e02cecfc Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Fri, 3 May 2019 11:55:36 +0300 Subject: perf/x86/intel/pt: Remove software double buffering PMU capability Now that all AUX allocations are high-order by default, the software double buffering PMU capability doesn't make sense any more, get rid of it. In case some PMUs choose to opt out, we can re-introduce it. Signed-off-by: Alexander Shishkin Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: adrian.hunter@intel.com Link: http://lkml.kernel.org/r/20190503085536.24119-3-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index e47ef764f613..1f678f023850 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -240,7 +240,6 @@ struct perf_event; #define PERF_PMU_CAP_NO_INTERRUPT 0x01 #define PERF_PMU_CAP_NO_NMI 0x02 #define PERF_PMU_CAP_AUX_NO_SG 0x04 -#define PERF_PMU_CAP_AUX_SW_DOUBLEBUF 0x08 #define PERF_PMU_CAP_EXCLUSIVE 0x10 #define PERF_PMU_CAP_ITRACE 0x20 #define PERF_PMU_CAP_HETEROGENEOUS_CPUS 0x40 -- cgit v1.2.3 From 2f1a6fbbef7781382850c3104ecb658f21b5d460 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 11 Apr 2019 13:34:45 +1000 Subject: power/suspend: Add function to disable secondaries for suspend This adds a function to disable secondary CPUs for suspend that are not necessarily non-zero / non-boot CPUs. Platforms will be able to use this to suspend using non-zero CPUs. Signed-off-by: Nicholas Piggin Signed-off-by: Peter Zijlstra (Intel) Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Thomas Gleixner Cc: linuxppc-dev@lists.ozlabs.org Link: https://lkml.kernel.org/r/20190411033448.20842-3-npiggin@gmail.com Signed-off-by: Ingo Molnar --- include/linux/cpu.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 5041357d0297..7ab2f09c0a14 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -137,9 +137,21 @@ static inline int disable_nonboot_cpus(void) return freeze_secondary_cpus(0); } extern void enable_nonboot_cpus(void); + +static inline int suspend_disable_secondary_cpus(void) +{ + return freeze_secondary_cpus(0); +} +static inline void suspend_enable_secondary_cpus(void) +{ + return enable_nonboot_cpus(); +} + #else /* !CONFIG_PM_SLEEP_SMP */ static inline int disable_nonboot_cpus(void) { return 0; } static inline void enable_nonboot_cpus(void) {} +static inline int suspend_disable_secondary_cpus(void) { return 0; } +static inline void suspend_enable_secondary_cpus(void) { } #endif /* !CONFIG_PM_SLEEP_SMP */ void cpu_startup_entry(enum cpuhp_state state); -- cgit v1.2.3 From 9ca12ac04bb7d7cfb28aa549dcd3d15761f15543 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 11 Apr 2019 13:34:46 +1000 Subject: kernel/cpu: Allow non-zero CPU to be primary for suspend / kexec freeze This patch provides an arch option, ARCH_SUSPEND_NONZERO_CPU, to opt-in to allowing suspend to occur on one of the housekeeping CPUs rather than hardcoded CPU0. This will allow CPU0 to be a nohz_full CPU with a later change. It may be possible for platforms with hardware/firmware restrictions on suspend/wake effectively support this by handing off the final stage to CPU0 when kernel housekeeping is no longer required. Another option is to make housekeeping / nohz_full mask dynamic at runtime, but the complexity could not be justified at this time. Signed-off-by: Nicholas Piggin Signed-off-by: Peter Zijlstra (Intel) Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Thomas Gleixner Cc: linuxppc-dev@lists.ozlabs.org Link: https://lkml.kernel.org/r/20190411033448.20842-4-npiggin@gmail.com Signed-off-by: Ingo Molnar --- include/linux/cpu.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 7ab2f09c0a14..73baab8535c1 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -140,7 +140,12 @@ extern void enable_nonboot_cpus(void); static inline int suspend_disable_secondary_cpus(void) { - return freeze_secondary_cpus(0); + int cpu = 0; + + if (IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) + cpu = -1; + + return freeze_secondary_cpus(cpu); } static inline void suspend_enable_secondary_cpus(void) { -- cgit v1.2.3