From 6549c46af8551b346bcc0b9043f93848319acd5c Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Sun, 27 Jun 2021 16:04:18 +0800 Subject: regulator: rt5033: Fix n_voltages settings for BUCK and LDO For linear regulators, the n_voltages should be (max - min) / step + 1. Buck voltage from 1v to 3V, per step 100mV, and vout mask is 0x1f. If value is from 20 to 31, the voltage will all be fixed to 3V. And LDO also, just vout range is different from 1.2v to 3v, step is the same. If value is from 18 to 31, the voltage will also be fixed to 3v. Signed-off-by: Axel Lin Reviewed-by: ChiYuan Huang Link: https://lore.kernel.org/r/20210627080418.1718127-1-axel.lin@ingics.com Signed-off-by: Mark Brown --- include/linux/mfd/rt5033-private.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/rt5033-private.h b/include/linux/mfd/rt5033-private.h index 2d1895c3efbf..40a0c2dfb80f 100644 --- a/include/linux/mfd/rt5033-private.h +++ b/include/linux/mfd/rt5033-private.h @@ -200,13 +200,13 @@ enum rt5033_reg { #define RT5033_REGULATOR_BUCK_VOLTAGE_MIN 1000000U #define RT5033_REGULATOR_BUCK_VOLTAGE_MAX 3000000U #define RT5033_REGULATOR_BUCK_VOLTAGE_STEP 100000U -#define RT5033_REGULATOR_BUCK_VOLTAGE_STEP_NUM 32 +#define RT5033_REGULATOR_BUCK_VOLTAGE_STEP_NUM 21 /* RT5033 regulator LDO output voltage uV */ #define RT5033_REGULATOR_LDO_VOLTAGE_MIN 1200000U #define RT5033_REGULATOR_LDO_VOLTAGE_MAX 3000000U #define RT5033_REGULATOR_LDO_VOLTAGE_STEP 100000U -#define RT5033_REGULATOR_LDO_VOLTAGE_STEP_NUM 32 +#define RT5033_REGULATOR_LDO_VOLTAGE_STEP_NUM 19 /* RT5033 regulator SAFE LDO output voltage uV */ #define RT5033_REGULATOR_SAFE_LDO_VOLTAGE 4900000U -- cgit v1.2.3 From 5d43f951b1ac797450bb4d230fdc960b739bea04 Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Wed, 30 Jun 2021 16:11:52 +0800 Subject: ptp: add ptp virtual clock driver framework This patch is to add ptp virtual clock driver framework utilizing timecounter/cyclecounter. The patch just exports two essential APIs for PTP driver. - ptp_vclock_register() - ptp_vclock_unregister() Signed-off-by: Yangbo Lu Signed-off-by: David S. Miller --- include/linux/ptp_clock_kernel.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index aba237c0b3a2..b6fb771ee524 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -11,7 +11,9 @@ #include #include #include +#include +#define PTP_CLOCK_NAME_LEN 32 /** * struct ptp_clock_request - request PTP clock event * @@ -134,7 +136,7 @@ struct ptp_system_timestamp { struct ptp_clock_info { struct module *owner; - char name[16]; + char name[PTP_CLOCK_NAME_LEN]; s32 max_adj; int n_alarm; int n_ext_ts; -- cgit v1.2.3 From acb288e8047b7569fbc9af6fa6e9405315345103 Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Wed, 30 Jun 2021 16:11:55 +0800 Subject: ptp: add kernel API ptp_get_vclocks_index() Add kernel API ptp_get_vclocks_index() to get all ptp vclocks index on pclock. This is preparation for supporting ptp vclocks info query through ethtool. Signed-off-by: Yangbo Lu Signed-off-by: David S. Miller --- include/linux/ptp_clock_kernel.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index b6fb771ee524..300a984fec87 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -306,6 +306,18 @@ int ptp_schedule_worker(struct ptp_clock *ptp, unsigned long delay); */ void ptp_cancel_worker_sync(struct ptp_clock *ptp); +/** + * ptp_get_vclocks_index() - get all vclocks index on pclock, and + * caller is responsible to free memory + * of vclock_index + * + * @pclock_index: phc index of ptp pclock. + * @vclock_index: pointer to pointer of vclock index. + * + * return number of vclocks. + */ +int ptp_get_vclocks_index(int pclock_index, int **vclock_index); + #else static inline struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, struct device *parent) @@ -325,6 +337,8 @@ static inline int ptp_schedule_worker(struct ptp_clock *ptp, { return -EOPNOTSUPP; } static inline void ptp_cancel_worker_sync(struct ptp_clock *ptp) { } +static inline int ptp_get_vclocks_index(int pclock_index, int **vclock_index) +{ return 0; } #endif -- cgit v1.2.3 From c156174a67070042d51d2c866146d3c934d5468c Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Wed, 30 Jun 2021 16:11:56 +0800 Subject: ethtool: add a new command for getting PHC virtual clocks Add an interface for getting PHC (PTP Hardware Clock) virtual clocks, which are based on PHC physical clock providing hardware timestamp to network packets. Signed-off-by: Yangbo Lu Signed-off-by: David S. Miller --- include/linux/ethtool.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 29dbb603bc91..232daaec56e4 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -757,6 +757,16 @@ void ethtool_params_from_link_mode(struct ethtool_link_ksettings *link_ksettings, enum ethtool_link_mode_bit_indices link_mode); +/** + * ethtool_get_phc_vclocks - Derive phc vclocks information, and caller + * is responsible to free memory of vclock_index + * @dev: pointer to net_device structure + * @vclock_index: pointer to pointer of vclock index + * + * Return number of phc vclocks + */ +int ethtool_get_phc_vclocks(struct net_device *dev, int **vclock_index); + /** * ethtool_sprintf - Write formatted string to ethtool string data * @data: Pointer to start of string to update -- cgit v1.2.3 From 895487a3a10fb3a177e20dcde875515d46ccd4df Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Wed, 30 Jun 2021 16:11:57 +0800 Subject: ptp: add kernel API ptp_convert_timestamp() Add kernel API ptp_convert_timestamp() to convert raw hardware timestamp to a specified ptp vclock time. Signed-off-by: Yangbo Lu Signed-off-by: David S. Miller --- include/linux/ptp_clock_kernel.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index 300a984fec87..71fac9237725 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -12,6 +12,7 @@ #include #include #include +#include #define PTP_CLOCK_NAME_LEN 32 /** @@ -318,6 +319,15 @@ void ptp_cancel_worker_sync(struct ptp_clock *ptp); */ int ptp_get_vclocks_index(int pclock_index, int **vclock_index); +/** + * ptp_convert_timestamp() - convert timestamp to a ptp vclock time + * + * @hwtstamps: skb_shared_hwtstamps structure pointer + * @vclock_index: phc index of ptp vclock. + */ +void ptp_convert_timestamp(struct skb_shared_hwtstamps *hwtstamps, + int vclock_index); + #else static inline struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, struct device *parent) @@ -339,6 +349,9 @@ static inline void ptp_cancel_worker_sync(struct ptp_clock *ptp) { } static inline int ptp_get_vclocks_index(int pclock_index, int **vclock_index) { return 0; } +static inline void ptp_convert_timestamp(struct skb_shared_hwtstamps *hwtstamps, + int vclock_index) +{ } #endif -- cgit v1.2.3 From b2aae654a4794ef898ad33a179f341eb610f6b85 Mon Sep 17 00:00:00 2001 From: Xiaoliang Yang Date: Mon, 5 Jul 2021 18:26:54 +0800 Subject: net: stmmac: add mutex lock to protect est parameters Add a mutex lock to protect est structure parameters so that the EST parameters can be updated by other threads. Signed-off-by: Xiaoliang Yang Signed-off-by: David S. Miller --- include/linux/stmmac.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index d5ae621d66ba..09157b8a5810 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -115,6 +115,7 @@ struct stmmac_axi { #define EST_GCL 1024 struct stmmac_est { + struct mutex lock; int enable; u32 btr_offset[2]; u32 btr[2]; -- cgit v1.2.3 From e9e3720002f61cd637a49ecafae77cac230eefae Mon Sep 17 00:00:00 2001 From: Xiaoliang Yang Date: Mon, 5 Jul 2021 18:26:55 +0800 Subject: net: stmmac: ptp: update tas basetime after ptp adjust After adjusting the ptp time, the Qbv base time may be the past time of the new current time. dwmac5 hardware limited the base time cannot be set as past time. This patch add a btr_reserve to store the base time get from qopt, then calculate the base time and reset the Qbv configuration after ptp time adjust. Signed-off-by: Xiaoliang Yang Signed-off-by: David S. Miller --- include/linux/stmmac.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 09157b8a5810..a6f03b36fc4f 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -117,6 +117,7 @@ struct stmmac_axi { struct stmmac_est { struct mutex lock; int enable; + u32 btr_reserve[2]; u32 btr_offset[2]; u32 btr[2]; u32 ctr[2]; -- cgit v1.2.3 From f263a81451c12da5a342d90572e317e611846f2c Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 7 Jul 2021 15:38:47 -0700 Subject: bpf: Track subprog poke descriptors correctly and fix use-after-free Subprograms are calling map_poke_track(), but on program release there is no hook to call map_poke_untrack(). However, on program release, the aux memory (and poke descriptor table) is freed even though we still have a reference to it in the element list of the map aux data. When we run map_poke_run(), we then end up accessing free'd memory, triggering KASAN in prog_array_map_poke_run(): [...] [ 402.824689] BUG: KASAN: use-after-free in prog_array_map_poke_run+0xc2/0x34e [ 402.824698] Read of size 4 at addr ffff8881905a7940 by task hubble-fgs/4337 [ 402.824705] CPU: 1 PID: 4337 Comm: hubble-fgs Tainted: G I 5.12.0+ #399 [ 402.824715] Call Trace: [ 402.824719] dump_stack+0x93/0xc2 [ 402.824727] print_address_description.constprop.0+0x1a/0x140 [ 402.824736] ? prog_array_map_poke_run+0xc2/0x34e [ 402.824740] ? prog_array_map_poke_run+0xc2/0x34e [ 402.824744] kasan_report.cold+0x7c/0xd8 [ 402.824752] ? prog_array_map_poke_run+0xc2/0x34e [ 402.824757] prog_array_map_poke_run+0xc2/0x34e [ 402.824765] bpf_fd_array_map_update_elem+0x124/0x1a0 [...] The elements concerned are walked as follows: for (i = 0; i < elem->aux->size_poke_tab; i++) { poke = &elem->aux->poke_tab[i]; [...] The access to size_poke_tab is a 4 byte read, verified by checking offsets in the KASAN dump: [ 402.825004] The buggy address belongs to the object at ffff8881905a7800 which belongs to the cache kmalloc-1k of size 1024 [ 402.825008] The buggy address is located 320 bytes inside of 1024-byte region [ffff8881905a7800, ffff8881905a7c00) The pahole output of bpf_prog_aux: struct bpf_prog_aux { [...] /* --- cacheline 5 boundary (320 bytes) --- */ u32 size_poke_tab; /* 320 4 */ [...] In general, subprograms do not necessarily manage their own data structures. For example, BTF func_info and linfo are just pointers to the main program structure. This allows reference counting and cleanup to be done on the latter which simplifies their management a bit. The aux->poke_tab struct, however, did not follow this logic. The initial proposed fix for this use-after-free bug further embedded poke data tracking into the subprogram with proper reference counting. However, Daniel and Alexei questioned why we were treating these objects special; I agree, its unnecessary. The fix here removes the per subprogram poke table allocation and map tracking and instead simply points the aux->poke_tab pointer at the main programs poke table. This way, map tracking is simplified to the main program and we do not need to manage them per subprogram. This also means, bpf_prog_free_deferred(), which unwinds the program reference counting and kfrees objects, needs to ensure that we don't try to double free the poke_tab when free'ing the subprog structures. This is easily solved by NULL'ing the poke_tab pointer. The second detail is to ensure that per subprogram JIT logic only does fixups on poke_tab[] entries it owns. To do this, we add a pointer in the poke structure to point at the subprogram value so JITs can easily check while walking the poke_tab structure if the current entry belongs to the current program. The aux pointer is stable and therefore suitable for such comparison. On the jit_subprogs() error path, we omit cleaning up the poke->aux field because these are only ever referenced from the JIT side, but on error we will never make it to the JIT, so its fine to leave them dangling. Removing these pointers would complicate the error path for no reason. However, we do need to untrack all poke descriptors from the main program as otherwise they could race with the freeing of JIT memory from the subprograms. Lastly, a748c6975dea3 ("bpf: propagate poke descriptors to subprograms") had an off-by-one on the subprogram instruction index range check as it was testing 'insn_idx >= subprog_start && insn_idx <= subprog_end'. However, subprog_end is the next subprogram's start instruction. Fixes: a748c6975dea3 ("bpf: propagate poke descriptors to subprograms") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Co-developed-by: Daniel Borkmann Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210707223848.14580-2-john.fastabend@gmail.com --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f309fc1509f2..e8e2b0393ca9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -780,6 +780,7 @@ struct bpf_jit_poke_descriptor { void *tailcall_target; void *tailcall_bypass; void *bypass_addr; + void *aux; union { struct { struct bpf_map *map; -- cgit v1.2.3 From a5de4be0aaaa66a2fa98e8a33bdbed3bd0682804 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Sun, 11 Jul 2021 18:38:15 +0200 Subject: net: phy: marvell10g: fix differentiation of 88X3310 from 88X3340 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It seems that we cannot differentiate 88X3310 from 88X3340 by simply looking at bit 3 of revision ID. This only works on revisions A0 and A1. On revision B0, this bit is always 1. Instead use the 3.d00d register for differentiation, since this register contains information about number of ports on the device. Fixes: 9885d016ffa9 ("net: phy: marvell10g: add separate structure for 88X3340") Signed-off-by: Marek Behún Reported-by: Matteo Croce Tested-by: Matteo Croce Signed-off-by: David S. Miller --- include/linux/marvell_phy.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/marvell_phy.h b/include/linux/marvell_phy.h index acee44b9db26..0f06c2287b52 100644 --- a/include/linux/marvell_phy.h +++ b/include/linux/marvell_phy.h @@ -22,14 +22,10 @@ #define MARVELL_PHY_ID_88E1545 0x01410ea0 #define MARVELL_PHY_ID_88E1548P 0x01410ec0 #define MARVELL_PHY_ID_88E3016 0x01410e60 +#define MARVELL_PHY_ID_88X3310 0x002b09a0 #define MARVELL_PHY_ID_88E2110 0x002b09b0 #define MARVELL_PHY_ID_88X2222 0x01410f10 -/* PHY IDs and mask for Alaska 10G PHYs */ -#define MARVELL_PHY_ID_88X33X0_MASK 0xfffffff8 -#define MARVELL_PHY_ID_88X3310 0x002b09a0 -#define MARVELL_PHY_ID_88X3340 0x002b09a8 - /* Marvel 88E1111 in Finisar SFP module with modified PHY ID */ #define MARVELL_PHY_ID_88E1111_FINISAR 0x01ff0cc0 -- cgit v1.2.3 From 79789db03fdd77510cfb35cb4b3bd52b6c50c901 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 12 Jul 2021 16:32:07 +0100 Subject: mm: Make copy_huge_page() always available Rewrite copy_huge_page() and move it into mm/util.c so it's always available. Fixes an exposure of uninitialised memory on configurations with HUGETLB and UFFD enabled and MIGRATION disabled. Fixes: 8cc5fcbb5be8 ("mm, hugetlb: fix racy resv_huge_pages underflow on UFFDIO_COPY") Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Mike Kravetz Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 5 ----- include/linux/mm.h | 1 + 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 9b7b7cd3bae9..23dadf7aeba8 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -51,7 +51,6 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page); extern int migrate_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page, int extra_count); -extern void copy_huge_page(struct page *dst, struct page *src); #else static inline void putback_movable_pages(struct list_head *l) {} @@ -77,10 +76,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, { return -ENOSYS; } - -static inline void copy_huge_page(struct page *dst, struct page *src) -{ -} #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_COMPACTION diff --git a/include/linux/mm.h b/include/linux/mm.h index 57453dba41b9..7ca22e6e694a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -906,6 +906,7 @@ void __put_page(struct page *page); void put_pages_list(struct list_head *pages); void split_page(struct page *page, unsigned int order); +void copy_huge_page(struct page *dst, struct page *src); /* * Compound pages have a destructor function. Provide a -- cgit v1.2.3 From 52f83955aaf91b22f46765b007b4404ce85b2133 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Mon, 12 Jul 2021 14:08:01 +0100 Subject: firmware: arm_scmi: Fix kernel doc warnings Kernel doc validation script is unhappy and complains with the below set of warnings. | Function parameter or member 'fast_switch_possible' not described in 'scmi_perf_proto_ops' | Function parameter or member 'power_scale_mw_get' not described in 'scmi_perf_proto_ops' | cannot understand function prototype: 'struct scmi_sensor_reading ' | cannot understand function prototype: 'struct scmi_range_attrs ' | cannot understand function prototype: 'struct scmi_sensor_axis_info ' | cannot understand function prototype: 'struct scmi_sensor_intervals_info ' Fix them adding appropriate documents or missing keywords. Link: https://lore.kernel.org/r/20210712130801.2436492-2-sudeep.holla@arm.com Reviewed-by: Cristian Marussi Signed-off-by: Sudeep Holla --- include/linux/scmi_protocol.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h index 79d0a1237e6c..80e781c51ddc 100644 --- a/include/linux/scmi_protocol.h +++ b/include/linux/scmi_protocol.h @@ -101,6 +101,10 @@ struct scmi_clk_proto_ops { * to sustained performance level mapping * @est_power_get: gets the estimated power cost for a given performance domain * at a given frequency + * @fast_switch_possible: indicates if fast DVFS switching is possible or not + * for a given device + * @power_scale_mw_get: indicates if the power values provided are in milliWatts + * or in some other (abstract) scale */ struct scmi_perf_proto_ops { int (*limits_set)(const struct scmi_protocol_handle *ph, u32 domain, @@ -153,7 +157,7 @@ struct scmi_power_proto_ops { }; /** - * scmi_sensor_reading - represent a timestamped read + * struct scmi_sensor_reading - represent a timestamped read * * Used by @reading_get_timestamped method. * @@ -167,7 +171,7 @@ struct scmi_sensor_reading { }; /** - * scmi_range_attrs - specifies a sensor or axis values' range + * struct scmi_range_attrs - specifies a sensor or axis values' range * @min_range: The minimum value which can be represented by the sensor/axis. * @max_range: The maximum value which can be represented by the sensor/axis. */ @@ -177,7 +181,7 @@ struct scmi_range_attrs { }; /** - * scmi_sensor_axis_info - describes one sensor axes + * struct scmi_sensor_axis_info - describes one sensor axes * @id: The axes ID. * @type: Axes type. Chosen amongst one of @enum scmi_sensor_class. * @scale: Power-of-10 multiplier applied to the axis unit. @@ -205,8 +209,8 @@ struct scmi_sensor_axis_info { }; /** - * scmi_sensor_intervals_info - describes number and type of available update - * intervals + * struct scmi_sensor_intervals_info - describes number and type of available + * update intervals * @segmented: Flag for segmented intervals' representation. When True there * will be exactly 3 intervals in @desc, with each entry * representing a member of a segment in this order: -- cgit v1.2.3 From 5ff6319d46cee22c9cd6f39a377e32c444f9a7b0 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Mon, 12 Jul 2021 11:27:48 +0100 Subject: firmware: arm_scpi: Fix kernel doc warnings Kernel doc validation script is unhappy and complains with the below set of warnings. | Function parameter or member 'device_domain_id' not described in 'scpi_ops' | Function parameter or member 'get_transition_latency' not described in 'scpi_ops' | Function parameter or member 'add_opps_to_device' not described in 'scpi_ops' | Function parameter or member 'sensor_get_capability' not described in 'scpi_ops' | Function parameter or member 'sensor_get_info' not described in 'scpi_ops' | Function parameter or member 'sensor_get_value' not described in 'scpi_ops' | Function parameter or member 'device_get_power_state' not described in 'scpi_ops' | Function parameter or member 'device_set_power_state' not described in 'scpi_ops' Fix them adding appropriate documents or missing keywords. Link: https://lore.kernel.org/r/20210712130801.2436492-1-sudeep.holla@arm.com Reviewed-by: Cristian Marussi Signed-off-by: Sudeep Holla --- include/linux/scpi_protocol.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/scpi_protocol.h b/include/linux/scpi_protocol.h index afbf8037d8db..d2176a56828a 100644 --- a/include/linux/scpi_protocol.h +++ b/include/linux/scpi_protocol.h @@ -51,6 +51,14 @@ struct scpi_sensor_info { * OPP is an index to the list return by @dvfs_get_info * @dvfs_get_info: returns the DVFS capabilities of the given power * domain. It includes the OPP list and the latency information + * @device_domain_id: gets the scpi domain id for a given device + * @get_transition_latency: gets the DVFS transition latency for a given device + * @add_opps_to_device: adds all the OPPs for a given device + * @sensor_get_capability: get the list of capabilities for the sensors + * @sensor_get_info: get the information of the specified sensor + * @sensor_get_value: gets the current value of the sensor + * @device_get_power_state: gets the power state of a power domain + * @device_set_power_state: sets the power state of a power domain */ struct scpi_ops { u32 (*get_version)(void); -- cgit v1.2.3 From d1d488d813703618f0dd93f0e4c4a05928114aa8 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 14 Jul 2021 15:47:50 +0200 Subject: fs: add vfs_parse_fs_param_source() helper Add a simple helper that filesystems can use in their parameter parser to parse the "source" parameter. A few places open-coded this function and that already caused a bug in the cgroup v1 parser that we fixed. Let's make it harder to get this wrong by introducing a helper which performs all necessary checks. Link: https://syzkaller.appspot.com/bug?id=6312526aba5beae046fdae8f00399f87aab48b12 Cc: Christoph Hellwig Cc: Alexander Viro Cc: Dmitry Vyukov Signed-off-by: Christian Brauner Signed-off-by: Linus Torvalds --- include/linux/fs_context.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index 37e1e8f7f08d..e2bc16300c82 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -139,6 +139,8 @@ extern int vfs_parse_fs_string(struct fs_context *fc, const char *key, extern int generic_parse_monolithic(struct fs_context *fc, void *data); extern int vfs_get_tree(struct fs_context *fc); extern void put_fs_context(struct fs_context *fc); +extern int vfs_parse_fs_param_source(struct fs_context *fc, + struct fs_parameter *param); /* * sget() wrappers to be called from the ->get_tree() op. -- cgit v1.2.3 From 2db710cc846d3321a4dc0977fa13769bddba2351 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 14 Jul 2021 21:26:40 -0700 Subject: kasan: fix build by including kernel.h The header relies on _RET_IP_ being defined, and had been receiving that definition via inclusion of bug.h which includes kernel.h. However, since f39650de687e ("kernel.h: split out panic and oops helpers") that is no longer the case and get the following build error when building CONFIG_KASAN_HW_TAGS on arm64: In file included from arch/arm64/mm/kasan_init.c:10: include/linux/kasan.h: In function 'kasan_slab_free': include/linux/kasan.h:230:39: error: '_RET_IP_' undeclared (first use in this function) 230 | return __kasan_slab_free(s, object, _RET_IP_, init); Fix it by including kernel.h from kasan.h. Link: https://lkml.kernel.org/r/20210705072716.2125074-1-elver@google.com Fixes: f39650de687e ("kernel.h: split out panic and oops helpers") Signed-off-by: Marco Elver Reviewed-by: Andy Shevchenko Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Peter Collingbourne Cc: Catalin Marinas Cc: Vincenzo Frascino Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 5310e217bd74..dd874a1ee862 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -3,6 +3,7 @@ #define _LINUX_KASAN_H #include +#include #include #include -- cgit v1.2.3 From ab7965de1725cd8514f0edbced5c2fb793846078 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Jul 2021 21:26:55 -0700 Subject: mm: fix the try_to_unmap prototype for !CONFIG_MMU Adjust the nommu stub of try_to_unmap to match the changed protype for the full version. Turn it into an inline instead of a macro to generally improve the type checking. Link: https://lkml.kernel.org/r/20210705053944.885828-1-hch@lst.de Fixes: 1fb08ac63bee ("mm: rmap: make try_to_unmap() void function") Signed-off-by: Christoph Hellwig Reviewed-by: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 83fb86133fe1..c976cc6de257 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -291,7 +291,9 @@ static inline int page_referenced(struct page *page, int is_locked, return 0; } -#define try_to_unmap(page, refs) false +static inline void try_to_unmap(struct page *page, enum ttu_flags flags) +{ +} static inline int page_mkclean(struct page *page) { -- cgit v1.2.3 From e48bf29cf9d6d60d810e2af71e54b71a324094e0 Mon Sep 17 00:00:00 2001 From: Ye Xiang Date: Sun, 13 Jun 2021 11:25:07 +0800 Subject: HID: intel-ish-hid: use async resume function ISH IPC driver uses asynchronous workqueue to do resume now, but there is a potential timing issue: when child devices resume before bus driver, it will cause child devices resume failed and cannot be recovered until reboot. The current implementation in this case do wait for IPC to resume but fail to accommodate for a case when there is no ISH reboot and soft resume is taking time. This issue is apparent on Tiger Lake platform with 5.11.13 kernel when doing suspend to idle then resume(s0ix) test. To resolve this issue, we change ISHTP HID client to use asynchronous resume callback too. In the asynchronous resume callback, it waits for the ISHTP resume done event, and then notify ISHTP HID client link ready. Signed-off-by: Ye Xiang Acked-by: Srinivas Pandruvada Signed-off-by: Jiri Kosina --- include/linux/intel-ish-client-if.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/intel-ish-client-if.h b/include/linux/intel-ish-client-if.h index 25e2b4e80502..aee8ff4739b1 100644 --- a/include/linux/intel-ish-client-if.h +++ b/include/linux/intel-ish-client-if.h @@ -81,6 +81,8 @@ int ishtp_register_event_cb(struct ishtp_cl_device *device, /* Get the device * from ishtp device instance */ struct device *ishtp_device(struct ishtp_cl_device *cl_device); +/* wait for IPC resume */ +bool ishtp_wait_resume(struct ishtp_device *dev); /* Trace interface for clients */ ishtp_print_log ishtp_trace_callback(struct ishtp_cl_device *cl_device); /* Get device pointer of PCI device for DMA acces */ -- cgit v1.2.3 From e042aa532c84d18ff13291d00620502ce7a38dda Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 16 Jul 2021 09:18:21 +0000 Subject: bpf: Fix pointer arithmetic mask tightening under state pruning In 7fedb63a8307 ("bpf: Tighten speculative pointer arithmetic mask") we narrowed the offset mask for unprivileged pointer arithmetic in order to mitigate a corner case where in the speculative domain it is possible to advance, for example, the map value pointer by up to value_size-1 out-of- bounds in order to leak kernel memory via side-channel to user space. The verifier's state pruning for scalars leaves one corner case open where in the first verification path R_x holds an unknown scalar with an aux->alu_limit of e.g. 7, and in a second verification path that same register R_x, here denoted as R_x', holds an unknown scalar which has tighter bounds and would thus satisfy range_within(R_x, R_x') as well as tnum_in(R_x, R_x') for state pruning, yielding an aux->alu_limit of 3: Given the second path fits the register constraints for pruning, the final generated mask from aux->alu_limit will remain at 7. While technically not wrong for the non-speculative domain, it would however be possible to craft similar cases where the mask would be too wide as in 7fedb63a8307. One way to fix it is to detect the presence of unknown scalar map pointer arithmetic and force a deeper search on unknown scalars to ensure that we do not run into a masking mismatch. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index e774ecc1cd1f..7ba7e800d472 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -414,6 +414,7 @@ struct bpf_verifier_env { u32 used_map_cnt; /* number of used maps */ u32 used_btf_cnt; /* number of used BTF objects */ u32 id_gen; /* used to generate unique reg IDs */ + bool explore_alu_limits; bool allow_ptr_leaks; bool allow_uninit_stack; bool allow_ptr_to_map_access; -- cgit v1.2.3 From ec645dc96699ea6c37b6de86c84d7288ea9a4ddf Mon Sep 17 00:00:00 2001 From: Oleksandr Natalenko Date: Sat, 17 Jul 2021 14:33:28 +0200 Subject: block: increase BLKCG_MAX_POLS After mq-deadline learned to deal with cgroups, the BLKCG_MAX_POLS value became too small for all the elevators to be registered properly. The following issue is seen: ``` calling bfq_init+0x0/0x8b @ 1 blkcg_policy_register: BLKCG_MAX_POLS too small initcall bfq_init+0x0/0x8b returned -28 after 507 usecs ``` which renders BFQ non-functional. Increase BLKCG_MAX_POLS to allow enough space for everyone. Fixes: 08a9ad8bf607 ("block/mq-deadline: Add cgroup support") Link: https://lore.kernel.org/lkml/8988303.mDXGIdCtx8@natalenko.name/ Signed-off-by: Oleksandr Natalenko Link: https://lore.kernel.org/r/20210717123328.945810-1-oleksandr@natalenko.name Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c454fb446fd0..2e12320cb121 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -57,7 +57,7 @@ struct blk_keyslot_manager; * Maximum number of blkcg policies allowed to be registered concurrently. * Defined here to simplify include dependency. */ -#define BLKCG_MAX_POLS 5 +#define BLKCG_MAX_POLS 6 typedef void (rq_end_io_fn)(struct request *, blk_status_t); -- cgit v1.2.3 From d6371c76e20d7d3f61b05fd67b596af4d14a8886 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 19 Jul 2021 09:51:34 +0100 Subject: bpf: Fix OOB read when printing XDP link fdinfo We got the following UBSAN report on one of our testing machines: ================================================================================ UBSAN: array-index-out-of-bounds in kernel/bpf/syscall.c:2389:24 index 6 is out of range for type 'char *[6]' CPU: 43 PID: 930921 Comm: systemd-coredum Tainted: G O 5.10.48-cloudflare-kasan-2021.7.0 #1 Hardware name: Call Trace: dump_stack+0x7d/0xa3 ubsan_epilogue+0x5/0x40 __ubsan_handle_out_of_bounds.cold+0x43/0x48 ? seq_printf+0x17d/0x250 bpf_link_show_fdinfo+0x329/0x380 ? bpf_map_value_size+0xe0/0xe0 ? put_files_struct+0x20/0x2d0 ? __kasan_kmalloc.constprop.0+0xc2/0xd0 seq_show+0x3f7/0x540 seq_read_iter+0x3f8/0x1040 seq_read+0x329/0x500 ? seq_read_iter+0x1040/0x1040 ? __fsnotify_parent+0x80/0x820 ? __fsnotify_update_child_dentry_flags+0x380/0x380 vfs_read+0x123/0x460 ksys_read+0xed/0x1c0 ? __x64_sys_pwrite64+0x1f0/0x1f0 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 ================================================================================ ================================================================================ UBSAN: object-size-mismatch in kernel/bpf/syscall.c:2384:2 From the report, we can infer that some array access in bpf_link_show_fdinfo at index 6 is out of bounds. The obvious candidate is bpf_link_type_strs[BPF_LINK_TYPE_XDP] with BPF_LINK_TYPE_XDP == 6. It turns out that BPF_LINK_TYPE_XDP is missing from bpf_types.h and therefore doesn't have an entry in bpf_link_type_strs: pos: 0 flags: 02000000 mnt_id: 13 link_type: (null) link_id: 4 prog_tag: bcf7977d3b93787c prog_id: 4 ifindex: 1 Fixes: aa8d3a716b59 ("bpf, xdp: Add bpf_link-based XDP attachment API") Signed-off-by: Lorenz Bauer Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210719085134.43325-2-lmb@cloudflare.com --- include/linux/bpf_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index a9db1eae6796..ae3ac3a2018c 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -134,4 +134,5 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_CGROUP, cgroup) BPF_LINK_TYPE(BPF_LINK_TYPE_ITER, iter) #ifdef CONFIG_NET BPF_LINK_TYPE(BPF_LINK_TYPE_NETNS, netns) +BPF_LINK_TYPE(BPF_LINK_TYPE_XDP, xdp) #endif -- cgit v1.2.3 From dc7019b7d0e188d4093b34bd0747ed0d668c63bf Mon Sep 17 00:00:00 2001 From: Jens Wiklander Date: Mon, 14 Jun 2021 17:33:14 -0500 Subject: tee: add tee_shm_alloc_kernel_buf() Adds a new function tee_shm_alloc_kernel_buf() to allocate shared memory from a kernel driver. This function can later be made more lightweight by unnecessary dma-buf export. Cc: stable@vger.kernel.org Reviewed-by: Tyler Hicks Reviewed-by: Sumit Garg Signed-off-by: Jens Wiklander --- include/linux/tee_drv.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h index 54269e47ac9a..8990f7628387 100644 --- a/include/linux/tee_drv.h +++ b/include/linux/tee_drv.h @@ -332,6 +332,7 @@ void *tee_get_drvdata(struct tee_device *teedev); * @returns a pointer to 'struct tee_shm' */ struct tee_shm *tee_shm_alloc(struct tee_context *ctx, size_t size, u32 flags); +struct tee_shm *tee_shm_alloc_kernel_buf(struct tee_context *ctx, size_t size); /** * tee_shm_register() - Register shared memory buffer -- cgit v1.2.3 From 376e4199e327a5cf29b8ec8fb0f64f3d8b429819 Mon Sep 17 00:00:00 2001 From: Sumit Garg Date: Mon, 14 Jun 2021 17:33:15 -0500 Subject: tee: Correct inappropriate usage of TEE_SHM_DMA_BUF flag Currently TEE_SHM_DMA_BUF flag has been inappropriately used to not register shared memory allocated for private usage by underlying TEE driver: OP-TEE in this case. So rather add a new flag as TEE_SHM_PRIV that can be utilized by underlying TEE drivers for private allocation and usage of shared memory. With this corrected, allow tee_shm_alloc_kernel_buf() to allocate a shared memory region without the backing of dma-buf. Cc: stable@vger.kernel.org Signed-off-by: Sumit Garg Co-developed-by: Tyler Hicks Signed-off-by: Tyler Hicks Reviewed-by: Jens Wiklander Reviewed-by: Sumit Garg Signed-off-by: Jens Wiklander --- include/linux/tee_drv.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h index 8990f7628387..3ebfea0781f1 100644 --- a/include/linux/tee_drv.h +++ b/include/linux/tee_drv.h @@ -27,6 +27,7 @@ #define TEE_SHM_USER_MAPPED BIT(4) /* Memory mapped in user space */ #define TEE_SHM_POOL BIT(5) /* Memory allocated from pool */ #define TEE_SHM_KERNEL_MAPPED BIT(6) /* Memory mapped in kernel space */ +#define TEE_SHM_PRIV BIT(7) /* Memory private to TEE driver */ struct device; struct tee_device; -- cgit v1.2.3 From d8a719059b9dc963aa190598778ac804ff3e6a87 Mon Sep 17 00:00:00 2001 From: Jonathan Marek Date: Wed, 21 Jul 2021 17:02:13 +1000 Subject: Revert "mm/pgtable: add stubs for {pmd/pub}_{set/clear}_huge" This reverts commit c742199a014de23ee92055c2473d91fe5561ffdf. c742199a014d ("mm/pgtable: add stubs for {pmd/pub}_{set/clear}_huge") breaks arm64 in at least two ways for configurations where PUD or PMD folding occur: 1. We no longer install huge-vmap mappings and silently fall back to page-granular entries, despite being able to install block entries at what is effectively the PGD level. 2. If the linear map is backed with block mappings, these will now silently fail to be created in alloc_init_pud(), causing a panic early during boot. The pgtable selftests caught this, although a fix has not been forthcoming and Christophe is AWOL at the moment, so just revert the change for now to get a working -rc3 on which we can queue patches for 5.15. A simple revert breaks the build for 32-bit PowerPC 8xx machines, which rely on the default function definitions when the corresponding page-table levels are folded, since commit a6a8f7c4aa7e ("powerpc/8xx: add support for huge pages on VMAP and VMALLOC"), eg: powerpc64-linux-ld: mm/vmalloc.o: in function `vunmap_pud_range': linux/mm/vmalloc.c:362: undefined reference to `pud_clear_huge' To avoid that, add stubs for pud_clear_huge() and pmd_clear_huge() in arch/powerpc/mm/nohash/8xx.c as suggested by Christophe. Cc: Christophe Leroy Cc: Catalin Marinas Cc: Andrew Morton Cc: Nicholas Piggin Cc: Mike Rapoport Cc: Mark Rutland Cc: Geert Uytterhoeven Fixes: c742199a014d ("mm/pgtable: add stubs for {pmd/pub}_{set/clear}_huge") Signed-off-by: Jonathan Marek Reviewed-by: Ard Biesheuvel Acked-by: Marc Zyngier [mpe: Fold in 8xx.c changes from Christophe and mention in change log] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/linux-arm-kernel/CAMuHMdXShORDox-xxaeUfDW3wx2PeggFSqhVSHVZNKCGK-y_vQ@mail.gmail.com/ Link: https://lore.kernel.org/r/20210717160118.9855-1-jonathan@marek.ca Link: https://lore.kernel.org/r/87r1fs1762.fsf@mpe.ellerman.id.au Signed-off-by: Will Deacon --- include/linux/pgtable.h | 26 +------------------------- 1 file changed, 1 insertion(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index d147480cdefc..e24d2c992b11 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1397,34 +1397,10 @@ static inline int p4d_clear_huge(p4d_t *p4d) } #endif /* !__PAGETABLE_P4D_FOLDED */ -#ifndef __PAGETABLE_PUD_FOLDED int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot); -int pud_clear_huge(pud_t *pud); -#else -static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) -{ - return 0; -} -static inline int pud_clear_huge(pud_t *pud) -{ - return 0; -} -#endif /* !__PAGETABLE_PUD_FOLDED */ - -#ifndef __PAGETABLE_PMD_FOLDED int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot); +int pud_clear_huge(pud_t *pud); int pmd_clear_huge(pmd_t *pmd); -#else -static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) -{ - return 0; -} -static inline int pmd_clear_huge(pmd_t *pmd) -{ - return 0; -} -#endif /* !__PAGETABLE_PMD_FOLDED */ - int p4d_free_pud_page(p4d_t *p4d, unsigned long addr); int pud_free_pmd_page(pud_t *pud, unsigned long addr); int pmd_free_pte_page(pmd_t *pmd, unsigned long addr); -- cgit v1.2.3 From 853a9ae29e978d37f5dfa72622a68c9ae3d7fa89 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 14 Jul 2021 10:04:27 +0200 Subject: serial: 8250: fix handle_irq locking The 8250 handle_irq callback is not just called from the interrupt handler but also from a timer callback when polling (e.g. for ports without an interrupt line). Consequently the callback must explicitly disable interrupts to avoid a potential deadlock with another interrupt in polled mode. Add back an irqrestore-version of the sysrq port-unlock helper and use it in the 8250 callbacks that need it. Fixes: 75f4e830fa9c ("serial: do not restore interrupt state in sysrq helper") Cc: stable@vger.kernel.org # 5.13 Cc: Joel Stanley Cc: Andrew Jeffery Reported-by: kernel test robot Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20210714080427.28164-1-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_core.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 52d7fb92a69d..c58cc142d23f 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -518,6 +518,25 @@ static inline void uart_unlock_and_check_sysrq(struct uart_port *port) if (sysrq_ch) handle_sysrq(sysrq_ch); } + +static inline void uart_unlock_and_check_sysrq_irqrestore(struct uart_port *port, + unsigned long flags) +{ + int sysrq_ch; + + if (!port->has_sysrq) { + spin_unlock_irqrestore(&port->lock, flags); + return; + } + + sysrq_ch = port->sysrq_ch; + port->sysrq_ch = 0; + + spin_unlock_irqrestore(&port->lock, flags); + + if (sysrq_ch) + handle_sysrq(sysrq_ch); +} #else /* CONFIG_MAGIC_SYSRQ_SERIAL */ static inline int uart_handle_sysrq_char(struct uart_port *port, unsigned int ch) { @@ -531,6 +550,11 @@ static inline void uart_unlock_and_check_sysrq(struct uart_port *port) { spin_unlock(&port->lock); } +static inline void uart_unlock_and_check_sysrq_irqrestore(struct uart_port *port, + unsigned long flags) +{ + spin_unlock_irqrestore(&port->lock, flags); +} #endif /* CONFIG_MAGIC_SYSRQ_SERIAL */ /* -- cgit v1.2.3 From 1e7107c5ef44431bc1ebbd4c353f1d7c22e5f2ec Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Wed, 16 Jun 2021 08:51:57 -0400 Subject: cgroup1: fix leaked context root causing sporadic NULL deref in LTP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Richard reported sporadic (roughly one in 10 or so) null dereferences and other strange behaviour for a set of automated LTP tests. Things like: BUG: kernel NULL pointer dereference, address: 0000000000000008 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 0 PID: 1516 Comm: umount Not tainted 5.10.0-yocto-standard #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-48-gd9c812dda519-prebuilt.qemu.org 04/01/2014 RIP: 0010:kernfs_sop_show_path+0x1b/0x60 ...or these others: RIP: 0010:do_mkdirat+0x6a/0xf0 RIP: 0010:d_alloc_parallel+0x98/0x510 RIP: 0010:do_readlinkat+0x86/0x120 There were other less common instances of some kind of a general scribble but the common theme was mount and cgroup and a dubious dentry triggering the NULL dereference. I was only able to reproduce it under qemu by replicating Richard's setup as closely as possible - I never did get it to happen on bare metal, even while keeping everything else the same. In commit 71d883c37e8d ("cgroup_do_mount(): massage calling conventions") we see this as a part of the overall change: -------------- struct cgroup_subsys *ss; - struct dentry *dentry; [...] - dentry = cgroup_do_mount(&cgroup_fs_type, fc->sb_flags, root, - CGROUP_SUPER_MAGIC, ns); [...] - if (percpu_ref_is_dying(&root->cgrp.self.refcnt)) { - struct super_block *sb = dentry->d_sb; - dput(dentry); + ret = cgroup_do_mount(fc, CGROUP_SUPER_MAGIC, ns); + if (!ret && percpu_ref_is_dying(&root->cgrp.self.refcnt)) { + struct super_block *sb = fc->root->d_sb; + dput(fc->root); deactivate_locked_super(sb); msleep(10); return restart_syscall(); } -------------- In changing from the local "*dentry" variable to using fc->root, we now export/leave that dentry pointer in the file context after doing the dput() in the unlikely "is_dying" case. With LTP doing a crazy amount of back to back mount/unmount [testcases/bin/cgroup_regression_5_1.sh] the unlikely becomes slightly likely and then bad things happen. A fix would be to not leave the stale reference in fc->root as follows: --------------                 dput(fc->root); + fc->root = NULL;                 deactivate_locked_super(sb); -------------- ...but then we are just open-coding a duplicate of fc_drop_locked() so we simply use that instead. Cc: Al Viro Cc: Tejun Heo Cc: Zefan Li Cc: Johannes Weiner Cc: stable@vger.kernel.org # v5.1+ Reported-by: Richard Purdie Fixes: 71d883c37e8d ("cgroup_do_mount(): massage calling conventions") Signed-off-by: Paul Gortmaker Signed-off-by: Tejun Heo --- include/linux/fs_context.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index e2bc16300c82..6b54982fc5f3 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -141,6 +141,7 @@ extern int vfs_get_tree(struct fs_context *fc); extern void put_fs_context(struct fs_context *fc); extern int vfs_parse_fs_param_source(struct fs_context *fc, struct fs_parameter *param); +extern void fc_drop_locked(struct fs_context *fc); /* * sget() wrappers to be called from the ->get_tree() op. -- cgit v1.2.3 From 8dad53a11f8d94dceb540a5f8f153484f42be84b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 23 Jul 2021 15:50:17 -0700 Subject: mm: call flush_dcache_page() in memcpy_to_page() and memzero_page() memcpy_to_page and memzero_page can write to arbitrary pages, which could be in the page cache or in high memory, so call flush_kernel_dcache_pages to flush the dcache. This is a problem when using these helpers on dcache challeneged architectures. Right now there are just a few users, chances are no one used the PC floppy driver, the aha1542 driver for an ISA SCSI HBA, and a few advanced and optional btrfs and ext4 features on those platforms yet since the conversion. Link: https://lkml.kernel.org/r/20210713055231.137602-2-hch@lst.de Fixes: bb90d4bc7b6a ("mm/highmem: Lift memcpy_[to|from]_page to core") Fixes: 28961998f858 ("iov_iter: lift memzero_page() to highmem.h") Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Cc: Chaitanya Kulkarni Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/highmem.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 8c6e8e996c87..8e7e50a53a12 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -318,6 +318,7 @@ static inline void memcpy_to_page(struct page *page, size_t offset, VM_BUG_ON(offset + len > PAGE_SIZE); memcpy(to + offset, from, len); + flush_dcache_page(page); kunmap_local(to); } @@ -325,6 +326,7 @@ static inline void memzero_page(struct page *page, size_t offset, size_t len) { char *addr = kmap_atomic(page); memset(addr + offset, 0, len); + flush_dcache_page(page); kunmap_atomic(addr); } -- cgit v1.2.3 From d9a42b53bdf7b0329dc09a59fc1b092640b6da19 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 23 Jul 2021 15:50:20 -0700 Subject: mm: use kmap_local_page in memzero_page The commit message introducing the global memzero_page explicitly mentions switching to kmap_local_page in the commit log but doesn't actually do that. Link: https://lkml.kernel.org/r/20210713055231.137602-3-hch@lst.de Fixes: 28961998f858 ("iov_iter: lift memzero_page() to highmem.h") Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Ira Weiny Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/highmem.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 8e7e50a53a12..d9a606a9fc64 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -324,10 +324,10 @@ static inline void memcpy_to_page(struct page *page, size_t offset, static inline void memzero_page(struct page *page, size_t offset, size_t len) { - char *addr = kmap_atomic(page); + char *addr = kmap_local_page(page); memset(addr + offset, 0, len); flush_dcache_page(page); - kunmap_atomic(addr); + kunmap_local(addr); } #endif /* _LINUX_HIGHMEM_H */ -- cgit v1.2.3 From 79e482e9c3ae86e849c701c846592e72baddda5a Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Fri, 23 Jul 2021 15:50:26 -0700 Subject: memblock: make for_each_mem_range() traverse MEMBLOCK_HOTPLUG regions Commit b10d6bca8720 ("arch, drivers: replace for_each_membock() with for_each_mem_range()") didn't take into account that when there is movable_node parameter in the kernel command line, for_each_mem_range() would skip ranges marked with MEMBLOCK_HOTPLUG. The page table setup code in POWER uses for_each_mem_range() to create the linear mapping of the physical memory and since the regions marked as MEMORY_HOTPLUG are skipped, they never make it to the linear map. A later access to the memory in those ranges will fail: BUG: Unable to handle kernel data access on write at 0xc000000400000000 Faulting instruction address: 0xc00000000008a3c0 Oops: Kernel access of bad area, sig: 11 [#1] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries Modules linked in: CPU: 0 PID: 53 Comm: kworker/u2:0 Not tainted 5.13.0 #7 NIP: c00000000008a3c0 LR: c0000000003c1ed8 CTR: 0000000000000040 REGS: c000000008a57770 TRAP: 0300 Not tainted (5.13.0) MSR: 8000000002009033 CR: 84222202 XER: 20040000 CFAR: c0000000003c1ed4 DAR: c000000400000000 DSISR: 42000000 IRQMASK: 0 GPR00: c0000000003c1ed8 c000000008a57a10 c0000000019da700 c000000400000000 GPR04: 0000000000000280 0000000000000180 0000000000000400 0000000000000200 GPR08: 0000000000000100 0000000000000080 0000000000000040 0000000000000300 GPR12: 0000000000000380 c000000001bc0000 c0000000001660c8 c000000006337e00 GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR20: 0000000040000000 0000000020000000 c000000001a81990 c000000008c30000 GPR24: c000000008c20000 c000000001a81998 000fffffffff0000 c000000001a819a0 GPR28: c000000001a81908 c00c000001000000 c000000008c40000 c000000008a64680 NIP clear_user_page+0x50/0x80 LR __handle_mm_fault+0xc88/0x1910 Call Trace: __handle_mm_fault+0xc44/0x1910 (unreliable) handle_mm_fault+0x130/0x2a0 __get_user_pages+0x248/0x610 __get_user_pages_remote+0x12c/0x3e0 get_arg_page+0x54/0xf0 copy_string_kernel+0x11c/0x210 kernel_execve+0x16c/0x220 call_usermodehelper_exec_async+0x1b0/0x2f0 ret_from_kernel_thread+0x5c/0x70 Instruction dump: 79280fa4 79271764 79261f24 794ae8e2 7ca94214 7d683a14 7c893a14 7d893050 7d4903a6 60000000 60000000 60000000 <7c001fec> 7c091fec 7c081fec 7c051fec ---[ end trace 490b8c67e6075e09 ]--- Making for_each_mem_range() include MEMBLOCK_HOTPLUG regions in the traversal fixes this issue. Link: https://bugzilla.redhat.com/show_bug.cgi?id=1976100 Link: https://lkml.kernel.org/r/20210712071132.20902-1-rppt@kernel.org Fixes: b10d6bca8720 ("arch, drivers: replace for_each_membock() with for_each_mem_range()") Signed-off-by: Mike Rapoport Tested-by: Greg Kurz Reviewed-by: David Hildenbrand Cc: [5.10+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index cbf46f56d105..4a53c3ca86bd 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -209,7 +209,7 @@ static inline void __next_physmem_range(u64 *idx, struct memblock_type *type, */ #define for_each_mem_range(i, p_start, p_end) \ __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, \ - MEMBLOCK_NONE, p_start, p_end, NULL) + MEMBLOCK_HOTPLUG, p_start, p_end, NULL) /** * for_each_mem_range_rev - reverse iterate through memblock areas from @@ -220,7 +220,7 @@ static inline void __next_physmem_range(u64 *idx, struct memblock_type *type, */ #define for_each_mem_range_rev(i, p_start, p_end) \ __for_each_mem_range_rev(i, &memblock.memory, NULL, NUMA_NO_NODE, \ - MEMBLOCK_NONE, p_start, p_end, NULL) + MEMBLOCK_HOTPLUG, p_start, p_end, NULL) /** * for_each_reserved_mem_range - iterate over all reserved memblock areas -- cgit v1.2.3 From bf88fef0b6f1488abeca594d377991171c00e52a Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Sat, 17 Jul 2021 21:21:27 +0300 Subject: usb: otg-fsm: Fix hrtimer list corruption The HNP work can be re-scheduled while it's still in-fly. This results in re-initialization of the busy work, resetting the hrtimer's list node of the work and crashing kernel with null dereference within kernel/timer once work's timer is expired. It's very easy to trigger this problem by re-plugging USB cable quickly. Initialize HNP work only once to fix this trouble. Unable to handle kernel NULL pointer dereference at virtual address 00000126) ... PC is at __run_timers.part.0+0x150/0x228 LR is at __next_timer_interrupt+0x51/0x9c ... (__run_timers.part.0) from [] (run_timer_softirq+0x2f/0x50) (run_timer_softirq) from [] (__do_softirq+0xd5/0x2f0) (__do_softirq) from [] (irq_exit+0xab/0xb8) (irq_exit) from [] (handle_domain_irq+0x45/0x60) (handle_domain_irq) from [] (gic_handle_irq+0x6b/0x7c) (gic_handle_irq) from [] (__irq_svc+0x65/0xac) Cc: stable@vger.kernel.org Acked-by: Peter Chen Signed-off-by: Dmitry Osipenko Link: https://lore.kernel.org/r/20210717182134.30262-6-digetx@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/otg-fsm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/usb/otg-fsm.h b/include/linux/usb/otg-fsm.h index 3aee78dda16d..784659d4dc99 100644 --- a/include/linux/usb/otg-fsm.h +++ b/include/linux/usb/otg-fsm.h @@ -196,6 +196,7 @@ struct otg_fsm { struct mutex lock; u8 *host_req_flag; struct delayed_work hnp_polling_work; + bool hnp_work_inited; bool state_changed; }; -- cgit v1.2.3 From 9635720b7c88592214562cb72605bdab6708006c Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 27 Jul 2021 09:05:00 -0700 Subject: bpf, sockmap: Fix memleak on ingress msg enqueue If backlog handler is running during a tear down operation we may enqueue data on the ingress msg queue while tear down is trying to free it. sk_psock_backlog() sk_psock_handle_skb() skb_psock_skb_ingress() sk_psock_skb_ingress_enqueue() sk_psock_queue_msg(psock,msg) spin_lock(ingress_lock) sk_psock_zap_ingress() _sk_psock_purge_ingerss_msg() _sk_psock_purge_ingress_msg() -- free ingress_msg list -- spin_unlock(ingress_lock) spin_lock(ingress_lock) list_add_tail(msg,ingress_msg) <- entry on list with no one left to free it. spin_unlock(ingress_lock) To fix we only enqueue from backlog if the ENABLED bit is set. The tear down logic clears the bit with ingress_lock set so we wont enqueue the msg in the last step. Fixes: 799aa7f98d53 ("skmsg: Avoid lock_sock() in sk_psock_backlog()") Signed-off-by: John Fastabend Signed-off-by: Andrii Nakryiko Acked-by: Jakub Sitnicki Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210727160500.1713554-4-john.fastabend@gmail.com --- include/linux/skmsg.h | 54 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 96f319099744..14ab0c0bc924 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -285,11 +285,45 @@ static inline struct sk_psock *sk_psock(const struct sock *sk) return rcu_dereference_sk_user_data(sk); } +static inline void sk_psock_set_state(struct sk_psock *psock, + enum sk_psock_state_bits bit) +{ + set_bit(bit, &psock->state); +} + +static inline void sk_psock_clear_state(struct sk_psock *psock, + enum sk_psock_state_bits bit) +{ + clear_bit(bit, &psock->state); +} + +static inline bool sk_psock_test_state(const struct sk_psock *psock, + enum sk_psock_state_bits bit) +{ + return test_bit(bit, &psock->state); +} + +static inline void sock_drop(struct sock *sk, struct sk_buff *skb) +{ + sk_drops_add(sk, skb); + kfree_skb(skb); +} + +static inline void drop_sk_msg(struct sk_psock *psock, struct sk_msg *msg) +{ + if (msg->skb) + sock_drop(psock->sk, msg->skb); + kfree(msg); +} + static inline void sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { spin_lock_bh(&psock->ingress_lock); - list_add_tail(&msg->list, &psock->ingress_msg); + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + list_add_tail(&msg->list, &psock->ingress_msg); + else + drop_sk_msg(psock, msg); spin_unlock_bh(&psock->ingress_lock); } @@ -406,24 +440,6 @@ static inline void sk_psock_restore_proto(struct sock *sk, psock->psock_update_sk_prot(sk, psock, true); } -static inline void sk_psock_set_state(struct sk_psock *psock, - enum sk_psock_state_bits bit) -{ - set_bit(bit, &psock->state); -} - -static inline void sk_psock_clear_state(struct sk_psock *psock, - enum sk_psock_state_bits bit) -{ - clear_bit(bit, &psock->state); -} - -static inline bool sk_psock_test_state(const struct sk_psock *psock, - enum sk_psock_state_bits bit) -{ - return test_bit(bit, &psock->state); -} - static inline struct sk_psock *sk_psock_get(struct sock *sk) { struct sk_psock *psock; -- cgit v1.2.3 From f5e81d1117501546b7be050c5fbafa6efd2c722c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 13 Jul 2021 08:18:31 +0000 Subject: bpf: Introduce BPF nospec instruction for mitigating Spectre v4 In case of JITs, each of the JIT backends compiles the BPF nospec instruction /either/ to a machine instruction which emits a speculation barrier /or/ to /no/ machine instruction in case the underlying architecture is not affected by Speculative Store Bypass or has different mitigations in place already. This covers both x86 and (implicitly) arm64: In case of x86, we use 'lfence' instruction for mitigation. In case of arm64, we rely on the firmware mitigation as controlled via the ssbd kernel parameter. Whenever the mitigation is enabled, it works for all of the kernel code with no need to provide any additional instructions here (hence only comment in arm64 JIT). Other archs can follow as needed. The BPF nospec instruction is specifically targeting Spectre v4 since i) we don't use a serialization barrier for the Spectre v1 case, and ii) mitigation instructions for v1 and v4 might be different on some archs. The BPF nospec is required for a future commit, where the BPF verifier does annotate intermediate BPF programs with speculation barriers. Co-developed-by: Piotr Krysiuk Co-developed-by: Benedict Schlueter Signed-off-by: Daniel Borkmann Signed-off-by: Piotr Krysiuk Signed-off-by: Benedict Schlueter Acked-by: Alexei Starovoitov --- include/linux/filter.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 472f97074da0..83b896044e79 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -73,6 +73,11 @@ struct ctl_table_header; /* unused opcode to mark call to interpreter with arguments */ #define BPF_CALL_ARGS 0xe0 +/* unused opcode to mark speculation barrier for mitigating + * Speculative Store Bypass + */ +#define BPF_NOSPEC 0xc0 + /* As per nm, we expose JITed images as text (code) section for * kallsyms. That way, tools like perf can find it to match * addresses. @@ -390,6 +395,16 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) .off = 0, \ .imm = 0 }) +/* Speculation barrier */ + +#define BPF_ST_NOSPEC() \ + ((struct bpf_insn) { \ + .code = BPF_ST | BPF_NOSPEC, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = 0 }) + /* Internal classic blocks for direct assignment */ #define __BPF_STMT(CODE, K) \ -- cgit v1.2.3 From 2039f26f3aca5b0e419b98f65dd36481337b86ee Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 13 Jul 2021 08:18:31 +0000 Subject: bpf: Fix leakage due to insufficient speculative store bypass mitigation Spectre v4 gadgets make use of memory disambiguation, which is a set of techniques that execute memory access instructions, that is, loads and stores, out of program order; Intel's optimization manual, section 2.4.4.5: A load instruction micro-op may depend on a preceding store. Many microarchitectures block loads until all preceding store addresses are known. The memory disambiguator predicts which loads will not depend on any previous stores. When the disambiguator predicts that a load does not have such a dependency, the load takes its data from the L1 data cache. Eventually, the prediction is verified. If an actual conflict is detected, the load and all succeeding instructions are re-executed. af86ca4e3088 ("bpf: Prevent memory disambiguation attack") tried to mitigate this attack by sanitizing the memory locations through preemptive "fast" (low latency) stores of zero prior to the actual "slow" (high latency) store of a pointer value such that upon dependency misprediction the CPU then speculatively executes the load of the pointer value and retrieves the zero value instead of the attacker controlled scalar value previously stored at that location, meaning, subsequent access in the speculative domain is then redirected to the "zero page". The sanitized preemptive store of zero prior to the actual "slow" store is done through a simple ST instruction based on r10 (frame pointer) with relative offset to the stack location that the verifier has been tracking on the original used register for STX, which does not have to be r10. Thus, there are no memory dependencies for this store, since it's only using r10 and immediate constant of zero; hence af86ca4e3088 /assumed/ a low latency operation. However, a recent attack demonstrated that this mitigation is not sufficient since the preemptive store of zero could also be turned into a "slow" store and is thus bypassed as well: [...] // r2 = oob address (e.g. scalar) // r7 = pointer to map value 31: (7b) *(u64 *)(r10 -16) = r2 // r9 will remain "fast" register, r10 will become "slow" register below 32: (bf) r9 = r10 // JIT maps BPF reg to x86 reg: // r9 -> r15 (callee saved) // r10 -> rbp // train store forward prediction to break dependency link between both r9 // and r10 by evicting them from the predictor's LRU table. 33: (61) r0 = *(u32 *)(r7 +24576) 34: (63) *(u32 *)(r7 +29696) = r0 35: (61) r0 = *(u32 *)(r7 +24580) 36: (63) *(u32 *)(r7 +29700) = r0 37: (61) r0 = *(u32 *)(r7 +24584) 38: (63) *(u32 *)(r7 +29704) = r0 39: (61) r0 = *(u32 *)(r7 +24588) 40: (63) *(u32 *)(r7 +29708) = r0 [...] 543: (61) r0 = *(u32 *)(r7 +25596) 544: (63) *(u32 *)(r7 +30716) = r0 // prepare call to bpf_ringbuf_output() helper. the latter will cause rbp // to spill to stack memory while r13/r14/r15 (all callee saved regs) remain // in hardware registers. rbp becomes slow due to push/pop latency. below is // disasm of bpf_ringbuf_output() helper for better visual context: // // ffffffff8117ee20: 41 54 push r12 // ffffffff8117ee22: 55 push rbp // ffffffff8117ee23: 53 push rbx // ffffffff8117ee24: 48 f7 c1 fc ff ff ff test rcx,0xfffffffffffffffc // ffffffff8117ee2b: 0f 85 af 00 00 00 jne ffffffff8117eee0 <-- jump taken // [...] // ffffffff8117eee0: 49 c7 c4 ea ff ff ff mov r12,0xffffffffffffffea // ffffffff8117eee7: 5b pop rbx // ffffffff8117eee8: 5d pop rbp // ffffffff8117eee9: 4c 89 e0 mov rax,r12 // ffffffff8117eeec: 41 5c pop r12 // ffffffff8117eeee: c3 ret 545: (18) r1 = map[id:4] 547: (bf) r2 = r7 548: (b7) r3 = 0 549: (b7) r4 = 4 550: (85) call bpf_ringbuf_output#194288 // instruction 551 inserted by verifier \ 551: (7a) *(u64 *)(r10 -16) = 0 | /both/ are now slow stores here // storing map value pointer r7 at fp-16 | since value of r10 is "slow". 552: (7b) *(u64 *)(r10 -16) = r7 / // following "fast" read to the same memory location, but due to dependency // misprediction it will speculatively execute before insn 551/552 completes. 553: (79) r2 = *(u64 *)(r9 -16) // in speculative domain contains attacker controlled r2. in non-speculative // domain this contains r7, and thus accesses r7 +0 below. 554: (71) r3 = *(u8 *)(r2 +0) // leak r3 As can be seen, the current speculative store bypass mitigation which the verifier inserts at line 551 is insufficient since /both/, the write of the zero sanitation as well as the map value pointer are a high latency instruction due to prior memory access via push/pop of r10 (rbp) in contrast to the low latency read in line 553 as r9 (r15) which stays in hardware registers. Thus, architecturally, fp-16 is r7, however, microarchitecturally, fp-16 can still be r2. Initial thoughts to address this issue was to track spilled pointer loads from stack and enforce their load via LDX through r10 as well so that /both/ the preemptive store of zero /as well as/ the load use the /same/ register such that a dependency is created between the store and load. However, this option is not sufficient either since it can be bypassed as well under speculation. An updated attack with pointer spill/fills now _all_ based on r10 would look as follows: [...] // r2 = oob address (e.g. scalar) // r7 = pointer to map value [...] // longer store forward prediction training sequence than before. 2062: (61) r0 = *(u32 *)(r7 +25588) 2063: (63) *(u32 *)(r7 +30708) = r0 2064: (61) r0 = *(u32 *)(r7 +25592) 2065: (63) *(u32 *)(r7 +30712) = r0 2066: (61) r0 = *(u32 *)(r7 +25596) 2067: (63) *(u32 *)(r7 +30716) = r0 // store the speculative load address (scalar) this time after the store // forward prediction training. 2068: (7b) *(u64 *)(r10 -16) = r2 // preoccupy the CPU store port by running sequence of dummy stores. 2069: (63) *(u32 *)(r7 +29696) = r0 2070: (63) *(u32 *)(r7 +29700) = r0 2071: (63) *(u32 *)(r7 +29704) = r0 2072: (63) *(u32 *)(r7 +29708) = r0 2073: (63) *(u32 *)(r7 +29712) = r0 2074: (63) *(u32 *)(r7 +29716) = r0 2075: (63) *(u32 *)(r7 +29720) = r0 2076: (63) *(u32 *)(r7 +29724) = r0 2077: (63) *(u32 *)(r7 +29728) = r0 2078: (63) *(u32 *)(r7 +29732) = r0 2079: (63) *(u32 *)(r7 +29736) = r0 2080: (63) *(u32 *)(r7 +29740) = r0 2081: (63) *(u32 *)(r7 +29744) = r0 2082: (63) *(u32 *)(r7 +29748) = r0 2083: (63) *(u32 *)(r7 +29752) = r0 2084: (63) *(u32 *)(r7 +29756) = r0 2085: (63) *(u32 *)(r7 +29760) = r0 2086: (63) *(u32 *)(r7 +29764) = r0 2087: (63) *(u32 *)(r7 +29768) = r0 2088: (63) *(u32 *)(r7 +29772) = r0 2089: (63) *(u32 *)(r7 +29776) = r0 2090: (63) *(u32 *)(r7 +29780) = r0 2091: (63) *(u32 *)(r7 +29784) = r0 2092: (63) *(u32 *)(r7 +29788) = r0 2093: (63) *(u32 *)(r7 +29792) = r0 2094: (63) *(u32 *)(r7 +29796) = r0 2095: (63) *(u32 *)(r7 +29800) = r0 2096: (63) *(u32 *)(r7 +29804) = r0 2097: (63) *(u32 *)(r7 +29808) = r0 2098: (63) *(u32 *)(r7 +29812) = r0 // overwrite scalar with dummy pointer; same as before, also including the // sanitation store with 0 from the current mitigation by the verifier. 2099: (7a) *(u64 *)(r10 -16) = 0 | /both/ are now slow stores here 2100: (7b) *(u64 *)(r10 -16) = r7 | since store unit is still busy. // load from stack intended to bypass stores. 2101: (79) r2 = *(u64 *)(r10 -16) 2102: (71) r3 = *(u8 *)(r2 +0) // leak r3 [...] Looking at the CPU microarchitecture, the scheduler might issue loads (such as seen in line 2101) before stores (line 2099,2100) because the load execution units become available while the store execution unit is still busy with the sequence of dummy stores (line 2069-2098). And so the load may use the prior stored scalar from r2 at address r10 -16 for speculation. The updated attack may work less reliable on CPU microarchitectures where loads and stores share execution resources. This concludes that the sanitizing with zero stores from af86ca4e3088 ("bpf: Prevent memory disambiguation attack") is insufficient. Moreover, the detection of stack reuse from af86ca4e3088 where previously data (STACK_MISC) has been written to a given stack slot where a pointer value is now to be stored does not have sufficient coverage as precondition for the mitigation either; for several reasons outlined as follows: 1) Stack content from prior program runs could still be preserved and is therefore not "random", best example is to split a speculative store bypass attack between tail calls, program A would prepare and store the oob address at a given stack slot and then tail call into program B which does the "slow" store of a pointer to the stack with subsequent "fast" read. From program B PoV such stack slot type is STACK_INVALID, and therefore also must be subject to mitigation. 2) The STACK_SPILL must not be coupled to register_is_const(&stack->spilled_ptr) condition, for example, the previous content of that memory location could also be a pointer to map or map value. Without the fix, a speculative store bypass is not mitigated in such precondition and can then lead to a type confusion in the speculative domain leaking kernel memory near these pointer types. While brainstorming on various alternative mitigation possibilities, we also stumbled upon a retrospective from Chrome developers [0]: [...] For variant 4, we implemented a mitigation to zero the unused memory of the heap prior to allocation, which cost about 1% when done concurrently and 4% for scavenging. Variant 4 defeats everything we could think of. We explored more mitigations for variant 4 but the threat proved to be more pervasive and dangerous than we anticipated. For example, stack slots used by the register allocator in the optimizing compiler could be subject to type confusion, leading to pointer crafting. Mitigating type confusion for stack slots alone would have required a complete redesign of the backend of the optimizing compiler, perhaps man years of work, without a guarantee of completeness. [...] From BPF side, the problem space is reduced, however, options are rather limited. One idea that has been explored was to xor-obfuscate pointer spills to the BPF stack: [...] // preoccupy the CPU store port by running sequence of dummy stores. [...] 2106: (63) *(u32 *)(r7 +29796) = r0 2107: (63) *(u32 *)(r7 +29800) = r0 2108: (63) *(u32 *)(r7 +29804) = r0 2109: (63) *(u32 *)(r7 +29808) = r0 2110: (63) *(u32 *)(r7 +29812) = r0 // overwrite scalar with dummy pointer; xored with random 'secret' value // of 943576462 before store ... 2111: (b4) w11 = 943576462 2112: (af) r11 ^= r7 2113: (7b) *(u64 *)(r10 -16) = r11 2114: (79) r11 = *(u64 *)(r10 -16) 2115: (b4) w2 = 943576462 2116: (af) r2 ^= r11 // ... and restored with the same 'secret' value with the help of AX reg. 2117: (71) r3 = *(u8 *)(r2 +0) [...] While the above would not prevent speculation, it would make data leakage infeasible by directing it to random locations. In order to be effective and prevent type confusion under speculation, such random secret would have to be regenerated for each store. The additional complexity involved for a tracking mechanism that prevents jumps such that restoring spilled pointers would not get corrupted is not worth the gain for unprivileged. Hence, the fix in here eventually opted for emitting a non-public BPF_ST | BPF_NOSPEC instruction which the x86 JIT translates into a lfence opcode. Inserting the latter in between the store and load instruction is one of the mitigations options [1]. The x86 instruction manual notes: [...] An LFENCE that follows an instruction that stores to memory might complete before the data being stored have become globally visible. [...] The latter meaning that the preceding store instruction finished execution and the store is at minimum guaranteed to be in the CPU's store queue, but it's not guaranteed to be in that CPU's L1 cache at that point (globally visible). The latter would only be guaranteed via sfence. So the load which is guaranteed to execute after the lfence for that local CPU would have to rely on store-to-load forwarding. [2], in section 2.3 on store buffers says: [...] For every store operation that is added to the ROB, an entry is allocated in the store buffer. This entry requires both the virtual and physical address of the target. Only if there is no free entry in the store buffer, the frontend stalls until there is an empty slot available in the store buffer again. Otherwise, the CPU can immediately continue adding subsequent instructions to the ROB and execute them out of order. On Intel CPUs, the store buffer has up to 56 entries. [...] One small upside on the fix is that it lifts constraints from af86ca4e3088 where the sanitize_stack_off relative to r10 must be the same when coming from different paths. The BPF_ST | BPF_NOSPEC gets emitted after a BPF_STX or BPF_ST instruction. This happens either when we store a pointer or data value to the BPF stack for the first time, or upon later pointer spills. The former needs to be enforced since otherwise stale stack data could be leaked under speculation as outlined earlier. For non-x86 JITs the BPF_ST | BPF_NOSPEC mapping is currently optimized away, but others could emit a speculation barrier as well if necessary. For real-world unprivileged programs e.g. generated by LLVM, pointer spill/fill is only generated upon register pressure and LLVM only tries to do that for pointers which are not used often. The program main impact will be the initial BPF_ST | BPF_NOSPEC sanitation for the STACK_INVALID case when the first write to a stack slot occurs e.g. upon map lookup. In future we might refine ways to mitigate the latter cost. [0] https://arxiv.org/pdf/1902.05178.pdf [1] https://msrc-blog.microsoft.com/2018/05/21/analysis-and-mitigation-of-speculative-store-bypass-cve-2018-3639/ [2] https://arxiv.org/pdf/1905.05725.pdf Fixes: af86ca4e3088 ("bpf: Prevent memory disambiguation attack") Fixes: f7cf25b2026d ("bpf: track spill/fill of constants") Co-developed-by: Piotr Krysiuk Co-developed-by: Benedict Schlueter Signed-off-by: Daniel Borkmann Signed-off-by: Piotr Krysiuk Signed-off-by: Benedict Schlueter Acked-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 7ba7e800d472..828d08afeee0 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -340,8 +340,8 @@ struct bpf_insn_aux_data { }; u64 map_key_state; /* constant (32 bit) key tracking for maps */ int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ - int sanitize_stack_off; /* stack slot to be cleared */ u32 seen; /* this insn was processed by the verifier at env->pass_cnt */ + bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */ bool zext_dst; /* this insn zero extends dst reg */ u8 alu_state; /* used in combination with alu_limit */ -- cgit v1.2.3 From 40e159403896f7d55c98f858d0b20fee1d941fa4 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 2 Aug 2021 12:21:30 +0100 Subject: mhi: Fix networking tree build. Signed-off-by: David S. Miller --- include/linux/mhi.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mhi.h b/include/linux/mhi.h index 944aa3aa3035..5e08468854db 100644 --- a/include/linux/mhi.h +++ b/include/linux/mhi.h @@ -719,8 +719,13 @@ void mhi_device_put(struct mhi_device *mhi_dev); * host and device execution environments match and * channels are in a DISABLED state. * @mhi_dev: Device associated with the channels + * @flags: MHI channel flags */ -int mhi_prepare_for_transfer(struct mhi_device *mhi_dev); +int mhi_prepare_for_transfer(struct mhi_device *mhi_dev, + unsigned int flags); + +/* Automatically allocate and queue inbound buffers */ +#define MHI_CH_INBOUND_ALLOC_BUFS BIT(0) /** * mhi_unprepare_from_transfer - Reset UL and DL channels for data transfer. -- cgit v1.2.3 From 1c69d7cf4a8b6b6cfd920a1e809f1cd33ae4369c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Aug 2021 07:30:29 -0700 Subject: Revert "mhi: Fix networking tree build." This reverts commit 40e159403896f7d55c98f858d0b20fee1d941fa4. Looks like this commit breaks the build for me. Signed-off-by: Jakub Kicinski --- include/linux/mhi.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mhi.h b/include/linux/mhi.h index 5e08468854db..944aa3aa3035 100644 --- a/include/linux/mhi.h +++ b/include/linux/mhi.h @@ -719,13 +719,8 @@ void mhi_device_put(struct mhi_device *mhi_dev); * host and device execution environments match and * channels are in a DISABLED state. * @mhi_dev: Device associated with the channels - * @flags: MHI channel flags */ -int mhi_prepare_for_transfer(struct mhi_device *mhi_dev, - unsigned int flags); - -/* Automatically allocate and queue inbound buffers */ -#define MHI_CH_INBOUND_ALLOC_BUFS BIT(0) +int mhi_prepare_for_transfer(struct mhi_device *mhi_dev); /** * mhi_unprepare_from_transfer - Reset UL and DL channels for data transfer. -- cgit v1.2.3 From ce78ffa3ef1681065ba451cfd545da6126f5ca88 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 3 Aug 2021 11:14:03 +0100 Subject: net: really fix the build... Signed-off-by: David S. Miller --- include/linux/mhi.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mhi.h b/include/linux/mhi.h index 944aa3aa3035..5e08468854db 100644 --- a/include/linux/mhi.h +++ b/include/linux/mhi.h @@ -719,8 +719,13 @@ void mhi_device_put(struct mhi_device *mhi_dev); * host and device execution environments match and * channels are in a DISABLED state. * @mhi_dev: Device associated with the channels + * @flags: MHI channel flags */ -int mhi_prepare_for_transfer(struct mhi_device *mhi_dev); +int mhi_prepare_for_transfer(struct mhi_device *mhi_dev, + unsigned int flags); + +/* Automatically allocate and queue inbound buffers */ +#define MHI_CH_INBOUND_ALLOC_BUFS BIT(0) /** * mhi_unprepare_from_transfer - Reset UL and DL channels for data transfer. -- cgit v1.2.3 From 5f7b51bf09baca8e4f80cbe879536842bafb5f31 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Wed, 28 Jul 2021 17:01:15 +0200 Subject: netfilter: ipset: Limit the maximal range of consecutive elements to add/delete The range size of consecutive elements were not limited. Thus one could define a huge range which may result soft lockup errors due to the long execution time. Now the range size is limited to 2^20 entries. Reported-by: Brad Spengler Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/ipset/ip_set.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index 10279c4830ac..ada1296c87d5 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -196,6 +196,9 @@ struct ip_set_region { u32 elements; /* Number of elements vs timeout */ }; +/* Max range where every element is added/deleted in one step */ +#define IPSET_MAX_RANGE (1<<20) + /* The max revision number supported by any set type + 1 */ #define IPSET_REVISION_MAX 9 -- cgit v1.2.3 From 1027b96ec9d34f9abab69bc1a4dc5b1ad8ab1349 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 6 Aug 2021 16:21:24 +0800 Subject: once: Fix panic when module unload DO_ONCE DEFINE_STATIC_KEY_TRUE(___once_key); __do_once_done once_disable_jump(once_key); INIT_WORK(&w->work, once_deferred); struct once_work *w; w->key = key; schedule_work(&w->work); module unload //*the key is destroy* process_one_work once_deferred BUG_ON(!static_key_enabled(work->key)); static_key_count((struct static_key *)x) //*access key, crash* When module uses DO_ONCE mechanism, it could crash due to the above concurrency problem, we could reproduce it with link[1]. Fix it by add/put module refcount in the once work process. [1] https://lore.kernel.org/netdev/eaa6c371-465e-57eb-6be9-f4b16b9d7cbf@huawei.com/ Cc: Hannes Frederic Sowa Cc: Daniel Borkmann Cc: David S. Miller Cc: Eric Dumazet Reported-by: Minmin chen Signed-off-by: Kefeng Wang Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/linux/once.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/once.h b/include/linux/once.h index 9225ee6d96c7..ae6f4eb41cbe 100644 --- a/include/linux/once.h +++ b/include/linux/once.h @@ -7,7 +7,7 @@ bool __do_once_start(bool *done, unsigned long *flags); void __do_once_done(bool *done, struct static_key_true *once_key, - unsigned long *flags); + unsigned long *flags, struct module *mod); /* Call a function exactly once. The idea of DO_ONCE() is to perform * a function call such as initialization of random seeds, etc, only @@ -46,7 +46,7 @@ void __do_once_done(bool *done, struct static_key_true *once_key, if (unlikely(___ret)) { \ func(__VA_ARGS__); \ __do_once_done(&___done, &___once_key, \ - &___flags); \ + &___flags, THIS_MODULE); \ } \ } \ ___ret; \ -- cgit v1.2.3 From 71330842ff93ae67a066c1fa68d75672527312fa Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 9 Aug 2021 21:45:32 +0200 Subject: bpf: Add _kernel suffix to internal lockdown_bpf_read Rename LOCKDOWN_BPF_READ into LOCKDOWN_BPF_READ_KERNEL so we have naming more consistent with a LOCKDOWN_BPF_WRITE_USER option that we are adding. Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko --- include/linux/security.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index 24eda04221e9..724d7a4a0c91 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -123,7 +123,7 @@ enum lockdown_reason { LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_KCORE, LOCKDOWN_KPROBES, - LOCKDOWN_BPF_READ, + LOCKDOWN_BPF_READ_KERNEL, LOCKDOWN_PERF, LOCKDOWN_TRACEFS, LOCKDOWN_XMON_RW, -- cgit v1.2.3 From 563476ae0c5e48a028cbfa38fa9d2fc0418eb88f Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Sun, 11 Apr 2021 15:32:55 +0300 Subject: net/mlx5: Synchronize correct IRQ when destroying CQ The CQ destroy is performed based on the IRQ number that is stored in cq->irqn. That number wasn't set explicitly during CQ creation and as expected some of the API users of mlx5_core_create_cq() forgot to update it. This caused to wrong synchronization call of the wrong IRQ with a number 0 instead of the real one. As a fix, set the IRQ number directly in the mlx5_core_create_cq() and update all users accordingly. Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices") Fixes: ef1659ade359 ("IB/mlx5: Add DEVX support for CQ events") Signed-off-by: Shay Drory Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 1efe37466969..25a8be58d289 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1044,8 +1044,7 @@ void mlx5_unregister_debugfs(void); void mlx5_fill_page_array(struct mlx5_frag_buf *buf, __be64 *pas); void mlx5_fill_page_frag_array_perm(struct mlx5_frag_buf *buf, __be64 *pas, u8 perm); void mlx5_fill_page_frag_array(struct mlx5_frag_buf *frag_buf, __be64 *pas); -int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn, - unsigned int *irqn); +int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn); int mlx5_core_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn); int mlx5_core_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn); -- cgit v1.2.3 From 51e1bb9eeaf7868db56e58f47848e364ab4c4129 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 9 Aug 2021 12:43:17 +0200 Subject: bpf: Add lockdown check for probe_write_user helper Back then, commit 96ae52279594 ("bpf: Add bpf_probe_write_user BPF helper to be called in tracers") added the bpf_probe_write_user() helper in order to allow to override user space memory. Its original goal was to have a facility to "debug, divert, and manipulate execution of semi-cooperative processes" under CAP_SYS_ADMIN. Write to kernel was explicitly disallowed since it would otherwise tamper with its integrity. One use case was shown in cf9b1199de27 ("samples/bpf: Add test/example of using bpf_probe_write_user bpf helper") where the program DNATs traffic at the time of connect(2) syscall, meaning, it rewrites the arguments to a syscall while they're still in userspace, and before the syscall has a chance to copy the argument into kernel space. These days we have better mechanisms in BPF for achieving the same (e.g. for load-balancers), but without having to write to userspace memory. Of course the bpf_probe_write_user() helper can also be used to abuse many other things for both good or bad purpose. Outside of BPF, there is a similar mechanism for ptrace(2) such as PTRACE_PEEK{TEXT,DATA} and PTRACE_POKE{TEXT,DATA}, but would likely require some more effort. Commit 96ae52279594 explicitly dedicated the helper for experimentation purpose only. Thus, move the helper's availability behind a newly added LOCKDOWN_BPF_WRITE_USER lockdown knob so that the helper is disabled under the "integrity" mode. More fine-grained control can be implemented also from LSM side with this change. Fixes: 96ae52279594 ("bpf: Add bpf_probe_write_user BPF helper to be called in tracers") Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko --- include/linux/security.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index 724d7a4a0c91..5b7288521300 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -120,6 +120,7 @@ enum lockdown_reason { LOCKDOWN_MMIOTRACE, LOCKDOWN_DEBUGFS, LOCKDOWN_XMON_WR, + LOCKDOWN_BPF_WRITE_USER, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_KCORE, LOCKDOWN_KPROBES, -- cgit v1.2.3 From a2baf4e8bb0f306fbed7b5e6197c02896a638ab5 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 9 Aug 2021 18:04:13 -0700 Subject: bpf: Fix potentially incorrect results with bpf_get_local_storage() Commit b910eaaaa4b8 ("bpf: Fix NULL pointer dereference in bpf_get_local_storage() helper") fixed a bug for bpf_get_local_storage() helper so different tasks won't mess up with each other's percpu local storage. The percpu data contains 8 slots so it can hold up to 8 contexts (same or different tasks), for 8 different program runs, at the same time. This in general is sufficient. But our internal testing showed the following warning multiple times: [...] warning: WARNING: CPU: 13 PID: 41661 at include/linux/bpf-cgroup.h:193 __cgroup_bpf_run_filter_sock_ops+0x13e/0x180 RIP: 0010:__cgroup_bpf_run_filter_sock_ops+0x13e/0x180 tcp_call_bpf.constprop.99+0x93/0xc0 tcp_conn_request+0x41e/0xa50 ? tcp_rcv_state_process+0x203/0xe00 tcp_rcv_state_process+0x203/0xe00 ? sk_filter_trim_cap+0xbc/0x210 ? tcp_v6_inbound_md5_hash.constprop.41+0x44/0x160 tcp_v6_do_rcv+0x181/0x3e0 tcp_v6_rcv+0xc65/0xcb0 ip6_protocol_deliver_rcu+0xbd/0x450 ip6_input_finish+0x11/0x20 ip6_input+0xb5/0xc0 ip6_sublist_rcv_finish+0x37/0x50 ip6_sublist_rcv+0x1dc/0x270 ipv6_list_rcv+0x113/0x140 __netif_receive_skb_list_core+0x1a0/0x210 netif_receive_skb_list_internal+0x186/0x2a0 gro_normal_list.part.170+0x19/0x40 napi_complete_done+0x65/0x150 mlx5e_napi_poll+0x1ae/0x680 __napi_poll+0x25/0x120 net_rx_action+0x11e/0x280 __do_softirq+0xbb/0x271 irq_exit_rcu+0x97/0xa0 common_interrupt+0x7f/0xa0 asm_common_interrupt+0x1e/0x40 RIP: 0010:bpf_prog_1835a9241238291a_tw_egress+0x5/0xbac ? __cgroup_bpf_run_filter_skb+0x378/0x4e0 ? do_softirq+0x34/0x70 ? ip6_finish_output2+0x266/0x590 ? ip6_finish_output+0x66/0xa0 ? ip6_output+0x6c/0x130 ? ip6_xmit+0x279/0x550 ? ip6_dst_check+0x61/0xd0 [...] Using drgn [0] to dump the percpu buffer contents showed that on this CPU slot 0 is still available, but slots 1-7 are occupied and those tasks in slots 1-7 mostly don't exist any more. So we might have issues in bpf_cgroup_storage_unset(). Further debugging confirmed that there is a bug in bpf_cgroup_storage_unset(). Currently, it tries to unset "current" slot with searching from the start. So the following sequence is possible: 1. A task is running and claims slot 0 2. Running BPF program is done, and it checked slot 0 has the "task" and ready to reset it to NULL (not yet). 3. An interrupt happens, another BPF program runs and it claims slot 1 with the *same* task. 4. The unset() in interrupt context releases slot 0 since it matches "task". 5. Interrupt is done, the task in process context reset slot 0. At the end, slot 1 is not reset and the same process can continue to occupy slots 2-7 and finally, when the above step 1-5 is repeated again, step 3 BPF program won't be able to claim an empty slot and a warning will be issued. To fix the issue, for unset() function, we should traverse from the last slot to the first. This way, the above issue can be avoided. The same reverse traversal should also be done in bpf_get_local_storage() helper itself. Otherwise, incorrect local storage may be returned to BPF program. [0] https://github.com/osandov/drgn Fixes: b910eaaaa4b8 ("bpf: Fix NULL pointer dereference in bpf_get_local_storage() helper") Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210810010413.1976277-1-yhs@fb.com --- include/linux/bpf-cgroup.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 8b77d08d4b47..6c9b10d82c80 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -201,8 +201,8 @@ static inline void bpf_cgroup_storage_unset(void) { int i; - for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) { - if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current)) + for (i = BPF_CGROUP_STORAGE_NEST_MAX - 1; i >= 0; i--) { + if (likely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current)) continue; this_cpu_write(bpf_cgroup_storage_info[i].task, NULL); -- cgit v1.2.3 From 77e89afc25f30abd56e76a809ee2884d7c1b63ce Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jul 2021 23:51:47 +0200 Subject: PCI/MSI: Protect msi_desc::masked for multi-MSI Multi-MSI uses a single MSI descriptor and there is a single mask register when the device supports per vector masking. To avoid reading back the mask register the value is cached in the MSI descriptor and updates are done by clearing and setting bits in the cache and writing it to the device. But nothing protects msi_desc::masked and the mask register from being modified concurrently on two different CPUs for two different Linux interrupts which belong to the same multi-MSI descriptor. Add a lock to struct device and protect any operation on the mask and the mask register with it. This makes the update of msi_desc::masked unconditional, but there is no place which requires a modification of the hardware register without updating the masked cache. msi_mask_irq() is now an empty wrapper which will be cleaned up in follow up changes. The problem goes way back to the initial support of multi-MSI, but picking the commit which introduced the mask cache is a valid cut off point (2.6.30). Fixes: f2440d9acbe8 ("PCI MSI: Refactor interrupt masking code") Signed-off-by: Thomas Gleixner Tested-by: Marc Zyngier Reviewed-by: Marc Zyngier Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210729222542.726833414@linutronix.de --- include/linux/device.h | 1 + include/linux/msi.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 59940f1744c1..e53aa5065f58 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -506,6 +506,7 @@ struct device { struct dev_pin_info *pins; #endif #ifdef CONFIG_GENERIC_MSI_IRQ + raw_spinlock_t msi_lock; struct list_head msi_list; #endif #ifdef CONFIG_DMA_OPS diff --git a/include/linux/msi.h b/include/linux/msi.h index 6aff469e511d..e8bdcb83172b 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -233,7 +233,7 @@ void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg); void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg); u32 __pci_msix_desc_mask_irq(struct msi_desc *desc, u32 flag); -u32 __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag); +void __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag); void pci_msi_mask_irq(struct irq_data *data); void pci_msi_unmask_irq(struct irq_data *data); -- cgit v1.2.3 From 826da771291fc25a428e871f9e7fb465e390f852 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jul 2021 23:51:48 +0200 Subject: genirq: Provide IRQCHIP_AFFINITY_PRE_STARTUP X86 IO/APIC and MSI interrupts (when used without interrupts remapping) require that the affinity setup on startup is done before the interrupt is enabled for the first time as the non-remapped operation mode cannot safely migrate enabled interrupts from arbitrary contexts. Provide a new irq chip flag which allows affected hardware to request this. This has to be opt-in because there have been reports in the past that some interrupt chips cannot handle affinity setting before startup. Fixes: 18404756765c ("genirq: Expose default irq affinity mask (take 3)") Signed-off-by: Thomas Gleixner Tested-by: Marc Zyngier Reviewed-by: Marc Zyngier Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210729222542.779791738@linutronix.de --- include/linux/irq.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 8e9a9ae471a6..c8293c817646 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -569,6 +569,7 @@ struct irq_chip { * IRQCHIP_SUPPORTS_NMI: Chip can deliver NMIs, only for root irqchips * IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND: Invokes __enable_irq()/__disable_irq() for wake irqs * in the suspend path if they are in disabled state + * IRQCHIP_AFFINITY_PRE_STARTUP: Default affinity update before startup */ enum { IRQCHIP_SET_TYPE_MASKED = (1 << 0), @@ -581,6 +582,7 @@ enum { IRQCHIP_SUPPORTS_LEVEL_MSI = (1 << 7), IRQCHIP_SUPPORTS_NMI = (1 << 8), IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND = (1 << 9), + IRQCHIP_AFFINITY_PRE_STARTUP = (1 << 10), }; #include -- cgit v1.2.3 From b69dd5b3780a7298bd893816a09da751bc0636f7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 11 Aug 2021 12:57:15 -0700 Subject: net: igmp: increase size of mr_ifc_count Some arches support cmpxchg() on 4-byte and 8-byte only. Increase mr_ifc_count width to 32bit to fix this problem. Fixes: 4a2b285e7e10 ("net: igmp: fix data-race in igmp_ifc_timer_expire()") Signed-off-by: Eric Dumazet Reported-by: Guenter Roeck Link: https://lore.kernel.org/r/20210811195715.3684218-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/inetdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index 53aa0343bf69..aaf4f1b4c277 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -41,7 +41,7 @@ struct in_device { unsigned long mr_qri; /* Query Response Interval */ unsigned char mr_qrv; /* Query Robustness Variable */ unsigned char mr_gq_running; - unsigned char mr_ifc_count; + u32 mr_ifc_count; struct timer_list mr_gq_timer; /* general query timer */ struct timer_list mr_ifc_timer; /* interface change timer */ -- cgit v1.2.3 From 7a3dc4f35bf8e1a07e5c3f8ecc8ac923f48493fe Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 13 Aug 2021 12:36:14 +0200 Subject: driver core: Add missing kernel doc for device::msi_lock Fixes: 77e89afc25f3 ("PCI/MSI: Protect msi_desc::masked for multi-MSI") Reported-by: Stephen Rothwell Signed-off-by: Thomas Gleixner --- include/linux/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index e53aa5065f58..65d84b67b024 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -407,6 +407,7 @@ struct dev_links_info { * @em_pd: device's energy model performance domain * @pins: For device pin management. * See Documentation/driver-api/pin-control.rst for details. + * @msi_lock: Lock to protect MSI mask cache and mask register * @msi_list: Hosts MSI descriptors * @msi_domain: The generic MSI domain this device is using. * @numa_node: NUMA node this device is close to. -- cgit v1.2.3 From 90e7a6de62781c27d6a111fccfb19b807f9b6887 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Tue, 24 Aug 2021 17:25:29 +0300 Subject: lib/scatterlist: Provide a dedicated function to support table append RDMA is the only in-kernel user that uses __sg_alloc_table_from_pages to append pages dynamically. In the next patch. That mode will be extended and that function will get more parameters. So separate it into a unique function to make such change more clear. Link: https://lore.kernel.org/r/20210824142531.3877007-2-maorg@nvidia.com Signed-off-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/linux/scatterlist.h | 39 +++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index ecf87484814f..5c700f2a0d18 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -285,14 +285,45 @@ void sg_free_table(struct sg_table *); int __sg_alloc_table(struct sg_table *, unsigned int, unsigned int, struct scatterlist *, unsigned int, gfp_t, sg_alloc_fn *); int sg_alloc_table(struct sg_table *, unsigned int, gfp_t); -struct scatterlist *__sg_alloc_table_from_pages(struct sg_table *sgt, +struct scatterlist *sg_alloc_append_table_from_pages(struct sg_table *sgt, struct page **pages, unsigned int n_pages, unsigned int offset, unsigned long size, unsigned int max_segment, struct scatterlist *prv, unsigned int left_pages, gfp_t gfp_mask); -int sg_alloc_table_from_pages(struct sg_table *sgt, struct page **pages, - unsigned int n_pages, unsigned int offset, - unsigned long size, gfp_t gfp_mask); +int sg_alloc_table_from_pages_segment(struct sg_table *sgt, struct page **pages, + unsigned int n_pages, unsigned int offset, + unsigned long size, + unsigned int max_segment, gfp_t gfp_mask); + +/** + * sg_alloc_table_from_pages - Allocate and initialize an sg table from + * an array of pages + * @sgt: The sg table header to use + * @pages: Pointer to an array of page pointers + * @n_pages: Number of pages in the pages array + * @offset: Offset from start of the first page to the start of a buffer + * @size: Number of valid bytes in the buffer (after offset) + * @gfp_mask: GFP allocation mask + * + * Description: + * Allocate and initialize an sg table from a list of pages. Contiguous + * ranges of the pages are squashed into a single scatterlist node. A user + * may provide an offset at a start and a size of valid data in a buffer + * specified by the page array. The returned sg table is released by + * sg_free_table. + * + * Returns: + * 0 on success, negative error on failure + */ +static inline int sg_alloc_table_from_pages(struct sg_table *sgt, + struct page **pages, + unsigned int n_pages, + unsigned int offset, + unsigned long size, gfp_t gfp_mask) +{ + return sg_alloc_table_from_pages_segment(sgt, pages, n_pages, offset, + size, UINT_MAX, gfp_mask); +} #ifdef CONFIG_SGL_ALLOC struct scatterlist *sgl_alloc_order(unsigned long long length, -- cgit v1.2.3 From 3e302dbc6774a27edaea39a1d5107f0c12e35cf2 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Tue, 24 Aug 2021 17:25:30 +0300 Subject: lib/scatterlist: Fix wrong update of orig_nents orig_nents should represent the number of entries with pages, but __sg_alloc_table_from_pages sets orig_nents as the number of total entries in the table. This is wrong when the API is used for dynamic allocation where not all the table entries are mapped with pages. It wasn't observed until now, since RDMA umem who uses this API in the dynamic form doesn't use orig_nents implicit or explicit by the scatterlist APIs. Fix it by changing the append API to track the SG append table state and have an API to free the append table according to the total number of entries in the table. Now all APIs set orig_nents as number of enries with pages. Fixes: 07da1223ec93 ("lib/scatterlist: Add support in dynamic allocation of SG table from pages") Link: https://lore.kernel.org/r/20210824142531.3877007-3-maorg@nvidia.com Signed-off-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/linux/scatterlist.h | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 5c700f2a0d18..266754a55327 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -39,6 +39,12 @@ struct sg_table { unsigned int orig_nents; /* original size of list */ }; +struct sg_append_table { + struct sg_table sgt; /* The scatter list table */ + struct scatterlist *prv; /* last populated sge in the table */ + unsigned int total_nents; /* Total entries in the table */ +}; + /* * Notes on SG table design. * @@ -280,16 +286,17 @@ typedef struct scatterlist *(sg_alloc_fn)(unsigned int, gfp_t); typedef void (sg_free_fn)(struct scatterlist *, unsigned int); void __sg_free_table(struct sg_table *, unsigned int, unsigned int, - sg_free_fn *); + sg_free_fn *, unsigned int); void sg_free_table(struct sg_table *); +void sg_free_append_table(struct sg_append_table *sgt); int __sg_alloc_table(struct sg_table *, unsigned int, unsigned int, struct scatterlist *, unsigned int, gfp_t, sg_alloc_fn *); int sg_alloc_table(struct sg_table *, unsigned int, gfp_t); -struct scatterlist *sg_alloc_append_table_from_pages(struct sg_table *sgt, - struct page **pages, unsigned int n_pages, unsigned int offset, - unsigned long size, unsigned int max_segment, - struct scatterlist *prv, unsigned int left_pages, - gfp_t gfp_mask); +int sg_alloc_append_table_from_pages(struct sg_append_table *sgt, + struct page **pages, unsigned int n_pages, + unsigned int offset, unsigned long size, + unsigned int max_segment, + unsigned int left_pages, gfp_t gfp_mask); int sg_alloc_table_from_pages_segment(struct sg_table *sgt, struct page **pages, unsigned int n_pages, unsigned int offset, unsigned long size, -- cgit v1.2.3