From d92aabca4df182763cd541d342f2d55f8c0a827c Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Sat, 25 Jul 2020 21:15:20 -0700 Subject: firmware: bcm47xx_sprom: Fix -Wmissing-prototypes warnings bcm47xx_sprom.h did not include a prototype for bcm47xx_fill_sprom() therefore add one, and make sure we do include that header to fix -Wmissing-prototypes warnings. Reported-by: kernel test robot Signed-off-by: Florian Fainelli Signed-off-by: Thomas Bogendoerfer --- include/linux/bcm47xx_sprom.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bcm47xx_sprom.h b/include/linux/bcm47xx_sprom.h index b0f4424f34fc..f8254fd53e15 100644 --- a/include/linux/bcm47xx_sprom.h +++ b/include/linux/bcm47xx_sprom.h @@ -9,9 +9,19 @@ #include #include +struct ssb_sprom; + #ifdef CONFIG_BCM47XX_SPROM +void bcm47xx_fill_sprom(struct ssb_sprom *sprom, const char *prefix, + bool fallback); int bcm47xx_sprom_register_fallbacks(void); #else +static inline void bcm47xx_fill_sprom(struct ssb_sprom *sprom, + const char *prefix, + bool fallback) +{ +} + static inline int bcm47xx_sprom_register_fallbacks(void) { return -ENOTSUPP; -- cgit v1.2.3 From c2fe8ebb332eefb3d0543b248e28dd2992c04793 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 4 Aug 2020 21:26:42 +0200 Subject: clk: samsung: s3c64xx: declare s3c64xx_clk_init() in shared header The s3c64xx_clk_init() is defined and used by the clk-s3c64xx driver and also used in the mach-s3c64xx machine code. Move the declaration to a header to fix W=1 build warning: drivers/clk/samsung/clk-s3c64xx.c:391:13: warning: no previous prototype for 's3c64xx_clk_init' [-Wmissing-prototypes] 391 | void __init s3c64xx_clk_init(struct device_node *np, unsigned long xtal_f, Signed-off-by: Krzysztof Kozlowski Reviewed-by: Tomasz Figa Acked-by: Chanwoo Choi Reviewed-by: Stephen Boyd --- include/linux/clk/samsung.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 include/linux/clk/samsung.h (limited to 'include/linux') diff --git a/include/linux/clk/samsung.h b/include/linux/clk/samsung.h new file mode 100644 index 000000000000..7a0824b22eed --- /dev/null +++ b/include/linux/clk/samsung.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2020 Krzysztof Kozlowski + */ + +#ifndef __LINUX_CLK_SAMSUNG_H_ +#define __LINUX_CLK_SAMSUNG_H_ + +#include + +struct device_node; + +#ifdef CONFIG_ARCH_S3C64XX +void s3c64xx_clk_init(struct device_node *np, unsigned long xtal_f, + unsigned long xusbxti_f, bool s3c6400, + void __iomem *base); +#else +static inline void s3c64xx_clk_init(struct device_node *np, + unsigned long xtal_f, + unsigned long xusbxti_f, + bool s3c6400, void __iomem *base) { } +#endif /* CONFIG_ARCH_S3C64XX */ + +#endif /* __LINUX_CLK_SAMSUNG_H_ */ -- cgit v1.2.3 From 16b17fcf77f2145b98cabbca6bfe6ea13c90bb08 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 4 Aug 2020 21:26:43 +0200 Subject: clk: samsung: s3c24xx: declare s3c24xx_common_clk_init() in shared header The s3c2410_common_clk_init() and others are defined and used by the clk-s3c24xx driver and also used in the mach-s3c24xx machine code. Move the declaration to a header to fix W=1 build warnings: drivers/clk/samsung/clk-s3c2410.c:320:13: warning: no previous prototype for 's3c2410_common_clk_init' [-Wmissing-prototypes] 320 | void __init s3c2410_common_clk_init(struct device_node *np, unsigned long xti_f, drivers/clk/samsung/clk-s3c2412.c:205:13: warning: no previous prototype for 's3c2412_common_clk_init' [-Wmissing-prototypes] 205 | void __init s3c2412_common_clk_init(struct device_node *np, unsigned long xti_f, drivers/clk/samsung/clk-s3c2443.c:341:13: warning: no previous prototype for 's3c2443_common_clk_init' [-Wmissing-prototypes] 341 | void __init s3c2443_common_clk_init(struct device_node *np, unsigned long xti_f, Signed-off-by: Krzysztof Kozlowski Acked-by: Chanwoo Choi Reviewed-by: Stephen Boyd --- include/linux/clk/samsung.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk/samsung.h b/include/linux/clk/samsung.h index 7a0824b22eed..79097e365f7f 100644 --- a/include/linux/clk/samsung.h +++ b/include/linux/clk/samsung.h @@ -21,4 +21,36 @@ static inline void s3c64xx_clk_init(struct device_node *np, bool s3c6400, void __iomem *base) { } #endif /* CONFIG_ARCH_S3C64XX */ +#ifdef CONFIG_S3C2410_COMMON_CLK +void s3c2410_common_clk_init(struct device_node *np, unsigned long xti_f, + int current_soc, + void __iomem *reg_base); +#else +static inline void s3c2410_common_clk_init(struct device_node *np, + unsigned long xti_f, + int current_soc, + void __iomem *reg_base) { } +#endif /* CONFIG_S3C2410_COMMON_CLK */ + +#ifdef CONFIG_S3C2412_COMMON_CLK +void s3c2412_common_clk_init(struct device_node *np, unsigned long xti_f, + unsigned long ext_f, void __iomem *reg_base); +#else +static inline void s3c2412_common_clk_init(struct device_node *np, + unsigned long xti_f, + unsigned long ext_f, + void __iomem *reg_base) { } +#endif /* CONFIG_S3C2412_COMMON_CLK */ + +#ifdef CONFIG_S3C2443_COMMON_CLK +void s3c2443_common_clk_init(struct device_node *np, unsigned long xti_f, + int current_soc, + void __iomem *reg_base); +#else +static inline void s3c2443_common_clk_init(struct device_node *np, + unsigned long xti_f, + int current_soc, + void __iomem *reg_base) { } +#endif /* CONFIG_S3C2443_COMMON_CLK */ + #endif /* __LINUX_CLK_SAMSUNG_H_ */ -- cgit v1.2.3 From b84e23f5135103c45022b0e4a4ed2459d5398a7e Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 6 Aug 2020 20:20:23 +0200 Subject: ARM: s3c24xx: pass pointer to clk driver via platform data Passing pointers directly as platform data is fragile and undocumented. Better to create a platform data structure which explicitly documents what is passed to the driver. Suggested-by: Tomasz Figa Signed-off-by: Krzysztof Kozlowski Acked-by: Arnd Bergmann Reviewed-by: Stephen Boyd Link: https://lore.kernel.org/r/20200806182059.2431-6-krzk@kernel.org --- include/linux/platform_data/clk-s3c2410.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 include/linux/platform_data/clk-s3c2410.h (limited to 'include/linux') diff --git a/include/linux/platform_data/clk-s3c2410.h b/include/linux/platform_data/clk-s3c2410.h new file mode 100644 index 000000000000..7eb1cfa5409b --- /dev/null +++ b/include/linux/platform_data/clk-s3c2410.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2020 Krzysztof Kozlowski + */ + +#ifndef __LINUX_PLATFORM_DATA_CLK_S3C2410_H_ +#define __LINUX_PLATFORM_DATA_CLK_S3C2410_H_ + +/** + * struct s3c2410_clk_platform_data - platform data for S3C2410 clock driver + * + * @modify_misccr: Function to modify the MISCCR and return the new value + */ +struct s3c2410_clk_platform_data { + unsigned int (*modify_misccr)(unsigned int clr, unsigned int chg); +}; + +#endif /* __LINUX_PLATFORM_DATA_CLK_S3C2410_H_ */ + -- cgit v1.2.3 From 5f745424761a2a49762625e8616417a8e7694228 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 6 Aug 2020 20:20:26 +0200 Subject: usb: gadget: s3c-hsudc: remove platform header dependency There is no real phy driver, so s3c-hsudc just pokes the registers itself. Improve this a little by making it a platform data callback like we do for gpios. There is only one board using this driver, and it's unlikely that another would be added, so this is a minimal workaround. Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20200806182059.2431-9-krzk@kernel.org [krzk: Include regs-s3c2443-clock.h in ifdef to fixup build on s3c6400] Signed-off-by: Krzysztof Kozlowski --- include/linux/platform_data/s3c-hsudc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/s3c-hsudc.h b/include/linux/platform_data/s3c-hsudc.h index 4dc9b8760166..a170939832d5 100644 --- a/include/linux/platform_data/s3c-hsudc.h +++ b/include/linux/platform_data/s3c-hsudc.h @@ -26,6 +26,8 @@ struct s3c24xx_hsudc_platdata { unsigned int epnum; void (*gpio_init)(void); void (*gpio_uninit)(void); + void (*phy_init)(void); + void (*phy_uninit)(void); }; #endif /* __LINUX_USB_S3C_HSUDC_H */ -- cgit v1.2.3 From 17132da70eb766785b9b4677bacce18cc11ea442 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 6 Aug 2020 20:20:33 +0200 Subject: ARM: samsung: move pm check code to drivers/soc This is the only part of plat-samsung that is really shared between the s3c and s5p ports. Moving it to drivers/soc/ lets us make them completely independent. Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20200806182059.2431-16-krzk@kernel.org Signed-off-by: Krzysztof Kozlowski --- include/linux/soc/samsung/s3c-pm.h | 84 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 include/linux/soc/samsung/s3c-pm.h (limited to 'include/linux') diff --git a/include/linux/soc/samsung/s3c-pm.h b/include/linux/soc/samsung/s3c-pm.h new file mode 100644 index 000000000000..730bd1d3d09a --- /dev/null +++ b/include/linux/soc/samsung/s3c-pm.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2013 Samsung Electronics Co., Ltd. + * Tomasz Figa + * Copyright (c) 2004 Simtec Electronics + * http://armlinux.simtec.co.uk/ + * Written by Ben Dooks, + */ + +#ifndef __LINUX_SOC_SAMSUNG_S3C_PM_H +#define __LINUX_SOC_SAMSUNG_S3C_PM_H __FILE__ + +#include + +/* PM debug functions */ + +/** + * struct pm_uart_save - save block for core UART + * @ulcon: Save value for S3C2410_ULCON + * @ucon: Save value for S3C2410_UCON + * @ufcon: Save value for S3C2410_UFCON + * @umcon: Save value for S3C2410_UMCON + * @ubrdiv: Save value for S3C2410_UBRDIV + * + * Save block for UART registers to be held over sleep and restored if they + * are needed (say by debug). +*/ +struct pm_uart_save { + u32 ulcon; + u32 ucon; + u32 ufcon; + u32 umcon; + u32 ubrdiv; + u32 udivslot; +}; + +#ifdef CONFIG_SAMSUNG_PM_DEBUG +/** + * s3c_pm_dbg() - low level debug function for use in suspend/resume. + * @msg: The message to print. + * + * This function is used mainly to debug the resume process before the system + * can rely on printk/console output. It uses the low-level debugging output + * routine printascii() to do its work. + */ +extern void s3c_pm_dbg(const char *msg, ...); + +#define S3C_PMDBG(fmt...) s3c_pm_dbg(fmt) + +extern void s3c_pm_save_uarts(bool is_s3c24xx); +extern void s3c_pm_restore_uarts(bool is_s3c24xx); + +#ifdef CONFIG_ARCH_S3C64XX +extern void s3c_pm_arch_update_uart(void __iomem *regs, + struct pm_uart_save *save); +#else +static inline void +s3c_pm_arch_update_uart(void __iomem *regs, struct pm_uart_save *save) +{ +} +#endif + +#else +#define S3C_PMDBG(fmt...) pr_debug(fmt) + +static inline void s3c_pm_save_uarts(bool is_s3c24xx) { } +static inline void s3c_pm_restore_uarts(bool is_s3c24xx) { } +#endif + +/* suspend memory checking */ + +#ifdef CONFIG_SAMSUNG_PM_CHECK +extern void s3c_pm_check_prepare(void); +extern void s3c_pm_check_restore(void); +extern void s3c_pm_check_cleanup(void); +extern void s3c_pm_check_store(void); +#else +#define s3c_pm_check_prepare() do { } while (0) +#define s3c_pm_check_restore() do { } while (0) +#define s3c_pm_check_cleanup() do { } while (0) +#define s3c_pm_check_store() do { } while (0) +#endif + +#endif -- cgit v1.2.3 From 7dbad03ebcb924cde142f7477d65a54ffb1166a3 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 6 Aug 2020 20:20:39 +0200 Subject: ARM: s3c: adc: move header to linux/soc/samsung There are multiple drivers using the private adc interface. It seems unlikely that they would ever get converted to iio, so make the current state official by making the header file global. The s3c2410_ts driver needs a couple of register definitions as well. Signed-off-by: Arnd Bergmann Acked-by: Guenter Roeck Acked-by: Dmitry Torokhov Acked-by: Sebastian Reichel Link: https://lore.kernel.org/r/20200806182059.2431-22-krzk@kernel.org Signed-off-by: Krzysztof Kozlowski --- include/linux/soc/samsung/s3c-adc.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 include/linux/soc/samsung/s3c-adc.h (limited to 'include/linux') diff --git a/include/linux/soc/samsung/s3c-adc.h b/include/linux/soc/samsung/s3c-adc.h new file mode 100644 index 000000000000..591c94ef957d --- /dev/null +++ b/include/linux/soc/samsung/s3c-adc.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2008 Simtec Electronics + * http://armlinux.simtec.co.uk/ + * Ben Dooks + * + * S3C ADC driver information + */ + +#ifndef __LINUX_SOC_SAMSUNG_S3C_ADC_H +#define __LINUX_SOC_SAMSUNG_S3C_ADC_H __FILE__ + +struct s3c_adc_client; +struct platform_device; + +extern int s3c_adc_start(struct s3c_adc_client *client, + unsigned int channel, unsigned int nr_samples); + +extern int s3c_adc_read(struct s3c_adc_client *client, unsigned int ch); + +extern struct s3c_adc_client * + s3c_adc_register(struct platform_device *pdev, + void (*select)(struct s3c_adc_client *client, + unsigned selected), + void (*conv)(struct s3c_adc_client *client, + unsigned d0, unsigned d1, + unsigned *samples_left), + unsigned int is_ts); + +extern void s3c_adc_release(struct s3c_adc_client *client); + +#endif /* __LINUX_SOC_SAMSUNG_S3C_ADC_H */ -- cgit v1.2.3 From f131a4443ea468cd532410c271c229bb39caab08 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 3 Sep 2019 11:31:09 +0200 Subject: ARM: s3c24xx: move spi fiq handler into platform The fiq handler needs access to some register definitions that should not be used directly by device drivers. Since this is closely related to the irqchip driver anyway, move it into the same place. Signed-off-by: Arnd Bergmann [krzk: Add a header guard in include/linux/spi/s3c24xx-fiq.h, fix SPDX comment style, update maintainer's entry] Co-developed-by: Krzysztof Kozlowski Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20200806182059.2431-23-krzk%40kernel.org Acked-by: Mark Brown --- include/linux/spi/s3c24xx-fiq.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 include/linux/spi/s3c24xx-fiq.h (limited to 'include/linux') diff --git a/include/linux/spi/s3c24xx-fiq.h b/include/linux/spi/s3c24xx-fiq.h new file mode 100644 index 000000000000..d2842ac1de27 --- /dev/null +++ b/include/linux/spi/s3c24xx-fiq.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* linux/drivers/spi/spi_s3c24xx_fiq.h + * + * Copyright 2009 Simtec Electronics + * Ben Dooks + * + * S3C24XX SPI - FIQ pseudo-DMA transfer support +*/ + +#ifndef __LINUX_SPI_S3C24XX_FIQ_H +#define __LINUX_SPI_S3C24XX_FIQ_H __FILE__ + +/* We have R8 through R13 to play with */ + +#ifdef __ASSEMBLY__ +#define __REG_NR(x) r##x +#else + +extern struct spi_fiq_code s3c24xx_spi_fiq_txrx; +extern struct spi_fiq_code s3c24xx_spi_fiq_tx; +extern struct spi_fiq_code s3c24xx_spi_fiq_rx; + +#define __REG_NR(x) (x) +#endif + +#define fiq_rspi __REG_NR(8) +#define fiq_rtmp __REG_NR(9) +#define fiq_rrx __REG_NR(10) +#define fiq_rtx __REG_NR(11) +#define fiq_rcount __REG_NR(12) +#define fiq_rirq __REG_NR(13) + +#endif /* __LINUX_SPI_S3C24XX_FIQ_H */ -- cgit v1.2.3 From b558b6c24068d87ffcd95dad3bf0d9bd2ac290e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Tue, 18 Aug 2020 18:07:09 -0700 Subject: net-tun: Add type safety to tun_xdp_to_ptr() and tun_ptr_to_xdp() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reduces likelihood of incorrect use. Test: builds Signed-off-by: Maciej Żenczykowski Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200819010710.3959310-1-zenczykowski@gmail.com --- include/linux/if_tun.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h index 5bda8cf457b6..6c37e1dbc5df 100644 --- a/include/linux/if_tun.h +++ b/include/linux/if_tun.h @@ -28,8 +28,8 @@ struct tun_xdp_hdr { struct socket *tun_get_socket(struct file *); struct ptr_ring *tun_get_tx_ring(struct file *file); bool tun_is_xdp_frame(void *ptr); -void *tun_xdp_to_ptr(void *ptr); -void *tun_ptr_to_xdp(void *ptr); +void *tun_xdp_to_ptr(struct xdp_frame *xdp); +struct xdp_frame *tun_ptr_to_xdp(void *ptr); void tun_ptr_free(void *ptr); #else #include @@ -48,11 +48,11 @@ static inline bool tun_is_xdp_frame(void *ptr) { return false; } -static inline void *tun_xdp_to_ptr(void *ptr) +static inline void *tun_xdp_to_ptr(struct xdp_frame *xdp) { return NULL; } -static inline void *tun_ptr_to_xdp(void *ptr) +static inline struct xdp_frame *tun_ptr_to_xdp(void *ptr) { return NULL; } -- cgit v1.2.3 From 596b5ef458f903d4362fb3c69967b6d8a23334bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Tue, 18 Aug 2020 18:07:10 -0700 Subject: net-tun: Eliminate two tun/xdp related function calls from vhost-net MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This provides a minor performance boost by virtue of inlining instead of cross module function calls. Test: builds Signed-off-by: Maciej Żenczykowski Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200819010710.3959310-2-zenczykowski@gmail.com --- include/linux/if_tun.h | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h index 6c37e1dbc5df..2a7660843444 100644 --- a/include/linux/if_tun.h +++ b/include/linux/if_tun.h @@ -27,9 +27,18 @@ struct tun_xdp_hdr { #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) struct socket *tun_get_socket(struct file *); struct ptr_ring *tun_get_tx_ring(struct file *file); -bool tun_is_xdp_frame(void *ptr); -void *tun_xdp_to_ptr(struct xdp_frame *xdp); -struct xdp_frame *tun_ptr_to_xdp(void *ptr); +static inline bool tun_is_xdp_frame(void *ptr) +{ + return (unsigned long)ptr & TUN_XDP_FLAG; +} +static inline void *tun_xdp_to_ptr(struct xdp_frame *xdp) +{ + return (void *)((unsigned long)xdp | TUN_XDP_FLAG); +} +static inline struct xdp_frame *tun_ptr_to_xdp(void *ptr) +{ + return (void *)((unsigned long)ptr & ~TUN_XDP_FLAG); +} void tun_ptr_free(void *ptr); #else #include -- cgit v1.2.3 From bdfbb63c314a6fb7d27dddd62f5a3e4f062b92bb Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Tue, 18 Aug 2020 12:32:43 +0200 Subject: ptp: Add generic ptp v2 header parsing function Reason: A lot of the ptp drivers - which implement hardware time stamping - need specific fields such as the sequence id from the ptp v2 header. Currently all drivers implement that themselves. Introduce a generic function to retrieve a pointer to the start of the ptp v2 header. Suggested-by: Russell King Signed-off-by: Kurt Kanzenbach Reviewed-by: Richard Cochran Reviewed-by: Florian Fainelli Reviewed-by: Russell King Signed-off-by: David S. Miller --- include/linux/ptp_classify.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ptp_classify.h b/include/linux/ptp_classify.h index dd00fa41f7e7..0a9cc0eb0801 100644 --- a/include/linux/ptp_classify.h +++ b/include/linux/ptp_classify.h @@ -44,6 +44,30 @@ #define OFF_IHL 14 #define IPV4_HLEN(data) (((struct iphdr *)(data + OFF_IHL))->ihl << 2) +struct clock_identity { + u8 id[8]; +} __packed; + +struct port_identity { + struct clock_identity clock_identity; + __be16 port_number; +} __packed; + +struct ptp_header { + u8 tsmt; /* transportSpecific | messageType */ + u8 ver; /* reserved | versionPTP */ + __be16 message_length; + u8 domain_number; + u8 reserved1; + u8 flag_field[2]; + __be64 correction; + __be32 reserved2; + struct port_identity source_port_identity; + __be16 sequence_id; + u8 control; + u8 log_message_interval; +} __packed; + #if defined(CONFIG_NET_PTP_CLASSIFY) /** * ptp_classify_raw - classify a PTP packet @@ -57,6 +81,21 @@ */ unsigned int ptp_classify_raw(const struct sk_buff *skb); +/** + * ptp_parse_header - Get pointer to the PTP v2 header + * @skb: packet buffer + * @type: type of the packet (see ptp_classify_raw()) + * + * This function takes care of the VLAN, UDP, IPv4 and IPv6 headers. The length + * is checked. + * + * Note, internally skb_mac_header() is used. Make sure that the @skb is + * initialized accordingly. + * + * Return: Pointer to the ptp v2 header or NULL if not found + */ +struct ptp_header *ptp_parse_header(struct sk_buff *skb, unsigned int type); + void __init ptp_classifier_init(void); #else static inline void ptp_classifier_init(void) @@ -66,5 +105,10 @@ static inline unsigned int ptp_classify_raw(struct sk_buff *skb) { return PTP_CLASS_NONE; } +static inline struct ptp_header *ptp_parse_header(struct sk_buff *skb, + unsigned int type) +{ + return NULL; +} #endif #endif /* _PTP_CLASSIFY_H_ */ -- cgit v1.2.3 From 036c508ba95e573f53bd5bbb79b0c4d71003b319 Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Tue, 18 Aug 2020 12:32:44 +0200 Subject: ptp: Add generic ptp message type function The message type is located at different offsets within the ptp header depending on the ptp version (v1 or v2). Therefore, drivers which also deal with ptp v1 have some code for it. Extract this into a helper function for drivers to be used. Signed-off-by: Kurt Kanzenbach Reviewed-by: Richard Cochran Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/ptp_classify.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ptp_classify.h b/include/linux/ptp_classify.h index 0a9cc0eb0801..cfc6d9152f69 100644 --- a/include/linux/ptp_classify.h +++ b/include/linux/ptp_classify.h @@ -96,6 +96,31 @@ unsigned int ptp_classify_raw(const struct sk_buff *skb); */ struct ptp_header *ptp_parse_header(struct sk_buff *skb, unsigned int type); +/** + * ptp_get_msgtype - Extract ptp message type from given header + * @hdr: ptp header + * @type: type of the packet (see ptp_classify_raw()) + * + * This function returns the message type for a given ptp header. It takes care + * of the different ptp header versions (v1 or v2). + * + * Return: The message type + */ +static inline u8 ptp_get_msgtype(const struct ptp_header *hdr, + unsigned int type) +{ + u8 msgtype; + + if (unlikely(type & PTP_CLASS_V1)) { + /* msg type is located at the control field for ptp v1 */ + msgtype = hdr->control; + } else { + msgtype = hdr->tsmt & 0x0f; + } + + return msgtype; +} + void __init ptp_classifier_init(void); #else static inline void ptp_classifier_init(void) -- cgit v1.2.3 From 17060fb5069f2c73a566015ce6e019d5685680b5 Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Tue, 18 Aug 2020 12:32:51 +0200 Subject: ptp: Remove unused macro The offset for the control field is not needed anymore. Remove it. Signed-off-by: Kurt Kanzenbach Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/ptp_classify.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ptp_classify.h b/include/linux/ptp_classify.h index cfc6d9152f69..8437307cca8c 100644 --- a/include/linux/ptp_classify.h +++ b/include/linux/ptp_classify.h @@ -36,7 +36,6 @@ #define OFF_PTP_SOURCE_UUID 22 /* PTPv1 only */ #define OFF_PTP_SEQUENCE_ID 30 -#define OFF_PTP_CONTROL 32 /* PTPv1 only */ /* Below defines should actually be removed at some point in time. */ #define IP6_HLEN 40 -- cgit v1.2.3 From 005142b8a1f0f32d33fbe04b728464c1b7acfa0e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 18 Aug 2020 21:27:56 -0700 Subject: bpf: Factor out bpf_link_by_id() helper. Refactor the code a bit to extract bpf_link_by_id() helper. It's similar to existing bpf_prog_by_id(). Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200819042759.51280-2-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 55f694b63164..a9b7185a6b37 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1358,6 +1358,7 @@ int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog, struct btf *btf, const struct btf_type *t); struct bpf_prog *bpf_prog_by_id(u32 id); +struct bpf_link *bpf_link_by_id(u32 id); const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id); #else /* !CONFIG_BPF_SYSCALL */ -- cgit v1.2.3 From f67f6c00c7f367fe90f2bc01b9a977aa13de870e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 6 Aug 2020 20:20:46 +0200 Subject: ARM: s3c24xx: move s3cmci pinctrl handling into board files Rather than call the internal s3c_gpio_cfgall_range() function through a platform header, move the code into the set_power callback that is already exported by the board, and add a default implementation. In DT mode, the code already does not set the pin config, so nothing changes there. Signed-off-by: Arnd Bergmann Acked-by: Ulf Hansson Link: https://lore.kernel.org/r/20200806182059.2431-29-krzk@kernel.org [krzk: Rebase and correct set_power in mach-h1940.c] Signed-off-by: Krzysztof Kozlowski --- include/linux/platform_data/mmc-s3cmci.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/mmc-s3cmci.h b/include/linux/platform_data/mmc-s3cmci.h index 33310b11cbdd..bacb86db3112 100644 --- a/include/linux/platform_data/mmc-s3cmci.h +++ b/include/linux/platform_data/mmc-s3cmci.h @@ -35,6 +35,7 @@ struct s3c24xx_mci_pdata { unsigned long ocr_avail; void (*set_power)(unsigned char power_mode, unsigned short vdd); + struct gpio_desc *bus[6]; }; /** @@ -44,6 +45,7 @@ struct s3c24xx_mci_pdata { * Copy the platform data supplied by @pdata so that this can be marked * __initdata. */ +extern void s3c24xx_mci_def_set_power(unsigned char power_mode, unsigned short vdd); extern void s3c24xx_mci_set_platdata(struct s3c24xx_mci_pdata *pdata); #endif /* _ARCH_NCI_H */ -- cgit v1.2.3 From cd4bd8f9435ddf08a8677f56abf418423f223959 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 6 Aug 2020 20:20:48 +0200 Subject: ARM: s3c24xx: spi: avoid hardcoding fiq number in driver The IRQ_EINT0 constant is a platform detail that is defined in mach/irqs.h and not visible to drivers once that header is made private. Since the same calculation already happens in s3c24xx_set_fiq, just return the value from there. Signed-off-by: Arnd Bergmann Acked-by: Mark Brown Link: https://lore.kernel.org/r/20200806182059.2431-31-krzk@kernel.org Signed-off-by: Krzysztof Kozlowski --- include/linux/spi/s3c24xx.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/spi/s3c24xx.h b/include/linux/spi/s3c24xx.h index c91d10b82f08..440a71593162 100644 --- a/include/linux/spi/s3c24xx.h +++ b/include/linux/spi/s3c24xx.h @@ -20,6 +20,6 @@ struct s3c2410_spi_info { void (*set_cs)(struct s3c2410_spi_info *spi, int cs, int pol); }; -extern int s3c24xx_set_fiq(unsigned int irq, bool on); +extern int s3c24xx_set_fiq(unsigned int irq, u32 *ack_ptr, bool on); #endif /* __LINUX_SPI_S3C24XX_H */ -- cgit v1.2.3 From 81994e0ffc373e67ace4c98797c35f8213f07753 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 2 Sep 2019 22:33:24 +0200 Subject: fbdev: s3c2410fb: remove mach header dependency The s3c2410fb driver is too deeply intertwined with the s3c24xx platform code. Change it in a way that avoids the use of platform header files but having all interface data in a platform_data header, and the private register definitions next to the driver itself. One ugly bit here is that the driver pokes directly into gpio registers, which are owned by another driver. Passing the mapped addresses in platform_data is somewhat suboptimal, but it is a small improvement over the previous version. Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20200806182059.2431-33-krzk@kernel.org Signed-off-by: Krzysztof Kozlowski --- include/linux/platform_data/fb-s3c2410.h | 99 ++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 include/linux/platform_data/fb-s3c2410.h (limited to 'include/linux') diff --git a/include/linux/platform_data/fb-s3c2410.h b/include/linux/platform_data/fb-s3c2410.h new file mode 100644 index 000000000000..10c11e6316d6 --- /dev/null +++ b/include/linux/platform_data/fb-s3c2410.h @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2004 Arnaud Patard + * + * Inspired by pxafb.h +*/ + +#ifndef __ASM_PLAT_FB_S3C2410_H +#define __ASM_PLAT_FB_S3C2410_H __FILE__ + +#include + +struct s3c2410fb_hw { + unsigned long lcdcon1; + unsigned long lcdcon2; + unsigned long lcdcon3; + unsigned long lcdcon4; + unsigned long lcdcon5; +}; + +/* LCD description */ +struct s3c2410fb_display { + /* LCD type */ + unsigned type; +#define S3C2410_LCDCON1_DSCAN4 (0<<5) +#define S3C2410_LCDCON1_STN4 (1<<5) +#define S3C2410_LCDCON1_STN8 (2<<5) +#define S3C2410_LCDCON1_TFT (3<<5) + +#define S3C2410_LCDCON1_TFT1BPP (8<<1) +#define S3C2410_LCDCON1_TFT2BPP (9<<1) +#define S3C2410_LCDCON1_TFT4BPP (10<<1) +#define S3C2410_LCDCON1_TFT8BPP (11<<1) +#define S3C2410_LCDCON1_TFT16BPP (12<<1) +#define S3C2410_LCDCON1_TFT24BPP (13<<1) + + /* Screen size */ + unsigned short width; + unsigned short height; + + /* Screen info */ + unsigned short xres; + unsigned short yres; + unsigned short bpp; + + unsigned pixclock; /* pixclock in picoseconds */ + unsigned short left_margin; /* value in pixels (TFT) or HCLKs (STN) */ + unsigned short right_margin; /* value in pixels (TFT) or HCLKs (STN) */ + unsigned short hsync_len; /* value in pixels (TFT) or HCLKs (STN) */ + unsigned short upper_margin; /* value in lines (TFT) or 0 (STN) */ + unsigned short lower_margin; /* value in lines (TFT) or 0 (STN) */ + unsigned short vsync_len; /* value in lines (TFT) or 0 (STN) */ + + /* lcd configuration registers */ + unsigned long lcdcon5; +#define S3C2410_LCDCON5_BPP24BL (1<<12) +#define S3C2410_LCDCON5_FRM565 (1<<11) +#define S3C2410_LCDCON5_INVVCLK (1<<10) +#define S3C2410_LCDCON5_INVVLINE (1<<9) +#define S3C2410_LCDCON5_INVVFRAME (1<<8) +#define S3C2410_LCDCON5_INVVD (1<<7) +#define S3C2410_LCDCON5_INVVDEN (1<<6) +#define S3C2410_LCDCON5_INVPWREN (1<<5) +#define S3C2410_LCDCON5_INVLEND (1<<4) +#define S3C2410_LCDCON5_PWREN (1<<3) +#define S3C2410_LCDCON5_ENLEND (1<<2) +#define S3C2410_LCDCON5_BSWP (1<<1) +#define S3C2410_LCDCON5_HWSWP (1<<0) +}; + +struct s3c2410fb_mach_info { + + struct s3c2410fb_display *displays; /* attached displays info */ + unsigned num_displays; /* number of defined displays */ + unsigned default_display; + + /* GPIOs */ + + unsigned long gpcup; + unsigned long gpcup_mask; + unsigned long gpccon; + unsigned long gpccon_mask; + unsigned long gpdup; + unsigned long gpdup_mask; + unsigned long gpdcon; + unsigned long gpdcon_mask; + + void __iomem * gpccon_reg; + void __iomem * gpcup_reg; + void __iomem * gpdcon_reg; + void __iomem * gpdup_reg; + + /* lpc3600 control register */ + unsigned long lpcsel; +}; + +extern void s3c24xx_fb_set_platdata(struct s3c2410fb_mach_info *); + +#endif /* __ASM_PLAT_FB_S3C2410_H */ -- cgit v1.2.3 From 81b11a6a09964cfea4c525d22548790a1d92d38f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 6 Aug 2020 20:20:52 +0200 Subject: ARM: s3c: remove cpufreq header dependencies The cpufreq drivers are split between the machine directory and the drivers/cpufreq directory. In order to share header files after we convert s3c to multiplatform, those headers have to live in a different global location. Move them to linux/soc/samsung/ in lack of a better place. Signed-off-by: Arnd Bergmann Acked-by: Viresh Kumar Link: https://lore.kernel.org/r/20200806182059.2431-35-krzk@kernel.org Signed-off-by: Krzysztof Kozlowski --- include/linux/soc/samsung/s3c-cpu-freq.h | 145 +++++++++++++ include/linux/soc/samsung/s3c-cpufreq-core.h | 291 +++++++++++++++++++++++++++ include/linux/soc/samsung/s3c-pm.h | 10 + 3 files changed, 446 insertions(+) create mode 100644 include/linux/soc/samsung/s3c-cpu-freq.h create mode 100644 include/linux/soc/samsung/s3c-cpufreq-core.h (limited to 'include/linux') diff --git a/include/linux/soc/samsung/s3c-cpu-freq.h b/include/linux/soc/samsung/s3c-cpu-freq.h new file mode 100644 index 000000000000..63e88fd5dea2 --- /dev/null +++ b/include/linux/soc/samsung/s3c-cpu-freq.h @@ -0,0 +1,145 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2006-2007 Simtec Electronics + * http://armlinux.simtec.co.uk/ + * Ben Dooks + * + * S3C CPU frequency scaling support - driver and board + */ +#ifndef __LINUX_SOC_SAMSUNG_S3C_CPU_FREQ_H +#define __LINUX_SOC_SAMSUNG_S3C_CPU_FREQ_H + +#include + +struct s3c_cpufreq_info; +struct s3c_cpufreq_board; +struct s3c_iotimings; + +/** + * struct s3c_freq - frequency information (mainly for core drivers) + * @fclk: The FCLK frequency in Hz. + * @armclk: The ARMCLK frequency in Hz. + * @hclk_tns: HCLK cycle time in 10ths of nano-seconds. + * @hclk: The HCLK frequency in Hz. + * @pclk: The PCLK frequency in Hz. + * + * This contains the frequency information about the current configuration + * mainly for the core drivers to ensure we do not end up passing about + * a large number of parameters. + * + * The @hclk_tns field is a useful cache for the parts of the drivers that + * need to calculate IO timings and suchlike. + */ +struct s3c_freq { + unsigned long fclk; + unsigned long armclk; + unsigned long hclk_tns; /* in 10ths of ns */ + unsigned long hclk; + unsigned long pclk; +}; + +/** + * struct s3c_cpufreq_freqs - s3c cpufreq notification information. + * @freqs: The cpufreq setting information. + * @old: The old clock settings. + * @new: The new clock settings. + * @pll_changing: Set if the PLL is changing. + * + * Wrapper 'struct cpufreq_freqs' so that any drivers receiving the + * notification can use this information that is not provided by just + * having the core frequency alone. + * + * The pll_changing flag is used to indicate if the PLL itself is + * being set during this change. This is important as the clocks + * will temporarily be set to the XTAL clock during this time, so + * drivers may want to close down their output during this time. + * + * Note, this is not being used by any current drivers and therefore + * may be removed in the future. + */ +struct s3c_cpufreq_freqs { + struct cpufreq_freqs freqs; + struct s3c_freq old; + struct s3c_freq new; + + unsigned int pll_changing:1; +}; + +#define to_s3c_cpufreq(_cf) container_of(_cf, struct s3c_cpufreq_freqs, freqs) + +/** + * struct s3c_clkdivs - clock divisor information + * @p_divisor: Divisor from FCLK to PCLK. + * @h_divisor: Divisor from FCLK to HCLK. + * @arm_divisor: Divisor from FCLK to ARMCLK (not all CPUs). + * @dvs: Non-zero if using DVS mode for ARMCLK. + * + * Divisor settings for the core clocks. + */ +struct s3c_clkdivs { + int p_divisor; + int h_divisor; + int arm_divisor; + unsigned char dvs; +}; + +#define PLLVAL(_m, _p, _s) (((_m) << 12) | ((_p) << 4) | (_s)) + +/** + * struct s3c_pllval - PLL value entry. + * @freq: The frequency for this entry in Hz. + * @pll_reg: The PLL register setting for this PLL value. + */ +struct s3c_pllval { + unsigned long freq; + unsigned long pll_reg; +}; + +/** + * struct s3c_cpufreq_board - per-board cpu frequency informatin + * @refresh: The SDRAM refresh period in nanoseconds. + * @auto_io: Set if the IO timing settings should be generated from the + * initialisation time hardware registers. + * @need_io: Set if the board has external IO on any of the chipselect + * lines that will require the hardware timing registers to be + * updated on a clock change. + * @max: The maxium frequency limits for the system. Any field that + * is left at zero will use the CPU's settings. + * + * This contains the board specific settings that affect how the CPU + * drivers chose settings. These include the memory refresh and IO + * timing information. + * + * Registration depends on the driver being used, the ARMCLK only + * implementation does not currently need this but the older style + * driver requires this to be available. + */ +struct s3c_cpufreq_board { + unsigned int refresh; + unsigned int auto_io:1; /* automatically init io timings. */ + unsigned int need_io:1; /* set if needs io timing support. */ + + /* any non-zero field in here is taken as an upper limit. */ + struct s3c_freq max; /* frequency limits */ +}; + +/* Things depending on frequency scaling. */ +#ifdef CONFIG_ARM_S3C_CPUFREQ +#define __init_or_cpufreq +#else +#define __init_or_cpufreq __init +#endif + +/* Board functions */ + +#ifdef CONFIG_ARM_S3C_CPUFREQ +extern int s3c_cpufreq_setboard(struct s3c_cpufreq_board *board); +#else + +static inline int s3c_cpufreq_setboard(struct s3c_cpufreq_board *board) +{ + return 0; +} +#endif /* CONFIG_ARM_S3C_CPUFREQ */ + +#endif diff --git a/include/linux/soc/samsung/s3c-cpufreq-core.h b/include/linux/soc/samsung/s3c-cpufreq-core.h new file mode 100644 index 000000000000..c578b07ccd5d --- /dev/null +++ b/include/linux/soc/samsung/s3c-cpufreq-core.h @@ -0,0 +1,291 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2006-2009 Simtec Electronics + * http://armlinux.simtec.co.uk/ + * Ben Dooks + * + * S3C CPU frequency scaling support - core support + */ +#ifndef __LINUX_SOC_SAMSUNG_S3C_CPUFREQ_CORE_H +#define __LINUX_SOC_SAMSUNG_S3C_CPUFREQ_CORE_H + +#include + +struct seq_file; + +#define MAX_BANKS (8) +#define S3C2412_MAX_IO (8) + +/** + * struct s3c2410_iobank_timing - IO bank timings for S3C2410 style timings + * @bankcon: The cached version of settings in this structure. + * @tacp: + * @tacs: Time from address valid to nCS asserted. + * @tcos: Time from nCS asserted to nOE or nWE asserted. + * @tacc: Time that nOE or nWE is asserted. + * @tcoh: Time nCS is held after nOE or nWE are released. + * @tcah: Time address is held for after + * @nwait_en: Whether nWAIT is enabled for this bank. + * + * This structure represents the IO timings for a S3C2410 style IO bank + * used by the CPU frequency support if it needs to change the settings + * of the IO. + */ +struct s3c2410_iobank_timing { + unsigned long bankcon; + unsigned int tacp; + unsigned int tacs; + unsigned int tcos; + unsigned int tacc; + unsigned int tcoh; /* nCS hold after nOE/nWE */ + unsigned int tcah; /* Address hold after nCS */ + unsigned char nwait_en; /* nWait enabled for bank. */ +}; + +/** + * struct s3c2412_iobank_timing - io timings for PL092 (S3C2412) style IO + * @idcy: The idle cycle time between transactions. + * @wstrd: nCS release to end of read cycle. + * @wstwr: nCS release to end of write cycle. + * @wstoen: nCS assertion to nOE assertion time. + * @wstwen: nCS assertion to nWE assertion time. + * @wstbrd: Burst ready delay. + * @smbidcyr: Register cache for smbidcyr value. + * @smbwstrd: Register cache for smbwstrd value. + * @smbwstwr: Register cache for smbwstwr value. + * @smbwstoen: Register cache for smbwstoen value. + * @smbwstwen: Register cache for smbwstwen value. + * @smbwstbrd: Register cache for smbwstbrd value. + * + * Timing information for a IO bank on an S3C2412 or similar system which + * uses a PL093 block. + */ +struct s3c2412_iobank_timing { + unsigned int idcy; + unsigned int wstrd; + unsigned int wstwr; + unsigned int wstoen; + unsigned int wstwen; + unsigned int wstbrd; + + /* register cache */ + unsigned char smbidcyr; + unsigned char smbwstrd; + unsigned char smbwstwr; + unsigned char smbwstoen; + unsigned char smbwstwen; + unsigned char smbwstbrd; +}; + +union s3c_iobank { + struct s3c2410_iobank_timing *io_2410; + struct s3c2412_iobank_timing *io_2412; +}; + +/** + * struct s3c_iotimings - Chip IO timings holder + * @bank: The timings for each IO bank. + */ +struct s3c_iotimings { + union s3c_iobank bank[MAX_BANKS]; +}; + +/** + * struct s3c_plltab - PLL table information. + * @vals: List of PLL values. + * @size: Size of the PLL table @vals. + */ +struct s3c_plltab { + struct s3c_pllval *vals; + int size; +}; + +/** + * struct s3c_cpufreq_config - current cpu frequency configuration + * @freq: The current settings for the core clocks. + * @max: Maxium settings, derived from core, board and user settings. + * @pll: The PLL table entry for the current PLL settings. + * @divs: The divisor settings for the core clocks. + * @info: The current core driver information. + * @board: The information for the board we are running on. + * @lock_pll: Set if the PLL settings cannot be changed. + * + * This is for the core drivers that need to know information about + * the current settings and values. It should not be needed by any + * device drivers. +*/ +struct s3c_cpufreq_config { + struct s3c_freq freq; + struct s3c_freq max; + struct clk *mpll; + struct cpufreq_frequency_table pll; + struct s3c_clkdivs divs; + struct s3c_cpufreq_info *info; /* for core, not drivers */ + struct s3c_cpufreq_board *board; + + unsigned int lock_pll:1; +}; + +/** + * struct s3c_cpufreq_info - Information for the CPU frequency driver. + * @name: The name of this implementation. + * @max: The maximum frequencies for the system. + * @latency: Transition latency to give to cpufreq. + * @locktime_m: The lock-time in uS for the MPLL. + * @locktime_u: The lock-time in uS for the UPLL. + * @locttime_bits: The number of bits each LOCKTIME field. + * @need_pll: Set if this driver needs to change the PLL values to achieve + * any frequency changes. This is really only need by devices like the + * S3C2410 where there is no or limited divider between the PLL and the + * ARMCLK. + * @get_iotiming: Get the current IO timing data, mainly for use at start. + * @set_iotiming: Update the IO timings from the cached copies calculated + * from the @calc_iotiming entry when changing the frequency. + * @calc_iotiming: Calculate and update the cached copies of the IO timings + * from the newly calculated frequencies. + * @calc_freqtable: Calculate (fill in) the given frequency table from the + * current frequency configuration. If the table passed in is NULL, + * then the return is the number of elements to be filled for allocation + * of the table. + * @set_refresh: Set the memory refresh configuration. + * @set_fvco: Set the PLL frequencies. + * @set_divs: Update the clock divisors. + * @calc_divs: Calculate the clock divisors. + */ +struct s3c_cpufreq_info { + const char *name; + struct s3c_freq max; + + unsigned int latency; + + unsigned int locktime_m; + unsigned int locktime_u; + unsigned char locktime_bits; + + unsigned int need_pll:1; + + /* driver routines */ + + int (*get_iotiming)(struct s3c_cpufreq_config *cfg, + struct s3c_iotimings *timings); + + void (*set_iotiming)(struct s3c_cpufreq_config *cfg, + struct s3c_iotimings *timings); + + int (*calc_iotiming)(struct s3c_cpufreq_config *cfg, + struct s3c_iotimings *timings); + + int (*calc_freqtable)(struct s3c_cpufreq_config *cfg, + struct cpufreq_frequency_table *t, + size_t table_size); + + void (*debug_io_show)(struct seq_file *seq, + struct s3c_cpufreq_config *cfg, + union s3c_iobank *iob); + + void (*set_refresh)(struct s3c_cpufreq_config *cfg); + void (*set_fvco)(struct s3c_cpufreq_config *cfg); + void (*set_divs)(struct s3c_cpufreq_config *cfg); + int (*calc_divs)(struct s3c_cpufreq_config *cfg); +}; + +extern int s3c_cpufreq_register(struct s3c_cpufreq_info *info); + +extern int s3c_plltab_register(struct cpufreq_frequency_table *plls, + unsigned int plls_no); + +/* exports and utilities for debugfs */ +extern struct s3c_cpufreq_config *s3c_cpufreq_getconfig(void); +extern struct s3c_iotimings *s3c_cpufreq_getiotimings(void); + +#ifdef CONFIG_ARM_S3C24XX_CPUFREQ_DEBUGFS +#define s3c_cpufreq_debugfs_call(x) x +#else +#define s3c_cpufreq_debugfs_call(x) NULL +#endif + +/* Useful utility functions. */ + +extern struct clk *s3c_cpufreq_clk_get(struct device *, const char *); + +/* S3C2410 and compatible exported functions */ + +extern void s3c2410_cpufreq_setrefresh(struct s3c_cpufreq_config *cfg); +extern void s3c2410_set_fvco(struct s3c_cpufreq_config *cfg); + +#ifdef CONFIG_S3C2410_IOTIMING +extern void s3c2410_iotiming_debugfs(struct seq_file *seq, + struct s3c_cpufreq_config *cfg, + union s3c_iobank *iob); + +extern int s3c2410_iotiming_calc(struct s3c_cpufreq_config *cfg, + struct s3c_iotimings *iot); + +extern int s3c2410_iotiming_get(struct s3c_cpufreq_config *cfg, + struct s3c_iotimings *timings); + +extern void s3c2410_iotiming_set(struct s3c_cpufreq_config *cfg, + struct s3c_iotimings *iot); +#else +#define s3c2410_iotiming_debugfs NULL +#define s3c2410_iotiming_calc NULL +#define s3c2410_iotiming_get NULL +#define s3c2410_iotiming_set NULL +#endif /* CONFIG_S3C2410_IOTIMING */ + +/* S3C2412 compatible routines */ + +#ifdef CONFIG_S3C2412_IOTIMING +extern void s3c2412_iotiming_debugfs(struct seq_file *seq, + struct s3c_cpufreq_config *cfg, + union s3c_iobank *iob); + +extern int s3c2412_iotiming_get(struct s3c_cpufreq_config *cfg, + struct s3c_iotimings *timings); + +extern int s3c2412_iotiming_calc(struct s3c_cpufreq_config *cfg, + struct s3c_iotimings *iot); + +extern void s3c2412_iotiming_set(struct s3c_cpufreq_config *cfg, + struct s3c_iotimings *iot); +#else +#define s3c2412_iotiming_debugfs NULL +#define s3c2412_iotiming_calc NULL +#define s3c2412_iotiming_get NULL +#define s3c2412_iotiming_set NULL +#endif /* CONFIG_S3C2412_IOTIMING */ + +#ifdef CONFIG_ARM_S3C24XX_CPUFREQ_DEBUG +#define s3c_freq_dbg(x...) printk(KERN_INFO x) +#else +#define s3c_freq_dbg(x...) do { if (0) printk(x); } while (0) +#endif /* CONFIG_ARM_S3C24XX_CPUFREQ_DEBUG */ + +#ifdef CONFIG_ARM_S3C24XX_CPUFREQ_IODEBUG +#define s3c_freq_iodbg(x...) printk(KERN_INFO x) +#else +#define s3c_freq_iodbg(x...) do { if (0) printk(x); } while (0) +#endif /* CONFIG_ARM_S3C24XX_CPUFREQ_IODEBUG */ + +static inline int s3c_cpufreq_addfreq(struct cpufreq_frequency_table *table, + int index, size_t table_size, + unsigned int freq) +{ + if (index < 0) + return index; + + if (table) { + if (index >= table_size) + return -ENOMEM; + + s3c_freq_dbg("%s: { %d = %u kHz }\n", + __func__, index, freq); + + table[index].driver_data = index; + table[index].frequency = freq; + } + + return index + 1; +} + +#endif diff --git a/include/linux/soc/samsung/s3c-pm.h b/include/linux/soc/samsung/s3c-pm.h index 730bd1d3d09a..f9164559c99f 100644 --- a/include/linux/soc/samsung/s3c-pm.h +++ b/include/linux/soc/samsung/s3c-pm.h @@ -81,4 +81,14 @@ extern void s3c_pm_check_store(void); #define s3c_pm_check_store() do { } while (0) #endif +/* system device subsystems */ + +extern struct bus_type s3c2410_subsys; +extern struct bus_type s3c2410a_subsys; +extern struct bus_type s3c2412_subsys; +extern struct bus_type s3c2416_subsys; +extern struct bus_type s3c2440_subsys; +extern struct bus_type s3c2442_subsys; +extern struct bus_type s3c2443_subsys; + #endif -- cgit v1.2.3 From 44c01f5ce1c7518886a87d5522528e30e0b4d9f8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 6 Aug 2020 20:20:53 +0200 Subject: cpufreq: s3c2412: use global s3c2412_cpufreq_setrefresh There are two identical copies of the s3c2412_cpufreq_setrefresh function: a static one in the cpufreq driver and a global version in iotiming-s3c2412.c. As the function requires the use of a hardcoded register address from a header that we want to not be visible to drivers, just move the existing global function and add a declaration in one of the cpufreq header files. Signed-off-by: Arnd Bergmann Acked-by: Viresh Kumar Link: https://lore.kernel.org/r/20200806182059.2431-36-krzk@kernel.org Signed-off-by: Krzysztof Kozlowski --- include/linux/soc/samsung/s3c-cpufreq-core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/soc/samsung/s3c-cpufreq-core.h b/include/linux/soc/samsung/s3c-cpufreq-core.h index c578b07ccd5d..e0c7217a0f53 100644 --- a/include/linux/soc/samsung/s3c-cpufreq-core.h +++ b/include/linux/soc/samsung/s3c-cpufreq-core.h @@ -248,6 +248,7 @@ extern int s3c2412_iotiming_calc(struct s3c_cpufreq_config *cfg, extern void s3c2412_iotiming_set(struct s3c_cpufreq_config *cfg, struct s3c_iotimings *iot); +extern void s3c2412_cpufreq_setrefresh(struct s3c_cpufreq_config *cfg); #else #define s3c2412_iotiming_debugfs NULL #define s3c2412_iotiming_calc NULL -- cgit v1.2.3 From c38758e3d574380ccfa583793be14c1cc8a322ff Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 6 Aug 2020 20:20:54 +0200 Subject: cpufreq: s3c24xx: move low-level clk reg access into platform code Rather than have the cpufreq drivers touch include the common headers to get the constants, add a small indirection. This is still not the proper way that would do this through the common clk API, but it lets us kill off the header file usage. Signed-off-by: Arnd Bergmann Acked-by: Viresh Kumar Link: https://lore.kernel.org/r/20200806182059.2431-37-krzk@kernel.org [krzk: Rebase and fix -Wold-style-definition] Signed-off-by: Krzysztof Kozlowski --- include/linux/soc/samsung/s3c-cpufreq-core.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/samsung/s3c-cpufreq-core.h b/include/linux/soc/samsung/s3c-cpufreq-core.h index e0c7217a0f53..3b278afb769b 100644 --- a/include/linux/soc/samsung/s3c-cpufreq-core.h +++ b/include/linux/soc/samsung/s3c-cpufreq-core.h @@ -289,4 +289,11 @@ static inline int s3c_cpufreq_addfreq(struct cpufreq_frequency_table *table, return index + 1; } +u32 s3c2440_read_camdivn(void); +void s3c2440_write_camdivn(u32 camdiv); +u32 s3c24xx_read_clkdivn(void); +void s3c24xx_write_clkdivn(u32 clkdiv); +u32 s3c24xx_read_mpllcon(void); +void s3c24xx_write_locktime(u32 locktime); + #endif -- cgit v1.2.3 From ba171d3f0850003216fd1a85190d17b1feddb961 Mon Sep 17 00:00:00 2001 From: Cedric Neveux Date: Mon, 4 Mar 2019 08:54:23 +0100 Subject: driver: tee: Handle NULL pointer indication from client TEE Client introduce a new capability "TEE_GEN_CAP_MEMREF_NULL" to handle the support of the shared memory buffer with a NULL pointer. This capability depends on TEE Capabilities and driver support. Driver and TEE exchange capabilities at driver initialization. Signed-off-by: Michael Whitfield Signed-off-by: Cedric Neveux Reviewed-by: Joakim Bech Tested-by: Joakim Bech (QEMU) Signed-off-by: Jens Wiklander --- include/linux/tee_drv.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h index d074302989dd..cdd049a724b1 100644 --- a/include/linux/tee_drv.h +++ b/include/linux/tee_drv.h @@ -47,6 +47,8 @@ struct tee_shm_pool; * and just return with an error code. It is needed for requests * that arises from TEE based kernel drivers that should be * non-blocking in nature. + * @cap_memref_null: flag indicating if the TEE Client support shared + * memory buffer with a NULL pointer. */ struct tee_context { struct tee_device *teedev; @@ -54,6 +56,7 @@ struct tee_context { struct kref refcount; bool releasing; bool supp_nowait; + bool cap_memref_null; }; struct tee_param_memref { -- cgit v1.2.3 From 6b0a249a301e2af9adda84adbced3a2988248b95 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 21 Aug 2020 11:44:18 -0700 Subject: bpf: Implement link_query for bpf iterators This patch implemented bpf_link callback functions show_fdinfo and fill_link_info to support link_query interface. The general interface for show_fdinfo and fill_link_info will print/fill the target_name. Each targets can register show_fdinfo and fill_link_info callbacks to print/fill more target specific information. For example, the below is a fdinfo result for a bpf task iterator. $ cat /proc/1749/fdinfo/7 pos: 0 flags: 02000000 mnt_id: 14 link_type: iter link_id: 11 prog_tag: 990e1f8152f7e54f prog_id: 59 target_name: task Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200821184418.574122-1-yhs@fb.com --- include/linux/bpf.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a9b7185a6b37..529e9b183eeb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1218,12 +1218,18 @@ typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, union bpf_iter_link_info *linfo, struct bpf_iter_aux_info *aux); typedef void (*bpf_iter_detach_target_t)(struct bpf_iter_aux_info *aux); +typedef void (*bpf_iter_show_fdinfo_t) (const struct bpf_iter_aux_info *aux, + struct seq_file *seq); +typedef int (*bpf_iter_fill_link_info_t)(const struct bpf_iter_aux_info *aux, + struct bpf_link_info *info); #define BPF_ITER_CTX_ARG_MAX 2 struct bpf_iter_reg { const char *target; bpf_iter_attach_target_t attach_target; bpf_iter_detach_target_t detach_target; + bpf_iter_show_fdinfo_t show_fdinfo; + bpf_iter_fill_link_info_t fill_link_info; u32 ctx_arg_info_size; struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX]; const struct bpf_iter_seq_info *seq_info; -- cgit v1.2.3 From b76f22269028fb252727a696084c70494d80a52c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 21 Aug 2020 11:44:19 -0700 Subject: bpf: Implement link_query callbacks in map element iterators For bpf_map_elem and bpf_sk_local_storage bpf iterators, additional map_id should be shown for fdinfo and userspace query. For example, the following is for a bpf_map_elem iterator. $ cat /proc/1753/fdinfo/9 pos: 0 flags: 02000000 mnt_id: 14 link_type: iter link_id: 34 prog_tag: 104be6d3fe45e6aa prog_id: 173 target_name: bpf_map_elem map_id: 127 Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200821184419.574240-1-yhs@fb.com --- include/linux/bpf.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 529e9b183eeb..30c144af894a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1256,6 +1256,10 @@ int bpf_iter_new_fd(struct bpf_link *link); bool bpf_link_is_iter(struct bpf_link *link); struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop); int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx); +void bpf_iter_map_show_fdinfo(const struct bpf_iter_aux_info *aux, + struct seq_file *seq); +int bpf_iter_map_fill_link_info(const struct bpf_iter_aux_info *aux, + struct bpf_link_info *info); int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); -- cgit v1.2.3 From 7b219da43f94a3b4d5a8aa4cc52b75b34f0301ec Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 21 Aug 2020 11:29:43 +0100 Subject: net: sk_msg: Simplify sk_psock initialization Initializing psock->sk_proto and other saved callbacks is only done in sk_psock_update_proto, after sk_psock_init has returned. The logic for this is difficult to follow, and needlessly complex. Instead, initialize psock->sk_proto whenever we allocate a new psock. Additionally, assert the following invariants: * The SK has no ULP: ULP does it's own finagling of sk->sk_prot * sk_user_data is unused: we need it to store sk_psock Protect our access to sk_user_data with sk_callback_lock, which is what other users like reuseport arrays, etc. do. The result is that an sk_psock is always fully initialized, and that psock->sk_proto is always the "original" struct proto. The latter allows us to use psock->sk_proto when initializing IPv6 TCP / UDP callbacks for sockmap. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200821102948.21918-2-lmb@cloudflare.com --- include/linux/skmsg.h | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 1e9ed840b9fc..3119928fc103 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -340,23 +340,6 @@ static inline void sk_psock_update_proto(struct sock *sk, struct sk_psock *psock, struct proto *ops) { - /* Initialize saved callbacks and original proto only once, since this - * function may be called multiple times for a psock, e.g. when - * psock->progs.msg_parser is updated. - * - * Since we've not installed the new proto, psock is not yet in use and - * we can initialize it without synchronization. - */ - if (!psock->sk_proto) { - struct proto *orig = READ_ONCE(sk->sk_prot); - - psock->saved_unhash = orig->unhash; - psock->saved_close = orig->close; - psock->saved_write_space = sk->sk_write_space; - - psock->sk_proto = orig; - } - /* Pairs with lockless read in sk_clone_lock() */ WRITE_ONCE(sk->sk_prot, ops); } -- cgit v1.2.3 From 13b79d3ffbb8add9e2a6d604db2b49f241b97303 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 21 Aug 2020 11:29:45 +0100 Subject: bpf: sockmap: Call sock_map_update_elem directly Don't go via map->ops to call sock_map_update_elem, since we know what function to call in bpf_map_update_value. Since we currently don't allow calling map_update_elem from BPF context, we can remove ops->map_update_elem and rename the function to sock_map_update_elem_sys. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200821102948.21918-4-lmb@cloudflare.com --- include/linux/bpf.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 30c144af894a..81f38e2fda78 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1648,6 +1648,7 @@ int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, struct bpf_prog *old, u32 which); int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog); int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); +int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags); void sock_map_unhash(struct sock *sk); void sock_map_close(struct sock *sk, long timeout); #else @@ -1669,6 +1670,12 @@ static inline int sock_map_prog_detach(const union bpf_attr *attr, { return -EOPNOTSUPP; } + +static inline int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, + u64 flags) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_BPF_STREAM_PARSER */ #if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) -- cgit v1.2.3 From 2152fbbd47c06c4f50ad265ec1b0c43673bee3e8 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Fri, 3 Jul 2020 09:07:29 -0700 Subject: soc: ti: pm33xx: Simplify RTC usage to prepare to drop platform data We must re-enable the RTC module clock enabled in RTC+DDR suspend, and pm33xx has been using platform data callbacks for that. Looks like for retention suspend the RTC module clock must not be re-enabled. To remove the legacy platform data callbacks, and eventually be able to drop the RTC legacy platform data, let's manage the RTC module clock and register range directly in pm33xx. Acked-by: Santosh Shilimkar Signed-off-by: Tony Lindgren --- include/linux/platform_data/pm33xx.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/pm33xx.h b/include/linux/platform_data/pm33xx.h index 644af1d89cfa..7037ba7a53ca 100644 --- a/include/linux/platform_data/pm33xx.h +++ b/include/linux/platform_data/pm33xx.h @@ -54,11 +54,8 @@ struct am33xx_pm_platform_data { void (*begin_suspend)(void); void (*finish_suspend)(void); struct am33xx_pm_sram_addr *(*get_sram_addrs)(void); - void __iomem *(*get_rtc_base_addr)(void); void (*save_context)(void); void (*restore_context)(void); - void (*prepare_rtc_suspend)(void); - void (*prepare_rtc_resume)(void); int (*check_off_mode_enable)(void); }; -- cgit v1.2.3 From 70a217f1976f75a6cfe8223e5669ad7b405daaad Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 20 Aug 2020 12:00:14 -0700 Subject: tcp: Use a struct to represent a saved_syn The TCP_SAVE_SYN has both the network header and tcp header. The total length of the saved syn packet is currently stored in the first 4 bytes (u32) of an array and the actual packet data is stored after that. A later patch will add a bpf helper that allows to get the tcp header alone from the saved syn without the network header. It will be more convenient to have a direct offset to a specific header instead of re-parsing it. This requires to separately store the network hdrlen. The total header length (i.e. network + tcp) is still needed for the current usage in getsockopt. Although this total length can be obtained by looking into the tcphdr and then get the (th->doff << 2), this patch chooses to directly store the tcp hdrlen in the second four bytes of this newly created "struct saved_syn". By using a new struct, it can give a readable name to each individual header length. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Reviewed-by: Eric Dumazet Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200820190014.2883694-1-kafai@fb.com --- include/linux/tcp.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 14b62d7df942..2088d5a079af 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -406,7 +406,7 @@ struct tcp_sock { * socket. Used to retransmit SYNACKs etc. */ struct request_sock __rcu *fastopen_rsk; - u32 *saved_syn; + struct saved_syn *saved_syn; }; enum tsq_enum { @@ -484,6 +484,11 @@ static inline void tcp_saved_syn_free(struct tcp_sock *tp) tp->saved_syn = NULL; } +static inline u32 tcp_saved_syn_len(const struct saved_syn *saved_syn) +{ + return saved_syn->network_hdrlen + saved_syn->tcp_hdrlen; +} + struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, const struct sk_buff *orig_skb); -- cgit v1.2.3 From 7656d68455891f7fc6689f95415fd59e7a1d629b Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 20 Aug 2020 12:00:33 -0700 Subject: tcp: Add saw_unknown to struct tcp_options_received In a later patch, the bpf prog only wants to be called to handle a header option if that particular header option cannot be handled by the kernel. This unknown option could be written by the peer's bpf-prog. It could also be a new standard option that the running kernel does not support it while a bpf-prog can handle it. This patch adds a "saw_unknown" bit to "struct tcp_options_received" and it uses an existing one byte hole to do that. "saw_unknown" will be set in tcp_parse_options() if it sees an option that the kernel cannot handle. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Reviewed-by: Eric Dumazet Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200820190033.2884430-1-kafai@fb.com --- include/linux/tcp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 2088d5a079af..29d166263ae7 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -92,6 +92,8 @@ struct tcp_options_received { smc_ok : 1, /* SMC seen on SYN packet */ snd_wscale : 4, /* Window scaling received from sender */ rcv_wscale : 4; /* Window scaling to send to receiver */ + u8 saw_unknown:1, /* Received unknown option */ + unused:7; u8 num_sacks; /* Number of SACK blocks */ u16 user_mss; /* mss requested by user in ioctl */ u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ -- cgit v1.2.3 From c9985d09e18965131958102f4b67fa1e742df335 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 20 Aug 2020 12:00:58 -0700 Subject: bpf: sock_ops: Change some members of sock_ops_kern from u32 to u8 A later patch needs to add a few pointers and a few u8 to sock_ops_kern. Hence, this patch saves some spaces by moving some of the existing members from u32 to u8 so that the later patch can still fit everything in a cacheline. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200820190058.2885640-1-kafai@fb.com --- include/linux/filter.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 0a355b005bf4..c427dfa5f908 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1236,13 +1236,13 @@ struct bpf_sock_addr_kern { struct bpf_sock_ops_kern { struct sock *sk; - u32 op; union { u32 args[4]; u32 reply; u32 replylong[4]; }; - u32 is_fullsock; + u8 op; + u8 is_fullsock; u64 temp; /* temp and everything after is not * initialized to 0 before calling * the BPF program. New fields that -- cgit v1.2.3 From 0813a841566f0962a5551be7749b43c45f0022a0 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 20 Aug 2020 12:01:04 -0700 Subject: bpf: tcp: Allow bpf prog to write and parse TCP header option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Note: The TCP changes here is mainly to implement the bpf pieces into the bpf_skops_*() functions introduced in the earlier patches. ] The earlier effort in BPF-TCP-CC allows the TCP Congestion Control algorithm to be written in BPF. It opens up opportunities to allow a faster turnaround time in testing/releasing new congestion control ideas to production environment. The same flexibility can be extended to writing TCP header option. It is not uncommon that people want to test new TCP header option to improve the TCP performance. Another use case is for data-center that has a more controlled environment and has more flexibility in putting header options for internal only use. For example, we want to test the idea in putting maximum delay ACK in TCP header option which is similar to a draft RFC proposal [1]. This patch introduces the necessary BPF API and use them in the TCP stack to allow BPF_PROG_TYPE_SOCK_OPS program to parse and write TCP header options. It currently supports most of the TCP packet except RST. Supported TCP header option: ─────────────────────────── This patch allows the bpf-prog to write any option kind. Different bpf-progs can write its own option by calling the new helper bpf_store_hdr_opt(). The helper will ensure there is no duplicated option in the header. By allowing bpf-prog to write any option kind, this gives a lot of flexibility to the bpf-prog. Different bpf-prog can write its own option kind. It could also allow the bpf-prog to support a recently standardized option on an older kernel. Sockops Callback Flags: ────────────────────── The bpf program will only be called to parse/write tcp header option if the following newly added callback flags are enabled in tp->bpf_sock_ops_cb_flags: BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG A few words on the PARSE CB flags. When the above PARSE CB flags are turned on, the bpf-prog will be called on packets received at a sk that has at least reached the ESTABLISHED state. The parsing of the SYN-SYNACK-ACK will be discussed in the "3 Way HandShake" section. The default is off for all of the above new CB flags, i.e. the bpf prog will not be called to parse or write bpf hdr option. There are details comment on these new cb flags in the UAPI bpf.h. sock_ops->skb_data and bpf_load_hdr_opt() ───────────────────────────────────────── sock_ops->skb_data and sock_ops->skb_data_end covers the whole TCP header and its options. They are read only. The new bpf_load_hdr_opt() helps to read a particular option "kind" from the skb_data. Please refer to the comment in UAPI bpf.h. It has details on what skb_data contains under different sock_ops->op. 3 Way HandShake ─────────────── The bpf-prog can learn if it is sending SYN or SYNACK by reading the sock_ops->skb_tcp_flags. * Passive side When writing SYNACK (i.e. sock_ops->op == BPF_SOCK_OPS_WRITE_HDR_OPT_CB), the received SYN skb will be available to the bpf prog. The bpf prog can use the SYN skb (which may carry the header option sent from the remote bpf prog) to decide what bpf header option should be written to the outgoing SYNACK skb. The SYN packet can be obtained by getsockopt(TCP_BPF_SYN*). More on this later. Also, the bpf prog can learn if it is in syncookie mode (by checking sock_ops->args[0] == BPF_WRITE_HDR_TCP_SYNACK_COOKIE). The bpf prog can store the received SYN pkt by using the existing bpf_setsockopt(TCP_SAVE_SYN). The example in a later patch does it. [ Note that the fullsock here is a listen sk, bpf_sk_storage is not very useful here since the listen sk will be shared by many concurrent connection requests. Extending bpf_sk_storage support to request_sock will add weight to the minisock and it is not necessary better than storing the whole ~100 bytes SYN pkt. ] When the connection is established, the bpf prog will be called in the existing PASSIVE_ESTABLISHED_CB callback. At that time, the bpf prog can get the header option from the saved syn and then apply the needed operation to the newly established socket. The later patch will use the max delay ack specified in the SYN header and set the RTO of this newly established connection as an example. The received ACK (that concludes the 3WHS) will also be available to the bpf prog during PASSIVE_ESTABLISHED_CB through the sock_ops->skb_data. It could be useful in syncookie scenario. More on this later. There is an existing getsockopt "TCP_SAVED_SYN" to return the whole saved syn pkt which includes the IP[46] header and the TCP header. A few "TCP_BPF_SYN*" getsockopt has been added to allow specifying where to start getting from, e.g. starting from TCP header, or from IP[46] header. The new getsockopt(TCP_BPF_SYN*) will also know where it can get the SYN's packet from: - (a) the just received syn (available when the bpf prog is writing SYNACK) and it is the only way to get SYN during syncookie mode. or - (b) the saved syn (available in PASSIVE_ESTABLISHED_CB and also other existing CB). The bpf prog does not need to know where the SYN pkt is coming from. The getsockopt(TCP_BPF_SYN*) will hide this details. Similarly, a flags "BPF_LOAD_HDR_OPT_TCP_SYN" is also added to bpf_load_hdr_opt() to read a particular header option from the SYN packet. * Fastopen Fastopen should work the same as the regular non fastopen case. This is a test in a later patch. * Syncookie For syncookie, the later example patch asks the active side's bpf prog to resend the header options in ACK. The server can use bpf_load_hdr_opt() to look at the options in this received ACK during PASSIVE_ESTABLISHED_CB. * Active side The bpf prog will get a chance to write the bpf header option in the SYN packet during WRITE_HDR_OPT_CB. The received SYNACK pkt will also be available to the bpf prog during the existing ACTIVE_ESTABLISHED_CB callback through the sock_ops->skb_data and bpf_load_hdr_opt(). * Turn off header CB flags after 3WHS If the bpf prog does not need to write/parse header options beyond the 3WHS, the bpf prog can clear the bpf_sock_ops_cb_flags to avoid being called for header options. Or the bpf-prog can select to leave the UNKNOWN_HDR_OPT_CB_FLAG on so that the kernel will only call it when there is option that the kernel cannot handle. [1]: draft-wang-tcpm-low-latency-opt-00 https://tools.ietf.org/html/draft-wang-tcpm-low-latency-opt-00 Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200820190104.2885895-1-kafai@fb.com --- include/linux/bpf-cgroup.h | 25 +++++++++++++++++++++++++ include/linux/filter.h | 4 ++++ 2 files changed, 29 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 64f367044e25..2f98d2fce62e 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -279,6 +279,31 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_RECVMSG, NULL) +/* The SOCK_OPS"_SK" macro should be used when sock_ops->sk is not a + * fullsock and its parent fullsock cannot be traced by + * sk_to_full_sk(). + * + * e.g. sock_ops->sk is a request_sock and it is under syncookie mode. + * Its listener-sk is not attached to the rsk_listener. + * In this case, the caller holds the listener-sk (unlocked), + * set its sock_ops->sk to req_sk, and call this SOCK_OPS"_SK" with + * the listener-sk such that the cgroup-bpf-progs of the + * listener-sk will be run. + * + * Regardless of syncookie mode or not, + * calling bpf_setsockopt on listener-sk will not make sense anyway, + * so passing 'sock_ops->sk == req_sk' to the bpf prog is appropriate here. + */ +#define BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(sock_ops, sk) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) \ + __ret = __cgroup_bpf_run_filter_sock_ops(sk, \ + sock_ops, \ + BPF_CGROUP_SOCK_OPS); \ + __ret; \ +}) + #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ int __ret = 0; \ diff --git a/include/linux/filter.h b/include/linux/filter.h index c427dfa5f908..995625950cc1 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1241,8 +1241,12 @@ struct bpf_sock_ops_kern { u32 reply; u32 replylong[4]; }; + struct sk_buff *syn_skb; + struct sk_buff *skb; + void *skb_data_end; u8 op; u8 is_fullsock; + u8 remaining_opt_len; u64 temp; /* temp and everything after is not * initialized to 0 before calling * the BPF program. New fields that -- cgit v1.2.3 From 267cf9fa43d1c9d525d5d818a8651f2900e3aa9e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 20 Aug 2020 12:01:23 -0700 Subject: tcp: bpf: Optionally store mac header in TCP_SAVE_SYN This patch is adapted from Eric's patch in an earlier discussion [1]. The TCP_SAVE_SYN currently only stores the network header and tcp header. This patch allows it to optionally store the mac header also if the setsockopt's optval is 2. It requires one more bit for the "save_syn" bit field in tcp_sock. This patch achieves this by moving the syn_smc bit next to the is_mptcp. The syn_smc is currently used with the TCP experimental option. Since syn_smc is only used when CONFIG_SMC is enabled, this patch also puts the "IS_ENABLED(CONFIG_SMC)" around it like the is_mptcp did with "IS_ENABLED(CONFIG_MPTCP)". The mac_hdrlen is also stored in the "struct saved_syn" to allow a quick offset from the bpf prog if it chooses to start getting from the network header or the tcp header. [1]: https://lore.kernel.org/netdev/CANn89iLJNWh6bkH7DNhy_kmcAexuUCccqERqe7z2QsvPhGrYPQ@mail.gmail.com/ Suggested-by: Eric Dumazet Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/bpf/20200820190123.2886935-1-kafai@fb.com --- include/linux/tcp.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 29d166263ae7..56ff2952edaf 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -239,14 +239,13 @@ struct tcp_sock { repair : 1, frto : 1;/* F-RTO (RFC5682) activated in CA_Loss */ u8 repair_queue; - u8 syn_data:1, /* SYN includes data */ + u8 save_syn:2, /* Save headers of SYN packet */ + syn_data:1, /* SYN includes data */ syn_fastopen:1, /* SYN includes Fast Open option */ syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ syn_fastopen_ch:1, /* Active TFO re-enabling probe */ syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ - save_syn:1, /* Save headers of SYN packet */ - is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */ - syn_smc:1; /* SYN includes SMC */ + is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */ u32 tlp_high_seq; /* snd_nxt at the time of TLP */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ @@ -393,6 +392,9 @@ struct tcp_sock { #if IS_ENABLED(CONFIG_MPTCP) bool is_mptcp; #endif +#if IS_ENABLED(CONFIG_SMC) + bool syn_smc; /* SYN includes SMC */ +#endif #ifdef CONFIG_TCP_MD5SIG /* TCP AF-Specific parts; only used by MD5 Signature support so far */ @@ -488,7 +490,8 @@ static inline void tcp_saved_syn_free(struct tcp_sock *tp) static inline u32 tcp_saved_syn_len(const struct saved_syn *saved_syn) { - return saved_syn->network_hdrlen + saved_syn->tcp_hdrlen; + return saved_syn->mac_hdrlen + saved_syn->network_hdrlen + + saved_syn->tcp_hdrlen; } struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, -- cgit v1.2.3 From 583bbf0624dfd8fc45f1049be1d4980be59451ff Mon Sep 17 00:00:00 2001 From: Luke Hsiao Date: Fri, 21 Aug 2020 21:41:04 -0700 Subject: io_uring: allow tcp ancillary data for __sys_recvmsg_sock() For TCP tx zero-copy, the kernel notifies the process of completions by queuing completion notifications on the socket error queue. This patch allows reading these notifications via recvmsg to support TCP tx zero-copy. Ancillary data was originally disallowed due to privilege escalation via io_uring's offloading of sendmsg() onto a kernel thread with kernel credentials (https://crbug.com/project-zero/1975). So, we must ensure that the socket type is one where the ancillary data types that are delivered on recvmsg are plain data (no file descriptors or values that are translated based on the identity of the calling process). This was tested by using io_uring to call recvmsg on the MSG_ERRQUEUE with tx zero-copy enabled. Before this patch, we received -EINVALID from this specific code path. After this patch, we could read tcp tx zero-copy completion notifications from the MSG_ERRQUEUE. Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Arjun Roy Acked-by: Eric Dumazet Reviewed-by: Jann Horn Reviewed-by: Jens Axboe Signed-off-by: Luke Hsiao Signed-off-by: David S. Miller --- include/linux/net.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/net.h b/include/linux/net.h index d48ff1180879..7657c6432a69 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -41,6 +41,8 @@ struct net; #define SOCK_PASSCRED 3 #define SOCK_PASSSEC 4 +#define PROTO_CMSG_DATA_ONLY 0x0001 + #ifndef ARCH_HAS_SOCKET_TYPES /** * enum sock_type - Socket types @@ -135,6 +137,7 @@ typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *, struct proto_ops { int family; + unsigned int flags; struct module *owner; int (*release) (struct socket *sock); int (*bind) (struct socket *sock, -- cgit v1.2.3 From 755f982bb1ff469a181df3eaf8dd5d769267ab8e Mon Sep 17 00:00:00 2001 From: Igor Russkikh Date: Sun, 23 Aug 2020 14:19:26 +0300 Subject: qed/qede: make devlink survive recovery Devlink instance lifecycle was linked to qed_dev object, that caused devlink to be recreated on each recovery. Changing it by making higher level driver (qede) responsible for its life. This way devlink now survives recoveries. qede now stores devlink structure pointer as a part of its device object, devlink private data contains a linkage structure, qed_devlink. Signed-off-by: Igor Russkikh Signed-off-by: Alexander Lobakin Signed-off-by: Michal Kalderon Signed-off-by: David S. Miller --- include/linux/qed/qed_if.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index cd6a5c7e56eb..d8368e1770df 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -21,6 +21,7 @@ #include #include #include +#include enum dcbx_protocol_type { DCBX_PROTOCOL_ISCSI, @@ -779,6 +780,10 @@ enum qed_nvm_flash_cmd { QED_NVM_FLASH_CMD_NVM_MAX, }; +struct qed_devlink { + struct qed_dev *cdev; +}; + struct qed_common_cb_ops { void (*arfs_filter_op)(void *dev, void *fltr, u8 fw_rc); void (*link_update)(void *dev, struct qed_link_output *link); @@ -1137,6 +1142,10 @@ struct qed_common_ops { * */ int (*set_grc_config)(struct qed_dev *cdev, u32 cfg_id, u32 val); + + struct devlink* (*devlink_register)(struct qed_dev *cdev); + + void (*devlink_unregister)(struct devlink *devlink); }; #define MASK_FIELD(_name, _value) \ -- cgit v1.2.3 From 9524067b9a91dce2a096a0de7727c217495e3d2e Mon Sep 17 00:00:00 2001 From: Igor Russkikh Date: Sun, 23 Aug 2020 14:19:29 +0300 Subject: qed: health reporter init deinit seq Here we declare health reporter ops (empty for now) and register these in qed probe and remove callbacks. This way we get devlink attached to all kind of qed* PCI device entities: networking or storage offload entity. Signed-off-by: Igor Russkikh Signed-off-by: Alexander Lobakin Signed-off-by: Michal Kalderon Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/qed/qed_if.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index d8368e1770df..30fe06fe06a0 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -782,6 +782,7 @@ enum qed_nvm_flash_cmd { struct qed_devlink { struct qed_dev *cdev; + struct devlink_health_reporter *fw_reporter; }; struct qed_common_cb_ops { -- cgit v1.2.3 From 4f5a8db27eb926616500f827d9b81dc40595b0ef Mon Sep 17 00:00:00 2001 From: Igor Russkikh Date: Sun, 23 Aug 2020 14:19:30 +0300 Subject: qed: use devlink logic to report errors Use devlink_health_report to push error indications. We implement this in qede via callback function to make it possible to reuse the same for other drivers sitting on top of qed in future. Signed-off-by: Igor Russkikh Signed-off-by: Alexander Lobakin Signed-off-by: Michal Kalderon Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/qed/qed_if.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index 30fe06fe06a0..a75533de9186 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -906,6 +906,9 @@ struct qed_common_ops { int (*dbg_all_data_size) (struct qed_dev *cdev); + int (*report_fatal_error)(struct devlink *devlink, + enum qed_hw_err_type err_type); + /** * @brief can_link_change - can the instance change the link or not * -- cgit v1.2.3 From c5c642c55e2fd43fcf42262fd1e87271d413fb42 Mon Sep 17 00:00:00 2001 From: Igor Russkikh Date: Sun, 23 Aug 2020 14:19:33 +0300 Subject: qed: align adjacent indent Remove extra indent on some of adjacent declarations. Signed-off-by: Igor Russkikh Signed-off-by: Alexander Lobakin Signed-off-by: Michal Kalderon Signed-off-by: David S. Miller --- include/linux/qed/qed_if.h | 69 ++++++++++++++++++++++------------------------ 1 file changed, 33 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index a75533de9186..56fa55841d39 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -850,10 +850,9 @@ struct qed_common_ops { struct qed_dev* (*probe)(struct pci_dev *dev, struct qed_probe_params *params); - void (*remove)(struct qed_dev *cdev); + void (*remove)(struct qed_dev *cdev); - int (*set_power_state)(struct qed_dev *cdev, - pci_power_t state); + int (*set_power_state)(struct qed_dev *cdev, pci_power_t state); void (*set_name) (struct qed_dev *cdev, char name[]); @@ -861,50 +860,48 @@ struct qed_common_ops { * PF params required for the call before slowpath_start is * documented within the qed_pf_params structure definition. */ - void (*update_pf_params)(struct qed_dev *cdev, - struct qed_pf_params *params); - int (*slowpath_start)(struct qed_dev *cdev, - struct qed_slowpath_params *params); + void (*update_pf_params)(struct qed_dev *cdev, + struct qed_pf_params *params); - int (*slowpath_stop)(struct qed_dev *cdev); + int (*slowpath_start)(struct qed_dev *cdev, + struct qed_slowpath_params *params); + + int (*slowpath_stop)(struct qed_dev *cdev); /* Requests to use `cnt' interrupts for fastpath. * upon success, returns number of interrupts allocated for fastpath. */ - int (*set_fp_int)(struct qed_dev *cdev, - u16 cnt); + int (*set_fp_int)(struct qed_dev *cdev, u16 cnt); /* Fills `info' with pointers required for utilizing interrupts */ - int (*get_fp_int)(struct qed_dev *cdev, - struct qed_int_info *info); - - u32 (*sb_init)(struct qed_dev *cdev, - struct qed_sb_info *sb_info, - void *sb_virt_addr, - dma_addr_t sb_phy_addr, - u16 sb_id, - enum qed_sb_type type); - - u32 (*sb_release)(struct qed_dev *cdev, - struct qed_sb_info *sb_info, - u16 sb_id, - enum qed_sb_type type); - - void (*simd_handler_config)(struct qed_dev *cdev, - void *token, - int index, - void (*handler)(void *)); - - void (*simd_handler_clean)(struct qed_dev *cdev, - int index); - int (*dbg_grc)(struct qed_dev *cdev, - void *buffer, u32 *num_dumped_bytes); + int (*get_fp_int)(struct qed_dev *cdev, struct qed_int_info *info); + + u32 (*sb_init)(struct qed_dev *cdev, + struct qed_sb_info *sb_info, + void *sb_virt_addr, + dma_addr_t sb_phy_addr, + u16 sb_id, + enum qed_sb_type type); + + u32 (*sb_release)(struct qed_dev *cdev, + struct qed_sb_info *sb_info, + u16 sb_id, + enum qed_sb_type type); + + void (*simd_handler_config)(struct qed_dev *cdev, + void *token, + int index, + void (*handler)(void *)); + + void (*simd_handler_clean)(struct qed_dev *cdev, int index); + + int (*dbg_grc)(struct qed_dev *cdev, void *buffer, u32 *num_dumped_bytes); int (*dbg_grc_size)(struct qed_dev *cdev); - int (*dbg_all_data) (struct qed_dev *cdev, void *buffer); + int (*dbg_all_data)(struct qed_dev *cdev, void *buffer); - int (*dbg_all_data_size) (struct qed_dev *cdev); + int (*dbg_all_data_size)(struct qed_dev *cdev); int (*report_fatal_error)(struct devlink *devlink, enum qed_hw_err_type err_type); -- cgit v1.2.3 From 000601bb62330f18dc8f5d2d0b82e9aec3e207c4 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Thu, 9 Jul 2020 15:05:59 +0200 Subject: rcu: Fix kerneldoc comments in rcupdate.h This commit fixes the kerneldoc comments for rcu_read_unlock_bh(), rcu_read_unlock_sched() and rcu_head_after_call_rcu() so they e.g. get properly linked in the API documentation. Also add parenthesis after function names to match the notation used in other kerneldoc comments in the same file. Signed-off-by: Tobias Klauser Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index d15d46db61f7..b47d6b66665e 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -709,8 +709,8 @@ static inline void rcu_read_lock_bh(void) "rcu_read_lock_bh() used illegally while idle"); } -/* - * rcu_read_unlock_bh - marks the end of a softirq-only RCU critical section +/** + * rcu_read_unlock_bh() - marks the end of a softirq-only RCU critical section * * See rcu_read_lock_bh() for more information. */ @@ -751,10 +751,10 @@ static inline notrace void rcu_read_lock_sched_notrace(void) __acquire(RCU_SCHED); } -/* - * rcu_read_unlock_sched - marks the end of a RCU-classic critical section +/** + * rcu_read_unlock_sched() - marks the end of a RCU-classic critical section * - * See rcu_read_lock_sched for more information. + * See rcu_read_lock_sched() for more information. */ static inline void rcu_read_unlock_sched(void) { @@ -945,7 +945,7 @@ static inline void rcu_head_init(struct rcu_head *rhp) } /** - * rcu_head_after_call_rcu - Has this rcu_head been passed to call_rcu()? + * rcu_head_after_call_rcu() - Has this rcu_head been passed to call_rcu()? * @rhp: The rcu_head structure to test. * @f: The function passed to call_rcu() along with @rhp. * -- cgit v1.2.3 From ae2212a7216b674633bdc3bd2e24947a0665efb8 Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Sun, 12 Jul 2020 18:40:02 +0530 Subject: rculist: Introduce list/hlist_for_each_entry_srcu() macros list/hlist_for_each_entry_rcu() provides an optional cond argument to specify the lock held in the updater side. However for SRCU read side, not providing the cond argument results into false positive as whether srcu_read_lock is held or not is not checked implicitly. Therefore, on read side the lockdep expression srcu_read_lock_held(srcu struct) can solve this issue. However, the function still fails to check the cases where srcu protected list is traversed with rcu_read_lock() instead of srcu_read_lock(). Therefore, to remove the false negative, this patch introduces two new list traversal primitives : list_for_each_entry_srcu() and hlist_for_each_entry_srcu(). Both of the functions have non-optional cond argument as it is required for both read and update side, and simply checks if the cond is true. For regular read side the lockdep expression srcu_read_lock_head() can be passed as the cond argument to list/hlist_for_each_entry_srcu(). Suggested-by: Paolo Bonzini Tested-by: Suraj Upadhyay Tested-by: Naresh Kamboju [ paulmck: Add "true" per kbuild test robot feedback. ] Signed-off-by: Madhuparna Bhowmik Signed-off-by: Paul E. McKenney --- include/linux/rculist.h | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 7a6fc9956510..f8633d37e358 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -63,9 +63,17 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list) RCU_LOCKDEP_WARN(!(cond) && !rcu_read_lock_any_held(), \ "RCU-list traversed in non-reader section!"); \ }) + +#define __list_check_srcu(cond) \ + ({ \ + RCU_LOCKDEP_WARN(!(cond), \ + "RCU-list traversed without holding the required lock!");\ + }) #else #define __list_check_rcu(dummy, cond, extra...) \ ({ check_arg_count_one(extra); }) + +#define __list_check_srcu(cond) ({ }) #endif /* @@ -385,6 +393,25 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, &pos->member != (head); \ pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) +/** + * list_for_each_entry_srcu - iterate over rcu list of given type + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_head within the struct. + * @cond: lockdep expression for the lock required to traverse the list. + * + * This list-traversal primitive may safely run concurrently with + * the _rcu list-mutation primitives such as list_add_rcu() + * as long as the traversal is guarded by srcu_read_lock(). + * The lockdep expression srcu_read_lock_held() can be passed as the + * cond argument from read side. + */ +#define list_for_each_entry_srcu(pos, head, member, cond) \ + for (__list_check_srcu(cond), \ + pos = list_entry_rcu((head)->next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) + /** * list_entry_lockless - get the struct for this entry * @ptr: the &struct list_head pointer. @@ -683,6 +710,27 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n, pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\ &(pos)->member)), typeof(*(pos)), member)) +/** + * hlist_for_each_entry_srcu - iterate over rcu list of given type + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the hlist_node within the struct. + * @cond: lockdep expression for the lock required to traverse the list. + * + * This list-traversal primitive may safely run concurrently with + * the _rcu list-mutation primitives such as hlist_add_head_rcu() + * as long as the traversal is guarded by srcu_read_lock(). + * The lockdep expression srcu_read_lock_held() can be passed as the + * cond argument from read side. + */ +#define hlist_for_each_entry_srcu(pos, head, member, cond) \ + for (__list_check_srcu(cond), \ + pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\ + typeof(*(pos)), member); \ + pos; \ + pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\ + &(pos)->member)), typeof(*(pos)), member)) + /** * hlist_for_each_entry_rcu_notrace - iterate over rcu list of given type (for tracing) * @pos: the type * to use as a loop cursor. -- cgit v1.2.3 From 7f2a53c231fe5d9522c3b695ab454203904031ac Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 17 Aug 2020 10:37:22 -0700 Subject: rcu: Remove unused __rcu_is_watching() function The x86/entry work removed all uses of __rcu_is_watching(), therefore this commit removes it entirely. Cc: Andy Lutomirski Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 1 - include/linux/rcutree.h | 1 - 2 files changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 5cc9637cac16..7c1ecdb356d8 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -103,7 +103,6 @@ static inline void rcu_scheduler_starting(void) { } static inline void rcu_end_inkernel_boot(void) { } static inline bool rcu_inkernel_boot_has_ended(void) { return true; } static inline bool rcu_is_watching(void) { return true; } -static inline bool __rcu_is_watching(void) { return true; } static inline void rcu_momentary_dyntick_idle(void) { } static inline void kfree_rcu_scheduler_running(void) { } static inline bool rcu_gp_might_be_stalled(void) { return false; } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index d2f4064ebd1d..59eb5cd567d7 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -64,7 +64,6 @@ extern int rcu_scheduler_active __read_mostly; void rcu_end_inkernel_boot(void); bool rcu_inkernel_boot_has_ended(void); bool rcu_is_watching(void); -bool __rcu_is_watching(void); #ifndef CONFIG_PREEMPTION void rcu_all_qs(void); #endif -- cgit v1.2.3 From aa40c138cc8f36e2f5c721fd1bdb823a1ef1a237 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 10 Aug 2020 09:58:03 -0700 Subject: rcu: Report QS for outermost PREEMPT=n rcu_read_unlock() for strict GPs The CONFIG_PREEMPT=n instance of rcu_read_unlock is even more aggressively than that of CONFIG_PREEMPT=y in deferring reporting quiescent states to the RCU core. This is just what is wanted in normal use because it reduces overhead, but the resulting delay is not what is wanted for kernels built with CONFIG_RCU_STRICT_GRACE_PERIOD=y. This commit therefore adds an rcu_read_unlock_strict() function that checks for exceptional conditions, and reports the newly started quiescent state if it is safe to do so, also doing a spin-delay if requested via rcutree.rcu_unlock_delay. This commit also adds a call to rcu_read_unlock_strict() from the CONFIG_PREEMPT=n instance of __rcu_read_unlock(). [ paulmck: Fixed bug located by kernel test robot ] Reported-by Jann Horn Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index d15d46db61f7..522529a13786 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -55,6 +55,12 @@ void __rcu_read_unlock(void); #else /* #ifdef CONFIG_PREEMPT_RCU */ +#ifdef CONFIG_TINY_RCU +#define rcu_read_unlock_strict() do { } while (0) +#else +void rcu_read_unlock_strict(void); +#endif + static inline void __rcu_read_lock(void) { preempt_disable(); @@ -63,6 +69,7 @@ static inline void __rcu_read_lock(void) static inline void __rcu_read_unlock(void) { preempt_enable(); + rcu_read_unlock_strict(); } static inline int rcu_preempt_depth(void) -- cgit v1.2.3 From 00cda13e339c9f4956a6f036675df2ca5b5a552e Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Fri, 14 Aug 2020 00:34:02 +0300 Subject: power: supply: Support battery temperature device-tree properties The generic battery temperature properties are already supported by the power-supply core. Let's support parsing of the common battery temperature properties from a device-tree. Signed-off-by: Dmitry Osipenko Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 97cc4b85bf61..d0684362a392 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -365,6 +365,12 @@ struct power_supply_battery_info { int constant_charge_voltage_max_uv; /* microVolts */ int factory_internal_resistance_uohm; /* microOhms */ int ocv_temp[POWER_SUPPLY_OCV_TEMP_MAX];/* celsius */ + int temp_ambient_alert_min; /* celsius */ + int temp_ambient_alert_max; /* celsius */ + int temp_alert_min; /* celsius */ + int temp_alert_max; /* celsius */ + int temp_min; /* celsius */ + int temp_max; /* celsius */ struct power_supply_battery_ocv_table *ocv_table[POWER_SUPPLY_OCV_TEMP_MAX]; int ocv_table_size[POWER_SUPPLY_OCV_TEMP_MAX]; struct power_supply_resistance_temp_table *resist_table; -- cgit v1.2.3 From f836a56e84ffc9f1a1cd73f77e10404ca46a4616 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 25 Aug 2020 20:29:15 +0200 Subject: bpf: Generalize bpf_sk_storage Refactor the functionality in bpf_sk_storage.c so that concept of storage linked to kernel objects can be extended to other objects like inode, task_struct etc. Each new local storage will still be a separate map and provide its own set of helpers. This allows for future object specific extensions and still share a lot of the underlying implementation. This includes the changes suggested by Martin in: https://lore.kernel.org/bpf/20200725013047.4006241-1-kafai@fb.com/ adding new map operations to support bpf_local_storage maps: * storages for different kernel objects to optionally have different memory charging strategy (map_local_storage_charge, map_local_storage_uncharge) * Functionality to extract the storage pointer from a pointer to the owning object (map_owner_storage_ptr) Co-developed-by: Martin KaFai Lau Signed-off-by: Martin KaFai Lau Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200825182919.1118197-4-kpsingh@chromium.org --- include/linux/bpf.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 81f38e2fda78..8c443b93ac11 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -34,6 +34,8 @@ struct btf_type; struct exception_table_entry; struct seq_operations; struct bpf_iter_aux_info; +struct bpf_local_storage; +struct bpf_local_storage_map; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -104,6 +106,12 @@ struct bpf_map_ops { __poll_t (*map_poll)(struct bpf_map *map, struct file *filp, struct poll_table_struct *pts); + /* Functions called by bpf_local_storage maps */ + int (*map_local_storage_charge)(struct bpf_local_storage_map *smap, + void *owner, u32 size); + void (*map_local_storage_uncharge)(struct bpf_local_storage_map *smap, + void *owner, u32 size); + struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner); /* BTF name and id of struct allocated by map_alloc */ const char * const map_btf_name; int *map_btf_id; -- cgit v1.2.3 From 450af8d0f6be2e7dd2a528a3fb054bb726bf1747 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 25 Aug 2020 20:29:16 +0200 Subject: bpf: Split bpf_local_storage to bpf_sk_storage A purely mechanical change: bpf_sk_storage.c = bpf_sk_storage.c + bpf_local_storage.c bpf_sk_storage.h = bpf_sk_storage.h + bpf_local_storage.h Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200825182919.1118197-5-kpsingh@chromium.org --- include/linux/bpf_local_storage.h | 163 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 163 insertions(+) create mode 100644 include/linux/bpf_local_storage.h (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h new file mode 100644 index 000000000000..b2c9463f36a1 --- /dev/null +++ b/include/linux/bpf_local_storage.h @@ -0,0 +1,163 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2019 Facebook + * Copyright 2020 Google LLC. + */ + +#ifndef _BPF_LOCAL_STORAGE_H +#define _BPF_LOCAL_STORAGE_H + +#include +#include +#include +#include +#include +#include + +#define BPF_LOCAL_STORAGE_CACHE_SIZE 16 + +struct bpf_local_storage_map_bucket { + struct hlist_head list; + raw_spinlock_t lock; +}; + +/* Thp map is not the primary owner of a bpf_local_storage_elem. + * Instead, the container object (eg. sk->sk_bpf_storage) is. + * + * The map (bpf_local_storage_map) is for two purposes + * 1. Define the size of the "local storage". It is + * the map's value_size. + * + * 2. Maintain a list to keep track of all elems such + * that they can be cleaned up during the map destruction. + * + * When a bpf local storage is being looked up for a + * particular object, the "bpf_map" pointer is actually used + * as the "key" to search in the list of elem in + * the respective bpf_local_storage owned by the object. + * + * e.g. sk->sk_bpf_storage is the mini-map with the "bpf_map" pointer + * as the searching key. + */ +struct bpf_local_storage_map { + struct bpf_map map; + /* Lookup elem does not require accessing the map. + * + * Updating/Deleting requires a bucket lock to + * link/unlink the elem from the map. Having + * multiple buckets to improve contention. + */ + struct bpf_local_storage_map_bucket *buckets; + u32 bucket_log; + u16 elem_size; + u16 cache_idx; +}; + +struct bpf_local_storage_data { + /* smap is used as the searching key when looking up + * from the object's bpf_local_storage. + * + * Put it in the same cacheline as the data to minimize + * the number of cachelines access during the cache hit case. + */ + struct bpf_local_storage_map __rcu *smap; + u8 data[] __aligned(8); +}; + +/* Linked to bpf_local_storage and bpf_local_storage_map */ +struct bpf_local_storage_elem { + struct hlist_node map_node; /* Linked to bpf_local_storage_map */ + struct hlist_node snode; /* Linked to bpf_local_storage */ + struct bpf_local_storage __rcu *local_storage; + struct rcu_head rcu; + /* 8 bytes hole */ + /* The data is stored in aother cacheline to minimize + * the number of cachelines access during a cache hit. + */ + struct bpf_local_storage_data sdata ____cacheline_aligned; +}; + +struct bpf_local_storage { + struct bpf_local_storage_data __rcu *cache[BPF_LOCAL_STORAGE_CACHE_SIZE]; + struct hlist_head list; /* List of bpf_local_storage_elem */ + void *owner; /* The object that owns the above "list" of + * bpf_local_storage_elem. + */ + struct rcu_head rcu; + raw_spinlock_t lock; /* Protect adding/removing from the "list" */ +}; + +/* U16_MAX is much more than enough for sk local storage + * considering a tcp_sock is ~2k. + */ +#define BPF_LOCAL_STORAGE_MAX_VALUE_SIZE \ + min_t(u32, \ + (KMALLOC_MAX_SIZE - MAX_BPF_STACK - \ + sizeof(struct bpf_local_storage_elem)), \ + (U16_MAX - sizeof(struct bpf_local_storage_elem))) + +#define SELEM(_SDATA) \ + container_of((_SDATA), struct bpf_local_storage_elem, sdata) +#define SDATA(_SELEM) (&(_SELEM)->sdata) + +#define BPF_LOCAL_STORAGE_CACHE_SIZE 16 + +struct bpf_local_storage_cache { + spinlock_t idx_lock; + u64 idx_usage_counts[BPF_LOCAL_STORAGE_CACHE_SIZE]; +}; + +#define DEFINE_BPF_STORAGE_CACHE(name) \ +static struct bpf_local_storage_cache name = { \ + .idx_lock = __SPIN_LOCK_UNLOCKED(name.idx_lock), \ +} + +u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache); +void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, + u16 idx); + +/* Helper functions for bpf_local_storage */ +int bpf_local_storage_map_alloc_check(union bpf_attr *attr); + +struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr); + +struct bpf_local_storage_data * +bpf_local_storage_lookup(struct bpf_local_storage *local_storage, + struct bpf_local_storage_map *smap, + bool cacheit_lockit); + +void bpf_local_storage_map_free(struct bpf_local_storage_map *smap); + +int bpf_local_storage_map_check_btf(const struct bpf_map *map, + const struct btf *btf, + const struct btf_type *key_type, + const struct btf_type *value_type); + +void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem); + +bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem, + bool uncharge_omem); + +void bpf_selem_unlink(struct bpf_local_storage_elem *selem); + +void bpf_selem_link_map(struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *selem); + +void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem); + +struct bpf_local_storage_elem * +bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, + bool charge_mem); + +int +bpf_local_storage_alloc(void *owner, + struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *first_selem); + +struct bpf_local_storage_data * +bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, + void *value, u64 map_flags); + +#endif /* _BPF_LOCAL_STORAGE_H */ -- cgit v1.2.3 From 8ea636848aca35b9f97c5b5dee30225cf2dd0fe6 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 25 Aug 2020 20:29:17 +0200 Subject: bpf: Implement bpf_local_storage for inodes Similar to bpf_local_storage for sockets, add local storage for inodes. The life-cycle of storage is managed with the life-cycle of the inode. i.e. the storage is destroyed along with the owning inode. The BPF LSM allocates an __rcu pointer to the bpf_local_storage in the security blob which are now stackable and can co-exist with other LSMs. Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200825182919.1118197-6-kpsingh@chromium.org --- include/linux/bpf_lsm.h | 29 +++++++++++++++++++++++++++++ include/linux/bpf_types.h | 3 +++ 2 files changed, 32 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h index af74712af585..aaacb6aafc87 100644 --- a/include/linux/bpf_lsm.h +++ b/include/linux/bpf_lsm.h @@ -17,9 +17,28 @@ #include #undef LSM_HOOK +struct bpf_storage_blob { + struct bpf_local_storage __rcu *storage; +}; + +extern struct lsm_blob_sizes bpf_lsm_blob_sizes; + int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog); +static inline struct bpf_storage_blob *bpf_inode( + const struct inode *inode) +{ + if (unlikely(!inode->i_security)) + return NULL; + + return inode->i_security + bpf_lsm_blob_sizes.lbs_inode; +} + +extern const struct bpf_func_proto bpf_inode_storage_get_proto; +extern const struct bpf_func_proto bpf_inode_storage_delete_proto; +void bpf_inode_storage_free(struct inode *inode); + #else /* !CONFIG_BPF_LSM */ static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, @@ -28,6 +47,16 @@ static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, return -EOPNOTSUPP; } +static inline struct bpf_storage_blob *bpf_inode( + const struct inode *inode) +{ + return NULL; +} + +static inline void bpf_inode_storage_free(struct inode *inode) +{ +} + #endif /* CONFIG_BPF_LSM */ #endif /* _LINUX_BPF_LSM_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index a52a5688418e..2e6f568377f1 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -107,6 +107,9 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) #endif +#ifdef CONFIG_BPF_LSM +BPF_MAP_TYPE(BPF_MAP_TYPE_INODE_STORAGE, inode_storage_map_ops) +#endif BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) #if defined(CONFIG_XDP_SOCKETS) BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) -- cgit v1.2.3 From 6298399bfc101f8e8cf35a916f26aa32bdf04278 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 25 Aug 2020 21:21:13 +0200 Subject: bpf: Move btf_resolve_size into __btf_resolve_size Moving btf_resolve_size into __btf_resolve_size and keeping btf_resolve_size public with just first 3 arguments, because the rest of the arguments are not used by outside callers. Following changes are adding more arguments, which are not useful to outside callers. They will be added to the __btf_resolve_size function. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200825192124.710397-4-jolsa@kernel.org --- include/linux/btf.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index 8b81fbb4497c..a9af5e7a7ece 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -64,8 +64,7 @@ const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf, u32 id, u32 *res_id); const struct btf_type * btf_resolve_size(const struct btf *btf, const struct btf_type *type, - u32 *type_size, const struct btf_type **elem_type, - u32 *total_nelems); + u32 *type_size); #define for_each_member(i, struct_type, member) \ for (i = 0, member = btf_type_member(struct_type); \ -- cgit v1.2.3 From faaf4a790d93794b46d67e2fd69b8e5c8cae2d41 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 25 Aug 2020 21:21:18 +0200 Subject: bpf: Add btf_struct_ids_match function Adding btf_struct_ids_match function to check if given address provided by BTF object + offset is also address of another nested BTF object. This allows to pass an argument to helper, which is defined via parent BTF object + offset, like for bpf_d_path (added in following changes): SEC("fentry/filp_close") int BPF_PROG(prog_close, struct file *file, void *id) { ... ret = bpf_d_path(&file->f_path, ... The first bpf_d_path argument is hold by verifier as BTF file object plus offset of f_path member. The btf_struct_ids_match function will walk the struct file object and check if there's nested struct path object on the given offset. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200825192124.710397-9-jolsa@kernel.org --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8c443b93ac11..540f5e6c3788 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1358,6 +1358,8 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf_type *t, int off, int size, enum bpf_access_type atype, u32 *next_btf_id); +bool btf_struct_ids_match(struct bpf_verifier_log *log, + int off, u32 id, u32 need_type_id); int btf_resolve_helper_id(struct bpf_verifier_log *log, const struct bpf_func_proto *fn, int); -- cgit v1.2.3 From eae2e83e62633a2659e3bc690facba1c2fc9c45b Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 25 Aug 2020 21:21:19 +0200 Subject: bpf: Add BTF_SET_START/END macros Adding support to define sorted set of BTF ID values. Following defines sorted set of BTF ID values: BTF_SET_START(btf_allowlist_d_path) BTF_ID(func, vfs_truncate) BTF_ID(func, vfs_fallocate) BTF_ID(func, dentry_open) BTF_ID(func, vfs_getattr) BTF_ID(func, filp_close) BTF_SET_END(btf_allowlist_d_path) It defines following 'struct btf_id_set' variable to access values and count: struct btf_id_set btf_allowlist_d_path; Adding 'allowed' callback to struct bpf_func_proto, to allow verifier the check on allowed callers. Adding btf_id_set_contains function, which will be used by allowed callbacks to verify the caller's BTF ID value is within allowed set. Also removing extra '\' in __BTF_ID_LIST macro. Added BTF_SET_START_GLOBAL macro for global sets. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200825192124.710397-10-jolsa@kernel.org --- include/linux/bpf.h | 4 ++++ include/linux/btf_ids.h | 51 ++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 54 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 540f5e6c3788..a6131d95e31e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -317,6 +317,7 @@ struct bpf_func_proto { * for this argument. */ int *ret_btf_id; /* return value btf_id */ + bool (*allowed)(const struct bpf_prog *prog); }; /* bpf_context is intentionally undefined structure. Pointer to bpf_context is @@ -1878,4 +1879,7 @@ enum bpf_text_poke_type { int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *addr1, void *addr2); +struct btf_id_set; +bool btf_id_set_contains(struct btf_id_set *set, u32 id); + #endif /* _LINUX_BPF_H */ diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 4867d549e3c1..210b086188a3 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -3,6 +3,11 @@ #ifndef _LINUX_BTF_IDS_H #define _LINUX_BTF_IDS_H +struct btf_id_set { + u32 cnt; + u32 ids[]; +}; + #ifdef CONFIG_DEBUG_INFO_BTF #include /* for __PASTE */ @@ -62,7 +67,7 @@ asm( \ ".pushsection " BTF_IDS_SECTION ",\"a\"; \n" \ "." #scope " " #name "; \n" \ #name ":; \n" \ -".popsection; \n"); \ +".popsection; \n"); #define BTF_ID_LIST(name) \ __BTF_ID_LIST(name, local) \ @@ -88,12 +93,56 @@ asm( \ ".zero 4 \n" \ ".popsection; \n"); +/* + * The BTF_SET_START/END macros pair defines sorted list of + * BTF IDs plus its members count, with following layout: + * + * BTF_SET_START(list) + * BTF_ID(type1, name1) + * BTF_ID(type2, name2) + * BTF_SET_END(list) + * + * __BTF_ID__set__list: + * .zero 4 + * list: + * __BTF_ID__type1__name1__3: + * .zero 4 + * __BTF_ID__type2__name2__4: + * .zero 4 + * + */ +#define __BTF_SET_START(name, scope) \ +asm( \ +".pushsection " BTF_IDS_SECTION ",\"a\"; \n" \ +"." #scope " __BTF_ID__set__" #name "; \n" \ +"__BTF_ID__set__" #name ":; \n" \ +".zero 4 \n" \ +".popsection; \n"); + +#define BTF_SET_START(name) \ +__BTF_ID_LIST(name, local) \ +__BTF_SET_START(name, local) + +#define BTF_SET_START_GLOBAL(name) \ +__BTF_ID_LIST(name, globl) \ +__BTF_SET_START(name, globl) + +#define BTF_SET_END(name) \ +asm( \ +".pushsection " BTF_IDS_SECTION ",\"a\"; \n" \ +".size __BTF_ID__set__" #name ", .-" #name " \n" \ +".popsection; \n"); \ +extern struct btf_id_set name; + #else #define BTF_ID_LIST(name) static u32 name[5]; #define BTF_ID(prefix, name) #define BTF_ID_UNUSED #define BTF_ID_LIST_GLOBAL(name) u32 name[1]; +#define BTF_SET_START(name) static struct btf_id_set name = { 0 }; +#define BTF_SET_START_GLOBAL(name) static struct btf_id_set name = { 0 }; +#define BTF_SET_END(name) #endif /* CONFIG_DEBUG_INFO_BTF */ -- cgit v1.2.3 From 24da79902efc4a8443fae09e6c8e25b515bd8db2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 26 Aug 2020 06:50:16 -0700 Subject: inet: remove inet_sk_copy_descendant() This is no longer used, SCTP now uses a private helper. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/ipv6.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index a44789d027cc..bac8f4fffbd6 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -345,17 +345,6 @@ static inline struct raw6_sock *raw6_sk(const struct sock *sk) return (struct raw6_sock *)sk; } -static inline void inet_sk_copy_descendant(struct sock *sk_to, - const struct sock *sk_from) -{ - int ancestor_size = sizeof(struct inet_sock); - - if (sk_from->sk_family == PF_INET6) - ancestor_size += sizeof(struct ipv6_pinfo); - - __inet_sk_copy_descendant(sk_to, sk_from, ancestor_size); -} - #define __ipv6_only_sock(sk) (sk->sk_ipv6only) #define ipv6_only_sock(sk) (__ipv6_only_sock(sk)) #define ipv6_sk_rxinfo(sk) ((sk)->sk_family == PF_INET6 && \ -- cgit v1.2.3 From 5ca937fb5d6870735341d8fdacdd2b49618c35dc Mon Sep 17 00:00:00 2001 From: Subbaraman Narayanamurthy Date: Thu, 13 Aug 2020 11:34:08 -0700 Subject: power: supply: add wireless type Currently, power_supply framework supports only Battery, UPS, Mains and USB power_supply_type. Add wireless power_supply_type so that the drivers which supports wireless can register a power supply class device with POWER_SUPPLY_TYPE_WIRELESS. Signed-off-by: Subbaraman Narayanamurthy Signed-off-by: Guru Das Srinagesh Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index d0684362a392..81a55e974feb 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -186,6 +186,7 @@ enum power_supply_type { POWER_SUPPLY_TYPE_USB_PD, /* Power Delivery Port */ POWER_SUPPLY_TYPE_USB_PD_DRP, /* PD Dual Role Port */ POWER_SUPPLY_TYPE_APPLE_BRICK_ID, /* Apple Charging Method */ + POWER_SUPPLY_TYPE_WIRELESS, /* Wireless */ }; enum power_supply_usb_type { -- cgit v1.2.3 From f468f21b7af0fa472ff8ff70f10b9b4995ef7eb3 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Wed, 26 Aug 2020 15:54:16 +0300 Subject: net: Take common prefetch code structure into a function Many device drivers use the same prefetch code structure to deal with small L1 cacheline size. Take this code into a function and call it from the drivers. Suggested-by: Jakub Kicinski Signed-off-by: Tariq Toukan Reviewed-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/netdevice.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b0e303f6603f..b8abe1d7aa0b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2193,6 +2193,22 @@ int netdev_get_num_tc(struct net_device *dev) return dev->num_tc; } +static inline void net_prefetch(void *p) +{ + prefetch(p); +#if L1_CACHE_BYTES < 128 + prefetch((u8 *)p + L1_CACHE_BYTES); +#endif +} + +static inline void net_prefetchw(void *p) +{ + prefetchw(p); +#if L1_CACHE_BYTES < 128 + prefetchw((u8 *)p + L1_CACHE_BYTES); +#endif +} + void netdev_unbind_sb_channel(struct net_device *dev, struct net_device *sb_dev); int netdev_bind_sb_channel_queue(struct net_device *dev, -- cgit v1.2.3 From 2da45b8f069644604e8e05ccb03b2b66ada611d5 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 27 Aug 2020 10:51:49 +0200 Subject: mtd: rawnand: Add a kernel doc to the ECC algorithm enumeration Before moving it to the generic raw NAND core, ensure the enumeration is properly described. Signed-off-by: Miquel Raynal Reviewed-by: Boris Brezillon Link: https://lore.kernel.org/linux-mtd/20200827085208.16276-2-miquel.raynal@bootlin.com --- include/linux/mtd/rawnand.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h index a725b620aca2..1495f22b60cb 100644 --- a/include/linux/mtd/rawnand.h +++ b/include/linux/mtd/rawnand.h @@ -92,6 +92,13 @@ enum nand_ecc_mode { NAND_ECC_ON_DIE, }; +/** + * enum nand_ecc_algo - NAND ECC algorithm + * @NAND_ECC_UNKNOWN: Unknown algorithm + * @NAND_ECC_HAMMING: Hamming algorithm + * @NAND_ECC_BCH: Bose-Chaudhuri-Hocquenghem algorithm + * @NAND_ECC_RS: Reed-Solomon algorithm + */ enum nand_ecc_algo { NAND_ECC_UNKNOWN, NAND_ECC_HAMMING, -- cgit v1.2.3 From e0a564ae0a4bc1bcf156d468955b27d3606e8253 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 27 Aug 2020 10:51:50 +0200 Subject: mtd: rawnand: Rename the ECC algorithm enumeration items NAND_ECC_ is not a meaningful prefix, use NAND_ECC_ALGO_ instead. Signed-off-by: Miquel Raynal Reviewed-by: Boris Brezillon Link: https://lore.kernel.org/linux-mtd/20200827085208.16276-3-miquel.raynal@bootlin.com --- include/linux/mtd/rawnand.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h index 1495f22b60cb..8174c0c331a1 100644 --- a/include/linux/mtd/rawnand.h +++ b/include/linux/mtd/rawnand.h @@ -94,16 +94,16 @@ enum nand_ecc_mode { /** * enum nand_ecc_algo - NAND ECC algorithm - * @NAND_ECC_UNKNOWN: Unknown algorithm - * @NAND_ECC_HAMMING: Hamming algorithm - * @NAND_ECC_BCH: Bose-Chaudhuri-Hocquenghem algorithm - * @NAND_ECC_RS: Reed-Solomon algorithm + * @NAND_ECC_ALGO_UNKNOWN: Unknown algorithm + * @NAND_ECC_ALGO_HAMMING: Hamming algorithm + * @NAND_ECC_ALGO_BCH: Bose-Chaudhuri-Hocquenghem algorithm + * @NAND_ECC_ALGO_RS: Reed-Solomon algorithm */ enum nand_ecc_algo { - NAND_ECC_UNKNOWN, - NAND_ECC_HAMMING, - NAND_ECC_BCH, - NAND_ECC_RS, + NAND_ECC_ALGO_UNKNOWN, + NAND_ECC_ALGO_HAMMING, + NAND_ECC_ALGO_BCH, + NAND_ECC_ALGO_RS, }; /* -- cgit v1.2.3 From f2f64c1e924131878179da64794d9cb18ee5c827 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 27 Aug 2020 10:51:51 +0200 Subject: mtd: rawnand: Move the nand_ecc_algo enum to the generic NAND layer This enumeration is generic and will be reused NAND-wide. Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20200827085208.16276-4-miquel.raynal@bootlin.com --- include/linux/mtd/nand.h | 14 ++++++++++++++ include/linux/mtd/rawnand.h | 14 -------------- 2 files changed, 14 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index af99041ceaa9..986c7de83326 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -115,6 +115,20 @@ struct nand_page_io_req { int mode; }; +/** + * enum nand_ecc_algo - NAND ECC algorithm + * @NAND_ECC_ALGO_UNKNOWN: Unknown algorithm + * @NAND_ECC_ALGO_HAMMING: Hamming algorithm + * @NAND_ECC_ALGO_BCH: Bose-Chaudhuri-Hocquenghem algorithm + * @NAND_ECC_ALGO_RS: Reed-Solomon algorithm + */ +enum nand_ecc_algo { + NAND_ECC_ALGO_UNKNOWN, + NAND_ECC_ALGO_HAMMING, + NAND_ECC_ALGO_BCH, + NAND_ECC_ALGO_RS, +}; + /** * struct nand_ecc_props - NAND ECC properties * @strength: ECC strength diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h index 8174c0c331a1..10bbfbf4ad7f 100644 --- a/include/linux/mtd/rawnand.h +++ b/include/linux/mtd/rawnand.h @@ -92,20 +92,6 @@ enum nand_ecc_mode { NAND_ECC_ON_DIE, }; -/** - * enum nand_ecc_algo - NAND ECC algorithm - * @NAND_ECC_ALGO_UNKNOWN: Unknown algorithm - * @NAND_ECC_ALGO_HAMMING: Hamming algorithm - * @NAND_ECC_ALGO_BCH: Bose-Chaudhuri-Hocquenghem algorithm - * @NAND_ECC_ALGO_RS: Reed-Solomon algorithm - */ -enum nand_ecc_algo { - NAND_ECC_ALGO_UNKNOWN, - NAND_ECC_ALGO_HAMMING, - NAND_ECC_ALGO_BCH, - NAND_ECC_ALGO_RS, -}; - /* * Constants for Hardware ECC */ -- cgit v1.2.3 From 701981cab01696584a12e5f0e7c2ad931a326059 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 27 Aug 2020 10:51:52 +0200 Subject: mtd: nand: Add a NAND page I/O request type Use an enum to differentiate the type of I/O (reading or writing a page). Also update the request iterator. Signed-off-by: Miquel Raynal Reviewed-by: Boris Brezillon Link: https://lore.kernel.org/linux-mtd/20200827085208.16276-5-miquel.raynal@bootlin.com --- include/linux/mtd/nand.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 986c7de83326..e754a6fc8a4b 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -82,8 +82,19 @@ struct nand_pos { unsigned int page; }; +/** + * enum nand_page_io_req_type - Direction of an I/O request + * @NAND_PAGE_READ: from the chip, to the controller + * @NAND_PAGE_WRITE: from the controller, to the chip + */ +enum nand_page_io_req_type { + NAND_PAGE_READ = 0, + NAND_PAGE_WRITE, +}; + /** * struct nand_page_io_req - NAND I/O request object + * @type: the type of page I/O: read or write * @pos: the position this I/O request is targeting * @dataoffs: the offset within the page * @datalen: number of data bytes to read from/write to this page @@ -99,6 +110,7 @@ struct nand_pos { * specific commands/operations. */ struct nand_page_io_req { + enum nand_page_io_req_type type; struct nand_pos pos; unsigned int dataoffs; unsigned int datalen; @@ -638,11 +650,13 @@ static inline void nanddev_pos_next_page(struct nand_device *nand, * layer. */ static inline void nanddev_io_iter_init(struct nand_device *nand, + enum nand_page_io_req_type reqtype, loff_t offs, struct mtd_oob_ops *req, struct nand_io_iter *iter) { struct mtd_info *mtd = nanddev_to_mtd(nand); + iter->req.type = reqtype; iter->req.mode = req->mode; iter->req.dataoffs = nanddev_offs_to_pos(nand, offs, &iter->req.pos); iter->req.ooboffs = req->ooboffs; @@ -712,8 +726,8 @@ static inline bool nanddev_io_iter_end(struct nand_device *nand, * * Should be used for iterate over pages that are contained in an MTD request. */ -#define nanddev_io_for_each_page(nand, start, req, iter) \ - for (nanddev_io_iter_init(nand, start, req, iter); \ +#define nanddev_io_for_each_page(nand, type, start, req, iter) \ + for (nanddev_io_iter_init(nand, type, start, req, iter); \ !nanddev_io_iter_end(nand, iter); \ nanddev_io_iter_next_page(nand, iter)) -- cgit v1.2.3 From 7c4b1ab9f16732fb921b3f11cd127fa65f26ad5c Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Tue, 18 Aug 2020 14:52:45 +0300 Subject: IB/mlx5: Add DCT RoCE LAG support When DCT QPs work in RoCE LAG mode: 1. DCT creation is allowed only when it is supported 2. The "port" of a DCT QP is assigned in a round-robin way Link: https://lore.kernel.org/r/20200818115245.700581-3-leon@kernel.org Signed-off-by: Mark Zhang Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/linux/mlx5/mlx5_ifc.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index de1ffb4804d6..aee25e4fb2cc 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1430,7 +1430,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 log_bf_reg_size[0x5]; - u8 reserved_at_270[0x8]; + u8 reserved_at_270[0x6]; + u8 lag_dct[0x2]; u8 lag_tx_port_affinity[0x1]; u8 reserved_at_279[0x2]; u8 lag_master[0x1]; -- cgit v1.2.3 From 1c9c02bb22684f6949d2e7ddc0a3ff364fd5a6fc Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 27 Apr 2020 14:50:37 -0500 Subject: mtd: lpddr: Fix bad logic in print_drs_error Update logic for broken test. Use a more common logging style. It appears the logic in this function is broken for the consecutive tests of if (prog_status & 0x3) ... else if (prog_status & 0x2) ... else (prog_status & 0x1) ... Likely the first test should be if ((prog_status & 0x3) == 0x3) Found by inspection of include files using printk. Fixes: eb3db27507f7 ("[MTD] LPDDR PFOW definition") Cc: stable@vger.kernel.org Reported-by: Joe Perches Signed-off-by: Gustavo A. R. Silva Acked-by: Miquel Raynal Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/3fb0e29f5b601db8be2938a01d974b00c8788501.1588016644.git.gustavo@embeddedor.com --- include/linux/mtd/pfow.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mtd/pfow.h b/include/linux/mtd/pfow.h index 6166e7c60869..b8da6f8e854b 100644 --- a/include/linux/mtd/pfow.h +++ b/include/linux/mtd/pfow.h @@ -128,7 +128,7 @@ static inline void print_drs_error(unsigned dsr) if (!(dsr & DSR_AVAILABLE)) printk(KERN_NOTICE"DSR.15: (0) Device not Available\n"); - if (prog_status & 0x03) + if ((prog_status & 0x03) == 0x03) printk(KERN_NOTICE"DSR.9,8: (11) Attempt to program invalid " "half with 41h command\n"); else if (prog_status & 0x02) -- cgit v1.2.3 From 518693abe6e3f57606ec18892e9135abbc04b361 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 27 Apr 2020 14:54:13 -0500 Subject: mtd: lpddr: Replace printk with pr_notice pr_notice is preferred over printk. Also, coalesce formats as coalescing is part of coding-style: "never break user-visible strings such as printk messages" Suggested-by: Joe Perches Signed-off-by: Gustavo A. R. Silva Reviewed-by: Miquel Raynal Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/ff48ce07ef208ba65b858f09279a3b36031d64d2.1588016644.git.gustavo@embeddedor.com --- include/linux/mtd/pfow.h | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/pfow.h b/include/linux/mtd/pfow.h index b8da6f8e854b..1c0f6113f230 100644 --- a/include/linux/mtd/pfow.h +++ b/include/linux/mtd/pfow.h @@ -127,31 +127,26 @@ static inline void print_drs_error(unsigned dsr) int prog_status = (dsr & DSR_RPS) >> 8; if (!(dsr & DSR_AVAILABLE)) - printk(KERN_NOTICE"DSR.15: (0) Device not Available\n"); + pr_notice("DSR.15: (0) Device not Available\n"); if ((prog_status & 0x03) == 0x03) - printk(KERN_NOTICE"DSR.9,8: (11) Attempt to program invalid " - "half with 41h command\n"); + pr_notice("DSR.9,8: (11) Attempt to program invalid half with 41h command\n"); else if (prog_status & 0x02) - printk(KERN_NOTICE"DSR.9,8: (10) Object Mode Program attempt " - "in region with Control Mode data\n"); + pr_notice("DSR.9,8: (10) Object Mode Program attempt in region with Control Mode data\n"); else if (prog_status & 0x01) - printk(KERN_NOTICE"DSR.9,8: (01) Program attempt in region " - "with Object Mode data\n"); + pr_notice("DSR.9,8: (01) Program attempt in region with Object Mode data\n"); if (!(dsr & DSR_READY_STATUS)) - printk(KERN_NOTICE"DSR.7: (0) Device is Busy\n"); + pr_notice("DSR.7: (0) Device is Busy\n"); if (dsr & DSR_ESS) - printk(KERN_NOTICE"DSR.6: (1) Erase Suspended\n"); + pr_notice("DSR.6: (1) Erase Suspended\n"); if (dsr & DSR_ERASE_STATUS) - printk(KERN_NOTICE"DSR.5: (1) Erase/Blank check error\n"); + pr_notice("DSR.5: (1) Erase/Blank check error\n"); if (dsr & DSR_PROGRAM_STATUS) - printk(KERN_NOTICE"DSR.4: (1) Program Error\n"); + pr_notice("DSR.4: (1) Program Error\n"); if (dsr & DSR_VPPS) - printk(KERN_NOTICE"DSR.3: (1) Vpp low detect, operation " - "aborted\n"); + pr_notice("DSR.3: (1) Vpp low detect, operation aborted\n"); if (dsr & DSR_PSS) - printk(KERN_NOTICE"DSR.2: (1) Program suspended\n"); + pr_notice("DSR.2: (1) Program suspended\n"); if (dsr & DSR_DPS) - printk(KERN_NOTICE"DSR.1: (1) Aborted Erase/Program attempt " - "on locked block\n"); + pr_notice("DSR.1: (1) Aborted Erase/Program attempt on locked block\n"); } #endif /* __LINUX_MTD_PFOW_H */ -- cgit v1.2.3 From 1a64026eda1642c81425e48550fd4bd3f73d0ab5 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 27 Apr 2020 14:56:08 -0500 Subject: mtd: lpddr: Move function print_drs_error to lpddr_cmds.c Function print_drs_error is only used in drivers/mtd/lpddr/lpddr_cmds.c so, better to move it there. Also, notice that there's no need for inline as the function is used once. Lastly, fix the following checkpatch warning: WARNING: Prefer 'unsigned int' to bare use of 'unsigned' +static void print_drs_error(unsigned dsr) Suggested-by: Joe Perches Signed-off-by: Gustavo A. R. Silva Reviewed-by: Miquel Raynal Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/e0063cbd65f3b47be1db34efc494ea3047634d88.1588016644.git.gustavo@embeddedor.com --- include/linux/mtd/pfow.h | 28 ---------------------------- 1 file changed, 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/pfow.h b/include/linux/mtd/pfow.h index 1c0f6113f230..146413d4bdb7 100644 --- a/include/linux/mtd/pfow.h +++ b/include/linux/mtd/pfow.h @@ -121,32 +121,4 @@ static inline void send_pfow_command(struct map_info *map, map_write(map, CMD(LPDDR_START_EXECUTION), map->pfow_base + PFOW_COMMAND_EXECUTE); } - -static inline void print_drs_error(unsigned dsr) -{ - int prog_status = (dsr & DSR_RPS) >> 8; - - if (!(dsr & DSR_AVAILABLE)) - pr_notice("DSR.15: (0) Device not Available\n"); - if ((prog_status & 0x03) == 0x03) - pr_notice("DSR.9,8: (11) Attempt to program invalid half with 41h command\n"); - else if (prog_status & 0x02) - pr_notice("DSR.9,8: (10) Object Mode Program attempt in region with Control Mode data\n"); - else if (prog_status & 0x01) - pr_notice("DSR.9,8: (01) Program attempt in region with Object Mode data\n"); - if (!(dsr & DSR_READY_STATUS)) - pr_notice("DSR.7: (0) Device is Busy\n"); - if (dsr & DSR_ESS) - pr_notice("DSR.6: (1) Erase Suspended\n"); - if (dsr & DSR_ERASE_STATUS) - pr_notice("DSR.5: (1) Erase/Blank check error\n"); - if (dsr & DSR_PROGRAM_STATUS) - pr_notice("DSR.4: (1) Program Error\n"); - if (dsr & DSR_VPPS) - pr_notice("DSR.3: (1) Vpp low detect, operation aborted\n"); - if (dsr & DSR_PSS) - pr_notice("DSR.2: (1) Program suspended\n"); - if (dsr & DSR_DPS) - pr_notice("DSR.1: (1) Aborted Erase/Program attempt on locked block\n"); -} #endif /* __LINUX_MTD_PFOW_H */ -- cgit v1.2.3 From 2fa4e4b799e191530edbae4b96b85d4486e55053 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 27 Aug 2020 04:00:28 +0200 Subject: net: pcs: Move XPCS into new PCS subdirectory Create drivers/net/pcs and move the Synopsys DesignWare XPCS into the new directory. Move the header file into a subdirectory include/linux/pcs Start a naming convention of all PCS files use the prefix pcs-, and rename the XPCS files to fit. v2: Add include/linux/pcs v4: Fix include path in stmmac. Remove PCS_DEVICES to avoid new prompts Cc: Jose Abreu Reviewed-by: Florian Fainelli Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/mdio-xpcs.h | 41 ----------------------------------------- include/linux/pcs/pcs-xpcs.h | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 41 deletions(-) delete mode 100644 include/linux/mdio-xpcs.h create mode 100644 include/linux/pcs/pcs-xpcs.h (limited to 'include/linux') diff --git a/include/linux/mdio-xpcs.h b/include/linux/mdio-xpcs.h deleted file mode 100644 index 9a841aa5982d..000000000000 --- a/include/linux/mdio-xpcs.h +++ /dev/null @@ -1,41 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (c) 2020 Synopsys, Inc. and/or its affiliates. - * Synopsys DesignWare XPCS helpers - */ - -#ifndef __LINUX_MDIO_XPCS_H -#define __LINUX_MDIO_XPCS_H - -#include -#include - -struct mdio_xpcs_args { - __ETHTOOL_DECLARE_LINK_MODE_MASK(supported); - struct mii_bus *bus; - int addr; -}; - -struct mdio_xpcs_ops { - int (*validate)(struct mdio_xpcs_args *xpcs, - unsigned long *supported, - struct phylink_link_state *state); - int (*config)(struct mdio_xpcs_args *xpcs, - const struct phylink_link_state *state); - int (*get_state)(struct mdio_xpcs_args *xpcs, - struct phylink_link_state *state); - int (*link_up)(struct mdio_xpcs_args *xpcs, int speed, - phy_interface_t interface); - int (*probe)(struct mdio_xpcs_args *xpcs, phy_interface_t interface); -}; - -#if IS_ENABLED(CONFIG_MDIO_XPCS) -struct mdio_xpcs_ops *mdio_xpcs_get_ops(void); -#else -static inline struct mdio_xpcs_ops *mdio_xpcs_get_ops(void) -{ - return NULL; -} -#endif - -#endif /* __LINUX_MDIO_XPCS_H */ diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h new file mode 100644 index 000000000000..351c1c9aedc5 --- /dev/null +++ b/include/linux/pcs/pcs-xpcs.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2020 Synopsys, Inc. and/or its affiliates. + * Synopsys DesignWare XPCS helpers + */ + +#ifndef __LINUX_PCS_XPCS_H +#define __LINUX_PCS_XPCS_H + +#include +#include + +struct mdio_xpcs_args { + __ETHTOOL_DECLARE_LINK_MODE_MASK(supported); + struct mii_bus *bus; + int addr; +}; + +struct mdio_xpcs_ops { + int (*validate)(struct mdio_xpcs_args *xpcs, + unsigned long *supported, + struct phylink_link_state *state); + int (*config)(struct mdio_xpcs_args *xpcs, + const struct phylink_link_state *state); + int (*get_state)(struct mdio_xpcs_args *xpcs, + struct phylink_link_state *state); + int (*link_up)(struct mdio_xpcs_args *xpcs, int speed, + phy_interface_t interface); + int (*probe)(struct mdio_xpcs_args *xpcs, phy_interface_t interface); +}; + +#if IS_ENABLED(CONFIG_PCS_XPCS) +struct mdio_xpcs_ops *mdio_xpcs_get_ops(void); +#else +static inline struct mdio_xpcs_ops *mdio_xpcs_get_ops(void) +{ + return NULL; +} +#endif + +#endif /* __LINUX_PCS_XPCS_H */ -- cgit v1.2.3 From fcba68bd75bb1d42b3aec7f471d382a9e639a672 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 27 Aug 2020 04:00:29 +0200 Subject: net/phy/mdio-i2c: Move header file to include/linux/mdio In preparation for moving all MDIO drivers into drivers/net/mdio, move the mdio-i2c header file into include/linux/mdio so it can be used by both the MDIO driver and the SFP code which instantiates I2C MDIO busses. v2: Add include/linux/mdio Reviewed-by: Florian Fainelli Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/mdio/mdio-i2c.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 include/linux/mdio/mdio-i2c.h (limited to 'include/linux') diff --git a/include/linux/mdio/mdio-i2c.h b/include/linux/mdio/mdio-i2c.h new file mode 100644 index 000000000000..b1d27f7cd23f --- /dev/null +++ b/include/linux/mdio/mdio-i2c.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * MDIO I2C bridge + * + * Copyright (C) 2015 Russell King + */ +#ifndef MDIO_I2C_H +#define MDIO_I2C_H + +struct device; +struct i2c_adapter; +struct mii_bus; + +struct mii_bus *mdio_i2c_alloc(struct device *parent, struct i2c_adapter *i2c); + +#endif -- cgit v1.2.3 From 232e15e1d7ddb191c28248cb681f4544c0ff1c54 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 27 Aug 2020 04:00:30 +0200 Subject: net: xgene: Move shared header file into include/linux This header file is currently included into the ethernet driver via a relative path into the PHY subsystem. This is bad practice, and causes issues for the upcoming move of the MDIO driver. Move the header file into include/linux to clean this up. v2: Move header to include/linux/mdio Reviewed-by: Florian Fainelli Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/mdio/mdio-xgene.h | 130 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 include/linux/mdio/mdio-xgene.h (limited to 'include/linux') diff --git a/include/linux/mdio/mdio-xgene.h b/include/linux/mdio/mdio-xgene.h new file mode 100644 index 000000000000..8af93ada8b64 --- /dev/null +++ b/include/linux/mdio/mdio-xgene.h @@ -0,0 +1,130 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* Applied Micro X-Gene SoC MDIO Driver + * + * Copyright (c) 2016, Applied Micro Circuits Corporation + * Author: Iyappan Subramanian + */ + +#ifndef __MDIO_XGENE_H__ +#define __MDIO_XGENE_H__ + +#define BLOCK_XG_MDIO_CSR_OFFSET 0x5000 +#define BLOCK_DIAG_CSR_OFFSET 0xd000 +#define XGENET_CONFIG_REG_ADDR 0x20 + +#define MAC_ADDR_REG_OFFSET 0x00 +#define MAC_COMMAND_REG_OFFSET 0x04 +#define MAC_WRITE_REG_OFFSET 0x08 +#define MAC_READ_REG_OFFSET 0x0c +#define MAC_COMMAND_DONE_REG_OFFSET 0x10 + +#define CLKEN_OFFSET 0x08 +#define SRST_OFFSET 0x00 + +#define MENET_CFG_MEM_RAM_SHUTDOWN_ADDR 0x70 +#define MENET_BLOCK_MEM_RDY_ADDR 0x74 + +#define MAC_CONFIG_1_ADDR 0x00 +#define MII_MGMT_COMMAND_ADDR 0x24 +#define MII_MGMT_ADDRESS_ADDR 0x28 +#define MII_MGMT_CONTROL_ADDR 0x2c +#define MII_MGMT_STATUS_ADDR 0x30 +#define MII_MGMT_INDICATORS_ADDR 0x34 +#define SOFT_RESET BIT(31) + +#define MII_MGMT_CONFIG_ADDR 0x20 +#define MII_MGMT_COMMAND_ADDR 0x24 +#define MII_MGMT_ADDRESS_ADDR 0x28 +#define MII_MGMT_CONTROL_ADDR 0x2c +#define MII_MGMT_STATUS_ADDR 0x30 +#define MII_MGMT_INDICATORS_ADDR 0x34 + +#define MIIM_COMMAND_ADDR 0x20 +#define MIIM_FIELD_ADDR 0x24 +#define MIIM_CONFIGURATION_ADDR 0x28 +#define MIIM_LINKFAILVECTOR_ADDR 0x2c +#define MIIM_INDICATOR_ADDR 0x30 +#define MIIMRD_FIELD_ADDR 0x34 + +#define MDIO_CSR_OFFSET 0x5000 + +#define REG_ADDR_POS 0 +#define REG_ADDR_LEN 5 +#define PHY_ADDR_POS 8 +#define PHY_ADDR_LEN 5 + +#define HSTMIIMWRDAT_POS 0 +#define HSTMIIMWRDAT_LEN 16 +#define HSTPHYADX_POS 23 +#define HSTPHYADX_LEN 5 +#define HSTREGADX_POS 18 +#define HSTREGADX_LEN 5 +#define HSTLDCMD BIT(3) +#define HSTMIIMCMD_POS 0 +#define HSTMIIMCMD_LEN 3 + +#define BUSY_MASK BIT(0) +#define READ_CYCLE_MASK BIT(0) + +enum xgene_enet_cmd { + XGENE_ENET_WR_CMD = BIT(31), + XGENE_ENET_RD_CMD = BIT(30) +}; + +enum { + MIIM_CMD_IDLE, + MIIM_CMD_LEGACY_WRITE, + MIIM_CMD_LEGACY_READ, +}; + +enum xgene_mdio_id { + XGENE_MDIO_RGMII = 1, + XGENE_MDIO_XFI +}; + +struct xgene_mdio_pdata { + struct clk *clk; + struct device *dev; + void __iomem *mac_csr_addr; + void __iomem *diag_csr_addr; + void __iomem *mdio_csr_addr; + struct mii_bus *mdio_bus; + int mdio_id; + spinlock_t mac_lock; /* mac lock */ +}; + +/* Set the specified value into a bit-field defined by its starting position + * and length within a single u64. + */ +static inline u64 xgene_enet_set_field_value(int pos, int len, u64 val) +{ + return (val & ((1ULL << len) - 1)) << pos; +} + +#define SET_VAL(field, val) \ + xgene_enet_set_field_value(field ## _POS, field ## _LEN, val) + +#define SET_BIT(field) \ + xgene_enet_set_field_value(field ## _POS, 1, 1) + +/* Get the value from a bit-field defined by its starting position + * and length within the specified u64. + */ +static inline u64 xgene_enet_get_field_value(int pos, int len, u64 src) +{ + return (src >> pos) & ((1ULL << len) - 1); +} + +#define GET_VAL(field, src) \ + xgene_enet_get_field_value(field ## _POS, field ## _LEN, src) + +#define GET_BIT(field, src) \ + xgene_enet_get_field_value(field ## _POS, 1, src) + +u32 xgene_mdio_rd_mac(struct xgene_mdio_pdata *pdata, u32 rd_addr); +void xgene_mdio_wr_mac(struct xgene_mdio_pdata *pdata, u32 wr_addr, u32 data); +int xgene_mdio_rgmii_read(struct mii_bus *bus, int phy_id, int reg); +int xgene_mdio_rgmii_write(struct mii_bus *bus, int phy_id, int reg, u16 data); +struct phy_device *xgene_enet_phy_register(struct mii_bus *bus, int phy_addr); + +#endif /* __MDIO_XGENE_H__ */ -- cgit v1.2.3 From 17529bcf0ae20f1ac6d7846762bf0c6ad91dbb7f Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 27 Aug 2020 10:48:28 +0200 Subject: power: supply: gpio-charger: Convert to GPIO descriptors This converts the GPIO charger to use exclusively GPIO descriptors, moving the two remaining platforms passing global GPIO numbers over to using a GPIO descriptor table. Signed-off-by: Linus Walleij Cc: Robert Jarzmik Cc: Dmitry Eremin-Solenikov Signed-off-by: Sebastian Reichel --- include/linux/power/gpio-charger.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/power/gpio-charger.h b/include/linux/power/gpio-charger.h index 5a5a8de98181..c0b7657ac1df 100644 --- a/include/linux/power/gpio-charger.h +++ b/include/linux/power/gpio-charger.h @@ -13,18 +13,12 @@ * struct gpio_charger_platform_data - platform_data for gpio_charger devices * @name: Name for the chargers power_supply device * @type: Type of the charger - * @gpio: GPIO which is used to indicate the chargers status - * @gpio_active_low: Should be set to 1 if the GPIO is active low otherwise 0 * @supplied_to: Array of battery names to which this chargers supplies power * @num_supplicants: Number of entries in the supplied_to array */ struct gpio_charger_platform_data { const char *name; enum power_supply_type type; - - int gpio; - int gpio_active_low; - char **supplied_to; size_t num_supplicants; }; -- cgit v1.2.3 From dab741e0e02bd3c4f5e2e97be74b39df2523fc6e Mon Sep 17 00:00:00 2001 From: Mattias Nissler Date: Thu, 27 Aug 2020 11:09:46 -0600 Subject: Add a "nosymfollow" mount option. For mounts that have the new "nosymfollow" option, don't follow symlinks when resolving paths. The new option is similar in spirit to the existing "nodev", "noexec", and "nosuid" options, as well as to the LOOKUP_NO_SYMLINKS resolve flag in the openat2(2) syscall. Various BSD variants have been supporting the "nosymfollow" mount option for a long time with equivalent implementations. Note that symlinks may still be created on file systems mounted with the "nosymfollow" option present. readlink() remains functional, so user space code that is aware of symlinks can still choose to follow them explicitly. Setting the "nosymfollow" mount option helps prevent privileged writers from modifying files unintentionally in case there is an unexpected link along the accessed path. The "nosymfollow" option is thus useful as a defensive measure for systems that need to deal with untrusted file systems in privileged contexts. More information on the history and motivation for this patch can be found here: https://sites.google.com/a/chromium.org/dev/chromium-os/chromiumos-design-docs/hardening-against-malicious-stateful-data#TOC-Restricting-symlink-traversal Signed-off-by: Mattias Nissler Signed-off-by: Ross Zwisler Reviewed-by: Aleksa Sarai Signed-off-by: Al Viro --- include/linux/mount.h | 3 ++- include/linux/statfs.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mount.h b/include/linux/mount.h index de657bd211fa..aaf343b38671 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -30,6 +30,7 @@ struct fs_context; #define MNT_NODIRATIME 0x10 #define MNT_RELATIME 0x20 #define MNT_READONLY 0x40 /* does the user want this to be r/o? */ +#define MNT_NOSYMFOLLOW 0x80 #define MNT_SHRINKABLE 0x100 #define MNT_WRITE_HOLD 0x200 @@ -46,7 +47,7 @@ struct fs_context; #define MNT_SHARED_MASK (MNT_UNBINDABLE) #define MNT_USER_SETTABLE_MASK (MNT_NOSUID | MNT_NODEV | MNT_NOEXEC \ | MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME \ - | MNT_READONLY) + | MNT_READONLY | MNT_NOSYMFOLLOW) #define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME ) #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \ diff --git a/include/linux/statfs.h b/include/linux/statfs.h index 9bc69edb8f18..fac4356ea1bf 100644 --- a/include/linux/statfs.h +++ b/include/linux/statfs.h @@ -40,6 +40,7 @@ struct kstatfs { #define ST_NOATIME 0x0400 /* do not update access times */ #define ST_NODIRATIME 0x0800 /* do not update directory access times */ #define ST_RELATIME 0x1000 /* update atime relative to mtime/ctime */ +#define ST_NOSYMFOLLOW 0x2000 /* do not follow symlinks */ struct dentry; extern int vfs_get_fsid(struct dentry *dentry, __kernel_fsid_t *fsid); -- cgit v1.2.3 From b6f3e21b928a8ae7959a0d79203b80bd70120768 Mon Sep 17 00:00:00 2001 From: Sebastian Reichel Date: Wed, 26 Aug 2020 16:41:58 +0200 Subject: power: supply: smb347-charger: Drop pdata support There are no platforms using the pdata support, so let's drop it to simplify the driver. Reviewed-by: Dmitry Osipenko Tested-by: Dmitry Osipenko Signed-off-by: Sebastian Reichel --- include/linux/power/smb347-charger.h | 114 ----------------------------------- 1 file changed, 114 deletions(-) delete mode 100644 include/linux/power/smb347-charger.h (limited to 'include/linux') diff --git a/include/linux/power/smb347-charger.h b/include/linux/power/smb347-charger.h deleted file mode 100644 index e0b687a4d20c..000000000000 --- a/include/linux/power/smb347-charger.h +++ /dev/null @@ -1,114 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Summit Microelectronics SMB347 Battery Charger Driver - * - * Copyright (C) 2011, Intel Corporation - * - * Authors: Bruce E. Robertson - * Mika Westerberg - */ - -#ifndef SMB347_CHARGER_H -#define SMB347_CHARGER_H - -#include -#include - -enum { - /* use the default compensation method */ - SMB347_SOFT_TEMP_COMPENSATE_DEFAULT = -1, - - SMB347_SOFT_TEMP_COMPENSATE_NONE, - SMB347_SOFT_TEMP_COMPENSATE_CURRENT, - SMB347_SOFT_TEMP_COMPENSATE_VOLTAGE, -}; - -/* Use default factory programmed value for hard/soft temperature limit */ -#define SMB347_TEMP_USE_DEFAULT -273 - -/* - * Charging enable can be controlled by software (via i2c) by - * smb347-charger driver or by EN pin (active low/high). - */ -enum smb347_chg_enable { - SMB347_CHG_ENABLE_SW, - SMB347_CHG_ENABLE_PIN_ACTIVE_LOW, - SMB347_CHG_ENABLE_PIN_ACTIVE_HIGH, -}; - -/** - * struct smb347_charger_platform_data - platform data for SMB347 charger - * @battery_info: Information about the battery - * @max_charge_current: maximum current (in uA) the battery can be charged - * @max_charge_voltage: maximum voltage (in uV) the battery can be charged - * @pre_charge_current: current (in uA) to use in pre-charging phase - * @termination_current: current (in uA) used to determine when the - * charging cycle terminates - * @pre_to_fast_voltage: voltage (in uV) treshold used for transitioning to - * pre-charge to fast charge mode - * @mains_current_limit: maximum input current drawn from AC/DC input (in uA) - * @usb_hc_current_limit: maximum input high current (in uA) drawn from USB - * input - * @chip_temp_threshold: die temperature where device starts limiting charge - * current [%100 - %130] (in degree C) - * @soft_cold_temp_limit: soft cold temperature limit [%0 - %15] (in degree C), - * granularity is 5 deg C. - * @soft_hot_temp_limit: soft hot temperature limit [%40 - %55] (in degree C), - * granularity is 5 deg C. - * @hard_cold_temp_limit: hard cold temperature limit [%-5 - %10] (in degree C), - * granularity is 5 deg C. - * @hard_hot_temp_limit: hard hot temperature limit [%50 - %65] (in degree C), - * granularity is 5 deg C. - * @suspend_on_hard_temp_limit: suspend charging when hard limit is hit - * @soft_temp_limit_compensation: compensation method when soft temperature - * limit is hit - * @charge_current_compensation: current (in uA) for charging compensation - * current when temperature hits soft limits - * @use_mains: AC/DC input can be used - * @use_usb: USB input can be used - * @use_usb_otg: USB OTG output can be used (not implemented yet) - * @irq_gpio: GPIO number used for interrupts (%-1 if not used) - * @enable_control: how charging enable/disable is controlled - * (driver/pin controls) - * - * @use_main, @use_usb, and @use_usb_otg are means to enable/disable - * hardware support for these. This is useful when we want to have for - * example OTG charging controlled via OTG transceiver driver and not by - * the SMB347 hardware. - * - * Hard and soft temperature limit values are given as described in the - * device data sheet and assuming NTC beta value is %3750. Even if this is - * not the case, these values should be used. They can be mapped to the - * corresponding NTC beta values with the help of table %2 in the data - * sheet. So for example if NTC beta is %3375 and we want to program hard - * hot limit to be %53 deg C, @hard_hot_temp_limit should be set to %50. - * - * If zero value is given in any of the current and voltage values, the - * factory programmed default will be used. For soft/hard temperature - * values, pass in %SMB347_TEMP_USE_DEFAULT instead. - */ -struct smb347_charger_platform_data { - struct power_supply_info battery_info; - unsigned int max_charge_current; - unsigned int max_charge_voltage; - unsigned int pre_charge_current; - unsigned int termination_current; - unsigned int pre_to_fast_voltage; - unsigned int mains_current_limit; - unsigned int usb_hc_current_limit; - unsigned int chip_temp_threshold; - int soft_cold_temp_limit; - int soft_hot_temp_limit; - int hard_cold_temp_limit; - int hard_hot_temp_limit; - bool suspend_on_hard_temp_limit; - unsigned int soft_temp_limit_compensation; - unsigned int charge_current_compensation; - bool use_mains; - bool use_usb; - bool use_usb_otg; - int irq_gpio; - enum smb347_chg_enable enable_control; -}; - -#endif /* SMB347_CHARGER_H */ -- cgit v1.2.3 From f4d05259213ff1e91f767c91dcab455f68308fac Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 27 Aug 2020 18:18:06 -0700 Subject: bpf: Add map_meta_equal map ops Some properties of the inner map is used in the verification time. When an inner map is inserted to an outer map at runtime, bpf_map_meta_equal() is currently used to ensure those properties of the inserting inner map stays the same as the verification time. In particular, the current bpf_map_meta_equal() checks max_entries which turns out to be too restrictive for most of the maps which do not use max_entries during the verification time. It limits the use case that wants to replace a smaller inner map with a larger inner map. There are some maps do use max_entries during verification though. For example, the map_gen_lookup in array_map_ops uses the max_entries to generate the inline lookup code. To accommodate differences between maps, the map_meta_equal is added to bpf_map_ops. Each map-type can decide what to check when its map is used as an inner map during runtime. Also, some map types cannot be used as an inner map and they are currently black listed in bpf_map_meta_alloc() in map_in_map.c. It is not unusual that the new map types may not aware that such blacklist exists. This patch enforces an explicit opt-in and only allows a map to be used as an inner map if it has implemented the map_meta_equal ops. It is based on the discussion in [1]. All maps that support inner map has its map_meta_equal points to bpf_map_meta_equal in this patch. A later patch will relax the max_entries check for most maps. bpf_types.h counts 28 map types. This patch adds 23 ".map_meta_equal" by using coccinelle. -5 for BPF_MAP_TYPE_PROG_ARRAY BPF_MAP_TYPE_(PERCPU)_CGROUP_STORAGE BPF_MAP_TYPE_STRUCT_OPS BPF_MAP_TYPE_ARRAY_OF_MAPS BPF_MAP_TYPE_HASH_OF_MAPS The "if (inner_map->inner_map_meta)" check in bpf_map_meta_alloc() is moved such that the same error is returned. [1]: https://lore.kernel.org/bpf/20200522022342.899756-1-kafai@fb.com/ Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200828011806.1970400-1-kafai@fb.com --- include/linux/bpf.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a6131d95e31e..dbba82a80087 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -112,6 +112,19 @@ struct bpf_map_ops { void (*map_local_storage_uncharge)(struct bpf_local_storage_map *smap, void *owner, u32 size); struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner); + + /* map_meta_equal must be implemented for maps that can be + * used as an inner map. It is a runtime check to ensure + * an inner map can be inserted to an outer map. + * + * Some properties of the inner map has been used during the + * verification time. When inserting an inner map at the runtime, + * map_meta_equal has to ensure the inserting map has the same + * properties that the verifier has used earlier. + */ + bool (*map_meta_equal)(const struct bpf_map *meta0, + const struct bpf_map *meta1); + /* BTF name and id of struct allocated by map_alloc */ const char * const map_btf_name; int *map_btf_id; @@ -235,6 +248,9 @@ int map_check_no_btf(const struct bpf_map *map, const struct btf_type *key_type, const struct btf_type *value_type); +bool bpf_map_meta_equal(const struct bpf_map *meta0, + const struct bpf_map *meta1); + extern const struct bpf_map_ops bpf_map_offload_ops; /* function argument constraints */ -- cgit v1.2.3 From 316cdaa1158af17250397054f92bb339fbd8e282 Mon Sep 17 00:00:00 2001 From: Mahesh Bandewar Date: Wed, 26 Aug 2020 09:05:35 -0700 Subject: net: add option to not create fall-back tunnels in root-ns as well The sysctl that was added earlier by commit 79134e6ce2c ("net: do not create fallback tunnels for non-default namespaces") to create fall-back only in root-ns. This patch enhances that behavior to provide option not to create fallback tunnels in root-ns as well. Since modules that create fallback tunnels could be built-in and setting the sysctl value after booting is pointless, so added a kernel cmdline options to change this default. The default setting is preseved for backward compatibility. The kernel command line option of fb_tunnels=initns will set the sysctl value to 1 and will create fallback tunnels only in initns while kernel cmdline fb_tunnels=none will set the sysctl value to 2 and fallback tunnels are skipped in every netns. Signed-off-by: Mahesh Bandewar Cc: Eric Dumazet Cc: Maciej Zenczykowski Cc: Jian Yang Cc: Randy Dunlap Signed-off-by: David S. Miller --- include/linux/netdevice.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b8abe1d7aa0b..c0b512e6a02b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -640,10 +640,14 @@ struct netdev_queue { extern int sysctl_fb_tunnels_only_for_init_net; extern int sysctl_devconf_inherit_init_net; +/* + * sysctl_fb_tunnels_only_for_init_net == 0 : For all netns + * == 1 : For initns only + * == 2 : For none. + */ static inline bool net_has_fallback_tunnels(const struct net *net) { - return net == &init_net || - !IS_ENABLED(CONFIG_SYSCTL) || + return (net == &init_net && sysctl_fb_tunnels_only_for_init_net == 1) || !sysctl_fb_tunnels_only_for_init_net; } -- cgit v1.2.3 From 9584051f3cf3af181e577960956bb7c085879b67 Mon Sep 17 00:00:00 2001 From: Jonghwa Lee Date: Thu, 14 May 2020 16:04:27 -0700 Subject: power: supply: charger-manager: Remove cm_notify_event function cm_notify_event() was introduced to get an event associated with the battery status externally (ie in board files), but no one ever used it. Moreover it makes charger manager driver more complicated. Drop the function and all data related to it to simplify the driver. Signed-off-by: Jonghwa Lee Signed-off-by: Krzysztof Kozlowski Signed-off-by: Jonathan Bakker Signed-off-by: Sebastian Reichel --- include/linux/power/charger-manager.h | 33 ++++++--------------------------- 1 file changed, 6 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/power/charger-manager.h b/include/linux/power/charger-manager.h index ae94dcebd936..3a98837684e3 100644 --- a/include/linux/power/charger-manager.h +++ b/include/linux/power/charger-manager.h @@ -31,16 +31,10 @@ enum polling_modes { CM_POLL_CHARGING_ONLY, }; -enum cm_event_types { - CM_EVENT_UNKNOWN = 0, - CM_EVENT_BATT_FULL, - CM_EVENT_BATT_IN, - CM_EVENT_BATT_OUT, - CM_EVENT_BATT_OVERHEAT, - CM_EVENT_BATT_COLD, - CM_EVENT_EXT_PWR_IN_OUT, - CM_EVENT_CHG_START_STOP, - CM_EVENT_OTHERS, +enum cm_batt_temp { + CM_BATT_OK = 0, + CM_BATT_OVERHEAT, + CM_BATT_COLD, }; /** @@ -131,11 +125,10 @@ struct charger_regulator { * @psy_name: the name of power-supply-class for charger manager * @polling_mode: * Determine which polling mode will be used - * @fullbatt_vchkdrop_ms: * @fullbatt_vchkdrop_uV: * Check voltage drop after the battery is fully charged. - * If it has dropped more than fullbatt_vchkdrop_uV after - * fullbatt_vchkdrop_ms, CM will restart charging. + * If it has dropped more than fullbatt_vchkdrop_uV + * CM will restart charging. * @fullbatt_uV: voltage in microvolt * If VBATT >= fullbatt_uV, it is assumed to be full. * @fullbatt_soc: state of Charge in % @@ -172,7 +165,6 @@ struct charger_desc { enum polling_modes polling_mode; unsigned int polling_interval_ms; - unsigned int fullbatt_vchkdrop_ms; unsigned int fullbatt_vchkdrop_uV; unsigned int fullbatt_uV; unsigned int fullbatt_soc; @@ -211,9 +203,6 @@ struct charger_desc { * @charger_stat: array of power_supply for chargers * @tzd_batt : thermal zone device for battery * @charger_enabled: the state of charger - * @fullbatt_vchk_jiffies_at: - * jiffies at the time full battery check will occur. - * @fullbatt_vchk_work: work queue for full battery check * @emergency_stop: * When setting true, stop charging * @psy_name_buf: the name of power-supply-class for charger manager @@ -235,9 +224,6 @@ struct charger_manager { #endif bool charger_enabled; - unsigned long fullbatt_vchk_jiffies_at; - struct delayed_work fullbatt_vchk_work; - int emergency_stop; char psy_name_buf[PSY_NAME_MAX + 1]; @@ -248,11 +234,4 @@ struct charger_manager { u64 charging_end_time; }; -#if IS_ENABLED(CONFIG_CHARGER_MANAGER) -extern void cm_notify_event(struct power_supply *psy, - enum cm_event_types type, char *msg); -#else -static inline void cm_notify_event(struct power_supply *psy, - enum cm_event_types type, char *msg) { } -#endif #endif /* _CHARGER_MANAGER_H */ -- cgit v1.2.3 From e132fc6bb89bd307cfcdb8ba24afcd1985261485 Mon Sep 17 00:00:00 2001 From: Jonghwa Lee Date: Thu, 14 May 2020 16:04:31 -0700 Subject: power: supply: charger-manager: Make decisions focussed on battery status cm_monitor(), where charging management starts, checks various charging condition sequentially to decide next charging operation. However, as it follows sequential process, cascaded if statements, it does some jobs which have already done in the previous stage. This results in a delay in decision making. Moreover, starting point of charging is spread all around which makes maintain code and debugging difficult. Both of the problems mentioned above become clean if it manages battery charging focusing on battery status not following sequential condition checking. Now, cm_monitor() moves battery state diagram and does the optimal operation for current state. As a result, it reduces whole monitoring time almost in half. Signed-off-by: Jonghwa Lee Signed-off-by: Krzysztof Kozlowski Signed-off-by: Jonathan Bakker Signed-off-by: Sebastian Reichel --- include/linux/power/charger-manager.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/power/charger-manager.h b/include/linux/power/charger-manager.h index 3a98837684e3..c127dbe31e49 100644 --- a/include/linux/power/charger-manager.h +++ b/include/linux/power/charger-manager.h @@ -213,6 +213,7 @@ struct charger_desc { * saved status of battery before entering suspend-to-RAM * @charging_start_time: saved start time of enabling charging * @charging_end_time: saved end time of disabling charging + * @battery_status: Current battery status */ struct charger_manager { struct list_head entry; @@ -232,6 +233,8 @@ struct charger_manager { u64 charging_start_time; u64 charging_end_time; + + int battery_status; }; #endif /* _CHARGER_MANAGER_H */ -- cgit v1.2.3 From c1f73028f75df43689feda4bc70573b7d18a618e Mon Sep 17 00:00:00 2001 From: Jonathan Bakker Date: Thu, 14 May 2020 16:04:33 -0700 Subject: power: supply: charger-manager: Update extcon functions In commit 830ae442202e ("extcon: Remove the deprecated extcon functions") the function extcon_register_interest became a no-op returning an error, leading to non-functional behaviour in charger-manager. Additionally, a translation table is needed between the text representation of the extcon cable names and their IDs is needed. In order to retain DT compatibility, TA and CHARGE-DOWNSTREAM are added as they were present up until commit 11eecf910bd8 ("extcon: Modify the id and name of external connector") Signed-off-by: Jonathan Bakker Signed-off-by: Sebastian Reichel --- include/linux/power/charger-manager.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/power/charger-manager.h b/include/linux/power/charger-manager.h index c127dbe31e49..45e228b353ea 100644 --- a/include/linux/power/charger-manager.h +++ b/include/linux/power/charger-manager.h @@ -40,7 +40,7 @@ enum cm_batt_temp { /** * struct charger_cable * @extcon_name: the name of extcon device. - * @name: the name of charger cable(external connector). + * @name: the name of the cable connector * @extcon_dev: the extcon device. * @wq: the workqueue to control charger according to the state of * charger cable. If charger cable is attached, enable charger. @@ -56,9 +56,10 @@ enum cm_batt_temp { struct charger_cable { const char *extcon_name; const char *name; + struct extcon_dev *extcon_dev; + u64 extcon_type; /* The charger-manager use Extcon framework */ - struct extcon_specific_cable_nb extcon_dev; struct work_struct wq; struct notifier_block nb; -- cgit v1.2.3 From 4afc41dfa5a716e9e7a90c22972583f337c0bcbf Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 26 Aug 2020 00:52:43 +0200 Subject: netfilter: conntrack: remove ignore stats This counter increments when nf_conntrack_in sees a packet that already has a conntrack attached or when the packet is marked as UNTRACKED. Neither is an error. The former is normal for loopback traffic. The second happens for certain ICMPv6 packets or when nftables/ip(6)tables rules are in place. In case someone needs to count UNTRACKED packets, or packets that are marked as untracked before conntrack_in this can be done with both nftables and ip(6)tables rules. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_common.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h index 1db83c931d9c..96b90d7e361f 100644 --- a/include/linux/netfilter/nf_conntrack_common.h +++ b/include/linux/netfilter/nf_conntrack_common.h @@ -8,7 +8,6 @@ struct ip_conntrack_stat { unsigned int found; unsigned int invalid; - unsigned int ignore; unsigned int insert; unsigned int insert_failed; unsigned int drop; -- cgit v1.2.3 From bc92470413f3af152db0d8f90ef3eb13f8cc417a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 26 Aug 2020 00:52:44 +0200 Subject: netfilter: conntrack: add clash resolution stat counter There is a misconception about what "insert_failed" means. We increment this even when a clash got resolved, so it might not indicate a problem. Add a dedicated counter for clash resolution and only increment insert_failed if a clash cannot be resolved. For the old /proc interface, export this in place of an older stat that got removed a while back. For ctnetlink, export this with a new attribute. Also correct an outdated comment that implies we add a duplicate tuple -- we only add the (unique) reply direction. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_common.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h index 96b90d7e361f..0c7d8d1e945d 100644 --- a/include/linux/netfilter/nf_conntrack_common.h +++ b/include/linux/netfilter/nf_conntrack_common.h @@ -10,6 +10,7 @@ struct ip_conntrack_stat { unsigned int invalid; unsigned int insert; unsigned int insert_failed; + unsigned int clash_resolve; unsigned int drop; unsigned int early_drop; unsigned int error; -- cgit v1.2.3 From 1e6c62a8821557720a9b2ea9617359b264f2f67c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 27 Aug 2020 15:01:11 -0700 Subject: bpf: Introduce sleepable BPF programs Introduce sleepable BPF programs that can request such property for themselves via BPF_F_SLEEPABLE flag at program load time. In such case they will be able to use helpers like bpf_copy_from_user() that might sleep. At present only fentry/fexit/fmod_ret and lsm programs can request to be sleepable and only when they are attached to kernel functions that are known to allow sleeping. The non-sleepable programs are relying on implicit rcu_read_lock() and migrate_disable() to protect life time of programs, maps that they use and per-cpu kernel structures used to pass info between bpf programs and the kernel. The sleepable programs cannot be enclosed into rcu_read_lock(). migrate_disable() maps to preempt_disable() in non-RT kernels, so the progs should not be enclosed in migrate_disable() as well. Therefore rcu_read_lock_trace is used to protect the life time of sleepable progs. There are many networking and tracing program types. In many cases the 'struct bpf_prog *' pointer itself is rcu protected within some other kernel data structure and the kernel code is using rcu_dereference() to load that program pointer and call BPF_PROG_RUN() on it. All these cases are not touched. Instead sleepable bpf programs are allowed with bpf trampoline only. The program pointers are hard-coded into generated assembly of bpf trampoline and synchronize_rcu_tasks_trace() is used to protect the life time of the program. The same trampoline can hold both sleepable and non-sleepable progs. When rcu_read_lock_trace is held it means that some sleepable bpf program is running from bpf trampoline. Those programs can use bpf arrays and preallocated hash/lru maps. These map types are waiting on programs to complete via synchronize_rcu_tasks_trace(); Updates to trampoline now has to do synchronize_rcu_tasks_trace() and synchronize_rcu_tasks() to wait for sleepable progs to finish and for trampoline assembly to finish. This is the first step of introducing sleepable progs. Eventually dynamically allocated hash maps can be allowed and networking program types can become sleepable too. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Reviewed-by: Josef Bacik Acked-by: Andrii Nakryiko Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20200827220114.69225-3-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index dbba82a80087..4dd7e927621d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -539,6 +539,8 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, /* these two functions are called from generated trampoline */ u64 notrace __bpf_prog_enter(void); void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start); +void notrace __bpf_prog_enter_sleepable(void); +void notrace __bpf_prog_exit_sleepable(void); struct bpf_ksym { unsigned long start; @@ -734,6 +736,7 @@ struct bpf_prog_aux { bool offload_requested; bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ bool func_proto_unreliable; + bool sleepable; enum bpf_tramp_prog_type trampoline_prog_type; struct bpf_trampoline *trampoline; struct hlist_node tramp_hlist; -- cgit v1.2.3 From 07be4c4a3e7a0db148e44b16c5190e753d1c8569 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 27 Aug 2020 15:01:12 -0700 Subject: bpf: Add bpf_copy_from_user() helper. Sleepable BPF programs can now use copy_from_user() to access user memory. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20200827220114.69225-4-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4dd7e927621d..c6d9f2c444f4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1784,6 +1784,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; +extern const struct bpf_func_proto bpf_copy_from_user_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); -- cgit v1.2.3 From 9667305c6374df8672be46bc496f52f040999531 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 31 Aug 2020 08:51:55 -0700 Subject: bpf: Fix build without BPF_SYSCALL, but with BPF_JIT. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When CONFIG_BPF_SYSCALL is not set, but CONFIG_BPF_JIT=y the kernel build fails: In file included from ../kernel/bpf/trampoline.c:11: ../kernel/bpf/trampoline.c: In function ‘bpf_trampoline_update’: ../kernel/bpf/trampoline.c:220:39: error: ‘call_rcu_tasks_trace’ undeclared ../kernel/bpf/trampoline.c: In function ‘__bpf_prog_enter_sleepable’: ../kernel/bpf/trampoline.c:411:2: error: implicit declaration of function ‘rcu_read_lock_trace’ ../kernel/bpf/trampoline.c: In function ‘__bpf_prog_exit_sleepable’: ../kernel/bpf/trampoline.c:416:2: error: implicit declaration of function ‘rcu_read_unlock_trace’ This is due to: obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_JIT) += dispatcher.o There is a number of functions that arch/x86/net/bpf_jit_comp.c is using from these two files, but none of them will be used when only cBPF is on (which is the case for BPF_SYSCALL=n BPF_JIT=y). Add rcu_trace functions to rcupdate_trace.h. The JITed code won't execute them and BPF trampoline logic won't be used without BPF_SYSCALL. Fixes: 1e6c62a88215 ("bpf: Introduce sleepable BPF programs") Reported-by: kernel test robot Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Paul E. McKenney Link: https://lore.kernel.org/bpf/20200831155155.62754-1-alexei.starovoitov@gmail.com --- include/linux/rcupdate_trace.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h index d9015aac78c6..aaaac8ac927c 100644 --- a/include/linux/rcupdate_trace.h +++ b/include/linux/rcupdate_trace.h @@ -82,7 +82,14 @@ static inline void rcu_read_unlock_trace(void) void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); void synchronize_rcu_tasks_trace(void); void rcu_barrier_tasks_trace(void); - +#else +/* + * The BPF JIT forms these addresses even when it doesn't call these + * functions, so provide definitions that result in runtime errors. + */ +static inline void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func) { BUG(); } +static inline void rcu_read_lock_trace(void) { BUG(); } +static inline void rcu_read_unlock_trace(void) { BUG(); } #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ #endif /* __LINUX_RCUPDATE_TRACE_H */ -- cgit v1.2.3 From 1742b3d528690ae7773cf7bf2f01a90ee1de2fe0 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 28 Aug 2020 10:26:15 +0200 Subject: xsk: i40e: ice: ixgbe: mlx5: Pass buffer pool to driver instead of umem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the explicit umem reference passed to the driver in AF_XDP zero-copy mode with the buffer pool instead. This in preparation for extending the functionality of the zero-copy mode so that umems can be shared between queues on the same netdev and also between netdevs. In this commit, only an umem reference has been added to the buffer pool struct. But later commits will add other entities to it. These are going to be entities that are different between different queue ids and netdevs even though the umem is shared between them. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/1598603189-32145-2-git-send-email-magnus.karlsson@intel.com --- include/linux/netdevice.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b0e303f6603f..d088dd866825 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -618,7 +618,7 @@ struct netdev_queue { /* Subordinate device that the queue has been assigned to */ struct net_device *sb_dev; #ifdef CONFIG_XDP_SOCKETS - struct xdp_umem *umem; + struct xsk_buff_pool *pool; #endif /* * write-mostly part @@ -751,7 +751,7 @@ struct netdev_rx_queue { struct net_device *dev; struct xdp_rxq_info xdp_rxq; #ifdef CONFIG_XDP_SOCKETS - struct xdp_umem *umem; + struct xsk_buff_pool *pool; #endif } ____cacheline_aligned_in_smp; @@ -879,7 +879,7 @@ enum bpf_netdev_command { /* BPF program for offload callbacks, invoked at program load time. */ BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE, - XDP_SETUP_XSK_UMEM, + XDP_SETUP_XSK_POOL, }; struct bpf_prog_offload_ops; @@ -913,9 +913,9 @@ struct netdev_bpf { struct { struct bpf_offloaded_map *offmap; }; - /* XDP_SETUP_XSK_UMEM */ + /* XDP_SETUP_XSK_POOL */ struct { - struct xdp_umem *umem; + struct xsk_buff_pool *pool; u16 queue_id; } xsk; }; -- cgit v1.2.3 From afd6220999d4932616322f8a893371f8d0567a2a Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Sun, 30 Aug 2020 11:33:58 +0300 Subject: net: phylink: add helper function to decode USXGMII word With the new addition of the USXGMII link partner ability constants we can now introduce a phylink helper that decodes the USXGMII word and populates the appropriate fields in the phylink_link_state structure based on them. Signed-off-by: Ioana Ciornei Reviewed-by: Russell King Signed-off-by: David S. Miller --- include/linux/phylink.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index c36fb41a7d90..d81a714cfbbd 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -490,4 +490,7 @@ void phylink_mii_c22_pcs_an_restart(struct mdio_device *pcs); void phylink_mii_c45_pcs_get_state(struct mdio_device *pcs, struct phylink_link_state *state); + +void phylink_decode_usxgmii_word(struct phylink_link_state *state, + uint16_t lpa); #endif -- cgit v1.2.3 From 2dab432c5ae4f9c8eab3132ac0e9facf498ead92 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Sun, 30 Aug 2020 11:34:00 +0300 Subject: net: mdiobus: add clause 45 mdiobus write accessor Add the locked variant of the clause 45 mdiobus write accessor - mdiobus_c45_write(). Signed-off-by: Ioana Ciornei Reviewed-by: Russell King Signed-off-by: David S. Miller --- include/linux/mdio.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 898cbf00332a..3a88b699b758 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -358,6 +358,12 @@ static inline int mdiobus_c45_read(struct mii_bus *bus, int prtad, int devad, return mdiobus_read(bus, prtad, mdiobus_c45_addr(devad, regnum)); } +static inline int mdiobus_c45_write(struct mii_bus *bus, int prtad, int devad, + u16 regnum, u16 val) +{ + return mdiobus_write(bus, prtad, mdiobus_c45_addr(devad, regnum), val); +} + int mdiobus_register_device(struct mdio_device *mdiodev); int mdiobus_unregister_device(struct mdio_device *mdiodev); bool mdiobus_is_registered_device(struct mii_bus *bus, int addr); -- cgit v1.2.3 From 0da4c3d393e40e41e3c6b9f1cebaa498512c2abb Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Sun, 30 Aug 2020 11:34:01 +0300 Subject: net: phy: add Lynx PCS module Add a Lynx PCS module which exposes the necessary operations to drive the PCS using phylink. The majority of the code is extracted from the Felix DSA driver, which will be also modified in a later patch, and exposed as a separate module for code reusability purposes. As such, this aims at feature and bug parity with the existing Felix DSA driver, and thus USXGMII, SGMII, QSGMII and 2500Base-X (only w/o in-band AN) are supported by the Lynx PCS module since these were also supported by Felix. The module can only be enabled by the drivers in need and not user selectable. Signed-off-by: Ioana Ciornei Signed-off-by: David S. Miller --- include/linux/pcs-lynx.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 include/linux/pcs-lynx.h (limited to 'include/linux') diff --git a/include/linux/pcs-lynx.h b/include/linux/pcs-lynx.h new file mode 100644 index 000000000000..a6440d6ebe95 --- /dev/null +++ b/include/linux/pcs-lynx.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */ +/* Copyright 2020 NXP + * Lynx PCS helpers + */ + +#ifndef __LINUX_PCS_LYNX_H +#define __LINUX_PCS_LYNX_H + +#include +#include + +struct lynx_pcs { + struct phylink_pcs pcs; + struct mdio_device *mdio; +}; + +struct lynx_pcs *lynx_pcs_create(struct mdio_device *mdio); + +void lynx_pcs_destroy(struct lynx_pcs *pcs); + +#endif /* __LINUX_PCS_LYNX_H */ -- cgit v1.2.3 From 3f7d820bad6cace3ecb859bf3fa5bc56fe8bb1c6 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 31 Aug 2020 02:26:10 -0400 Subject: net: ipv6: remove unused arg exact_dif in compute_score The arg exact_dif is not used anymore, remove it. inet6_exact_dif_match() is no longer needed after the above is removed, remove it too. Signed-off-by: Miaohe Lin Signed-off-by: David S. Miller --- include/linux/ipv6.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index bac8f4fffbd6..dda61d150a13 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -177,17 +177,6 @@ static inline int inet6_sdif(const struct sk_buff *skb) return 0; } -/* can not be used in TCP layer after tcp_v6_fill_cb */ -static inline bool inet6_exact_dif_match(struct net *net, struct sk_buff *skb) -{ -#if defined(CONFIG_NET_L3_MASTER_DEV) - if (!net->ipv4.sysctl_tcp_l3mdev_accept && - skb && ipv6_l3mdev_skb(IP6CB(skb)->flags)) - return true; -#endif - return false; -} - struct tcp6_request_sock { struct tcp_request_sock tcp6rsk_tcp; }; -- cgit v1.2.3 From f53fa968a7344970b8f8a5707c39cdcf17a6f367 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 5 May 2020 15:42:35 +0200 Subject: scif: Fix spelling of EACCES As per POSIX, the correct spelling is EACCES: include/uapi/asm-generic/errno-base.h:#define EACCES 13 /* Permission denied */ Signed-off-by: Geert Uytterhoeven Signed-off-by: Jiri Kosina --- include/linux/scif.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/scif.h b/include/linux/scif.h index eeb250b73c4b..329e695b8fe5 100644 --- a/include/linux/scif.h +++ b/include/linux/scif.h @@ -657,7 +657,7 @@ int scif_unregister(scif_epd_t epd, off_t offset, size_t len); * the negative of one of the following errors is returned. * * Errors: - * EACCESS - Attempt to write to a read-only range + * EACCES - Attempt to write to a read-only range * EBADF, ENOTTY - epd is not a valid endpoint descriptor * ECONNRESET - Connection reset by peer * EINVAL - rma_flags is invalid @@ -733,7 +733,7 @@ int scif_readfrom(scif_epd_t epd, off_t loffset, size_t len, off_t * the negative of one of the following errors is returned. * * Errors: - * EACCESS - Attempt to write to a read-only range + * EACCES - Attempt to write to a read-only range * EBADF, ENOTTY - epd is not a valid endpoint descriptor * ECONNRESET - Connection reset by peer * EINVAL - rma_flags is invalid @@ -815,7 +815,7 @@ int scif_writeto(scif_epd_t epd, off_t loffset, size_t len, off_t * the negative of one of the following errors is returned. * * Errors: - * EACCESS - Attempt to write to a read-only range + * EACCES - Attempt to write to a read-only range * EBADF, ENOTTY - epd is not a valid endpoint descriptor * ECONNRESET - Connection reset by peer * EINVAL - rma_flags is invalid @@ -895,7 +895,7 @@ int scif_vreadfrom(scif_epd_t epd, void *addr, size_t len, off_t roffset, * the negative of one of the following errors is returned. * * Errors: - * EACCESS - Attempt to write to a read-only range + * EACCES - Attempt to write to a read-only range * EBADF, ENOTTY - epd is not a valid endpoint descriptor * ECONNRESET - Connection reset by peer * EINVAL - rma_flags is invalid -- cgit v1.2.3 From a8c7ffdb5fdde3a57c0b654f66f4d81325abe69f Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 27 Aug 2020 10:51:56 +0200 Subject: mtd: nand: Introduce the ECC engine framework Create a generic ECC engine framework. This is a base to instantiate ECC engine objects. If we really want to be generic, bindings must evolve, so here is the new logic. The following three properties are mutually exclusive: - The nand-no-ecc-engine boolean property is set and there is no ECC engine to retrieve. - The nand-use-soft-ecc-engine boolean property is set and the core will force using the use of software correction. - There is a nand-ecc-engine property pointing at a node which will act as ECC engine. It the later case, the property may reference: - The NAND chip node itself (for the on-die ECC case). - The parent node if the NAND controller embeds an ECC engine. - Any other node being an external ECC controller as well. Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20200827085208.16276-9-miquel.raynal@bootlin.com --- include/linux/mtd/nand.h | 114 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index e754a6fc8a4b..8cf5bdbea782 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -127,6 +127,40 @@ struct nand_page_io_req { int mode; }; +const struct mtd_ooblayout_ops *nand_get_small_page_ooblayout(void); +const struct mtd_ooblayout_ops *nand_get_large_page_ooblayout(void); +const struct mtd_ooblayout_ops *nand_get_large_page_hamming_ooblayout(void); + +/** + * enum nand_ecc_engine_type - NAND ECC engine type + * @NAND_ECC_ENGINE_TYPE_INVALID: Invalid value + * @NAND_ECC_ENGINE_TYPE_NONE: No ECC correction + * @NAND_ECC_ENGINE_TYPE_SOFT: Software ECC correction + * @NAND_ECC_ENGINE_TYPE_ON_HOST: On host hardware ECC correction + * @NAND_ECC_ENGINE_TYPE_ON_DIE: On chip hardware ECC correction + */ +enum nand_ecc_engine_type { + NAND_ECC_ENGINE_TYPE_INVALID, + NAND_ECC_ENGINE_TYPE_NONE, + NAND_ECC_ENGINE_TYPE_SOFT, + NAND_ECC_ENGINE_TYPE_ON_HOST, + NAND_ECC_ENGINE_TYPE_ON_DIE, +}; + +/** + * enum nand_ecc_placement - NAND ECC bytes placement + * @NAND_ECC_PLACEMENT_UNKNOWN: The actual position of the ECC bytes is unknown + * @NAND_ECC_PLACEMENT_OOB: The ECC bytes are located in the OOB area + * @NAND_ECC_PLACEMENT_INTERLEAVED: Syndrome layout, there are ECC bytes + * interleaved with regular data in the main + * area + */ +enum nand_ecc_placement { + NAND_ECC_PLACEMENT_UNKNOWN, + NAND_ECC_PLACEMENT_OOB, + NAND_ECC_PLACEMENT_INTERLEAVED, +}; + /** * enum nand_ecc_algo - NAND ECC algorithm * @NAND_ECC_ALGO_UNKNOWN: Unknown algorithm @@ -143,16 +177,27 @@ enum nand_ecc_algo { /** * struct nand_ecc_props - NAND ECC properties + * @engine_type: ECC engine type + * @placement: OOB placement (if relevant) + * @algo: ECC algorithm (if relevant) * @strength: ECC strength * @step_size: Number of bytes per step + * @flags: Misc properties */ struct nand_ecc_props { + enum nand_ecc_engine_type engine_type; + enum nand_ecc_placement placement; + enum nand_ecc_algo algo; unsigned int strength; unsigned int step_size; + unsigned int flags; }; #define NAND_ECCREQ(str, stp) { .strength = (str), .step_size = (stp) } +/* NAND ECC misc flags */ +#define NAND_ECC_MAXIMIZE_STRENGTH BIT(0) + /** * struct nand_bbt - bad block table object * @cache: in memory BBT cache @@ -183,6 +228,75 @@ struct nand_ops { bool (*isbad)(struct nand_device *nand, const struct nand_pos *pos); }; +/** + * struct nand_ecc_context - Context for the ECC engine + * @conf: basic ECC engine parameters + * @total: total number of bytes used for storing ECC codes, this is used by + * generic OOB layouts + * @priv: ECC engine driver private data + */ +struct nand_ecc_context { + struct nand_ecc_props conf; + unsigned int total; + void *priv; +}; + +/** + * struct nand_ecc_engine_ops - ECC engine operations + * @init_ctx: given a desired user configuration for the pointed NAND device, + * requests the ECC engine driver to setup a configuration with + * values it supports. + * @cleanup_ctx: clean the context initialized by @init_ctx. + * @prepare_io_req: is called before reading/writing a page to prepare the I/O + * request to be performed with ECC correction. + * @finish_io_req: is called after reading/writing a page to terminate the I/O + * request and ensure proper ECC correction. + */ +struct nand_ecc_engine_ops { + int (*init_ctx)(struct nand_device *nand); + void (*cleanup_ctx)(struct nand_device *nand); + int (*prepare_io_req)(struct nand_device *nand, + struct nand_page_io_req *req); + int (*finish_io_req)(struct nand_device *nand, + struct nand_page_io_req *req); +}; + +/** + * struct nand_ecc_engine - ECC engine abstraction for NAND devices + * @ops: ECC engine operations + */ +struct nand_ecc_engine { + struct nand_ecc_engine_ops *ops; +}; + +void of_get_nand_ecc_user_config(struct nand_device *nand); +int nand_ecc_init_ctx(struct nand_device *nand); +void nand_ecc_cleanup_ctx(struct nand_device *nand); +int nand_ecc_prepare_io_req(struct nand_device *nand, + struct nand_page_io_req *req); +int nand_ecc_finish_io_req(struct nand_device *nand, + struct nand_page_io_req *req); +bool nand_ecc_is_strong_enough(struct nand_device *nand); + +/** + * struct nand_ecc - Information relative to the ECC + * @defaults: Default values, depend on the underlying subsystem + * @requirements: ECC requirements from the NAND chip perspective + * @user_conf: User desires in terms of ECC parameters + * @ctx: ECC context for the ECC engine, derived from the device @requirements + * the @user_conf and the @defaults + * @ondie_engine: On-die ECC engine reference, if any + * @engine: ECC engine actually bound + */ +struct nand_ecc { + struct nand_ecc_props defaults; + struct nand_ecc_props requirements; + struct nand_ecc_props user_conf; + struct nand_ecc_context ctx; + struct nand_ecc_engine *ondie_engine; + struct nand_ecc_engine *engine; +}; + /** * struct nand_device - NAND device * @mtd: MTD instance attached to the NAND device -- cgit v1.2.3 From c1077616142907bb6ee987ecd136d6857ffd8787 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 1 Sep 2020 15:10:08 -0700 Subject: ip: expose inet sockopts through inet_diag Expose all exisiting inet sockopt bits through inet_diag for debug purpose. Corresponding changes in iproute2 ss will be submitted to output all these values. Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Signed-off-by: Mahesh Bandewar Signed-off-by: David S. Miller --- include/linux/inet_diag.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index 0ef2d800fda7..84abb30a3fbb 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -75,6 +75,8 @@ static inline size_t inet_diag_msg_attrs_size(void) #ifdef CONFIG_SOCK_CGROUP_DATA + nla_total_size_64bit(sizeof(u64)) /* INET_DIAG_CGROUP_ID */ #endif + + nla_total_size(sizeof(struct inet_diag_sockopt)) + /* INET_DIAG_SOCKOPT */ ; } int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, -- cgit v1.2.3 From fd5a13f4893c8df2a5a3af8599adecb52d05fe89 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Sep 2020 16:22:31 +0200 Subject: proc: add a read_iter method to proc proc_ops This will allow proc files to implement iter read semantics. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- include/linux/proc_fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 2df965cd0974..270cab43ca3d 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -30,6 +30,7 @@ struct proc_ops { unsigned int proc_flags; int (*proc_open)(struct inode *, struct file *); ssize_t (*proc_read)(struct file *, char __user *, size_t, loff_t *); + ssize_t (*proc_read_iter)(struct kiocb *, struct iov_iter *); ssize_t (*proc_write)(struct file *, const char __user *, size_t, loff_t *); loff_t (*proc_lseek)(struct file *, loff_t, int); int (*proc_release)(struct inode *, struct file *); -- cgit v1.2.3 From e48c15b796d412ede883bb2ef7779b2a142f7962 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 29 Jun 2020 17:21:32 -0700 Subject: smp: Add source and destination CPUs to __call_single_data This commit adds a destination CPU to __call_single_data, and is inspired by an earlier commit by Peter Zijlstra. This version adds #ifdef to permit use by 32-bit systems and supplying the destination CPU for all smp_call_function*() requests, not just smp_call_function_single(). If need be, 32-bit systems could be accommodated by shrinking the flags field to 16 bits (the atomic_t variant is currently unused) and by providing only eight bits for CPU on such systems. It is not clear that the addition of the fields to __call_single_node are really needed. [ paulmck: Apply Boqun Feng feedback on 32-bit builds. ] Link: https://lore.kernel.org/lkml/20200615164048.GC2531@hirez.programming.kicks-ass.net/ Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior Cc: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- include/linux/smp.h | 3 +++ include/linux/smp_types.h | 3 +++ 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/smp.h b/include/linux/smp.h index 80d557ef8a11..9f13966d3d92 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -26,6 +26,9 @@ struct __call_single_data { struct { struct llist_node llist; unsigned int flags; +#ifdef CONFIG_64BIT + u16 src, dst; +#endif }; }; smp_call_func_t func; diff --git a/include/linux/smp_types.h b/include/linux/smp_types.h index 364b3ae3e41d..2e8461af8df6 100644 --- a/include/linux/smp_types.h +++ b/include/linux/smp_types.h @@ -61,6 +61,9 @@ struct __call_single_node { unsigned int u_flags; atomic_t a_flags; }; +#ifdef CONFIG_64BIT + u16 src, dst; +#endif }; #endif /* __LINUX_SMP_TYPES_H */ -- cgit v1.2.3 From 0f7c5317b89058e432d3e97efa19467ff4c3b86b Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 4 Sep 2020 14:37:29 -0700 Subject: of: Export of_remove_property() to modules We will need to remove some OF properties in drivers/net/dsa/bcm_sf2.c with a subsequent commit. Export of_remove_property() to modules so we can keep bcm_sf2 modular and provide an empty stub for when CONFIG_OF is disabled to maintain the ability to compile test. Signed-off-by: Florian Fainelli Acked-by: Rob Herring Signed-off-by: Jakub Kicinski --- include/linux/of.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index 5cf7ae0465d1..481ec0467285 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -929,6 +929,11 @@ static inline int of_machine_is_compatible(const char *compat) return 0; } +static inline int of_remove_property(struct device_node *np, struct property *prop) +{ + return 0; +} + static inline bool of_console_check(const struct device_node *dn, const char *name, int index) { return false; -- cgit v1.2.3 From 07e292950b9368518c659c4d5f1dca4bf55779bd Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Thu, 20 Aug 2020 21:53:41 -0600 Subject: PCI: Allow root and child buses to have different pci_ops PCI host bridges often have different ways to access the root and child bus config spaces. The host bridge drivers have invented their own abstractions to handle this. Let's support having different root and child bus pci_ops so these per driver abstractions can be removed. Link: https://lore.kernel.org/r/20200821035420.380495-2-robh@kernel.org Signed-off-by: Rob Herring Signed-off-by: Lorenzo Pieralisi Cc: Bjorn Helgaas --- include/linux/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 835530605c0d..1fbe95a7d386 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -523,6 +523,7 @@ struct pci_host_bridge { struct device dev; struct pci_bus *bus; /* Root bus */ struct pci_ops *ops; + struct pci_ops *child_ops; void *sysdata; int busnr; struct list_head windows; /* resource_entry */ -- cgit v1.2.3 From 82894c1d397f16c2208a35dbb1310559f31980bb Mon Sep 17 00:00:00 2001 From: Rikard Falkeborn Date: Mon, 7 Sep 2020 01:04:51 +0200 Subject: firmware: arm_scmi: Constify ops pointers in scmi_handle These are never modified, so make them const to allow drivers to make them const. Link: https://lore.kernel.org/r/20200906230452.33410-3-rikard.falkeborn@gmail.com Signed-off-by: Rikard Falkeborn Signed-off-by: Sudeep Holla --- include/linux/scmi_protocol.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h index 7e5dd7d1e221..05570afc7f74 100644 --- a/include/linux/scmi_protocol.h +++ b/include/linux/scmi_protocol.h @@ -279,12 +279,12 @@ struct scmi_notify_ops { struct scmi_handle { struct device *dev; struct scmi_revision_info *version; - struct scmi_perf_ops *perf_ops; - struct scmi_clk_ops *clk_ops; - struct scmi_power_ops *power_ops; - struct scmi_sensor_ops *sensor_ops; - struct scmi_reset_ops *reset_ops; - struct scmi_notify_ops *notify_ops; + const struct scmi_perf_ops *perf_ops; + const struct scmi_clk_ops *clk_ops; + const struct scmi_power_ops *power_ops; + const struct scmi_sensor_ops *sensor_ops; + const struct scmi_reset_ops *reset_ops; + const struct scmi_notify_ops *notify_ops; /* for protocol internal use */ void *perf_priv; void *clk_priv; -- cgit v1.2.3 From a8803055127afb87640974adedac60435592b86d Mon Sep 17 00:00:00 2001 From: Cristian Marussi Date: Mon, 7 Sep 2020 18:46:55 +0100 Subject: firmware: arm_scmi: Add system power protocol support Add bare protocol support for SCMI system power protocol as needed by an OSPM agent: basic initialization and SYSTEM_POWER_STATE_NOTIFIER core notification support. No event-handling logic is attached to such notification.. Link: https://lore.kernel.org/r/20200907174657.32466-2-cristian.marussi@arm.com Signed-off-by: Cristian Marussi Signed-off-by: Sudeep Holla --- include/linux/scmi_protocol.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h index 05570afc7f74..4b10093ad671 100644 --- a/include/linux/scmi_protocol.h +++ b/include/linux/scmi_protocol.h @@ -292,6 +292,7 @@ struct scmi_handle { void *sensor_priv; void *reset_priv; void *notify_priv; + void *system_priv; }; enum scmi_std_protocol { @@ -304,6 +305,15 @@ enum scmi_std_protocol { SCMI_PROTOCOL_RESET = 0x16, }; +enum scmi_system_events { + SCMI_SYSTEM_SHUTDOWN, + SCMI_SYSTEM_COLDRESET, + SCMI_SYSTEM_WARMRESET, + SCMI_SYSTEM_POWERUP, + SCMI_SYSTEM_SUSPEND, + SCMI_SYSTEM_MAX +}; + struct scmi_device { u32 id; u8 protocol_id; @@ -378,6 +388,7 @@ enum scmi_notification_events { SCMI_EVENT_SENSOR_TRIP_POINT_EVENT = 0x0, SCMI_EVENT_RESET_ISSUED = 0x0, SCMI_EVENT_BASE_ERROR_EVENT = 0x0, + SCMI_EVENT_SYSTEM_POWER_STATE_NOTIFIER = 0x0, }; struct scmi_power_state_changed_report { @@ -387,6 +398,13 @@ struct scmi_power_state_changed_report { unsigned int power_state; }; +struct scmi_system_power_state_notifier_report { + ktime_t timestamp; + unsigned int agent_id; + unsigned int flags; + unsigned int system_state; +}; + struct scmi_perf_limits_report { ktime_t timestamp; unsigned int agent_id; -- cgit v1.2.3 From 4bd6a7353ee14697fea645e941354976d2c4a452 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 3 Sep 2020 16:22:32 +0200 Subject: sysctl: Convert to iter interfaces Using the read_iter/write_iter interfaces allows for in-kernel users to set sysctls without using set_fs(). Also, the buffer is a string, so give it the real type of 'char *', not void *. [AV: Christoph's fixup folded in] Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- include/linux/bpf-cgroup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 64f367044e25..82b26a1386d8 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -136,7 +136,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, - void **buf, size_t *pcount, loff_t *ppos, + char **buf, size_t *pcount, loff_t *ppos, enum bpf_attach_type type); int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level, -- cgit v1.2.3 From 36e2c7421f02a22f71c9283e55fdb672a9eb58e7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Sep 2020 16:22:34 +0200 Subject: fs: don't allow splice read/write without explicit ops default_file_splice_write is the last piece of generic code that uses set_fs to make the uaccess routines operate on kernel pointers. It implements a "fallback loop" for splicing from files that do not actually provide a proper splice_read method. The usual file systems and other high bandwidth instances all provide a ->splice_read, so this just removes support for various device drivers and procfs/debugfs files. If splice support for any of those turns out to be important it can be added back by switching them to the iter ops and using generic_file_splice_read. Signed-off-by: Christoph Hellwig Reviewed-by: Kees Cook Signed-off-by: Al Viro --- include/linux/fs.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index e019ea2f1347..d33cc3e8ed41 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1894,8 +1894,6 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); -extern ssize_t vfs_readv(struct file *, const struct iovec __user *, - unsigned long, loff_t *, rwf_t); extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, loff_t, size_t, unsigned int); extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in, -- cgit v1.2.3 From 5e6e9852d6f76e01b2e6803c74258afa5b432bc5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Sep 2020 16:22:35 +0200 Subject: uaccess: add infrastructure for kernel builds with set_fs() Add a CONFIG_SET_FS option that is selected by architecturess that implement set_fs, which is all of them initially. If the option is not set stubs for routines related to overriding the address space are provided so that architectures can start to opt out of providing set_fs. Signed-off-by: Christoph Hellwig Reviewed-by: Kees Cook Signed-off-by: Al Viro --- include/linux/uaccess.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 94b285411659..70073c802b48 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -8,6 +8,7 @@ #include +#ifdef CONFIG_SET_FS /* * Force the uaccess routines to be wired up for actual userspace access, * overriding any possible set_fs(KERNEL_DS) still lingering around. Undone @@ -25,6 +26,23 @@ static inline void force_uaccess_end(mm_segment_t oldfs) { set_fs(oldfs); } +#else /* CONFIG_SET_FS */ +typedef struct { + /* empty dummy */ +} mm_segment_t; + +#define uaccess_kernel() (false) +#define user_addr_max() (TASK_SIZE_MAX) + +static inline mm_segment_t force_uaccess_begin(void) +{ + return (mm_segment_t) { }; +} + +static inline void force_uaccess_end(mm_segment_t oldfs) +{ +} +#endif /* CONFIG_SET_FS */ /* * Architectures should provide two primitives (raw_copy_{to,from}_user()) -- cgit v1.2.3 From 2a71593da34d473461f2f5c3dbb53b883596188a Mon Sep 17 00:00:00 2001 From: Alain Volmat Date: Mon, 3 Aug 2020 07:17:55 +0200 Subject: i2c: smbus: add core function handling SMBus host-notify SMBus Host-Notify protocol, from the adapter point of view consist of receiving a message from a client, including the client address and some other data. It can be simply handled by creating a new slave device and registering a callback performing the parsing of the message received from the client. This commit introduces two new core functions * i2c_new_slave_host_notify_device * i2c_free_slave_host_notify_device that take care of registration of the new slave device and callback and will call i2c_handle_smbus_host_notify once a Host-Notify event is received. Signed-off-by: Alain Volmat Reviewed-by: Pierre-Yves MORDRET Signed-off-by: Wolfram Sang --- include/linux/i2c-smbus.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/i2c-smbus.h b/include/linux/i2c-smbus.h index 1e4e0de4ef8b..1ef421818d3a 100644 --- a/include/linux/i2c-smbus.h +++ b/include/linux/i2c-smbus.h @@ -38,6 +38,18 @@ static inline int of_i2c_setup_smbus_alert(struct i2c_adapter *adap) return 0; } #endif +#if IS_ENABLED(CONFIG_I2C_SMBUS) && IS_ENABLED(CONFIG_I2C_SLAVE) +struct i2c_client *i2c_new_slave_host_notify_device(struct i2c_adapter *adapter); +void i2c_free_slave_host_notify_device(struct i2c_client *client); +#else +static inline struct i2c_client *i2c_new_slave_host_notify_device(struct i2c_adapter *adapter) +{ + return ERR_PTR(-ENOSYS); +} +static inline void i2c_free_slave_host_notify_device(struct i2c_client *client) +{ +} +#endif #if IS_ENABLED(CONFIG_I2C_SMBUS) && IS_ENABLED(CONFIG_DMI) void i2c_register_spd(struct i2c_adapter *adap); -- cgit v1.2.3 From 6bbdd563ee9a6078725727571586c66c8613db64 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Tue, 3 Mar 2020 14:58:21 -0500 Subject: dax: Create a range version of dax_layout_busy_page() virtiofs device has a range of memory which is mapped into file inodes using dax. This memory is mapped in qemu on host and maps different sections of real file on host. Size of this memory is limited (determined by administrator) and depending on filesystem size, we will soon reach a situation where all the memory is in use and we need to reclaim some. As part of reclaim process, we will need to make sure that there are no active references to pages (taken by get_user_pages()) on the memory range we are trying to reclaim. I am planning to use dax_layout_busy_page() for this. But in current form this is per inode and scans through all the pages of the inode. We want to reclaim only a portion of memory (say 2MB page). So we want to make sure that only that 2MB range of pages do not have any references (and don't want to unmap all the pages of inode). Hence, create a range version of this function named dax_layout_busy_page_range() which can be used to pass a range which needs to be unmapped. Cc: Dan Williams Cc: linux-nvdimm@lists.01.org Cc: Jan Kara Cc: Vishal L Verma Cc: "Weiny, Ira" Signed-off-by: Vivek Goyal Reviewed-by: Jan Kara Signed-off-by: Miklos Szeredi --- include/linux/dax.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index 6904d4e0b2e0..9016929db4c6 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -141,6 +141,7 @@ int dax_writeback_mapping_range(struct address_space *mapping, struct dax_device *dax_dev, struct writeback_control *wbc); struct page *dax_layout_busy_page(struct address_space *mapping); +struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end); dax_entry_t dax_lock_page(struct page *page); void dax_unlock_page(struct page *page, dax_entry_t cookie); #else @@ -171,6 +172,11 @@ static inline struct page *dax_layout_busy_page(struct address_space *mapping) return NULL; } +static inline struct page *dax_layout_busy_page_range(struct address_space *mapping, pgoff_t start, pgoff_t nr_pages) +{ + return NULL; +} + static inline int dax_writeback_mapping_range(struct address_space *mapping, struct dax_device *dax_dev, struct writeback_control *wbc) { -- cgit v1.2.3 From 0feea33d79825d05b5ede30947db4df34722b463 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 22 Jul 2020 15:01:20 -0700 Subject: soc: qcom-geni-se: Don't use relaxed writes when writing commands Writing the command is the final step in kicking off a transfer. Let's use writel() to ensure that any other memory accesses are done before the command kicks off. It's expected that this is mostly relevant if we're in DMA mode but since it doesn't appear to regress performance in a measurable way [1] even in PIO mode and it's easier to reason about then let's just always use it. NOTE: this patch came about due to code inspection. No actual problems were observed that this patch fixes. [1] Tested by timing "flashrom -p ec" on a Chromebook which stresses GENI SPI a lot. Reviewed-by: Mukesh Kumar Savaliya Reviewed-by: Akash Asthana Reviewed-by: Stephen Boyd Suggested-by: Stephen Boyd Signed-off-by: Douglas Anderson Link: https://lore.kernel.org/r/20200722150113.1.Ia50ab5cb8a6d3a73d302e6bdc25542d48ffd27f4@changeid Signed-off-by: Bjorn Andersson --- include/linux/qcom-geni-se.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/qcom-geni-se.h b/include/linux/qcom-geni-se.h index 8f385fbe5a0e..ae4a8a766b69 100644 --- a/include/linux/qcom-geni-se.h +++ b/include/linux/qcom-geni-se.h @@ -296,7 +296,7 @@ static inline void geni_se_setup_m_cmd(struct geni_se *se, u32 cmd, u32 params) u32 m_cmd; m_cmd = (cmd << M_OPCODE_SHFT) | (params & M_PARAMS_MSK); - writel_relaxed(m_cmd, se->base + SE_GENI_M_CMD0); + writel(m_cmd, se->base + SE_GENI_M_CMD0); } /** @@ -316,7 +316,7 @@ static inline void geni_se_setup_s_cmd(struct geni_se *se, u32 cmd, u32 params) s_cmd &= ~(S_OPCODE_MSK | S_PARAMS_MSK); s_cmd |= (cmd << S_OPCODE_SHFT); s_cmd |= (params & S_PARAMS_MSK); - writel_relaxed(s_cmd, se->base + SE_GENI_S_CMD0); + writel(s_cmd, se->base + SE_GENI_S_CMD0); } /** -- cgit v1.2.3 From 5198d545dba8ad893f5e5a029ca8d43ee7bcf011 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 9 Sep 2020 10:37:51 -0700 Subject: net: remove napi_hash_del() from driver-facing API We allow drivers to call napi_hash_del() before calling netif_napi_del() to batch RCU grace periods. This makes the API asymmetric and leaks internal implementation details. Soon we will want the grace period to protect more than just the NAPI hash table. Restructure the API and have drivers call a new function - __netif_napi_del() if they want to take care of RCU waits. Note that only core was checking the return status from napi_hash_del() so the new helper does not report if the NAPI was actually deleted. Some notes on driver oddness: - veth observed the grace period before calling netif_napi_del() but that should not matter - myri10ge observed normal RCU flavor - bnx2x and enic did not actually observe the grace period (unless they did so implicitly) - virtio_net and enic only unhashed Rx NAPIs The last two points seem to indicate that the calls to napi_hash_del() were a left over rather than an optimization. Regardless, it's easy enough to correct them. This patch may introduce extra synchronize_net() calls for interfaces which set NAPI_STATE_NO_BUSY_POLL and depend on free_netdev() to call netif_napi_del(). This seems inevitable since we want to use RCU for netpoll dev->napi_list traversal, and almost no drivers set IFF_DISABLE_NETPOLL. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7f9fcfd15942..74ed95215091 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -70,6 +70,7 @@ struct udp_tunnel_nic; struct bpf_prog; struct xdp_buff; +void synchronize_net(void); void netdev_set_default_ethtool_ops(struct net_device *dev, const struct ethtool_ops *ops); @@ -488,20 +489,6 @@ static inline bool napi_complete(struct napi_struct *n) return napi_complete_done(n, 0); } -/** - * napi_hash_del - remove a NAPI from global table - * @napi: NAPI context - * - * Warning: caller must observe RCU grace period - * before freeing memory containing @napi, if - * this function returns true. - * Note: core networking stack automatically calls it - * from netif_napi_del(). - * Drivers might want to call this helper to combine all - * the needed RCU grace periods into a single one. - */ -bool napi_hash_del(struct napi_struct *napi); - /** * napi_disable - prevent NAPI from scheduling * @n: NAPI context @@ -2367,13 +2354,27 @@ static inline void netif_tx_napi_add(struct net_device *dev, netif_napi_add(dev, napi, poll, weight); } +/** + * __netif_napi_del - remove a NAPI context + * @napi: NAPI context + * + * Warning: caller must observe RCU grace period before freeing memory + * containing @napi. Drivers might want to call this helper to combine + * all the needed RCU grace periods into a single one. + */ +void __netif_napi_del(struct napi_struct *napi); + /** * netif_napi_del - remove a NAPI context * @napi: NAPI context * * netif_napi_del() removes a NAPI context from the network device NAPI list */ -void netif_napi_del(struct napi_struct *napi); +static inline void netif_napi_del(struct napi_struct *napi) +{ + __netif_napi_del(napi); + synchronize_net(); +} struct napi_gro_cb { /* Virtual address of skb_shinfo(skb)->frags[0].page + offset. */ @@ -2797,7 +2798,6 @@ static inline void unregister_netdevice(struct net_device *dev) int netdev_refcnt_read(const struct net_device *dev); void free_netdev(struct net_device *dev); void netdev_freemem(struct net_device *dev); -void synchronize_net(void); int init_dummy_netdev(struct net_device *dev); struct net_device *netdev_get_xmit_slave(struct net_device *dev, -- cgit v1.2.3 From 4d092dd2041a5f328410ad16d63b8606c24e8604 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 9 Sep 2020 10:37:52 -0700 Subject: net: manage napi add/del idempotence explicitly To RCUify napi->dev_list we need to replace list_del_init() with list_del_rcu(). There is no _init() version for RCU for obvious reasons. Up until now netif_napi_del() was idempotent so to make sure it remains such add a bit which is set when NAPI is listed, and cleared when it removed. Since we don't expect multiple calls to netif_napi_add() to be correct, add a warning on that side. Now that napi_hash_add / napi_hash_del are only called by napi_add / del we can actually steal its bit. We just need to make sure hash node is initialized correctly. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 74ed95215091..157e0242e9ee 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -355,7 +355,7 @@ enum { NAPI_STATE_MISSED, /* reschedule a napi */ NAPI_STATE_DISABLE, /* Disable pending */ NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ - NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */ + NAPI_STATE_LISTED, /* NAPI added to system lists */ NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */ NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */ }; @@ -365,7 +365,7 @@ enum { NAPIF_STATE_MISSED = BIT(NAPI_STATE_MISSED), NAPIF_STATE_DISABLE = BIT(NAPI_STATE_DISABLE), NAPIF_STATE_NPSVC = BIT(NAPI_STATE_NPSVC), - NAPIF_STATE_HASHED = BIT(NAPI_STATE_HASHED), + NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED), NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL), NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), }; -- cgit v1.2.3 From e9b12edc133b54e15ecd105620d51fb8e8fa8bde Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Wed, 9 Sep 2020 17:50:46 -0700 Subject: tcp: record received TOS value in the request socket A new field is added to the request sock to record the TOS value received on the listening socket during 3WHS: When not under syn flood, it is recording the TOS value sent in SYN. When under syn flood, it is recording the TOS value sent in the ACK. This is a preparation patch in order to do TOS reflection in the later commit. Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 56ff2952edaf..2f87377e9af7 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -134,6 +134,7 @@ struct tcp_request_sock { * FastOpen it's the seq# * after data-in-SYN. */ + u8 syn_tos; }; static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req) -- cgit v1.2.3 From 3d7bfea8b8378277a25b42b28fe5a2a5ca76a7cf Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 8 Jul 2020 02:12:34 -0700 Subject: unicode: Add utf8_casefold_hash This adds a case insensitive hash function to allow taking the hash without needing to allocate a casefolded copy of the string. The existing d_hash implementations for casefolding allocate memory within rcu-walk, by avoiding it we can be more efficient and avoid worrying about a failed allocation. Signed-off-by: Daniel Rosenberg Reviewed-by: Gabriel Krisman Bertazi Reviewed-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- include/linux/unicode.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/unicode.h b/include/linux/unicode.h index 990aa97d8049..74484d44c755 100644 --- a/include/linux/unicode.h +++ b/include/linux/unicode.h @@ -27,6 +27,9 @@ int utf8_normalize(const struct unicode_map *um, const struct qstr *str, int utf8_casefold(const struct unicode_map *um, const struct qstr *str, unsigned char *dest, size_t dlen); +int utf8_casefold_hash(const struct unicode_map *um, const void *salt, + struct qstr *str); + struct unicode_map *utf8_load(const char *version); void utf8_unload(struct unicode_map *um); -- cgit v1.2.3 From c843843e714c8f17280d7db009412b1b1baf448b Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 8 Jul 2020 02:12:35 -0700 Subject: fs: Add standard casefolding support This adds general supporting functions for filesystems that use utf8 casefolding. It provides standard dentry_operations and adds the necessary structures in struct super_block to allow this standardization. The new dentry operations are functionally equivalent to the existing operations in ext4 and f2fs, apart from the use of utf8_casefold_hash to avoid an allocation. By providing a common implementation, all users can benefit from any optimizations without needing to port over improvements. Signed-off-by: Daniel Rosenberg Reviewed-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- include/linux/fs.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 7519ae003a08..bc5417c61e12 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1371,6 +1371,12 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_ACTIVE (1<<30) #define SB_NOUSER (1<<31) +/* These flags relate to encoding and casefolding */ +#define SB_ENC_STRICT_MODE_FL (1 << 0) + +#define sb_has_strict_encoding(sb) \ + (sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL) + /* * Umount options */ @@ -1440,6 +1446,10 @@ struct super_block { #endif #ifdef CONFIG_FS_VERITY const struct fsverity_operations *s_vop; +#endif +#ifdef CONFIG_UNICODE + struct unicode_map *s_encoding; + __u16 s_encoding_flags; #endif struct hlist_bl_head s_roots; /* alternate root dentries for NFS */ struct list_head s_mounts; /* list of mounts; _not_ for fs use */ @@ -3262,6 +3272,12 @@ extern int generic_file_fsync(struct file *, loff_t, loff_t, int); extern int generic_check_addressable(unsigned, u64); +#ifdef CONFIG_UNICODE +extern int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str); +extern int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, + const char *str, const struct qstr *name); +#endif + #ifdef CONFIG_MIGRATION extern int buffer_migrate_page(struct address_space *, struct page *, struct page *, -- cgit v1.2.3 From eca4873ee1b63ee051e0eed91099fa42c97b2438 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 8 Jul 2020 02:12:36 -0700 Subject: f2fs: Use generic casefolding support This switches f2fs over to the generic support provided in the previous patch. Since casefolded dentries behave the same in ext4 and f2fs, we decrease the maintenance burden by unifying them, and any optimizations will immediately apply to both. Signed-off-by: Daniel Rosenberg Reviewed-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- include/linux/f2fs_fs.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 3c383ddd92dd..a5dbb57a687f 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -38,9 +38,6 @@ #define F2FS_MAX_QUOTAS 3 #define F2FS_ENC_UTF8_12_1 1 -#define F2FS_ENC_STRICT_MODE_FL (1 << 0) -#define f2fs_has_strict_mode(sbi) \ - (sbi->s_encoding_flags & F2FS_ENC_STRICT_MODE_FL) #define F2FS_IO_SIZE(sbi) (1 << F2FS_OPTION(sbi).write_io_size_bits) /* Blocks */ #define F2FS_IO_SIZE_KB(sbi) (1 << (F2FS_OPTION(sbi).write_io_size_bits + 2)) /* KB */ -- cgit v1.2.3 From d66423fbe11e141206f117b232916aa899d44959 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Thu, 10 Sep 2020 12:02:48 +0100 Subject: bpf: Plug hole in struct bpf_sk_lookup_kern As Alexei points out, struct bpf_sk_lookup_kern has two 4-byte holes. This leads to suboptimal instructions being generated (IPv4, x86): 1372 struct bpf_sk_lookup_kern ctx = { 0xffffffff81b87f30 <+624>: xor %eax,%eax 0xffffffff81b87f32 <+626>: mov $0x6,%ecx 0xffffffff81b87f37 <+631>: lea 0x90(%rsp),%rdi 0xffffffff81b87f3f <+639>: movl $0x110002,0x88(%rsp) 0xffffffff81b87f4a <+650>: rep stos %rax,%es:(%rdi) 0xffffffff81b87f4d <+653>: mov 0x8(%rsp),%eax 0xffffffff81b87f51 <+657>: mov %r13d,0x90(%rsp) 0xffffffff81b87f59 <+665>: incl %gs:0x7e4970a0(%rip) 0xffffffff81b87f60 <+672>: mov %eax,0x8c(%rsp) 0xffffffff81b87f67 <+679>: movzwl 0x10(%rsp),%eax 0xffffffff81b87f6c <+684>: mov %ax,0xa8(%rsp) 0xffffffff81b87f74 <+692>: movzwl 0x38(%rsp),%eax 0xffffffff81b87f79 <+697>: mov %ax,0xaa(%rsp) Fix this by moving around sport and dport. pahole confirms there are no more holes: struct bpf_sk_lookup_kern { u16 family; /* 0 2 */ u16 protocol; /* 2 2 */ __be16 sport; /* 4 2 */ u16 dport; /* 6 2 */ struct { __be32 saddr; /* 8 4 */ __be32 daddr; /* 12 4 */ } v4; /* 8 8 */ struct { const struct in6_addr * saddr; /* 16 8 */ const struct in6_addr * daddr; /* 24 8 */ } v6; /* 16 16 */ struct sock * selected_sk; /* 32 8 */ bool no_reuseport; /* 40 1 */ /* size: 48, cachelines: 1, members: 8 */ /* padding: 7 */ /* last cacheline: 48 bytes */ }; The assembly also doesn't contain the pesky rep stos anymore: 1372 struct bpf_sk_lookup_kern ctx = { 0xffffffff81b87f60 <+624>: movzwl 0x10(%rsp),%eax 0xffffffff81b87f65 <+629>: movq $0x0,0xa8(%rsp) 0xffffffff81b87f71 <+641>: movq $0x0,0xb0(%rsp) 0xffffffff81b87f7d <+653>: mov %ax,0x9c(%rsp) 0xffffffff81b87f85 <+661>: movzwl 0x38(%rsp),%eax 0xffffffff81b87f8a <+666>: movq $0x0,0xb8(%rsp) 0xffffffff81b87f96 <+678>: mov %ax,0x9e(%rsp) 0xffffffff81b87f9e <+686>: mov 0x8(%rsp),%eax 0xffffffff81b87fa2 <+690>: movq $0x0,0xc0(%rsp) 0xffffffff81b87fae <+702>: movl $0x110002,0x98(%rsp) 0xffffffff81b87fb9 <+713>: mov %eax,0xa0(%rsp) 0xffffffff81b87fc0 <+720>: mov %r13d,0xa4(%rsp) 1: https://lore.kernel.org/bpf/CAADnVQKE6y9h2fwX6OS837v-Uf+aBXnT_JXiN_bbo2gitZQ3tA@mail.gmail.com/ Fixes: e9ddbb7707ff ("bpf: Introduce SK_LOOKUP program type with a dedicated attach point") Suggested-by: Alexei Starovoitov Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/20200910110248.198326-1-lmb@cloudflare.com --- include/linux/filter.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 995625950cc1..e962dd8117d8 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1287,6 +1287,8 @@ int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len); struct bpf_sk_lookup_kern { u16 family; u16 protocol; + __be16 sport; + u16 dport; struct { __be32 saddr; __be32 daddr; @@ -1295,8 +1297,6 @@ struct bpf_sk_lookup_kern { const struct in6_addr *saddr; const struct in6_addr *daddr; } v6; - __be16 sport; - u16 dport; struct sock *selected_sk; bool no_reuseport; }; -- cgit v1.2.3 From b3003a74456f0c1f614a46c07e16abe33bfdd087 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 4 Sep 2020 19:41:57 -0300 Subject: RDMA/qedr: Remove fbo and zbva from the MR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit zbva is always false, so fbo is never read. A 'zero-based-virtual-address' is simply IOVA == 0, and the driver already supports this. Link: https://lore.kernel.org/r/16-v2-270386b7e60b+28f4-umem_1_jgg@nvidia.com Acked-by: Michal Kalderon  Signed-off-by: Jason Gunthorpe --- include/linux/qed/qed_rdma_if.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/qed/qed_rdma_if.h b/include/linux/qed/qed_rdma_if.h index f464d85e88a4..aeb242cefebf 100644 --- a/include/linux/qed/qed_rdma_if.h +++ b/include/linux/qed/qed_rdma_if.h @@ -242,10 +242,8 @@ struct qed_rdma_register_tid_in_params { bool pbl_two_level; u8 pbl_page_size_log; u8 page_size_log; - u32 fbo; u64 length; u64 vaddr; - bool zbva; bool phy_mr; bool dma_mr; -- cgit v1.2.3 From 97fb3e33474897c35b7cfe6ff0e16a3e172d9249 Mon Sep 17 00:00:00 2001 From: Michal Kalderon Date: Wed, 2 Sep 2020 19:57:38 +0300 Subject: qede: Notify qedr when mtu has changed MTU change on ethtool is currently not supported for iWARP. Notify qedr driver so that appropriate logging can take place. Link: https://lore.kernel.org/r/20200902165741.8355-6-michal.kalderon@marvell.com Signed-off-by: Michal Kalderon Signed-off-by: Jason Gunthorpe --- include/linux/qed/qede_rdma.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/qed/qede_rdma.h b/include/linux/qed/qede_rdma.h index 072da2f6da37..0d5564a59a59 100644 --- a/include/linux/qed/qede_rdma.h +++ b/include/linux/qed/qede_rdma.h @@ -20,7 +20,8 @@ enum qede_rdma_event { QEDE_UP, QEDE_DOWN, QEDE_CHANGE_ADDR, - QEDE_CLOSE + QEDE_CLOSE, + QEDE_CHANGE_MTU, }; struct qede_rdma_event_work { @@ -54,6 +55,7 @@ void qede_rdma_dev_event_open(struct qede_dev *dev); void qede_rdma_dev_event_close(struct qede_dev *dev); void qede_rdma_dev_remove(struct qede_dev *dev, bool recovery); void qede_rdma_event_changeaddr(struct qede_dev *edr); +void qede_rdma_event_change_mtu(struct qede_dev *edev); #else static inline int qede_rdma_dev_add(struct qede_dev *dev, -- cgit v1.2.3 From 568a36a69bad4f2efcfa4f94c83aa150a463735c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 10 Sep 2020 19:48:54 +0300 Subject: net: dsa: tag_8021q: include missing refcount.h The previous assumption was that the caller would already have this header file included. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/dsa/8021q.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index 311aa04e7520..804750122c66 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -5,6 +5,7 @@ #ifndef _NET_DSA_8021Q_H #define _NET_DSA_8021Q_H +#include #include struct dsa_switch; -- cgit v1.2.3 From 7e092af2f3b33694b9117ffd978d42b04ec4f260 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 10 Sep 2020 19:48:55 +0300 Subject: net: dsa: tag_8021q: setup tagging via a single function call There is no point in calling dsa_port_setup_8021q_tagging for each individual port. Additionally, it will become more difficult to do that when we'll have a context structure to tag_8021q (next patch). So refactor this now. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/dsa/8021q.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index 804750122c66..8586d8cdf956 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -25,8 +25,7 @@ struct dsa_8021q_crosschip_link { #if IS_ENABLED(CONFIG_NET_DSA_TAG_8021Q) -int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int index, - bool enabled); +int dsa_8021q_setup(struct dsa_switch *ds, bool enabled); int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port, struct dsa_switch *other_ds, @@ -57,8 +56,7 @@ bool vid_is_dsa_8021q(u16 vid); #else -int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int index, - bool enabled) +int dsa_8021q_setup(struct dsa_switch *ds, bool enabled) { return 0; } -- cgit v1.2.3 From 5899ee367ab3fec885aa04d9a2b573bf2e464e7f Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 10 Sep 2020 19:48:56 +0300 Subject: net: dsa: tag_8021q: add a context structure While working on another tag_8021q driver implementation, some things became apparent: - It is not mandatory for a DSA driver to offload the tag_8021q VLANs by using the VLAN table per se. For example, it can add custom TCAM rules that simply encapsulate RX traffic, and redirect & decapsulate rules for TX traffic. For such a driver, it makes no sense to receive the tag_8021q configuration through the same callback as it receives the VLAN configuration from the bridge and the 8021q modules. - Currently, sja1105 (the only tag_8021q user) sets a priv->expect_dsa_8021q variable to distinguish between the bridge calling, and tag_8021q calling. That can be improved, to say the least. - The crosschip bridging operations are, in fact, stateful already. The list of crosschip_links must be kept by the caller and passed to the relevant tag_8021q functions. So it would be nice if the tag_8021q configuration was more self-contained. This patch attempts to do that. Create a struct dsa_8021q_context which encapsulates a struct dsa_switch, and has 2 function pointers for adding and deleting a VLAN. These will replace the previous channel to the driver, which was through the .port_vlan_add and .port_vlan_del callbacks of dsa_switch_ops. Also put the list of crosschip_links into this dsa_8021q_context. Drivers that don't support cross-chip bridging can simply omit to initialize this list, as long as they dont call any cross-chip function. The sja1105_vlan_add and sja1105_vlan_del functions are refactored into a smaller sja1105_vlan_add_one, which now has 2 entry points: - sja1105_vlan_add, from struct dsa_switch_ops - sja1105_dsa_8021q_vlan_add, from the tag_8021q ops But even this change is fairly trivial. It just reflects the fact that for sja1105, the VLANs from these 2 channels end up in the same hardware table. However that is not necessarily true in the general sense (and that's the reason for making this change). The rest of the patch is mostly plain refactoring of "ds" -> "ctx". The dsa_8021q_context structure needs to be propagated because adding a VLAN is now done through the ops function pointers inside of it. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/dsa/8021q.h | 46 +++++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index 8586d8cdf956..2b003ae9fb38 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -12,30 +12,40 @@ struct dsa_switch; struct sk_buff; struct net_device; struct packet_type; +struct dsa_8021q_context; struct dsa_8021q_crosschip_link { struct list_head list; int port; - struct dsa_switch *other_ds; + struct dsa_8021q_context *other_ctx; int other_port; refcount_t refcount; }; +struct dsa_8021q_ops { + int (*vlan_add)(struct dsa_switch *ds, int port, u16 vid, u16 flags); + int (*vlan_del)(struct dsa_switch *ds, int port, u16 vid); +}; + +struct dsa_8021q_context { + const struct dsa_8021q_ops *ops; + struct dsa_switch *ds; + struct list_head crosschip_links; +}; + #define DSA_8021Q_N_SUBVLAN 8 #if IS_ENABLED(CONFIG_NET_DSA_TAG_8021Q) -int dsa_8021q_setup(struct dsa_switch *ds, bool enabled); +int dsa_8021q_setup(struct dsa_8021q_context *ctx, bool enabled); -int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port, - struct dsa_switch *other_ds, - int other_port, - struct list_head *crosschip_links); +int dsa_8021q_crosschip_bridge_join(struct dsa_8021q_context *ctx, int port, + struct dsa_8021q_context *other_ctx, + int other_port); -int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port, - struct dsa_switch *other_ds, - int other_port, - struct list_head *crosschip_links); +int dsa_8021q_crosschip_bridge_leave(struct dsa_8021q_context *ctx, int port, + struct dsa_8021q_context *other_ctx, + int other_port); struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, u16 tpid, u16 tci); @@ -56,23 +66,21 @@ bool vid_is_dsa_8021q(u16 vid); #else -int dsa_8021q_setup(struct dsa_switch *ds, bool enabled) +int dsa_8021q_setup(struct dsa_8021q_context *ctx, bool enabled) { return 0; } -int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port, - struct dsa_switch *other_ds, - int other_port, - struct list_head *crosschip_links) +int dsa_8021q_crosschip_bridge_join(struct dsa_8021q_context *ctx, int port, + struct dsa_8021q_context *other_ctx, + int other_port) { return 0; } -int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port, - struct dsa_switch *other_ds, - int other_port, - struct list_head *crosschip_links) +int dsa_8021q_crosschip_bridge_leave(struct dsa_8021q_context *ctx, int port, + struct dsa_8021q_context *other_ctx, + int other_port) { return 0; } -- cgit v1.2.3 From 1623ad8ec04c771a54975fb84b22bc21c2dbcac1 Mon Sep 17 00:00:00 2001 From: Divya Koppera Date: Fri, 11 Sep 2020 18:48:44 +0530 Subject: net: phy: mchp: Add support for LAN8814 QUAD PHY LAN8814 is a low-power, quad-port triple-speed (10BASE-T/100BASETX/1000BASE-T) Ethernet physical layer transceiver (PHY). It supports transmission and reception of data on standard CAT-5, as well as CAT-5e and CAT-6, unshielded twisted pair (UTP) cables. LAN8814 supports industry-standard QSGMII (Quad Serial Gigabit Media Independent Interface) and Q-USGMII (Quad Universal Serial Gigabit Media Independent Interface) providing chip-to-chip connection to four Gigabit Ethernet MACs using a single serialized link (differential pair) in each direction. The LAN8814 SKU supports high-accuracy timestamping functions to support IEEE-1588 solutions using Microchip Ethernet switches, as well as customer solutions based on SoCs and FPGAs. The LAN8804 SKU has same features as that of LAN8814 SKU except that it does not support 1588, SyncE, or Q-USGMII with PCH/MCH. This adds support for 10BASE-T, 100BASE-TX, and 1000BASE-T, QSGMII link with the MAC. Signed-off-by: Divya Koppera Signed-off-by: David S. Miller --- include/linux/micrel_phy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/micrel_phy.h b/include/linux/micrel_phy.h index 75f880c25bb8..416ee6dd2574 100644 --- a/include/linux/micrel_phy.h +++ b/include/linux/micrel_phy.h @@ -27,6 +27,7 @@ #define PHY_ID_KSZ8061 0x00221570 #define PHY_ID_KSZ9031 0x00221620 #define PHY_ID_KSZ9131 0x00221640 +#define PHY_ID_LAN8814 0x00221660 #define PHY_ID_KSZ886X 0x00221430 #define PHY_ID_KSZ8863 0x00221435 -- cgit v1.2.3 From dc1129564a0147feb459159fd220ae22357e2eb6 Mon Sep 17 00:00:00 2001 From: Suman Anna Date: Fri, 11 Sep 2020 21:43:34 -0700 Subject: soc: ti: pruss: Add a platform driver for PRUSS in TI SoCs The Programmable Real-Time Unit - Industrial Communication Subsystem (PRU-ICSS) is present on various TI SoCs such as AM335x or AM437x or the Keystone 66AK2G. Each SoC can have one or more PRUSS instances that may or may not be identical. For example, AM335x SoCs have a single PRUSS, while AM437x has two PRUSS instances PRUSS1 and PRUSS0, with the PRUSS0 being a cut-down version of the PRUSS1. The PRUSS consists of dual 32-bit RISC cores called the Programmable Real-Time Units (PRUs), some shared, data and instruction memories, some internal peripheral modules, and an interrupt controller. The programmable nature of the PRUs provide flexibility to implement custom peripheral interfaces, fast real-time responses, or specialized data handling. The PRU-ICSS functionality is achieved through three different platform drivers addressing a specific portion of the PRUSS. Some sub-modules of the PRU-ICSS IP reuse some of the existing drivers (like davinci mdio driver or the generic syscon driver). This design provides flexibility in representing the different modules of PRUSS accordingly, and at the same time allowing the PRUSS driver to add some instance specific configuration within an SoC. The PRUSS platform driver deals with the overall PRUSS and is used for managing the subsystem level resources like various memories and the CFG module. It is responsible for the creation and deletion of the platform devices for the child PRU devices and other child devices (like Interrupt Controller, MDIO node and some syscon nodes) so that they can be managed by specific platform drivers. The PRUSS interrupt controller is managed by an irqchip driver, while the individual PRU RISC cores are managed by a PRU remoteproc driver. The driver currently supports the AM335x SoC, and support for other TI SoCs will be added in subsequent patches. Signed-off-by: Suman Anna Signed-off-by: Andrew F. Davis Signed-off-by: Tero Kristo Signed-off-by: Grzegorz Jaszczyk Signed-off-by: Santosh Shilimkar --- include/linux/pruss_driver.h | 48 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 include/linux/pruss_driver.h (limited to 'include/linux') diff --git a/include/linux/pruss_driver.h b/include/linux/pruss_driver.h new file mode 100644 index 000000000000..0701fe1953dd --- /dev/null +++ b/include/linux/pruss_driver.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * PRU-ICSS sub-system specific definitions + * + * Copyright (C) 2014-2020 Texas Instruments Incorporated - http://www.ti.com/ + * Suman Anna + */ + +#ifndef _PRUSS_DRIVER_H_ +#define _PRUSS_DRIVER_H_ + +#include + +/* + * enum pruss_mem - PRUSS memory range identifiers + */ +enum pruss_mem { + PRUSS_MEM_DRAM0 = 0, + PRUSS_MEM_DRAM1, + PRUSS_MEM_SHRD_RAM2, + PRUSS_MEM_MAX, +}; + +/** + * struct pruss_mem_region - PRUSS memory region structure + * @va: kernel virtual address of the PRUSS memory region + * @pa: physical (bus) address of the PRUSS memory region + * @size: size of the PRUSS memory region + */ +struct pruss_mem_region { + void __iomem *va; + phys_addr_t pa; + size_t size; +}; + +/** + * struct pruss - PRUSS parent structure + * @dev: pruss device pointer + * @cfg_regmap: regmap for config region + * @mem_regions: data for each of the PRUSS memory regions + */ +struct pruss { + struct device *dev; + struct regmap *cfg_regmap; + struct pruss_mem_region mem_regions[PRUSS_MEM_MAX]; +}; + +#endif /* _PRUSS_DRIVER_H_ */ -- cgit v1.2.3 From ba59c9b43c86b2c2396acac94e41d946cbaec9fe Mon Sep 17 00:00:00 2001 From: Grzegorz Jaszczyk Date: Fri, 11 Sep 2020 21:47:10 -0700 Subject: soc: ti: pruss: support CORECLK_MUX and IEPCLK_MUX The IEPCLK_MUX is present on all SoCs whereas the CORECLK_MUX is present only on AM65x SoCs and J721E. Add support for both these CLK muxes. This allows the clock rates and clock parents for these to be controlled through DT leveraging the clk infrastructure for configuring the default parents and rates. Signed-off-by: Roger Quadros Signed-off-by: Suman Anna Signed-off-by: Grzegorz Jaszczyk Signed-off-by: Santosh Shilimkar --- include/linux/pruss_driver.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pruss_driver.h b/include/linux/pruss_driver.h index 0701fe1953dd..ecfded30ed05 100644 --- a/include/linux/pruss_driver.h +++ b/include/linux/pruss_driver.h @@ -36,13 +36,19 @@ struct pruss_mem_region { /** * struct pruss - PRUSS parent structure * @dev: pruss device pointer + * @cfg_base: base iomap for CFG region * @cfg_regmap: regmap for config region * @mem_regions: data for each of the PRUSS memory regions + * @core_clk_mux: clk handle for PRUSS CORE_CLK_MUX + * @iep_clk_mux: clk handle for PRUSS IEP_CLK_MUX */ struct pruss { struct device *dev; + void __iomem *cfg_base; struct regmap *cfg_regmap; struct pruss_mem_region mem_regions[PRUSS_MEM_MAX]; + struct clk *core_clk_mux; + struct clk *iep_clk_mux; }; #endif /* _PRUSS_DRIVER_H_ */ -- cgit v1.2.3 From 66d90f6ecee755e9c19a119c9255e80091165498 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Mon, 7 Sep 2020 12:09:23 +0100 Subject: firmware: arm_scmi: Enable building as a single module Now, with all the plumbing in place to enable building scmi as a module instead of built-in modules, let us enable the same. Link: https://lore.kernel.org/r/20200907195046.56615-5-sudeep.holla@arm.com Tested-by: Cristian Marussi Signed-off-by: Sudeep Holla --- include/linux/scmi_protocol.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h index 4b10093ad671..9cd312a1ff92 100644 --- a/include/linux/scmi_protocol.h +++ b/include/linux/scmi_protocol.h @@ -345,7 +345,7 @@ struct scmi_driver { #define to_scmi_driver(d) container_of(d, struct scmi_driver, driver) -#ifdef CONFIG_ARM_SCMI_PROTOCOL +#if IS_REACHABLE(CONFIG_ARM_SCMI_PROTOCOL) int scmi_driver_register(struct scmi_driver *driver, struct module *owner, const char *mod_name); void scmi_driver_unregister(struct scmi_driver *driver); -- cgit v1.2.3 From 1fa5cef283420b3dad93cd6ab04d7125bc1562de Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Tue, 18 Aug 2020 15:07:57 +0800 Subject: i40e: optimise prefetch page refcount refcount of rx_buffer page will be added here originally, so prefetchw is needed, but after commit 1793668c3b8c ("i40e/i40evf: Update code to better handle incrementing page count"), and refcount is not added every time, so change prefetchw as prefetch. Now it mainly services page_address(), but which accesses struct page only when WANT_PAGE_VIRTUAL or HASHED_PAGE_VIRTUAL is defined otherwise it returns address based on offset, so we prefetch it conditionally. Jakub suggested to define prefetch_page_address in a common header. Reported-by: kernel test robot Suggested-by: Jakub Kicinski Signed-off-by: Li RongQing Reviewed-by: Jesse Brandeburg Tested-by: Aaron Brown Signed-off-by: Tony Nguyen --- include/linux/prefetch.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/prefetch.h b/include/linux/prefetch.h index 13eafebf3549..b83a3f944f28 100644 --- a/include/linux/prefetch.h +++ b/include/linux/prefetch.h @@ -15,6 +15,7 @@ #include #include +struct page; /* prefetch(x) attempts to pre-emptively get the memory pointed to by address "x" into the CPU L1 cache. @@ -62,4 +63,11 @@ static inline void prefetch_range(void *addr, size_t len) #endif } +static inline void prefetch_page_address(struct page *page) +{ +#if defined(WANT_PAGE_VIRTUAL) || defined(HASHED_PAGE_VIRTUAL) + prefetch(page); +#endif +} + #endif -- cgit v1.2.3 From cf6501689012ef346cc835a38cbebb1e398d0834 Mon Sep 17 00:00:00 2001 From: Andrew Scull Date: Tue, 15 Sep 2020 11:46:39 +0100 Subject: smccc: Define vendor hyp owned service call region Vendor specific hypervisor services have their own region of function identifiers reserved by SMCCC. Extend the list of owners to include this case. Signed-off-by: Andrew Scull Signed-off-by: Marc Zyngier Cc: Sudeep Holla Link: https://lore.kernel.org/r/20200915104643.2543892-16-ascull@google.com --- include/linux/arm-smccc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 15c706fb0a37..ee286f5de239 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -49,6 +49,7 @@ #define ARM_SMCCC_OWNER_OEM 3 #define ARM_SMCCC_OWNER_STANDARD 4 #define ARM_SMCCC_OWNER_STANDARD_HYP 5 +#define ARM_SMCCC_OWNER_VENDOR_HYP 6 #define ARM_SMCCC_OWNER_TRUSTED_APP 48 #define ARM_SMCCC_OWNER_TRUSTED_APP_END 49 #define ARM_SMCCC_OWNER_TRUSTED_OS 50 -- cgit v1.2.3 From 0794a974d74dc777a212a3c58dd236f507360348 Mon Sep 17 00:00:00 2001 From: Andrew Scull Date: Tue, 15 Sep 2020 11:46:40 +0100 Subject: smccc: Use separate variables for args and results Using the same register-bound variable for both arguments and results means these values share a type. That type must allow the arguments to be assigned to it and must also be assignable to the unsigned long fields of struct arm_smccc_res. This restriction on types causes compiler warnings when the argument cannot be implicitly assigned to an unsigned long, for example the pointers that are used in the KVM hyp interface. By separating the arguments and results into their own variables, the type constraint is lifted allowing the arguments to avoid the need for any type conversion. Signed-off-by: Andrew Scull Signed-off-by: Marc Zyngier Cc: Sudeep Holla Link: https://lore.kernel.org/r/20200915104643.2543892-17-ascull@google.com --- include/linux/arm-smccc.h | 73 +++++++++++++++++++---------------------------- 1 file changed, 29 insertions(+), 44 deletions(-) (limited to 'include/linux') diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index ee286f5de239..885c9ffc835c 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -228,87 +228,67 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1, #define __count_args(...) \ ___count_args(__VA_ARGS__, 7, 6, 5, 4, 3, 2, 1, 0) -#define __constraint_write_0 \ - "+r" (r0), "=&r" (r1), "=&r" (r2), "=&r" (r3) -#define __constraint_write_1 \ - "+r" (r0), "+r" (r1), "=&r" (r2), "=&r" (r3) -#define __constraint_write_2 \ - "+r" (r0), "+r" (r1), "+r" (r2), "=&r" (r3) -#define __constraint_write_3 \ - "+r" (r0), "+r" (r1), "+r" (r2), "+r" (r3) -#define __constraint_write_4 __constraint_write_3 -#define __constraint_write_5 __constraint_write_4 -#define __constraint_write_6 __constraint_write_5 -#define __constraint_write_7 __constraint_write_6 - -#define __constraint_read_0 -#define __constraint_read_1 -#define __constraint_read_2 -#define __constraint_read_3 -#define __constraint_read_4 "r" (r4) -#define __constraint_read_5 __constraint_read_4, "r" (r5) -#define __constraint_read_6 __constraint_read_5, "r" (r6) -#define __constraint_read_7 __constraint_read_6, "r" (r7) +#define __constraint_read_0 "r" (arg0) +#define __constraint_read_1 __constraint_read_0, "r" (arg1) +#define __constraint_read_2 __constraint_read_1, "r" (arg2) +#define __constraint_read_3 __constraint_read_2, "r" (arg3) +#define __constraint_read_4 __constraint_read_3, "r" (arg4) +#define __constraint_read_5 __constraint_read_4, "r" (arg5) +#define __constraint_read_6 __constraint_read_5, "r" (arg6) +#define __constraint_read_7 __constraint_read_6, "r" (arg7) #define __declare_arg_0(a0, res) \ struct arm_smccc_res *___res = res; \ - register unsigned long r0 asm("r0") = (u32)a0; \ - register unsigned long r1 asm("r1"); \ - register unsigned long r2 asm("r2"); \ - register unsigned long r3 asm("r3") + register unsigned long arg0 asm("r0") = (u32)a0 #define __declare_arg_1(a0, a1, res) \ typeof(a1) __a1 = a1; \ struct arm_smccc_res *___res = res; \ - register unsigned long r0 asm("r0") = (u32)a0; \ - register unsigned long r1 asm("r1") = __a1; \ - register unsigned long r2 asm("r2"); \ - register unsigned long r3 asm("r3") + register unsigned long arg0 asm("r0") = (u32)a0; \ + register typeof(a1) arg1 asm("r1") = __a1 #define __declare_arg_2(a0, a1, a2, res) \ typeof(a1) __a1 = a1; \ typeof(a2) __a2 = a2; \ struct arm_smccc_res *___res = res; \ - register unsigned long r0 asm("r0") = (u32)a0; \ - register unsigned long r1 asm("r1") = __a1; \ - register unsigned long r2 asm("r2") = __a2; \ - register unsigned long r3 asm("r3") + register unsigned long arg0 asm("r0") = (u32)a0; \ + register typeof(a1) arg1 asm("r1") = __a1; \ + register typeof(a2) arg2 asm("r2") = __a2 #define __declare_arg_3(a0, a1, a2, a3, res) \ typeof(a1) __a1 = a1; \ typeof(a2) __a2 = a2; \ typeof(a3) __a3 = a3; \ struct arm_smccc_res *___res = res; \ - register unsigned long r0 asm("r0") = (u32)a0; \ - register unsigned long r1 asm("r1") = __a1; \ - register unsigned long r2 asm("r2") = __a2; \ - register unsigned long r3 asm("r3") = __a3 + register unsigned long arg0 asm("r0") = (u32)a0; \ + register typeof(a1) arg1 asm("r1") = __a1; \ + register typeof(a2) arg2 asm("r2") = __a2; \ + register typeof(a3) arg3 asm("r3") = __a3 #define __declare_arg_4(a0, a1, a2, a3, a4, res) \ typeof(a4) __a4 = a4; \ __declare_arg_3(a0, a1, a2, a3, res); \ - register unsigned long r4 asm("r4") = __a4 + register typeof(a4) arg4 asm("r4") = __a4 #define __declare_arg_5(a0, a1, a2, a3, a4, a5, res) \ typeof(a5) __a5 = a5; \ __declare_arg_4(a0, a1, a2, a3, a4, res); \ - register unsigned long r5 asm("r5") = __a5 + register typeof(a5) arg5 asm("r5") = __a5 #define __declare_arg_6(a0, a1, a2, a3, a4, a5, a6, res) \ typeof(a6) __a6 = a6; \ __declare_arg_5(a0, a1, a2, a3, a4, a5, res); \ - register unsigned long r6 asm("r6") = __a6 + register typeof(a6) arg6 asm("r6") = __a6 #define __declare_arg_7(a0, a1, a2, a3, a4, a5, a6, a7, res) \ typeof(a7) __a7 = a7; \ __declare_arg_6(a0, a1, a2, a3, a4, a5, a6, res); \ - register unsigned long r7 asm("r7") = __a7 + register typeof(a7) arg7 asm("r7") = __a7 #define ___declare_args(count, ...) __declare_arg_ ## count(__VA_ARGS__) #define __declare_args(count, ...) ___declare_args(count, __VA_ARGS__) #define ___constraints(count) \ - : __constraint_write_ ## count \ : __constraint_read_ ## count \ : "memory" #define __constraints(count) ___constraints(count) @@ -320,8 +300,13 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1, */ #define __arm_smccc_1_1(inst, ...) \ do { \ + register unsigned long r0 asm("r0"); \ + register unsigned long r1 asm("r1"); \ + register unsigned long r2 asm("r2"); \ + register unsigned long r3 asm("r3"); \ __declare_args(__count_args(__VA_ARGS__), __VA_ARGS__); \ - asm volatile(inst "\n" \ + asm volatile(inst "\n" : \ + "=r" (r0), "=r" (r1), "=r" (r2), "=r" (r3) \ __constraints(__count_args(__VA_ARGS__))); \ if (___res) \ *___res = (typeof(*___res)){r0, r1, r2, r3}; \ @@ -367,7 +352,7 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1, #define __fail_smccc_1_1(...) \ do { \ __declare_args(__count_args(__VA_ARGS__), __VA_ARGS__); \ - asm ("" __constraints(__count_args(__VA_ARGS__))); \ + asm ("" : __constraints(__count_args(__VA_ARGS__))); \ if (___res) \ ___res->a0 = SMCCC_RET_NOT_SUPPORTED; \ } while (0) -- cgit v1.2.3 From fb609b5112bd74b4ba93c86d7af4089ffd9432c2 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Wed, 13 May 2020 11:06:47 +0300 Subject: net/mlx5: Always use container_of to find mdev pointer from clock struct Clock struct is part of struct mlx5_core_dev. Code was inconsistent, on some cases used container_of and on another used clock->mdev. Align code to use container_of amd remove clock->mdev pointer. While here, fix reverse xmas tree coding style. Signed-off-by: Eran Ben Elisha Reviewed-by: Moshe Shemesh --- include/linux/mlx5/driver.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index c145de0473bc..8dc3da6e6480 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -643,7 +643,6 @@ struct mlx5_pps { }; struct mlx5_clock { - struct mlx5_core_dev *mdev; struct mlx5_nb pps_nb; seqlock_t lock; struct cyclecounter cycles; -- cgit v1.2.3 From b7cf0806e8783e38f881cae3c56f0869e70b8da2 Mon Sep 17 00:00:00 2001 From: Ofer Levi Date: Sun, 17 May 2020 10:16:49 +0300 Subject: net/mlx5e: Add CQE compression support for multi-strides packets Add CQE compression support for completions of packets that span multiple strides in a Striding RQ, per the HW capability. In our memory model, we use small strides (256B as of today) for the non-linear SKB mode. This feature allows CQE compression to work also for multiple strides packets. In this case decompressing the mini CQE array will use stride index provided by HW as part of the mini CQE. Before this feature, compression was possible only for single-strided packets, i.e. for packets of size up to 256 bytes when in non-linear mode, and the index was maintained by SW. This feature is supported for ConnectX-5 and above. Feature performance test: This was whitebox-tested, we reduced the PCI speed from 125Gb/s to 62.5Gb/s to overload pci and manipulated mlx5 driver to drop incoming packets before building the SKB to achieve low cpu utilization. Outcome is low cpu utilization and bottleneck on pci only. Test setup: Server: Intel(R) Xeon(R) Silver 4108 CPU @ 1.80GHz server, 32 cores NIC: ConnectX-6 DX. Sender side generates 300 byte packets at full pci bandwidth. Receiver side configuration: Single channel, one cpu processing with one ring allocated. Cpu utilization is ~20% while pci bandwidth is fully utilized. For the generated traffic and interface MTU of 4500B (to activate the non-linear SKB mode), packet rate improvement is about 19% from ~17.6Mpps to ~21Mpps. Without this feature, counters show no CQE compression blocks for this setup, while with the feature, counters show ~20.7Mpps compressed CQEs in ~500K compression blocks. Signed-off-by: Ofer Levi Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 4d3376e20f5e..81ca5989009b 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -816,7 +816,7 @@ struct mlx5_mini_cqe8 { __be32 rx_hash_result; struct { __be16 checksum; - __be16 rsvd; + __be16 stridx; }; struct { __be16 wqe_counter; @@ -836,6 +836,7 @@ enum { enum { MLX5_CQE_FORMAT_CSUM = 0x1, + MLX5_CQE_FORMAT_CSUM_STRIDX = 0x3, }; #define MLX5_MINI_CQE_ARRAY_SIZE 8 -- cgit v1.2.3 From 9a27a33027f22a716ce362be48d70ae0eb012ab7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 14 Sep 2020 17:11:52 -0700 Subject: ethtool: add standard pause stats Currently drivers have to report their pause frames statistics via ethtool -S, and there is a wide variety of names used for these statistics. Add the two statistics defined in IEEE 802.3x to the standard API. Create a new ethtool request header flag for including statistics in the response to GET commands. Always create the ETHTOOL_A_PAUSE_STATS nest in replies when flag is set. Testing if driver declares the op is not a reliable way of checking if any stats will actually be included and therefore we don't want to give the impression that presence of ETHTOOL_A_PAUSE_STATS indicates driver support. Note that this patch does not include PFC counters, which may fit better in dcbnl? But mostly I don't need them/have a setup to test them so I haven't looked deeply into exposing them :) v3: - add a helper for "uninitializing" stats, rather than a cryptic memset() (Andrew) Signed-off-by: Jakub Kicinski Reviewed-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/ethtool.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 969a80211df6..060b20f0b20f 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -241,6 +241,27 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, ETHTOOL_COALESCE_PKT_RATE_LOW | ETHTOOL_COALESCE_PKT_RATE_HIGH | \ ETHTOOL_COALESCE_RATE_SAMPLE_INTERVAL) +#define ETHTOOL_STAT_NOT_SET (~0ULL) + +/** + * struct ethtool_pause_stats - statistics for IEEE 802.3x pause frames + * @tx_pause_frames: transmitted pause frame count. Reported to user space + * as %ETHTOOL_A_PAUSE_STAT_TX_FRAMES. + * + * Equivalent to `30.3.4.2 aPAUSEMACCtrlFramesTransmitted` + * from the standard. + * + * @rx_pause_frames: received pause frame count. Reported to user space + * as %ETHTOOL_A_PAUSE_STAT_RX_FRAMES. Equivalent to: + * + * Equivalent to `30.3.4.3 aPAUSEMACCtrlFramesReceived` + * from the standard. + */ +struct ethtool_pause_stats { + u64 tx_pause_frames; + u64 rx_pause_frames; +}; + /** * struct ethtool_ops - optional netdev operations * @supported_coalesce_params: supported types of interrupt coalescing. @@ -282,6 +303,9 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, * Returns a negative error code or zero. * @get_ringparam: Report ring sizes * @set_ringparam: Set ring sizes. Returns a negative error code or zero. + * @get_pause_stats: Report pause frame statistics. Drivers must not zero + * statistics which they don't report. The stats structure is initialized + * to ETHTOOL_STAT_NOT_SET indicating driver does not report statistics. * @get_pauseparam: Report pause parameters * @set_pauseparam: Set pause parameters. Returns a negative error code * or zero. @@ -418,6 +442,8 @@ struct ethtool_ops { struct ethtool_ringparam *); int (*set_ringparam)(struct net_device *, struct ethtool_ringparam *); + void (*get_pause_stats)(struct net_device *dev, + struct ethtool_pause_stats *pause_stats); void (*get_pauseparam)(struct net_device *, struct ethtool_pauseparam*); int (*set_pauseparam)(struct net_device *, -- cgit v1.2.3 From 984fe94f94756dacb3c8cc52904a23adf9e04da1 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Tue, 15 Sep 2020 16:45:39 -0700 Subject: bpf: Mutex protect used_maps array and count To support modifying the used_maps array, we use a mutex to protect the use of the counter and the array. The mutex is initialized right after the prog aux is allocated, and destroyed right before prog aux is freed. This way we guarantee it's initialized for both cBPF and eBPF. Signed-off-by: YiFei Zhu Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Cc: YiFei Zhu Link: https://lore.kernel.org/bpf/20200915234543.3220146-2-sdf@google.com --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c6d9f2c444f4..5dcce0364634 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -751,6 +751,7 @@ struct bpf_prog_aux { struct bpf_ksym ksym; const struct bpf_prog_ops *ops; struct bpf_map **used_maps; + struct mutex used_maps_mutex; /* mutex for used_maps and used_map_cnt */ struct bpf_prog *prog; struct user_struct *user; u64 load_time; /* ns since boottime */ -- cgit v1.2.3 From 3babbe447d76ac2919ec4d0eb3b0adfb22f5b03c Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Fri, 7 Aug 2020 13:15:16 +0530 Subject: sched/topology: Allow archs to override cpu_smt_mask cpu_smt_mask tracks topology_sibling_cpumask. This would be good for most architectures. One of the users of cpu_smt_mask(), would be to identify idle-cores. On Power9, a pair of SMT4 cores can be presented by the firmware as a SMT8 core for backward compatibility reasons. powerpc allows LPARs to be live migrated from Power8 to Power9. Do note Power8 had only SMT8 cores. Existing software which has been developed/configured for Power8 would expect to see SMT8 core. Maintaining the illusion of SMT8 core is a requirement to make that work. In order to maintain above userspace backward compatibility with previous versions of processor, Power9 onwards there is option to the firmware to advertise a pair of SMT4 cores as a fused cores aka SMT8 core. On Power9 this pair shares the L2 cache as well. However, from the scheduler's point of view, a core should be determined by SMT4, since its a completely independent unit of compute. Hence allow powerpc architecture to override the default cpu_smt_mask() to point to the SMT4 cores in a SMT8 mode. This will ensure the scheduler is always given the right information. Acked-by: Peter Zijlstra (Intel) Signed-off-by: Srikar Dronamraju Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200807074517.27957-1-srikar@linux.vnet.ibm.com --- include/linux/topology.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/topology.h b/include/linux/topology.h index 608fa4aadf0e..ad03df1cc266 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -198,7 +198,7 @@ static inline int cpu_to_mem(int cpu) #define topology_die_cpumask(cpu) cpumask_of(cpu) #endif -#ifdef CONFIG_SCHED_SMT +#if defined(CONFIG_SCHED_SMT) && !defined(cpu_smt_mask) static inline const struct cpumask *cpu_smt_mask(int cpu) { return topology_sibling_cpumask(cpu); -- cgit v1.2.3 From aabf59432c51be174994ecfe280f75ac139b5550 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= Date: Sun, 6 Sep 2020 22:39:49 +0000 Subject: fs: Remove duplicated flag O_NDELAY occurring twice in VALID_OPEN_FLAGS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The O_NDELAY flag occurs twice in the VALID_OPEN_FLAGS definition, this change removes the duplicate. There is no change to the functionality. Note, that the flags O_NONBLOCK and O_NDELAY are not duplicates, as values of these flags are platform dependent, and on platforms like Sparc O_NONBLOCK and O_NDELAY are not the same. This has been done that way to maintain the ABI compatibility with Solaris since the Sparc port was first introduced. This change resolves the following Coccinelle warning: include/linux/fcntl.h:11:13-21: duplicated argument to & or | Signed-off-by: Krzysztof Wilczyński Signed-off-by: Al Viro --- include/linux/fcntl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h index 7bcdcf4f6ab2..921e750843e6 100644 --- a/include/linux/fcntl.h +++ b/include/linux/fcntl.h @@ -8,7 +8,7 @@ /* List of all valid flags for the open/openat flags argument: */ #define VALID_OPEN_FLAGS \ (O_RDONLY | O_WRONLY | O_RDWR | O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | \ - O_APPEND | O_NDELAY | O_NONBLOCK | O_NDELAY | __O_SYNC | O_DSYNC | \ + O_APPEND | O_NDELAY | O_NONBLOCK | __O_SYNC | O_DSYNC | \ FASYNC | O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | \ O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE) -- cgit v1.2.3 From ba3a86e47232ad9f76160929f33ac9c64e4d0567 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 14 Sep 2020 15:44:37 -0700 Subject: rcu-tasks: Fix grace-period/unlock race in RCU Tasks Trace The more intense grace-period processing resulting from the 50x RCU Tasks Trace grace-period speedups exposed the following race condition: o Task A running on CPU 0 executes rcu_read_lock_trace(), entering a read-side critical section. o When Task A eventually invokes rcu_read_unlock_trace() to exit its read-side critical section, this function notes that the ->trc_reader_special.s flag is zero and and therefore invoke wil set ->trc_reader_nesting to zero using WRITE_ONCE(). But before that happens... o The RCU Tasks Trace grace-period kthread running on some other CPU interrogates Task A, but this fails because this task is currently running. This kthread therefore sends an IPI to CPU 0. o CPU 0 receives the IPI, and thus invokes trc_read_check_handler(). Because Task A has not yet cleared its ->trc_reader_nesting counter, this function sees that Task A is still within its read-side critical section. This function therefore sets the ->trc_reader_nesting.b.need_qs flag, AKA the .need_qs flag. Except that Task A has already checked the .need_qs flag, which is part of the ->trc_reader_special.s flag. The .need_qs flag therefore remains set until Task A's next rcu_read_unlock_trace(). o Task A now invokes synchronize_rcu_tasks_trace(), which cannot start a new grace period until the current grace period completes. And thus cannot return until after that time. But Task A's .need_qs flag is still set, which prevents the current grace period from completing. And because Task A is blocked, it will never execute rcu_read_unlock_trace() until its call to synchronize_rcu_tasks_trace() returns. We are therefore deadlocked. This race is improbable, but 80 hours of rcutorture made it happen twice. The race was possible before the grace-period speedup, but roughly 50x less probable. Several thousand hours of rcutorture would have been necessary to have a reasonable chance of making this happen before this 50x speedup. This commit therefore eliminates this deadlock by setting ->trc_reader_nesting to a large negative number before checking the .need_qs and zeroing (or decrementing with respect to its initial value) ->trc_reader_nesting. For its part, the IPI handler's trc_read_check_handler() function adds a check for negative values, deferring evaluation of the task in this case. Taken together, these changes avoid this deadlock scenario. Fixes: 276c410448db ("rcu-tasks: Split ->trc_reader_need_end") Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Jiri Olsa Cc: Cc: # 5.7.x Signed-off-by: Paul E. McKenney --- include/linux/rcupdate_trace.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h index d9015aac78c6..a6a6a3acab5a 100644 --- a/include/linux/rcupdate_trace.h +++ b/include/linux/rcupdate_trace.h @@ -50,6 +50,7 @@ static inline void rcu_read_lock_trace(void) struct task_struct *t = current; WRITE_ONCE(t->trc_reader_nesting, READ_ONCE(t->trc_reader_nesting) + 1); + barrier(); if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && t->trc_reader_special.b.need_mb) smp_mb(); // Pairs with update-side barriers @@ -72,6 +73,9 @@ static inline void rcu_read_unlock_trace(void) rcu_lock_release(&rcu_trace_lock_map); nesting = READ_ONCE(t->trc_reader_nesting) - 1; + barrier(); // Critical section before disabling. + // Disable IPI-based setting of .need_qs. + WRITE_ONCE(t->trc_reader_nesting, INT_MIN); if (likely(!READ_ONCE(t->trc_reader_special.s)) || nesting) { WRITE_ONCE(t->trc_reader_nesting, nesting); return; // We assume shallow reader nesting. -- cgit v1.2.3 From 639bf4415cadff4c18e13aa5cb0dba2d443e3aa7 Mon Sep 17 00:00:00 2001 From: Aharon Landau Date: Thu, 17 Sep 2020 12:02:21 +0300 Subject: net/mlx5: Refactor query port speed functions The functions mlx5_query_port_link_width_oper and mlx5_query_port_ib_proto_oper are always called together, so combine them to a new function called mlx5_query_port_oper to avoid duplication. And while the mlx5i_get_port_settings is the same as mlx5_query_port_oper therefore let's remove it. According to the IB spec link_width_oper and ib_proto_oper should be u16 and not as written u8, so perform casting as a preparation to cross-RDMA patch which will fix that type for all drivers in the RDMA subsystem. Fixes: ada68c31ba9c ("net/mlx5: Introduce a new header file for physical port functions") Signed-off-by: Aharon Landau Reviewed-by: Michael Guralnik Signed-off-by: Leon Romanovsky --- include/linux/mlx5/port.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index 2d45a6af52a4..4d33ae0c2d97 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -133,10 +133,9 @@ enum mlx5e_connector_type { int mlx5_set_port_caps(struct mlx5_core_dev *dev, u8 port_num, u32 caps); int mlx5_query_port_ptys(struct mlx5_core_dev *dev, u32 *ptys, int ptys_size, int proto_mask, u8 local_port); -int mlx5_query_port_link_width_oper(struct mlx5_core_dev *dev, - u8 *link_width_oper, u8 local_port); -int mlx5_query_port_ib_proto_oper(struct mlx5_core_dev *dev, - u8 *proto_oper, u8 local_port); + +int mlx5_query_ib_port_oper(struct mlx5_core_dev *dev, u16 *link_width_oper, + u16 *proto_oper, u8 local_port); void mlx5_toggle_port_link(struct mlx5_core_dev *dev); int mlx5_set_port_admin_status(struct mlx5_core_dev *dev, enum mlx5_port_status status); -- cgit v1.2.3 From e27014bdb47eb435f78573685f4196c07329f1f7 Mon Sep 17 00:00:00 2001 From: Aharon Landau Date: Thu, 17 Sep 2020 12:02:22 +0300 Subject: RDMA/mlx5: Delete duplicated mlx5_ptys_width enum Combine two same enums to avoid duplication. Signed-off-by: Aharon Landau Reviewed-by: Michael Guralnik Signed-off-by: Leon Romanovsky --- include/linux/mlx5/port.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index 4d33ae0c2d97..23edd2db4803 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -125,6 +125,14 @@ enum mlx5e_connector_type { MLX5E_CONNECTOR_TYPE_NUMBER, }; +enum mlx5_ptys_width { + MLX5_PTYS_WIDTH_1X = 1 << 0, + MLX5_PTYS_WIDTH_2X = 1 << 1, + MLX5_PTYS_WIDTH_4X = 1 << 2, + MLX5_PTYS_WIDTH_8X = 1 << 3, + MLX5_PTYS_WIDTH_12X = 1 << 4, +}; + #define MLX5E_PROT_MASK(link_mode) (1 << link_mode) #define MLX5_GET_ETH_PROTO(reg, out, ext, field) \ (ext ? MLX5_GET(reg, out, ext_##field) : \ -- cgit v1.2.3 From 877c1a5f79c6984bbe3f2924234c08e2f4f1acd5 Mon Sep 17 00:00:00 2001 From: Tuan Phan Date: Thu, 6 Aug 2020 14:57:34 -0700 Subject: PCI/ACPI: Add Ampere Altra SOC MCFG quirk Ampere Altra SOC supports only 32-bit ECAM reads. Add an MCFG quirk for the platform. Link: https://lore.kernel.org/r/1596751055-12316-1-git-send-email-tuanphan@os.amperecomputing.com Signed-off-by: Tuan Phan Signed-off-by: Bjorn Helgaas --- include/linux/pci-ecam.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci-ecam.h b/include/linux/pci-ecam.h index 1af5cb02ef7f..033ce74f02e8 100644 --- a/include/linux/pci-ecam.h +++ b/include/linux/pci-ecam.h @@ -51,6 +51,7 @@ extern const struct pci_ecam_ops pci_generic_ecam_ops; #if defined(CONFIG_ACPI) && defined(CONFIG_PCI_QUIRKS) extern const struct pci_ecam_ops pci_32b_ops; /* 32-bit accesses only */ +extern const struct pci_ecam_ops pci_32b_read_ops; /* 32-bit read only */ extern const struct pci_ecam_ops hisi_pcie_ops; /* HiSilicon */ extern const struct pci_ecam_ops thunder_pem_ecam_ops; /* Cavium ThunderX 1.x & 2.x */ extern const struct pci_ecam_ops pci_thunder_ecam_ops; /* Cavium ThunderX 1.x */ -- cgit v1.2.3 From 9d8feb460adb90e89e64101ce2c1cfcd548a6d83 Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Mon, 31 Aug 2020 10:01:48 +0300 Subject: RDMA/mlx5: Add sw_owner_v2 bit capability Added sw_owner_v2 which will be enabled for future devices, replacing sw_owner bit. Signed-off-by: Alex Vesker Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index de1ffb4804d6..3061ceebbaf3 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -420,7 +420,8 @@ struct mlx5_ifc_flow_table_prop_layout_bits { u8 reserved_at_1a[0x2]; u8 ipsec_encrypt[0x1]; u8 ipsec_decrypt[0x1]; - u8 reserved_at_1e[0x2]; + u8 sw_owner_v2[0x1]; + u8 reserved_at_1f[0x1]; u8 termination_table_raw_traffic[0x1]; u8 reserved_at_21[0x1]; -- cgit v1.2.3 From a748c6975dea325da540610c2ba9b5f332c603e6 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 16 Sep 2020 23:10:05 +0200 Subject: bpf: propagate poke descriptors to subprograms Previously, there was no need for poke descriptors being present in subprogram's bpf_prog_aux struct since tailcalls were simply not allowed in them. Each subprog is JITed independently so in order to enable JITing subprograms that use tailcalls, do the following: - in fixup_bpf_calls() store the index of tailcall insn onto the generated poke descriptor, - in case when insn patching occurs, adjust the tailcall insn idx from bpf_patch_insn_data, - then in jit_subprogs() check whether the given poke descriptor belongs to the current subprog by checking if that previously stored absolute index of tail call insn is in the scope of the insns of given subprog, - update the insn->imm with new poke descriptor slot so that while JITing the proper poke descriptor will be grabbed This way each of the main program's poke descriptors are distributed across the subprograms poke descriptor array, so main program's descriptors can be untracked out of the prog array map. Add also subprog's aux struct to the BPF map poke_progs list by calling on it map_poke_track(). In case of any error, call the map_poke_untrack() on subprog's aux structs that have already been registered to prog array map. Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5dcce0364634..a23e5eb728c8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -707,6 +707,7 @@ struct bpf_jit_poke_descriptor { bool ip_stable; u8 adj_off; u16 reason; + u32 insn_idx; }; /* reg_type info for ctx arguments */ -- cgit v1.2.3 From cf71b174d3464c7dc22f86f25d629a8d9d5c3519 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 16 Sep 2020 23:10:06 +0200 Subject: bpf: rename poke descriptor's 'ip' member to 'tailcall_target' Reflect the actual purpose of poke->ip and rename it to poke->tailcall_target so that it will not the be confused with another poke target that will be introduced in next commit. While at it, do the same thing with poke->ip_stable - rename it to poke->tailcall_target_stable. Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a23e5eb728c8..f3790c9cf542 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -697,14 +697,14 @@ enum bpf_jit_poke_reason { /* Descriptor of pokes pointing /into/ the JITed image. */ struct bpf_jit_poke_descriptor { - void *ip; + void *tailcall_target; union { struct { struct bpf_map *map; u32 key; } tail_call; }; - bool ip_stable; + bool tailcall_target_stable; u8 adj_off; u16 reason; u32 insn_idx; -- cgit v1.2.3 From 32b313ce9db54991a053da66883289e99d6ad820 Mon Sep 17 00:00:00 2001 From: Gustavo Pimentel Date: Thu, 17 Sep 2020 23:30:27 +0200 Subject: PCI: endpoint: Use "NULL" instead of "0" as a NULL pointer When returning a NULL pointer, use "NULL" instead of "0". Fixes sparse warning given by executing "make C=2 drivers/pci/": CHECK drivers/pci/endpoint/pci-epc-core.c drivers/pci/endpoint/pci-epc-core.c: note: in included file: ./include/linux/pci-ep-cfs.h:22:16: warning: Using plain integer as NULL pointer CHECK drivers/pci/endpoint/pci-epf-core.c drivers/pci/endpoint/pci-epf-core.c: note: in included file: ./include/linux/pci-ep-cfs.h:31:16: warning: Using plain integer as NULL pointer Link: https://lore.kernel.org/r/80895f7465719edb3aa259e907acc4bc3217945c.1600378209.git.gustavo.pimentel@synopsys.com Reported-by: Bjorn Helgaas Signed-off-by: Gustavo Pimentel Signed-off-by: Bjorn Helgaas Cc: Kishon Vijay Abraham I Cc: Joao Pinto --- include/linux/pci-ep-cfs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci-ep-cfs.h b/include/linux/pci-ep-cfs.h index f42b0fd4b4bc..662881335c7e 100644 --- a/include/linux/pci-ep-cfs.h +++ b/include/linux/pci-ep-cfs.h @@ -19,7 +19,7 @@ void pci_ep_cfs_remove_epf_group(struct config_group *group); #else static inline struct config_group *pci_ep_cfs_add_epc_group(const char *name) { - return 0; + return NULL; } static inline void pci_ep_cfs_remove_epc_group(struct config_group *group) @@ -28,7 +28,7 @@ static inline void pci_ep_cfs_remove_epc_group(struct config_group *group) static inline struct config_group *pci_ep_cfs_add_epf_group(const char *name) { - return 0; + return NULL; } static inline void pci_ep_cfs_remove_epf_group(struct config_group *group) -- cgit v1.2.3 From 2492c205d2bbbc01f5c9e49fffe4b2e633c33f38 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 17 Sep 2020 10:19:10 +0800 Subject: netdev: Remove unused functions There is no callers in tree, so can remove it. Signed-off-by: YueHaibing Reviewed-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/netdevice.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 157e0242e9ee..909b1fbb0481 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4677,16 +4677,6 @@ int netdev_class_create_file_ns(const struct class_attribute *class_attr, void netdev_class_remove_file_ns(const struct class_attribute *class_attr, const void *ns); -static inline int netdev_class_create_file(const struct class_attribute *class_attr) -{ - return netdev_class_create_file_ns(class_attr, NULL); -} - -static inline void netdev_class_remove_file(const struct class_attribute *class_attr) -{ - netdev_class_remove_file_ns(class_attr, NULL); -} - extern const struct kobj_ns_type_operations net_ns_type_operations; const char *netdev_drivername(const struct net_device *dev); -- cgit v1.2.3 From 7f6e4312e15a5c370e84eaa685879b6bdcc717e4 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 16 Sep 2020 23:10:07 +0200 Subject: bpf: Limit caller's stack depth 256 for subprogs with tailcalls Protect against potential stack overflow that might happen when bpf2bpf calls get combined with tailcalls. Limit the caller's stack depth for such case down to 256 so that the worst case scenario would result in 8k stack size (32 which is tailcall limit * 256 = 8k). Suggested-by: Alexei Starovoitov Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 53c7bd568c5d..5026b75db972 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -358,6 +358,7 @@ struct bpf_subprog_info { u32 start; /* insn idx of function entry point */ u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ u16 stack_depth; /* max. stack depth used by this function */ + bool has_tail_call; }; /* single container for all structs -- cgit v1.2.3 From ebf7d1f508a73871acf3b2bfbfa1323a477acdb3 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 16 Sep 2020 23:10:08 +0200 Subject: bpf, x64: rework pro/epilogue and tailcall handling in JIT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit serves two things: 1) it optimizes BPF prologue/epilogue generation 2) it makes possible to have tailcalls within BPF subprogram Both points are related to each other since without 1), 2) could not be achieved. In [1], Alexei says: "The prologue will look like: nop5 xor eax,eax  // two new bytes if bpf_tail_call() is used in this // function push rbp mov rbp, rsp sub rsp, rounded_stack_depth push rax // zero init tail_call counter variable number of push rbx,r13,r14,r15 Then bpf_tail_call will pop variable number rbx,.. and final 'pop rax' Then 'add rsp, size_of_current_stack_frame' jmp to next function and skip over 'nop5; xor eax,eax; push rpb; mov rbp, rsp' This way new function will set its own stack size and will init tail call counter with whatever value the parent had. If next function doesn't use bpf_tail_call it won't have 'xor eax,eax'. Instead it would need to have 'nop2' in there." Implement that suggestion. Since the layout of stack is changed, tail call counter handling can not rely anymore on popping it to rbx just like it have been handled for constant prologue case and later overwrite of rbx with actual value of rbx pushed to stack. Therefore, let's use one of the register (%rcx) that is considered to be volatile/caller-saved and pop the value of tail call counter in there in the epilogue. Drop the BUILD_BUG_ON in emit_prologue and in emit_bpf_tail_call_indirect where instruction layout is not constant anymore. Introduce new poke target, 'tailcall_bypass' to poke descriptor that is dedicated for skipping the register pops and stack unwind that are generated right before the actual jump to target program. For case when the target program is not present, BPF program will skip the pop instructions and nop5 dedicated for jmpq $target. An example of such state when only R6 of callee saved registers is used by program: ffffffffc0513aa1: e9 0e 00 00 00 jmpq 0xffffffffc0513ab4 ffffffffc0513aa6: 5b pop %rbx ffffffffc0513aa7: 58 pop %rax ffffffffc0513aa8: 48 81 c4 00 00 00 00 add $0x0,%rsp ffffffffc0513aaf: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) ffffffffc0513ab4: 48 89 df mov %rbx,%rdi When target program is inserted, the jump that was there to skip pops/nop5 will become the nop5, so CPU will go over pops and do the actual tailcall. One might ask why there simply can not be pushes after the nop5? In the following example snippet: ffffffffc037030c: 48 89 fb mov %rdi,%rbx (...) ffffffffc0370332: 5b pop %rbx ffffffffc0370333: 58 pop %rax ffffffffc0370334: 48 81 c4 00 00 00 00 add $0x0,%rsp ffffffffc037033b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) ffffffffc0370340: 48 81 ec 00 00 00 00 sub $0x0,%rsp ffffffffc0370347: 50 push %rax ffffffffc0370348: 53 push %rbx ffffffffc0370349: 48 89 df mov %rbx,%rdi ffffffffc037034c: e8 f7 21 00 00 callq 0xffffffffc0372548 There is the bpf2bpf call (at ffffffffc037034c) right after the tailcall and jump target is not present. ctx is in %rbx register and BPF subprogram that we will call into on ffffffffc037034c is relying on it, e.g. it will pick ctx from there. Such code layout is therefore broken as we would overwrite the content of %rbx with the value that was pushed on the prologue. That is the reason for the 'bypass' approach. Special care needs to be taken during the install/update/remove of tailcall target. In case when target program is not present, the CPU must not execute the pop instructions that precede the tailcall. To address that, the following states can be defined: A nop, unwind, nop B nop, unwind, tail C skip, unwind, nop D skip, unwind, tail A is forbidden (lead to incorrectness). The state transitions between tailcall install/update/remove will work as follows: First install tail call f: C->D->B(f) * poke the tailcall, after that get rid of the skip Update tail call f to f': B(f)->B(f') * poke the tailcall (poke->tailcall_target) and do NOT touch the poke->tailcall_bypass Remove tail call: B(f')->C(f') * poke->tailcall_bypass is poked back to jump, then we wait the RCU grace period so that other programs will finish its execution and after that we are safe to remove the poke->tailcall_target Install new tail call (f''): C(f')->D(f'')->B(f''). * same as first step This way CPU can never be exposed to "unwind, tail" state. Last but not least, when tailcalls get mixed with bpf2bpf calls, it would be possible to encounter the endless loop due to clearing the tailcall counter if for example we would use the tailcall3-like from BPF selftests program that would be subprogram-based, meaning the tailcall would be present within the BPF subprogram. This test, broken down to particular steps, would do: entry -> set tailcall counter to 0, bump it by 1, tailcall to func0 func0 -> call subprog_tail (we are NOT skipping the first 11 bytes of prologue and this subprogram has a tailcall, therefore we clear the counter...) subprog -> do the same thing as entry and then loop forever. To address this, the idea is to go through the call chain of bpf2bpf progs and look for a tailcall presence throughout whole chain. If we saw a single tail call then each node in this call chain needs to be marked as a subprog that can reach the tailcall. We would later feed the JIT with this info and: - set eax to 0 only when tailcall is reachable and this is the entry prog - if tailcall is reachable but there's no tailcall in insns of currently JITed prog then push rax anyway, so that it will be possible to propagate further down the call chain - finally if tailcall is reachable, then we need to precede the 'call' insn with mov rax, [rbp - (stack_depth + 8)] Tail call related cases from test_verifier kselftest are also working fine. Sample BPF programs that utilize tail calls (sockex3, tracex5) work properly as well. [1]: https://lore.kernel.org/bpf/20200517043227.2gpq22ifoq37ogst@ast-mbp.dhcp.thefacebook.com/ Suggested-by: Alexei Starovoitov Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +++ include/linux/bpf_verifier.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f3790c9cf542..d7c5a6ed87e3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -698,6 +698,8 @@ enum bpf_jit_poke_reason { /* Descriptor of pokes pointing /into/ the JITed image. */ struct bpf_jit_poke_descriptor { void *tailcall_target; + void *tailcall_bypass; + void *bypass_addr; union { struct { struct bpf_map *map; @@ -738,6 +740,7 @@ struct bpf_prog_aux { bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ bool func_proto_unreliable; bool sleepable; + bool tail_call_reachable; enum bpf_tramp_prog_type trampoline_prog_type; struct bpf_trampoline *trampoline; struct hlist_node tramp_hlist; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 5026b75db972..fbc964526ba3 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -359,6 +359,7 @@ struct bpf_subprog_info { u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ u16 stack_depth; /* max. stack depth used by this function */ bool has_tail_call; + bool tail_call_reachable; }; /* single container for all structs -- cgit v1.2.3 From 09b28d76eac48e922dc293da1aa2b2b85c32aeee Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 17 Sep 2020 19:09:18 -0700 Subject: bpf: Add abnormal return checks. LD_[ABS|IND] instructions may return from the function early. bpf_tail_call pseudo instruction is either fallthrough or return. Allow them in the subprograms only when subprograms are BTF annotated and have scalar return types. Allow ld_abs and tail_call in the main program even if it calls into subprograms. In the past that was not ok to do for ld_abs, since it was JITed with special exit sequence. Since bpf_gen_ld_abs() was introduced the ld_abs looks like normal exit insn from JIT point of view, so it's safe to allow them in the main program. Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index fbc964526ba3..2bb48a2c4d08 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -360,6 +360,7 @@ struct bpf_subprog_info { u16 stack_depth; /* max. stack depth used by this function */ bool has_tail_call; bool tail_call_reachable; + bool has_ld_abs; }; /* single container for all structs -- cgit v1.2.3 From 37050e3ab0b3f02819e3d70ab01d97addb810b28 Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Tue, 8 Sep 2020 12:03:02 -0700 Subject: ieee80211: redefine S1G bits with GENMASK The S1G capability fields were defined by ORing BITS() together, and expecting a custom macro to use the _SHIFT definitions. Use the Linux kernel GENMASK for the definitions now, and FIELD_{GET,PREP} to access the fields in the future. Take the chance to rename eg. S1G_CAPAB_B0 to the more compact S1G_CAP0. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200908190323.15814-2-thomas@adapt-ip.com Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 156 +++++++++++++++++++++++----------------------- 1 file changed, 78 insertions(+), 78 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index c47f43e65a2f..53fba39d4ba6 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2330,84 +2330,84 @@ ieee80211_he_spr_size(const u8 *he_spr_ie) } /* S1G Capabilities Information field */ -#define S1G_CAPAB_B0_S1G_LONG BIT(0) -#define S1G_CAPAB_B0_SGI_1MHZ BIT(1) -#define S1G_CAPAB_B0_SGI_2MHZ BIT(2) -#define S1G_CAPAB_B0_SGI_4MHZ BIT(3) -#define S1G_CAPAB_B0_SGI_8MHZ BIT(4) -#define S1G_CAPAB_B0_SGI_16MHZ BIT(5) -#define S1G_CAPAB_B0_SUPP_CH_WIDTH_MASK (BIT(6) | BIT(7)) -#define S1G_CAPAB_B0_SUPP_CH_WIDTH_SHIFT 6 - -#define S1G_CAPAB_B1_RX_LDPC BIT(0) -#define S1G_CAPAB_B1_TX_STBC BIT(1) -#define S1G_CAPAB_B1_RX_STBC BIT(2) -#define S1G_CAPAB_B1_SU_BFER BIT(3) -#define S1G_CAPAB_B1_SU_BFEE BIT(4) -#define S1G_CAPAB_B1_BFEE_STS_MASK (BIT(5) | BIT(6) | BIT(7)) -#define S1G_CAPAB_B1_BFEE_STS_SHIFT 5 - -#define S1G_CAPAB_B2_SOUNDING_DIMENSIONS_MASK (BIT(0) | BIT(1) | BIT(2)) -#define S1G_CAPAB_B2_SOUNDING_DIMENSIONS_SHIFT 0 -#define S1G_CAPAB_B2_MU_BFER BIT(3) -#define S1G_CAPAB_B2_MU_BFEE BIT(4) -#define S1G_CAPAB_B2_PLUS_HTC_VHT BIT(5) -#define S1G_CAPAB_B2_TRAVELING_PILOT_MASK (BIT(6) | BIT(7)) -#define S1G_CAPAB_B2_TRAVELING_PILOT_SHIFT 6 - -#define S1G_CAPAB_B3_RD_RESPONDER BIT(0) -#define S1G_CAPAB_B3_HT_DELAYED_BA BIT(1) -#define S1G_CAPAB_B3_MAX_MPDU_LEN BIT(2) -#define S1G_CAPAB_B3_MAX_AMPDU_LEN_EXP_MASK (BIT(3) | BIT(4)) -#define S1G_CAPAB_B3_MAX_AMPDU_LEN_EXP_SHIFT 3 -#define S1G_CAPAB_B3_MIN_MPDU_START_MASK (BIT(5) | BIT(6) | BIT(7)) -#define S1G_CAPAB_B3_MIN_MPDU_START_SHIFT 5 - -#define S1G_CAPAB_B4_UPLINK_SYNC BIT(0) -#define S1G_CAPAB_B4_DYNAMIC_AID BIT(1) -#define S1G_CAPAB_B4_BAT BIT(2) -#define S1G_CAPAB_B4_TIME_ADE BIT(3) -#define S1G_CAPAB_B4_NON_TIM BIT(4) -#define S1G_CAPAB_B4_GROUP_AID BIT(5) -#define S1G_CAPAB_B4_STA_TYPE_MASK (BIT(6) | BIT(7)) -#define S1G_CAPAB_B4_STA_TYPE_SHIFT 6 - -#define S1G_CAPAB_B5_CENT_AUTH_CONTROL BIT(0) -#define S1G_CAPAB_B5_DIST_AUTH_CONTROL BIT(1) -#define S1G_CAPAB_B5_AMSDU BIT(2) -#define S1G_CAPAB_B5_AMPDU BIT(3) -#define S1G_CAPAB_B5_ASYMMETRIC_BA BIT(4) -#define S1G_CAPAB_B5_FLOW_CONTROL BIT(5) -#define S1G_CAPAB_B5_SECTORIZED_BEAM_MASK (BIT(6) | BIT(7)) -#define S1G_CAPAB_B5_SECTORIZED_BEAM_SHIFT 6 - -#define S1G_CAPAB_B6_OBSS_MITIGATION BIT(0) -#define S1G_CAPAB_B6_FRAGMENT_BA BIT(1) -#define S1G_CAPAB_B6_NDP_PS_POLL BIT(2) -#define S1G_CAPAB_B6_RAW_OPERATION BIT(3) -#define S1G_CAPAB_B6_PAGE_SLICING BIT(4) -#define S1G_CAPAB_B6_TXOP_SHARING_IMP_ACK BIT(5) -#define S1G_CAPAB_B6_VHT_LINK_ADAPT_MASK (BIT(6) | BIT(7)) -#define S1G_CAPAB_B6_VHT_LINK_ADAPT_SHIFT 6 - -#define S1G_CAPAB_B7_TACK_AS_PS_POLL BIT(0) -#define S1G_CAPAB_B7_DUP_1MHZ BIT(1) -#define S1G_CAPAB_B7_MCS_NEGOTIATION BIT(2) -#define S1G_CAPAB_B7_1MHZ_CTL_RESPONSE_PREAMBLE BIT(3) -#define S1G_CAPAB_B7_NDP_BFING_REPORT_POLL BIT(4) -#define S1G_CAPAB_B7_UNSOLICITED_DYN_AID BIT(5) -#define S1G_CAPAB_B7_SECTOR_TRAINING_OPERATION BIT(6) -#define S1G_CAPAB_B7_TEMP_PS_MODE_SWITCH BIT(7) - -#define S1G_CAPAB_B8_TWT_GROUPING BIT(0) -#define S1G_CAPAB_B8_BDT BIT(1) -#define S1G_CAPAB_B8_COLOR_MASK (BIT(2) | BIT(3) | BIT(4)) -#define S1G_CAPAB_B8_COLOR_SHIFT 2 -#define S1G_CAPAB_B8_TWT_REQUEST BIT(5) -#define S1G_CAPAB_B8_TWT_RESPOND BIT(6) -#define S1G_CAPAB_B8_PV1_FRAME BIT(7) - -#define S1G_CAPAB_B9_LINK_ADAPT_PER_CONTROL_RESPONSE BIT(0) +#define S1G_CAP0_S1G_LONG BIT(0) +#define S1G_CAP0_SGI_1MHZ BIT(1) +#define S1G_CAP0_SGI_2MHZ BIT(2) +#define S1G_CAP0_SGI_4MHZ BIT(3) +#define S1G_CAP0_SGI_8MHZ BIT(4) +#define S1G_CAP0_SGI_16MHZ BIT(5) +#define S1G_CAP0_SUPP_CH_WIDTH GENMASK(7, 6) + +#define S1G_SUPP_CH_WIDTH_2 0 +#define S1G_SUPP_CH_WIDTH_4 1 +#define S1G_SUPP_CH_WIDTH_8 2 +#define S1G_SUPP_CH_WIDTH_16 3 +#define S1G_SUPP_CH_WIDTH_MAX(cap) ((1 << FIELD_GET(S1G_CAP0_SUPP_CH_WIDTH, \ + cap[0])) << 1) + +#define S1G_CAP1_RX_LDPC BIT(0) +#define S1G_CAP1_TX_STBC BIT(1) +#define S1G_CAP1_RX_STBC BIT(2) +#define S1G_CAP1_SU_BFER BIT(3) +#define S1G_CAP1_SU_BFEE BIT(4) +#define S1G_CAP1_BFEE_STS GENMASK(7, 5) + +#define S1G_CAP2_SOUNDING_DIMENSIONS GENMASK(2, 0) +#define S1G_CAP2_MU_BFER BIT(3) +#define S1G_CAP2_MU_BFEE BIT(4) +#define S1G_CAP2_PLUS_HTC_VHT BIT(5) +#define S1G_CAP2_TRAVELING_PILOT GENMASK(7, 6) + +#define S1G_CAP3_RD_RESPONDER BIT(0) +#define S1G_CAP3_HT_DELAYED_BA BIT(1) +#define S1G_CAP3_MAX_MPDU_LEN BIT(2) +#define S1G_CAP3_MAX_AMPDU_LEN_EXP GENMASK(4, 3) +#define S1G_CAP3_MIN_MPDU_START GENMASK(7, 5) + +#define S1G_CAP4_UPLINK_SYNC BIT(0) +#define S1G_CAP4_DYNAMIC_AID BIT(1) +#define S1G_CAP4_BAT BIT(2) +#define S1G_CAP4_TIME_ADE BIT(3) +#define S1G_CAP4_NON_TIM BIT(4) +#define S1G_CAP4_GROUP_AID BIT(5) +#define S1G_CAP4_STA_TYPE GENMASK(7, 6) + +#define S1G_CAP5_CENT_AUTH_CONTROL BIT(0) +#define S1G_CAP5_DIST_AUTH_CONTROL BIT(1) +#define S1G_CAP5_AMSDU BIT(2) +#define S1G_CAP5_AMPDU BIT(3) +#define S1G_CAP5_ASYMMETRIC_BA BIT(4) +#define S1G_CAP5_FLOW_CONTROL BIT(5) +#define S1G_CAP5_SECTORIZED_BEAM GENMASK(7, 6) + +#define S1G_CAP6_OBSS_MITIGATION BIT(0) +#define S1G_CAP6_FRAGMENT_BA BIT(1) +#define S1G_CAP6_NDP_PS_POLL BIT(2) +#define S1G_CAP6_RAW_OPERATION BIT(3) +#define S1G_CAP6_PAGE_SLICING BIT(4) +#define S1G_CAP6_TXOP_SHARING_IMP_ACK BIT(5) +#define S1G_CAP6_VHT_LINK_ADAPT GENMASK(7, 6) + +#define S1G_CAP7_TACK_AS_PS_POLL BIT(0) +#define S1G_CAP7_DUP_1MHZ BIT(1) +#define S1G_CAP7_MCS_NEGOTIATION BIT(2) +#define S1G_CAP7_1MHZ_CTL_RESPONSE_PREAMBLE BIT(3) +#define S1G_CAP7_NDP_BFING_REPORT_POLL BIT(4) +#define S1G_CAP7_UNSOLICITED_DYN_AID BIT(5) +#define S1G_CAP7_SECTOR_TRAINING_OPERATION BIT(6) +#define S1G_CAP7_TEMP_PS_MODE_SWITCH BIT(7) + +#define S1G_CAP8_TWT_GROUPING BIT(0) +#define S1G_CAP8_BDT BIT(1) +#define S1G_CAP8_COLOR GENMASK(4, 2) +#define S1G_CAP8_TWT_REQUEST BIT(5) +#define S1G_CAP8_TWT_RESPOND BIT(6) +#define S1G_CAP8_PV1_FRAME BIT(7) + +#define S1G_CAP9_LINK_ADAPT_PER_CONTROL_RESPONSE BIT(0) + +#define S1G_OPER_CH_WIDTH_PRIMARY_1MHZ BIT(0) +#define S1G_OPER_CH_WIDTH_OPER GENMASK(4, 1) /* Authentication algorithms */ #define WLAN_AUTH_OPEN 0 -- cgit v1.2.3 From 6d1349c769ea28543bdde20a658cbc93c3bc936d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 18 Sep 2020 16:45:50 -0400 Subject: [PATCH] reduce boilerplate in fsid handling Get rid of boilerplate in most of ->statfs() instances... Signed-off-by: Al Viro --- include/linux/statfs.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/statfs.h b/include/linux/statfs.h index fac4356ea1bf..20f695b90aab 100644 --- a/include/linux/statfs.h +++ b/include/linux/statfs.h @@ -45,4 +45,9 @@ struct kstatfs { struct dentry; extern int vfs_get_fsid(struct dentry *dentry, __kernel_fsid_t *fsid); +static inline __kernel_fsid_t u64_to_fsid(u64 v) +{ + return (__kernel_fsid_t){.val = {(u32)v, (u32)(v>>32)}}; +} + #endif -- cgit v1.2.3 From 3753d9779038ab011e01b949253492aaa37bf57a Mon Sep 17 00:00:00 2001 From: Mahesh Bandewar Date: Thu, 17 Sep 2020 22:08:32 -0700 Subject: net: fix build without CONFIG_SYSCTL definition Earlier commit 316cdaa1158a ("net: add option to not create fall-back tunnels in root-ns as well") removed the CONFIG_SYSCTL to enable the kernel-commandline to work. However, this variable gets defined only when CONFIG_SYSCTL option is selected. With this change the behavior would default to creating fall-back tunnels in all namespaces when CONFIG_SYSCTL is not selected and the kernel commandline option will be ignored. Fixes: 316cdaa1158a ("net: add option to not create fall-back tunnels in root-ns as well") Signed-off-by: Mahesh Bandewar Reported-by: Randy Dunlap Reported-by: kernel test robot Acked-by: Randy Dunlap # build-tested Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 909b1fbb0481..fef0eb96cf69 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -634,8 +634,9 @@ extern int sysctl_devconf_inherit_init_net; */ static inline bool net_has_fallback_tunnels(const struct net *net) { - return (net == &init_net && sysctl_fb_tunnels_only_for_init_net == 1) || - !sysctl_fb_tunnels_only_for_init_net; + return !IS_ENABLED(CONFIG_SYSCTL) || + !sysctl_fb_tunnels_only_for_init_net || + (net == &init_net && sysctl_fb_tunnels_only_for_init_net == 1); } static inline int netdev_queue_numa_node_read(const struct netdev_queue *q) -- cgit v1.2.3 From 6d23d831e9bd0b1d2bcd9a1ecdc6ac8e6d162c36 Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Fri, 18 Sep 2020 17:48:01 +0800 Subject: ptp_qoriq: support FIPER3 The FIPER3 (fixed interval period pulse generator) is supported on DPAA2 and ENETC network controller hardware. This patch is to support it in ptp_qoriq driver. Signed-off-by: Yangbo Lu Acked-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/fsl/ptp_qoriq.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fsl/ptp_qoriq.h b/include/linux/fsl/ptp_qoriq.h index 884b8f8ca06d..01acebe37fab 100644 --- a/include/linux/fsl/ptp_qoriq.h +++ b/include/linux/fsl/ptp_qoriq.h @@ -136,6 +136,7 @@ struct ptp_qoriq_registers { #define DEFAULT_TMR_PRSC 2 #define DEFAULT_FIPER1_PERIOD 1000000000 #define DEFAULT_FIPER2_PERIOD 1000000000 +#define DEFAULT_FIPER3_PERIOD 1000000000 struct ptp_qoriq { void __iomem *base; @@ -147,6 +148,7 @@ struct ptp_qoriq { struct dentry *debugfs_root; struct device *dev; bool extts_fifo_support; + bool fiper3_support; int irq; int phc_index; u32 tclk_period; /* nanoseconds */ @@ -155,6 +157,7 @@ struct ptp_qoriq { u32 cksel; u32 tmr_fiper1; u32 tmr_fiper2; + u32 tmr_fiper3; u32 (*read)(unsigned __iomem *addr); void (*write)(unsigned __iomem *addr, u32 val); }; -- cgit v1.2.3 From bbed0bbdddaf46260aeb1a8910a3b32941e321a2 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 21 Sep 2020 03:10:30 +0300 Subject: net: dsa: tag_8021q: add VLANs to the master interface too The whole purpose of tag_8021q is to send VLAN-tagged traffic to the CPU, from which the driver can decode the source port and switch id. Currently this only works if the VLAN filtering on the master is disabled. Change that by explicitly adding code to tag_8021q.c to add the VLANs corresponding to the tags to the filter of the master interface. Because we now need to call vlan_vid_add, then we also need to hold the RTNL mutex. Propagate that requirement to the callers of dsa_8021q_setup and modify the existing call sites as appropriate. Note that one call path, sja1105_best_effort_vlan_filtering_set -> sja1105_vlan_filtering -> sja1105_setup_8021q_tagging, was already holding this lock. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/dsa/8021q.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index 2b003ae9fb38..88cd72dfa4e0 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -31,6 +31,8 @@ struct dsa_8021q_context { const struct dsa_8021q_ops *ops; struct dsa_switch *ds; struct list_head crosschip_links; + /* EtherType of RX VID, used for filtering on master interface */ + __be16 proto; }; #define DSA_8021Q_N_SUBVLAN 8 -- cgit v1.2.3 From 49347755a84052fd1317c3897c7cea954c8f89fe Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Wed, 16 Sep 2020 00:34:53 +0200 Subject: can: include: fix spelling mistakes This patch fixes spelling erros found by "codespell" in the include/linux/can subtree. Link: https://lore.kernel.org/r/20200915223527.1417033-4-mkl@pengutronix.de Signed-off-by: Marc Kleine-Budde --- include/linux/can/core.h | 2 +- include/linux/can/dev.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/can/core.h b/include/linux/can/core.h index e20a0cd09ba5..7da9f1f82e8e 100644 --- a/include/linux/can/core.h +++ b/include/linux/can/core.h @@ -2,7 +2,7 @@ /* * linux/can/core.h * - * Protoypes and definitions for CAN protocol modules using the PF_CAN core + * Prototypes and definitions for CAN protocol modules using the PF_CAN core * * Authors: Oliver Hartkopp * Urs Thuermann diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 5e3d45525bd3..516892566ac9 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -108,7 +108,7 @@ static inline bool can_skb_headroom_valid(struct net_device *dev, skb->ip_summed = CHECKSUM_UNNECESSARY; - /* preform proper loopback on capable devices */ + /* perform proper loopback on capable devices */ if (dev->flags & IFF_ECHO) skb->pkt_type = PACKET_LOOPBACK; else -- cgit v1.2.3 From 80a71815d8cd1be6481ad16fad3167f095045a06 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Wed, 16 Sep 2020 00:35:01 +0200 Subject: can: dev: can_put_echo_skb(): propagate error in case of errors The function can_put_echo_skb() can fail for several reasons. It may fail due to OOM, but when it fails it's usually due to locking problems in the driver. In order to help developing and debugging of new drivers propagate error value in case of errors. Link: https://lore.kernel.org/r/20200915223527.1417033-12-mkl@pengutronix.de Signed-off-by: Marc Kleine-Budde --- include/linux/can/dev.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 516892566ac9..ed0482b2f4b2 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -201,8 +201,8 @@ void can_bus_off(struct net_device *dev); void can_change_state(struct net_device *dev, struct can_frame *cf, enum can_state tx_state, enum can_state rx_state); -void can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, - unsigned int idx); +int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, + unsigned int idx); struct sk_buff *__can_get_echo_skb(struct net_device *dev, unsigned int idx, u8 *len_ptr); unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx); -- cgit v1.2.3 From 728fc9ff73d3f25220f6b8a52aaf063ec51ef294 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Wed, 16 Sep 2020 00:35:22 +0200 Subject: can: rx-offload: can_rx_offload_add_manual(): add new initialization function This patch adds a new initialization function: can_rx_offload_add_manual() It should be used to add support rx-offload to a driver, if the callback mechanism should not be used. Use e.g. can_rx_offload_queue_sorted() to queue skbs into rx-offload. Link: https://lore.kernel.org/r/20200915223527.1417033-33-mkl@pengutronix.de Signed-off-by: Marc Kleine-Budde --- include/linux/can/rx-offload.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/can/rx-offload.h b/include/linux/can/rx-offload.h index 1b78a0cfb615..f1b38088b765 100644 --- a/include/linux/can/rx-offload.h +++ b/include/linux/can/rx-offload.h @@ -35,6 +35,9 @@ int can_rx_offload_add_timestamp(struct net_device *dev, int can_rx_offload_add_fifo(struct net_device *dev, struct can_rx_offload *offload, unsigned int weight); +int can_rx_offload_add_manual(struct net_device *dev, + struct can_rx_offload *offload, + unsigned int weight); int can_rx_offload_irq_offload_timestamp(struct can_rx_offload *offload, u64 reg); int can_rx_offload_irq_offload_fifo(struct can_rx_offload *offload); -- cgit v1.2.3 From b1d4dc15b2f430a4f541ab6c91e63a71cf230b7d Mon Sep 17 00:00:00 2001 From: Tian Tao Date: Tue, 15 Sep 2020 09:38:18 +0800 Subject: i2c: Switch to using the new API kobj_to_dev() Switch to using the new API kobj_to_dev(). Signed-off-by: Tian Tao Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index fc55ea41d323..56622658b215 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -344,7 +344,7 @@ const struct i2c_device_id *i2c_match_id(const struct i2c_device_id *id, static inline struct i2c_client *kobj_to_i2c_client(struct kobject *kobj) { - struct device * const dev = container_of(kobj, struct device, kobj); + struct device * const dev = kobj_to_dev(kobj); return to_i2c_client(dev); } -- cgit v1.2.3 From 1138ce1cf60954d1c0e2d7b4eba5b4df5813fd86 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 15 Jul 2020 18:31:30 -0700 Subject: sunrpc: fix duplicated word in Change "time time" to "time expiry_time" to match the field name. Signed-off-by: Randy Dunlap Cc: "J. Bruce Fields" Cc: Chuck Lever Cc: Trond Myklebust Cc: Anna Schumaker Cc: linux-nfs@vger.kernel.org Signed-off-by: Anna Schumaker --- include/linux/sunrpc/cache.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index 10891b70fc7b..d0965e2997b0 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -45,7 +45,8 @@ */ struct cache_head { struct hlist_node cache_list; - time64_t expiry_time; /* After time time, don't use the data */ + time64_t expiry_time; /* After time expiry_time, don't use + * the data */ time64_t last_refresh; /* If CACHE_PENDING, this is when upcall was * sent, else this is when update was * received, though it is alway set to -- cgit v1.2.3 From 0bdd4cea12a9fd79a7eb7de8493a5fef54d0eea6 Mon Sep 17 00:00:00 2001 From: "Alexander A. Klimov" Date: Tue, 7 Jul 2020 21:50:12 +0200 Subject: Replace HTTP links with HTTPS ones: NFS, SUNRPC, and LOCKD clients Rationale: Reduces attack surface on kernel devs opening the links for MITM as HTTPS traffic is much harder to manipulate. Deterministic algorithm: For each file: If not .svg: For each line: If doesn't contain `\bxmlns\b`: For each link, `\bhttp://[^# \t\r\n]*(?:\w|/)`: If both the HTTP and HTTPS versions return 200 OK and serve the same content: Replace HTTP with HTTPS. Signed-off-by: Alexander A. Klimov Signed-off-by: Anna Schumaker --- include/linux/sunrpc/bc_xprt.h | 2 +- include/linux/sunrpc/msg_prot.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/bc_xprt.h b/include/linux/sunrpc/bc_xprt.h index d796058cdff2..f07c334c599f 100644 --- a/include/linux/sunrpc/bc_xprt.h +++ b/include/linux/sunrpc/bc_xprt.h @@ -4,7 +4,7 @@ NetApp provides this source code under the GPL v2 License. The GPL v2 license is available at -http://opensource.org/licenses/gpl-license.php. +https://opensource.org/licenses/gpl-license.php. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT diff --git a/include/linux/sunrpc/msg_prot.h b/include/linux/sunrpc/msg_prot.h index bea40d9f03a1..43f854487539 100644 --- a/include/linux/sunrpc/msg_prot.h +++ b/include/linux/sunrpc/msg_prot.h @@ -143,7 +143,7 @@ typedef __be32 rpc_fraghdr; /* * Well-known netids. See: * - * http://www.iana.org/assignments/rpc-netids/rpc-netids.xhtml + * https://www.iana.org/assignments/rpc-netids/rpc-netids.xhtml */ #define RPCBIND_NETID_UDP "udp" #define RPCBIND_NETID_TCP "tcp" -- cgit v1.2.3 From c754e137f55e075d6b6ad9b866c32e9aad260a83 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 18 Sep 2020 15:29:59 -0400 Subject: pNFS/flexfiles: Be consistent about mirror index types A mirror index is always of type u32. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/nfs_xdr.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 9408f3252c8e..69cb46f7b8d2 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1611,8 +1611,8 @@ struct nfs_pgio_header { __u64 mds_offset; /* Filelayout dense stripe */ struct nfs_page_array page_array; struct nfs_client *ds_clp; /* pNFS data server */ - int ds_commit_idx; /* ds index if ds_clp is set */ - int pgio_mirror_idx;/* mirror index in pgio layer */ + u32 ds_commit_idx; /* ds index if ds_clp is set */ + u32 pgio_mirror_idx;/* mirror index in pgio layer */ }; struct nfs_mds_commit_info { -- cgit v1.2.3 From 12856e7acde4702b7c3238c15fcba86ff6aa507f Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Thu, 10 Sep 2020 10:59:55 -0400 Subject: PCI/IOV: Mark VFs as not implementing PCI_COMMAND_MEMORY For VFs, the Memory Space Enable bit in the Command Register is hard-wired to 0. Add a new bit to signify devices where the Command Register Memory Space Enable bit does not control the device's response to MMIO accesses. Fixes: abafbc551fdd ("vfio-pci: Invalidate mmaps and block MMIO access on disabled memory") Signed-off-by: Matthew Rosato Acked-by: Bjorn Helgaas Signed-off-by: Alex Williamson --- include/linux/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 835530605c0d..3ff723124ca7 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -445,6 +445,7 @@ struct pci_dev { unsigned int is_probed:1; /* Device probing in progress */ unsigned int link_active_reporting:1;/* Device capable of reporting link active */ unsigned int no_vf_scan:1; /* Don't scan for VFs after IOV enablement */ + unsigned int no_command_memory:1; /* No PCI_COMMAND_MEMORY */ pci_dev_flags_t dev_flags; atomic_t enable_cnt; /* pci_enable_device has been called */ -- cgit v1.2.3 From 2af30f115d6957f372ce3096c7198763ff253d97 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 21 Sep 2020 13:12:17 +0100 Subject: btf: Make btf_set_contains take a const pointer bsearch doesn't modify the contents of the array, so we can take a const pointer. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200921121227.255763-2-lmb@cloudflare.com --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d7c5a6ed87e3..0478b20d335b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1905,6 +1905,6 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *addr1, void *addr2); struct btf_id_set; -bool btf_id_set_contains(struct btf_id_set *set, u32 id); +bool btf_id_set_contains(const struct btf_id_set *set, u32 id); #endif /* _LINUX_BPF_H */ -- cgit v1.2.3 From 27774b7073b5d520c80f1fcb8e9993fc139f21bd Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 21 Sep 2020 13:12:19 +0100 Subject: btf: Add BTF_ID_LIST_SINGLE macro Add a convenience macro that allows defining a BTF ID list with a single item. This lets us cut down on repetitive macros. Suggested-by: Andrii Nakryiko Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200921121227.255763-4-lmb@cloudflare.com --- include/linux/btf_ids.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 210b086188a3..57890b357f85 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -76,6 +76,13 @@ extern u32 name[]; #define BTF_ID_LIST_GLOBAL(name) \ __BTF_ID_LIST(name, globl) +/* The BTF_ID_LIST_SINGLE macro defines a BTF_ID_LIST with + * a single entry. + */ +#define BTF_ID_LIST_SINGLE(name, prefix, typename) \ + BTF_ID_LIST(name) \ + BTF_ID(prefix, typename) + /* * The BTF_ID_UNUSED macro defines 4 zero bytes. * It's used when we want to define 'unused' entry @@ -140,6 +147,7 @@ extern struct btf_id_set name; #define BTF_ID(prefix, name) #define BTF_ID_UNUSED #define BTF_ID_LIST_GLOBAL(name) u32 name[1]; +#define BTF_ID_LIST_SINGLE(name, prefix, typename) static u32 name[1]; #define BTF_SET_START(name) static struct btf_id_set name = { 0 }; #define BTF_SET_START_GLOBAL(name) static struct btf_id_set name = { 0 }; #define BTF_SET_END(name) -- cgit v1.2.3 From 9436ef6e862b9ca22e5b12f87b106e07d5af4cae Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 21 Sep 2020 13:12:20 +0100 Subject: bpf: Allow specifying a BTF ID per argument in function protos Function prototypes using ARG_PTR_TO_BTF_ID currently use two ways to signal which BTF IDs are acceptable. First, bpf_func_proto.btf_id is an array of IDs, one for each argument. This array is only accessed up to the highest numbered argument that uses ARG_PTR_TO_BTF_ID and may therefore be less than five arguments long. It usually points at a BTF_ID_LIST. Second, check_btf_id is a function pointer that is called by the verifier if present. It gets the actual BTF ID of the register, and the argument number we're currently checking. It turns out that the only user check_arg_btf_id ignores the argument, and is simply used to check whether the BTF ID has a struct sock_common at it's start. Replace both of these mechanisms with an explicit BTF ID for each argument in a function proto. Thanks to btf_struct_ids_match this is very flexible: check_arg_btf_id can be replaced by requiring struct sock_common. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200921121227.255763-5-lmb@cloudflare.com --- include/linux/bpf.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0478b20d335b..87b0d5dcc1ff 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -326,12 +326,16 @@ struct bpf_func_proto { }; enum bpf_arg_type arg_type[5]; }; - int *btf_id; /* BTF ids of arguments */ - bool (*check_btf_id)(u32 btf_id, u32 arg); /* if the argument btf_id is - * valid. Often used if more - * than one btf id is permitted - * for this argument. - */ + union { + struct { + u32 *arg1_btf_id; + u32 *arg2_btf_id; + u32 *arg3_btf_id; + u32 *arg4_btf_id; + u32 *arg5_btf_id; + }; + u32 *arg_btf_id[5]; + }; int *ret_btf_id; /* return value btf_id */ bool (*allowed)(const struct bpf_prog *prog); }; @@ -1385,8 +1389,6 @@ int btf_struct_access(struct bpf_verifier_log *log, u32 *next_btf_id); bool btf_struct_ids_match(struct bpf_verifier_log *log, int off, u32 id, u32 need_type_id); -int btf_resolve_helper_id(struct bpf_verifier_log *log, - const struct bpf_func_proto *fn, int); int btf_distill_func_proto(struct bpf_verifier_log *log, struct btf *btf, -- cgit v1.2.3 From f79e7ea571732a6e16f15c6e2f000c347e2d7431 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 21 Sep 2020 13:12:27 +0100 Subject: bpf: Use a table to drive helper arg type checks The mapping between bpf_arg_type and bpf_reg_type is encoded in a big hairy if statement that is hard to follow. The debug output also leaves to be desired: if a reg_type doesn't match we only print one of the options, instead printing all the valid ones. Convert the if statement into a table which is then used to drive type checking. If none of the reg_types match we print all options, e.g.: R2 type=rdonly_buf expected=fp, pkt, pkt_meta, map_value Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200921121227.255763-12-lmb@cloudflare.com --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 87b0d5dcc1ff..fc5c901c7542 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -292,6 +292,7 @@ enum bpf_arg_type { ARG_PTR_TO_ALLOC_MEM, /* pointer to dynamically allocated memory */ ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */ ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ + __BPF_ARG_TYPE_MAX, }; /* type of values returned from helper functions */ -- cgit v1.2.3 From 92ec804f3dbf0d986f8e10850bfff14f316d7aaf Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 21 Sep 2020 15:10:53 -0700 Subject: net: phy: bcm7xxx: Add an entry for BCM72113 BCM72113 features a 28nm integrated EPHY, add an entry to the driver for it. Signed-off-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/brcmphy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 6ad4c000661a..d0bd226d6bd9 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -30,6 +30,7 @@ #define PHY_ID_BCM57780 0x03625d90 #define PHY_ID_BCM89610 0x03625cd0 +#define PHY_ID_BCM72113 0x35905310 #define PHY_ID_BCM7250 0xae025280 #define PHY_ID_BCM7255 0xae025120 #define PHY_ID_BCM7260 0xae025190 -- cgit v1.2.3 From e1ac11859a057ddcf7d6219bd090c7483541767d Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 22 Sep 2020 10:30:15 +0300 Subject: net: bridge: add src field to br_ip Add a new src field to struct br_ip which will be used to lookup S, G entries. When SSM option is added we will enable full br_ip lookups. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 6479a38e52fa..4fb9c4954f3a 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -18,6 +18,12 @@ struct br_ip { __be32 ip4; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr ip6; +#endif + } src; + union { + __be32 ip4; +#if IS_ENABLED(CONFIG_IPV6) + struct in6_addr ip6; #endif } u; __be16 proto; -- cgit v1.2.3 From eab3227b1240bdcc06c0a01a3fc5bfd2bc12f406 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 22 Sep 2020 10:30:17 +0300 Subject: net: bridge: mcast: rename br_ip's u member to dst Since now we have src in br_ip, u no longer makes sense so rename it to dst. No functional changes. v2: fix build with CONFIG_BATMAN_ADV_MCAST CC: Marek Lindner CC: Simon Wunderlich CC: Antonio Quartulli CC: Sven Eckelmann CC: b.a.t.m.a.n@lists.open-mesh.org Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 4fb9c4954f3a..556caed00258 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -25,7 +25,7 @@ struct br_ip { #if IS_ENABLED(CONFIG_IPV6) struct in6_addr ip6; #endif - } u; + } dst; __be16 proto; __u16 vid; }; -- cgit v1.2.3 From 39097ab66dbe89b16438dd6d178d49e75cb922ed Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 23 Sep 2020 00:29:02 +0200 Subject: net: phy: Fixup kernel doc Add missing parameter documentation, or fixup wrong parameter names. Reviewed-by: Florian Fainelli Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/mdio.h | 3 ++- include/linux/phy.h | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 3a88b699b758..dbd69b3d170b 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -306,7 +306,7 @@ static inline u32 linkmode_adv_to_mii_10gbt_adv_t(unsigned long *advertising) /** * mii_10gbt_stat_mod_linkmode_lpa_t * @advertising: target the linkmode advertisement settings - * @adv: value of the C45 10GBASE-T AN STATUS register + * @lpa: value of the C45 10GBASE-T AN STATUS register * * A small helper function that translates C45 10GBASE-T AN STATUS register bits * to linkmode advertisement settings. Other bits in advertising aren't changed. @@ -371,6 +371,7 @@ struct phy_device *mdiobus_get_phy(struct mii_bus *bus, int addr); /** * mdio_module_driver() - Helper macro for registering mdio drivers + * @_mdio_driver: driver to register * * Helper macro for MDIO drivers which do not do anything special in module * init/exit. Each module may only use this macro once, and calling it diff --git a/include/linux/phy.h b/include/linux/phy.h index 3a09d2bf69ea..4901b2637059 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1566,8 +1566,9 @@ static inline int mdiobus_register_board_info(const struct mdio_board_info *i, /** - * module_phy_driver() - Helper macro for registering PHY drivers + * phy_module_driver() - Helper macro for registering PHY drivers * @__phy_drivers: array of PHY drivers to register + * @__count: Numbers of members in array * * Helper macro for PHY drivers which do not do anything special in module * init/exit. Each module may only use this macro once, and calling it -- cgit v1.2.3 From 4069a572d423b73919ae40a500dfc4b10f8a6f32 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 23 Sep 2020 00:29:03 +0200 Subject: net: phy: Document core PHY structures Add kerneldoc for the core PHY data structures, a few inline functions and exported functions which are not already documented. v2 Typos g/phy/PHY/s Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy.h | 423 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 292 insertions(+), 131 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 4901b2637059..eb3cb1a98b45 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -82,7 +82,39 @@ extern const int phy_10gbit_features_array[1]; #define PHY_POLL_CABLE_TEST 0x00000004 #define MDIO_DEVICE_IS_PHY 0x80000000 -/* Interface Mode definitions */ +/** + * enum phy_interface_t - Interface Mode definitions + * + * @PHY_INTERFACE_MODE_NA: Not Applicable - don't touch + * @PHY_INTERFACE_MODE_INTERNAL: No interface, MAC and PHY combined + * @PHY_INTERFACE_MODE_MII: Median-independent interface + * @PHY_INTERFACE_MODE_GMII: Gigabit median-independent interface + * @PHY_INTERFACE_MODE_SGMII: Serial gigabit media-independent interface + * @PHY_INTERFACE_MODE_TBI: Ten Bit Interface + * @PHY_INTERFACE_MODE_REVMII: Reverse Media Independent Interface + * @PHY_INTERFACE_MODE_RMII: Reduced Media Independent Interface + * @PHY_INTERFACE_MODE_RGMII: Reduced gigabit media-independent interface + * @PHY_INTERFACE_MODE_RGMII_ID: RGMII with Internal RX+TX delay + * @PHY_INTERFACE_MODE_RGMII_RXID: RGMII with Internal RX delay + * @PHY_INTERFACE_MODE_RGMII_TXID: RGMII with Internal RX delay + * @PHY_INTERFACE_MODE_RTBI: Reduced TBI + * @PHY_INTERFACE_MODE_SMII: ??? MII + * @PHY_INTERFACE_MODE_XGMII: 10 gigabit media-independent interface + * @PHY_INTERFACE_MODE_XLGMII:40 gigabit media-independent interface + * @PHY_INTERFACE_MODE_MOCA: Multimedia over Coax + * @PHY_INTERFACE_MODE_QSGMII: Quad SGMII + * @PHY_INTERFACE_MODE_TRGMII: Turbo RGMII + * @PHY_INTERFACE_MODE_1000BASEX: 1000 BaseX + * @PHY_INTERFACE_MODE_2500BASEX: 2500 BaseX + * @PHY_INTERFACE_MODE_RXAUI: Reduced XAUI + * @PHY_INTERFACE_MODE_XAUI: 10 Gigabit Attachment Unit Interface + * @PHY_INTERFACE_MODE_10GBASER: 10G BaseR + * @PHY_INTERFACE_MODE_USXGMII: Universal Serial 10GE MII + * @PHY_INTERFACE_MODE_10GKR: 10GBASE-KR - with Clause 73 AN + * @PHY_INTERFACE_MODE_MAX: Book keeping + * + * Describes the interface between the MAC and PHY. + */ typedef enum { PHY_INTERFACE_MODE_NA, PHY_INTERFACE_MODE_INTERNAL, @@ -116,8 +148,8 @@ typedef enum { } phy_interface_t; /** - * phy_supported_speeds - return all speeds currently supported by a phy device - * @phy: The phy device to return supported speeds of. + * phy_supported_speeds - return all speeds currently supported by a PHY device + * @phy: The PHY device to return supported speeds of. * @speeds: buffer to store supported speeds in. * @size: size of speeds buffer. * @@ -134,9 +166,9 @@ unsigned int phy_supported_speeds(struct phy_device *phy, * phy_modes - map phy_interface_t enum to device tree binding of phy-mode * @interface: enum phy_interface_t value * - * Description: maps 'enum phy_interface_t' defined in this file + * Description: maps enum &phy_interface_t defined in this file * into the device tree binding of 'phy-mode', so that Ethernet - * device driver can get phy interface from device tree. + * device driver can get PHY interface from device tree. */ static inline const char *phy_modes(phy_interface_t interface) { @@ -215,6 +247,14 @@ struct sfp_bus; struct sfp_upstream_ops; struct sk_buff; +/** + * struct mdio_bus_stats - Statistics counters for MDIO busses + * @transfers: Total number of transfers, i.e. @writes + @reads + * @errors: Number of MDIO transfers that returned an error + * @writes: Number of write transfers + * @reads: Number of read transfers + * @syncp: Synchronisation for incrementing statistics + */ struct mdio_bus_stats { u64_stats_t transfers; u64_stats_t errors; @@ -224,7 +264,15 @@ struct mdio_bus_stats { struct u64_stats_sync syncp; }; -/* Represents a shared structure between different phydev's in the same +/** + * struct phy_package_shared - Shared information in PHY packages + * @addr: Common PHY address used to combine PHYs in one package + * @refcnt: Number of PHYs connected to this shared data + * @flags: Initialization of PHY package + * @priv_size: Size of the shared private data @priv + * @priv: Driver private data shared across a PHY package + * + * Represents a shared structure between different phydev's in the same * package, for example a quad PHY. See phy_package_join() and * phy_package_leave(). */ @@ -247,7 +295,14 @@ struct phy_package_shared { #define PHY_SHARED_F_INIT_DONE 0 #define PHY_SHARED_F_PROBE_DONE 1 -/* +/** + * struct mii_bus - Represents an MDIO bus + * + * @owner: Who owns this device + * @name: User friendly name for this MDIO device, or driver name + * @id: Unique identifier for this bus, typical from bus hierarchy + * @priv: Driver private data + * * The Bus class for PHYs. Devices which provide access to * PHYs should register using this structure */ @@ -256,49 +311,58 @@ struct mii_bus { const char *name; char id[MII_BUS_ID_SIZE]; void *priv; + /** @read: Perform a read transfer on the bus */ int (*read)(struct mii_bus *bus, int addr, int regnum); + /** @write: Perform a write transfer on the bus */ int (*write)(struct mii_bus *bus, int addr, int regnum, u16 val); + /** @reset: Perform a reset of the bus */ int (*reset)(struct mii_bus *bus); + + /** @stats: Statistic counters per device on the bus */ struct mdio_bus_stats stats[PHY_MAX_ADDR]; - /* - * A lock to ensure that only one thing can read/write + /** + * @mdio_lock: A lock to ensure that only one thing can read/write * the MDIO bus at a time */ struct mutex mdio_lock; + /** @parent: Parent device of this bus */ struct device *parent; + /** @state: State of bus structure */ enum { MDIOBUS_ALLOCATED = 1, MDIOBUS_REGISTERED, MDIOBUS_UNREGISTERED, MDIOBUS_RELEASED, } state; + + /** @dev: Kernel device representation */ struct device dev; - /* list of all PHYs on bus */ + /** @mdio_map: list of all MDIO devices on bus */ struct mdio_device *mdio_map[PHY_MAX_ADDR]; - /* PHY addresses to be ignored when probing */ + /** @phy_mask: PHY addresses to be ignored when probing */ u32 phy_mask; - /* PHY addresses to ignore the TA/read failure */ + /** @phy_ignore_ta_mask: PHY addresses to ignore the TA/read failure */ u32 phy_ignore_ta_mask; - /* - * An array of interrupts, each PHY's interrupt at the index + /** + * @irq: An array of interrupts, each PHY's interrupt at the index * matching its address */ int irq[PHY_MAX_ADDR]; - /* GPIO reset pulse width in microseconds */ + /** @reset_delay_us: GPIO reset pulse width in microseconds */ int reset_delay_us; - /* GPIO reset deassert delay in microseconds */ + /** @reset_post_delay_us: GPIO reset deassert delay in microseconds */ int reset_post_delay_us; - /* RESET GPIO descriptor pointer */ + /** @reset_gpiod: Reset GPIO descriptor pointer */ struct gpio_desc *reset_gpiod; - /* bus capabilities, used for probing */ + /** @probe_capabilities: bus capabilities, used for probing */ enum { MDIOBUS_NO_CAP = 0, MDIOBUS_C22, @@ -306,15 +370,22 @@ struct mii_bus { MDIOBUS_C22_C45, } probe_capabilities; - /* protect access to the shared element */ + /** @shared_lock: protect access to the shared element */ struct mutex shared_lock; - /* shared state across different PHYs */ + /** @shared: shared state across different PHYs */ struct phy_package_shared *shared[PHY_MAX_ADDR]; }; #define to_mii_bus(d) container_of(d, struct mii_bus, dev) -struct mii_bus *mdiobus_alloc_size(size_t); +struct mii_bus *mdiobus_alloc_size(size_t size); + +/** + * mdiobus_alloc - Allocate an MDIO bus structure + * + * The internal state of the MDIO bus will be set of MDIOBUS_ALLOCATED ready + * for the driver to register the bus. + */ static inline struct mii_bus *mdiobus_alloc(void) { return mdiobus_alloc_size(0); @@ -341,40 +412,41 @@ struct phy_device *mdiobus_scan(struct mii_bus *bus, int addr); #define PHY_INTERRUPT_DISABLED false #define PHY_INTERRUPT_ENABLED true -/* PHY state machine states: +/** + * enum phy_state - PHY state machine states: * - * DOWN: PHY device and driver are not ready for anything. probe + * @PHY_DOWN: PHY device and driver are not ready for anything. probe * should be called if and only if the PHY is in this state, * given that the PHY device exists. - * - PHY driver probe function will set the state to READY + * - PHY driver probe function will set the state to @PHY_READY * - * READY: PHY is ready to send and receive packets, but the + * @PHY_READY: PHY is ready to send and receive packets, but the * controller is not. By default, PHYs which do not implement * probe will be set to this state by phy_probe(). * - start will set the state to UP * - * UP: The PHY and attached device are ready to do work. + * @PHY_UP: The PHY and attached device are ready to do work. * Interrupts should be started here. - * - timer moves to NOLINK or RUNNING + * - timer moves to @PHY_NOLINK or @PHY_RUNNING * - * NOLINK: PHY is up, but not currently plugged in. - * - irq or timer will set RUNNING if link comes back - * - phy_stop moves to HALTED + * @PHY_NOLINK: PHY is up, but not currently plugged in. + * - irq or timer will set @PHY_RUNNING if link comes back + * - phy_stop moves to @PHY_HALTED * - * RUNNING: PHY is currently up, running, and possibly sending + * @PHY_RUNNING: PHY is currently up, running, and possibly sending * and/or receiving packets - * - irq or timer will set NOLINK if link goes down - * - phy_stop moves to HALTED + * - irq or timer will set @PHY_NOLINK if link goes down + * - phy_stop moves to @PHY_HALTED * - * CABLETEST: PHY is performing a cable test. Packet reception/sending + * @PHY_CABLETEST: PHY is performing a cable test. Packet reception/sending * is not expected to work, carrier will be indicated as down. PHY will be * poll once per second, or on interrupt for it current state. * Once complete, move to UP to restart the PHY. - * - phy_stop aborts the running test and moves to HALTED + * - phy_stop aborts the running test and moves to @PHY_HALTED * - * HALTED: PHY is up, but no polling or interrupts are done. Or + * @PHY_HALTED: PHY is up, but no polling or interrupts are done. Or * PHY is in an error state. - * - phy_start moves to UP + * - phy_start moves to @PHY_UP */ enum phy_state { PHY_DOWN = 0, @@ -403,34 +475,67 @@ struct phy_c45_device_ids { struct macsec_context; struct macsec_ops; -/* phy_device: An instance of a PHY +/** + * struct phy_device - An instance of a PHY * - * drv: Pointer to the driver for this PHY instance - * phy_id: UID for this device found during discovery - * c45_ids: 802.3-c45 Device Identifers if is_c45. - * is_c45: Set to true if this phy uses clause 45 addressing. - * is_internal: Set to true if this phy is internal to a MAC. - * is_pseudo_fixed_link: Set to true if this phy is an Ethernet switch, etc. - * is_gigabit_capable: Set to true if PHY supports 1000Mbps - * has_fixups: Set to true if this phy has fixups/quirks. - * suspended: Set to true if this phy has been suspended successfully. - * suspended_by_mdio_bus: Set to true if this phy was suspended by MDIO bus. - * sysfs_links: Internal boolean tracking sysfs symbolic links setup/removal. - * loopback_enabled: Set true if this phy has been loopbacked successfully. - * downshifted_rate: Set true if link speed has been downshifted. - * state: state of the PHY for management purposes - * dev_flags: Device-specific flags used by the PHY driver. - * irq: IRQ number of the PHY's interrupt (-1 if none) - * phy_timer: The timer for handling the state machine - * sfp_bus_attached: flag indicating whether the SFP bus has been attached - * sfp_bus: SFP bus attached to this PHY's fiber port - * attached_dev: The attached enet driver's device instance ptr - * adjust_link: Callback for the enet controller to respond to - * changes in the link state. - * macsec_ops: MACsec offloading ops. + * @mdio: MDIO bus this PHY is on + * @drv: Pointer to the driver for this PHY instance + * @phy_id: UID for this device found during discovery + * @c45_ids: 802.3-c45 Device Identifiers if is_c45. + * @is_c45: Set to true if this PHY uses clause 45 addressing. + * @is_internal: Set to true if this PHY is internal to a MAC. + * @is_pseudo_fixed_link: Set to true if this PHY is an Ethernet switch, etc. + * @is_gigabit_capable: Set to true if PHY supports 1000Mbps + * @has_fixups: Set to true if this PHY has fixups/quirks. + * @suspended: Set to true if this PHY has been suspended successfully. + * @suspended_by_mdio_bus: Set to true if this PHY was suspended by MDIO bus. + * @sysfs_links: Internal boolean tracking sysfs symbolic links setup/removal. + * @loopback_enabled: Set true if this PHY has been loopbacked successfully. + * @downshifted_rate: Set true if link speed has been downshifted. + * @state: State of the PHY for management purposes + * @dev_flags: Device-specific flags used by the PHY driver. + * @irq: IRQ number of the PHY's interrupt (-1 if none) + * @phy_timer: The timer for handling the state machine + * @phylink: Pointer to phylink instance for this PHY + * @sfp_bus_attached: Flag indicating whether the SFP bus has been attached + * @sfp_bus: SFP bus attached to this PHY's fiber port + * @attached_dev: The attached enet driver's device instance ptr + * @adjust_link: Callback for the enet controller to respond to changes: in the + * link state. + * @phy_link_change: Callback for phylink for notification of link change + * @macsec_ops: MACsec offloading ops. * - * speed, duplex, pause, supported, advertising, lp_advertising, - * and autoneg are used like in mii_if_info + * @speed: Current link speed + * @duplex: Current duplex + * @pause: Current pause + * @asym_pause: Current asymmetric pause + * @supported: Combined MAC/PHY supported linkmodes + * @advertising: Currently advertised linkmodes + * @adv_old: Saved advertised while power saving for WoL + * @lp_advertising: Current link partner advertised linkmodes + * @eee_broken_modes: Energy efficient ethernet modes which should be prohibited + * @autoneg: Flag autoneg being used + * @link: Current link state + * @autoneg_complete: Flag auto negotiation of the link has completed + * @mdix: Current crossover + * @mdix_ctrl: User setting of crossover + * @interrupts: Flag interrupts have been enabled + * @interface: enum phy_interface_t value + * @skb: Netlink message for cable diagnostics + * @nest: Netlink nest used for cable diagnostics + * @ehdr: nNtlink header for cable diagnostics + * @phy_led_triggers: Array of LED triggers + * @phy_num_led_triggers: Number of triggers in @phy_led_triggers + * @led_link_trigger: LED trigger for link up/down + * @last_triggered: last LED trigger for link speed + * @master_slave_set: User requested master/slave configuration + * @master_slave_get: Current master/slave advertisement + * @master_slave_state: Current master/slave configuration + * @mii_ts: Pointer to time stamper callbacks + * @lock: Mutex for serialization access to PHY + * @state_queue: Work queue for state machine + * @shared: Pointer to private data shared by phys in one package + * @priv: Pointer to driver private data * * interrupts currently only supports enabled or disabled, * but could be changed in the future to support enabling @@ -550,9 +655,18 @@ struct phy_device { #define to_phy_device(d) container_of(to_mdio_device(d), \ struct phy_device, mdio) -/* A structure containing possible configuration parameters +/** + * struct phy_tdr_config - Configuration of a TDR raw test + * + * @first: Distance for first data collection point + * @last: Distance for last data collection point + * @step: Step between data collection points + * @pair: Bitmap of cable pairs to collect data for + * + * A structure containing possible configuration parameters * for a TDR cable test. The driver does not need to implement * all the parameters, but should report what is actually used. + * All distances are in centimeters. */ struct phy_tdr_config { u32 first; @@ -562,18 +676,20 @@ struct phy_tdr_config { }; #define PHY_PAIR_ALL -1 -/* struct phy_driver: Driver structure for a particular PHY type +/** + * struct phy_driver - Driver structure for a particular PHY type * - * driver_data: static driver data - * phy_id: The result of reading the UID registers of this PHY + * @mdiodrv: Data common to all MDIO devices + * @phy_id: The result of reading the UID registers of this PHY * type, and ANDing them with the phy_id_mask. This driver * only works for PHYs with IDs which match this field - * name: The friendly name of this PHY type - * phy_id_mask: Defines the important bits of the phy_id - * features: A mandatory list of features (speed, duplex, etc) + * @name: The friendly name of this PHY type + * @phy_id_mask: Defines the important bits of the phy_id + * @features: A mandatory list of features (speed, duplex, etc) * supported by this PHY - * flags: A bitfield defining certain other features this PHY + * @flags: A bitfield defining certain other features this PHY * supports (like interrupts) + * @driver_data: Static driver data * * All functions are optional. If config_aneg or read_status * are not implemented, the phy core uses the genphy versions. @@ -592,151 +708,178 @@ struct phy_driver { u32 flags; const void *driver_data; - /* - * Called to issue a PHY software reset + /** + * @soft_reset: Called to issue a PHY software reset */ int (*soft_reset)(struct phy_device *phydev); - /* - * Called to initialize the PHY, + /** + * @config_init: Called to initialize the PHY, * including after a reset */ int (*config_init)(struct phy_device *phydev); - /* - * Called during discovery. Used to set + /** + * @probe: Called during discovery. Used to set * up device-specific structures, if any */ int (*probe)(struct phy_device *phydev); - /* - * Probe the hardware to determine what abilities it has. - * Should only set phydev->supported. + /** + * @get_features: Probe the hardware to determine what + * abilities it has. Should only set phydev->supported. */ int (*get_features)(struct phy_device *phydev); /* PHY Power Management */ + /** @suspend: Suspend the hardware, saving state if needed */ int (*suspend)(struct phy_device *phydev); + /** @resume: Resume the hardware, restoring state if needed */ int (*resume)(struct phy_device *phydev); - /* - * Configures the advertisement and resets + /** + * @config_aneg: Configures the advertisement and resets * autonegotiation if phydev->autoneg is on, * forces the speed to the current settings in phydev * if phydev->autoneg is off */ int (*config_aneg)(struct phy_device *phydev); - /* Determines the auto negotiation result */ + /** @aneg_done: Determines the auto negotiation result */ int (*aneg_done)(struct phy_device *phydev); - /* Determines the negotiated speed and duplex */ + /** @read_status: Determines the negotiated speed and duplex */ int (*read_status)(struct phy_device *phydev); - /* Clears any pending interrupts */ + /** @ack_interrupt: Clears any pending interrupts */ int (*ack_interrupt)(struct phy_device *phydev); - /* Enables or disables interrupts */ + /** @config_intr: Enables or disables interrupts */ int (*config_intr)(struct phy_device *phydev); - /* - * Checks if the PHY generated an interrupt. + /** + * @did_interrupt: Checks if the PHY generated an interrupt. * For multi-PHY devices with shared PHY interrupt pin * Set interrupt bits have to be cleared. */ int (*did_interrupt)(struct phy_device *phydev); - /* Override default interrupt handling */ + /** @handle_interrupt: Override default interrupt handling */ irqreturn_t (*handle_interrupt)(struct phy_device *phydev); - /* Clears up any memory if needed */ + /** @remove: Clears up any memory if needed */ void (*remove)(struct phy_device *phydev); - /* Returns true if this is a suitable driver for the given - * phydev. If NULL, matching is based on phy_id and - * phy_id_mask. + /** + * @match_phy_device: Returns true if this is a suitable + * driver for the given phydev. If NULL, matching is based on + * phy_id and phy_id_mask. */ int (*match_phy_device)(struct phy_device *phydev); - /* Some devices (e.g. qnap TS-119P II) require PHY register changes to - * enable Wake on LAN, so set_wol is provided to be called in the - * ethernet driver's set_wol function. */ + /** + * @set_wol: Some devices (e.g. qnap TS-119P II) require PHY + * register changes to enable Wake on LAN, so set_wol is + * provided to be called in the ethernet driver's set_wol + * function. + */ int (*set_wol)(struct phy_device *dev, struct ethtool_wolinfo *wol); - /* See set_wol, but for checking whether Wake on LAN is enabled. */ + /** + * @get_wol: See set_wol, but for checking whether Wake on LAN + * is enabled. + */ void (*get_wol)(struct phy_device *dev, struct ethtool_wolinfo *wol); - /* - * Called to inform a PHY device driver when the core is about to - * change the link state. This callback is supposed to be used as - * fixup hook for drivers that need to take action when the link - * state changes. Drivers are by no means allowed to mess with the + /** + * @link_change_notify: Called to inform a PHY device driver + * when the core is about to change the link state. This + * callback is supposed to be used as fixup hook for drivers + * that need to take action when the link state + * changes. Drivers are by no means allowed to mess with the * PHY device structure in their implementations. */ void (*link_change_notify)(struct phy_device *dev); - /* - * Phy specific driver override for reading a MMD register. - * This function is optional for PHY specific drivers. When - * not provided, the default MMD read function will be used - * by phy_read_mmd(), which will use either a direct read for - * Clause 45 PHYs or an indirect read for Clause 22 PHYs. - * devnum is the MMD device number within the PHY device, - * regnum is the register within the selected MMD device. + /** + * @read_mmd: PHY specific driver override for reading a MMD + * register. This function is optional for PHY specific + * drivers. When not provided, the default MMD read function + * will be used by phy_read_mmd(), which will use either a + * direct read for Clause 45 PHYs or an indirect read for + * Clause 22 PHYs. devnum is the MMD device number within the + * PHY device, regnum is the register within the selected MMD + * device. */ int (*read_mmd)(struct phy_device *dev, int devnum, u16 regnum); - /* - * Phy specific driver override for writing a MMD register. - * This function is optional for PHY specific drivers. When - * not provided, the default MMD write function will be used - * by phy_write_mmd(), which will use either a direct write for - * Clause 45 PHYs, or an indirect write for Clause 22 PHYs. - * devnum is the MMD device number within the PHY device, - * regnum is the register within the selected MMD device. - * val is the value to be written. + /** + * @write_mmd: PHY specific driver override for writing a MMD + * register. This function is optional for PHY specific + * drivers. When not provided, the default MMD write function + * will be used by phy_write_mmd(), which will use either a + * direct write for Clause 45 PHYs, or an indirect write for + * Clause 22 PHYs. devnum is the MMD device number within the + * PHY device, regnum is the register within the selected MMD + * device. val is the value to be written. */ int (*write_mmd)(struct phy_device *dev, int devnum, u16 regnum, u16 val); + /** @read_page: Return the current PHY register page number */ int (*read_page)(struct phy_device *dev); + /** @write_page: Set the current PHY register page number */ int (*write_page)(struct phy_device *dev, int page); - /* Get the size and type of the eeprom contained within a plug-in - * module */ + /** + * @module_info: Get the size and type of the eeprom contained + * within a plug-in module + */ int (*module_info)(struct phy_device *dev, struct ethtool_modinfo *modinfo); - /* Get the eeprom information from the plug-in module */ + /** + * @module_eeprom: Get the eeprom information from the plug-in + * module + */ int (*module_eeprom)(struct phy_device *dev, struct ethtool_eeprom *ee, u8 *data); - /* Start a cable test */ + /** @cable_test_start: Start a cable test */ int (*cable_test_start)(struct phy_device *dev); - /* Start a raw TDR cable test */ + /** @cable_test_tdr_start: Start a raw TDR cable test */ int (*cable_test_tdr_start)(struct phy_device *dev, const struct phy_tdr_config *config); - /* Once per second, or on interrupt, request the status of the - * test. + /** + * @cable_test_get_status: Once per second, or on interrupt, + * request the status of the test. */ int (*cable_test_get_status)(struct phy_device *dev, bool *finished); - /* Get statistics from the phy using ethtool */ + /* Get statistics from the PHY using ethtool */ + /** @get_sset_count: Number of statistic counters */ int (*get_sset_count)(struct phy_device *dev); + /** @get_strings: Names of the statistic counters */ void (*get_strings)(struct phy_device *dev, u8 *data); + /** @get_stats: Return the statistic counter values */ void (*get_stats)(struct phy_device *dev, struct ethtool_stats *stats, u64 *data); /* Get and Set PHY tunables */ + /** @get_tunable: Return the value of a tunable */ int (*get_tunable)(struct phy_device *dev, struct ethtool_tunable *tuna, void *data); + /** @set_tunable: Set the value of a tunable */ int (*set_tunable)(struct phy_device *dev, struct ethtool_tunable *tuna, const void *data); + /** @set_loopback: Set the loopback mood of the PHY */ int (*set_loopback)(struct phy_device *dev, bool enable); + /** @get_sqi: Get the signal quality indication */ int (*get_sqi)(struct phy_device *dev); + /** @get_sqi_max: Get the maximum signal quality indication */ int (*get_sqi_max)(struct phy_device *dev); }; #define to_phy_driver(d) container_of(to_mdio_common_driver(d), \ @@ -890,6 +1033,24 @@ static inline int __phy_modify_changed(struct phy_device *phydev, u32 regnum, */ int phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum); +/** + * phy_read_mmd_poll_timeout - Periodically poll a PHY register until a + * condition is met or a timeout occurs + * + * @phydev: The phy_device struct + * @devaddr: The MMD to read from + * @regnum: The register on the MMD to read + * @val: Variable to read the register into + * @cond: Break condition (usually involving @val) + * @sleep_us: Maximum time to sleep between reads in us (0 + * tight-loops). Should be less than ~20ms since usleep_range + * is used (see Documentation/timers/timers-howto.rst). + * @timeout_us: Timeout in us, 0 means never timeout + * @sleep_before_read: if it is true, sleep @sleep_us before read. + * Returns 0 on success and -ETIMEDOUT upon a timeout. In either + * case, the last read value at @args is stored in @val. Must not + * be called from atomic context if sleep_us or timeout_us are used. + */ #define phy_read_mmd_poll_timeout(phydev, devaddr, regnum, val, cond, \ sleep_us, timeout_us, sleep_before_read) \ ({ \ @@ -1161,7 +1322,7 @@ static inline bool phy_is_internal(struct phy_device *phydev) /** * phy_interface_mode_is_rgmii - Convenience function for testing if a * PHY interface mode is RGMII (all variants) - * @mode: the phy_interface_t enum + * @mode: the &phy_interface_t enum */ static inline bool phy_interface_mode_is_rgmii(phy_interface_t mode) { @@ -1170,11 +1331,11 @@ static inline bool phy_interface_mode_is_rgmii(phy_interface_t mode) }; /** - * phy_interface_mode_is_8023z() - does the phy interface mode use 802.3z + * phy_interface_mode_is_8023z() - does the PHY interface mode use 802.3z * negotiation * @mode: one of &enum phy_interface_t * - * Returns true if the phy interface mode uses the 16-bit negotiation + * Returns true if the PHY interface mode uses the 16-bit negotiation * word as defined in 802.3z. (See 802.3-2015 37.2.1 Config_Reg encoding) */ static inline bool phy_interface_mode_is_8023z(phy_interface_t mode) @@ -1193,7 +1354,7 @@ static inline bool phy_interface_is_rgmii(struct phy_device *phydev) return phy_interface_mode_is_rgmii(phydev->interface); }; -/* +/** * phy_is_pseudo_fixed_link - Convenience function for testing if this * PHY is the CPU port facing side of an Ethernet switch, or similar. * @phydev: the phy_device struct -- cgit v1.2.3 From 76bd5c016ef49683d626a48748ef1764aaf8ba63 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Mon, 14 Sep 2020 17:05:08 -0400 Subject: NFSv4: make cache consistency bitmask dynamic Client uses static bitmask for GETATTR on CLOSE/WRITE/DELEGRETURN and ignores the fact that it might have some attributes marked invalid in its cache. Compared to v3 where all attributes are retrieved in postop attributes, v4's cache is frequently out of sync and leads to standalone GETATTRs being sent to the server. Instead, in addition to the minimum cache consistency attributes also check cache_validity and adjust the GETATTR request accordingly. Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- include/linux/nfs_xdr.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 69cb46f7b8d2..0599efd57eb9 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -525,7 +525,7 @@ struct nfs_closeargs { struct nfs_seqid * seqid; fmode_t fmode; u32 share_access; - const u32 * bitmask; + u32 * bitmask; struct nfs4_layoutreturn_args *lr_args; }; @@ -608,7 +608,7 @@ struct nfs4_delegreturnargs { struct nfs4_sequence_args seq_args; const struct nfs_fh *fhandle; const nfs4_stateid *stateid; - const u32 * bitmask; + u32 * bitmask; struct nfs4_layoutreturn_args *lr_args; }; @@ -648,7 +648,7 @@ struct nfs_pgio_args { union { unsigned int replen; /* used by read */ struct { - const u32 * bitmask; /* used by write */ + u32 * bitmask; /* used by write */ enum nfs3_stable_how stable; /* used by write */ }; }; -- cgit v1.2.3 From b5b6775d72e8662b5ea7f892bf06db4831e2c1aa Mon Sep 17 00:00:00 2001 From: Russell King Date: Wed, 23 Sep 2020 18:41:22 +0300 Subject: of: add of_mdio_find_device() api Add a helper function which finds the mdio_device structure given a device tree node. This is helpful for finding the PCS device based on a DTS node but managing it as a mdio_device instead of a phy_device. Signed-off-by: Russell King Signed-off-by: Ioana Ciornei Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/of_mdio.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of_mdio.h b/include/linux/of_mdio.h index 1efb88d9f892..cfe8c607a628 100644 --- a/include/linux/of_mdio.h +++ b/include/linux/of_mdio.h @@ -17,6 +17,7 @@ bool of_mdiobus_child_is_phy(struct device_node *child); int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np); int devm_of_mdiobus_register(struct device *dev, struct mii_bus *mdio, struct device_node *np); +struct mdio_device *of_mdio_find_device(struct device_node *np); struct phy_device *of_phy_find_device(struct device_node *phy_np); struct phy_device * of_phy_connect(struct net_device *dev, struct device_node *phy_np, @@ -74,6 +75,11 @@ static inline int of_mdiobus_register(struct mii_bus *mdio, struct device_node * return mdiobus_register(mdio); } +static inline struct mdio_device *of_mdio_find_device(struct device_node *np) +{ + return NULL; +} + static inline struct phy_device *of_phy_find_device(struct device_node *phy_np) { return NULL; -- cgit v1.2.3 From 5f6e560c2dd5be3ae446f20ae97263cbfa309630 Mon Sep 17 00:00:00 2001 From: Dennis YC Hsieh Date: Tue, 7 Jul 2020 23:45:07 +0800 Subject: soc: mediatek: cmdq: add write_s function add write_s function in cmdq helper functions which writes value contains in internal register to address with large dma access support. Signed-off-by: Dennis YC Hsieh Link: https://lore.kernel.org/r/1594136714-11650-3-git-send-email-dennis-yc.hsieh@mediatek.com Signed-off-by: Matthias Brugger --- include/linux/mailbox/mtk-cmdq-mailbox.h | 1 + include/linux/soc/mediatek/mtk-cmdq.h | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mailbox/mtk-cmdq-mailbox.h b/include/linux/mailbox/mtk-cmdq-mailbox.h index 05eea1aef5aa..1f76cfedb16d 100644 --- a/include/linux/mailbox/mtk-cmdq-mailbox.h +++ b/include/linux/mailbox/mtk-cmdq-mailbox.h @@ -60,6 +60,7 @@ enum cmdq_code { CMDQ_CODE_JUMP = 0x10, CMDQ_CODE_WFE = 0x20, CMDQ_CODE_EOC = 0x40, + CMDQ_CODE_WRITE_S = 0x90, CMDQ_CODE_LOGIC = 0xa0, }; diff --git a/include/linux/soc/mediatek/mtk-cmdq.h b/include/linux/soc/mediatek/mtk-cmdq.h index 2249ecaf77e4..9b0c57a0063d 100644 --- a/include/linux/soc/mediatek/mtk-cmdq.h +++ b/include/linux/soc/mediatek/mtk-cmdq.h @@ -12,6 +12,8 @@ #include #define CMDQ_NO_TIMEOUT 0xffffffffu +#define CMDQ_ADDR_HIGH(addr) ((u32)(((addr) >> 16) & GENMASK(31, 0))) +#define CMDQ_ADDR_LOW(addr) ((u16)(addr) | BIT(1)) struct cmdq_pkt; @@ -102,6 +104,23 @@ int cmdq_pkt_write(struct cmdq_pkt *pkt, u8 subsys, u16 offset, u32 value); int cmdq_pkt_write_mask(struct cmdq_pkt *pkt, u8 subsys, u16 offset, u32 value, u32 mask); +/** + * cmdq_pkt_write_s() - append write_s command to the CMDQ packet + * @pkt: the CMDQ packet + * @high_addr_reg_idx: internal register ID which contains high address of pa + * @addr_low: low address of pa + * @src_reg_idx: the CMDQ internal register ID which cache source value + * + * Return: 0 for success; else the error code is returned + * + * Support write value to physical address without subsys. Use CMDQ_ADDR_HIGH() + * to get high address and call cmdq_pkt_assign() to assign value into internal + * reg. Also use CMDQ_ADDR_LOW() to get low address for addr_low parameter when + * call to this function. + */ +int cmdq_pkt_write_s(struct cmdq_pkt *pkt, u16 high_addr_reg_idx, + u16 addr_low, u16 src_reg_idx); + /** * cmdq_pkt_wfe() - append wait for event command to the CMDQ packet * @pkt: the CMDQ packet -- cgit v1.2.3 From 11c7842d41c82eb3551a0606ccba89ac33318b62 Mon Sep 17 00:00:00 2001 From: Dennis YC Hsieh Date: Tue, 7 Jul 2020 23:45:08 +0800 Subject: soc: mediatek: cmdq: add write_s_mask function add write_s_mask function in cmdq helper functions which writes value contains in internal register to address with mask and large dma access support. Signed-off-by: Dennis YC Hsieh Link: https://lore.kernel.org/r/1594136714-11650-4-git-send-email-dennis-yc.hsieh@mediatek.com Signed-off-by: Matthias Brugger --- include/linux/mailbox/mtk-cmdq-mailbox.h | 1 + include/linux/soc/mediatek/mtk-cmdq.h | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mailbox/mtk-cmdq-mailbox.h b/include/linux/mailbox/mtk-cmdq-mailbox.h index 1f76cfedb16d..90d1d8e64412 100644 --- a/include/linux/mailbox/mtk-cmdq-mailbox.h +++ b/include/linux/mailbox/mtk-cmdq-mailbox.h @@ -61,6 +61,7 @@ enum cmdq_code { CMDQ_CODE_WFE = 0x20, CMDQ_CODE_EOC = 0x40, CMDQ_CODE_WRITE_S = 0x90, + CMDQ_CODE_WRITE_S_MASK = 0x91, CMDQ_CODE_LOGIC = 0xa0, }; diff --git a/include/linux/soc/mediatek/mtk-cmdq.h b/include/linux/soc/mediatek/mtk-cmdq.h index 9b0c57a0063d..53230341bf94 100644 --- a/include/linux/soc/mediatek/mtk-cmdq.h +++ b/include/linux/soc/mediatek/mtk-cmdq.h @@ -121,6 +121,24 @@ int cmdq_pkt_write_mask(struct cmdq_pkt *pkt, u8 subsys, int cmdq_pkt_write_s(struct cmdq_pkt *pkt, u16 high_addr_reg_idx, u16 addr_low, u16 src_reg_idx); +/** + * cmdq_pkt_write_s_mask() - append write_s with mask command to the CMDQ packet + * @pkt: the CMDQ packet + * @high_addr_reg_idx: internal register ID which contains high address of pa + * @addr_low: low address of pa + * @src_reg_idx: the CMDQ internal register ID which cache source value + * @mask: the specified target address mask, use U32_MAX if no need + * + * Return: 0 for success; else the error code is returned + * + * Support write value to physical address without subsys. Use CMDQ_ADDR_HIGH() + * to get high address and call cmdq_pkt_assign() to assign value into internal + * reg. Also use CMDQ_ADDR_LOW() to get low address for addr_low parameter when + * call to this function. + */ +int cmdq_pkt_write_s_mask(struct cmdq_pkt *pkt, u16 high_addr_reg_idx, + u16 addr_low, u16 src_reg_idx, u32 mask); + /** * cmdq_pkt_wfe() - append wait for event command to the CMDQ packet * @pkt: the CMDQ packet -- cgit v1.2.3 From d3b04aab06fbc33ddea15725f3ff1667c9717929 Mon Sep 17 00:00:00 2001 From: Dennis YC Hsieh Date: Tue, 7 Jul 2020 23:45:09 +0800 Subject: soc: mediatek: cmdq: add read_s function Add read_s function in cmdq helper functions which support read value from register or dma physical address into gce internal register. Signed-off-by: Dennis YC Hsieh Link: https://lore.kernel.org/r/1594136714-11650-5-git-send-email-dennis-yc.hsieh@mediatek.com Signed-off-by: Matthias Brugger --- include/linux/mailbox/mtk-cmdq-mailbox.h | 1 + include/linux/soc/mediatek/mtk-cmdq.h | 12 ++++++++++++ 2 files changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mailbox/mtk-cmdq-mailbox.h b/include/linux/mailbox/mtk-cmdq-mailbox.h index 90d1d8e64412..efbd8a9eb2d1 100644 --- a/include/linux/mailbox/mtk-cmdq-mailbox.h +++ b/include/linux/mailbox/mtk-cmdq-mailbox.h @@ -60,6 +60,7 @@ enum cmdq_code { CMDQ_CODE_JUMP = 0x10, CMDQ_CODE_WFE = 0x20, CMDQ_CODE_EOC = 0x40, + CMDQ_CODE_READ_S = 0x80, CMDQ_CODE_WRITE_S = 0x90, CMDQ_CODE_WRITE_S_MASK = 0x91, CMDQ_CODE_LOGIC = 0xa0, diff --git a/include/linux/soc/mediatek/mtk-cmdq.h b/include/linux/soc/mediatek/mtk-cmdq.h index 53230341bf94..cd7ec714344e 100644 --- a/include/linux/soc/mediatek/mtk-cmdq.h +++ b/include/linux/soc/mediatek/mtk-cmdq.h @@ -104,6 +104,18 @@ int cmdq_pkt_write(struct cmdq_pkt *pkt, u8 subsys, u16 offset, u32 value); int cmdq_pkt_write_mask(struct cmdq_pkt *pkt, u8 subsys, u16 offset, u32 value, u32 mask); +/* + * cmdq_pkt_read_s() - append read_s command to the CMDQ packet + * @pkt: the CMDQ packet + * @high_addr_reg_idx: internal register ID which contains high address of pa + * @addr_low: low address of pa + * @reg_idx: the CMDQ internal register ID to cache read data + * + * Return: 0 for success; else the error code is returned + */ +int cmdq_pkt_read_s(struct cmdq_pkt *pkt, u16 high_addr_reg_idx, u16 addr_low, + u16 reg_idx); + /** * cmdq_pkt_write_s() - append write_s command to the CMDQ packet * @pkt: the CMDQ packet -- cgit v1.2.3 From 1af43fce813ebd74c76d080beb261603bd0853e1 Mon Sep 17 00:00:00 2001 From: Dennis YC Hsieh Date: Tue, 7 Jul 2020 23:45:10 +0800 Subject: soc: mediatek: cmdq: add write_s value function add write_s function in cmdq helper functions which writes a constant value to address with large dma access support. Signed-off-by: Dennis YC Hsieh Link: https://lore.kernel.org/r/1594136714-11650-6-git-send-email-dennis-yc.hsieh@mediatek.com Signed-off-by: Matthias Brugger --- include/linux/soc/mediatek/mtk-cmdq.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/mtk-cmdq.h b/include/linux/soc/mediatek/mtk-cmdq.h index cd7ec714344e..ae73e10da274 100644 --- a/include/linux/soc/mediatek/mtk-cmdq.h +++ b/include/linux/soc/mediatek/mtk-cmdq.h @@ -151,6 +151,19 @@ int cmdq_pkt_write_s(struct cmdq_pkt *pkt, u16 high_addr_reg_idx, int cmdq_pkt_write_s_mask(struct cmdq_pkt *pkt, u16 high_addr_reg_idx, u16 addr_low, u16 src_reg_idx, u32 mask); +/** + * cmdq_pkt_write_s_value() - append write_s command to the CMDQ packet which + * write value to a physical address + * @pkt: the CMDQ packet + * @high_addr_reg_idx: internal register ID which contains high address of pa + * @addr_low: low address of pa + * @value: the specified target value + * + * Return: 0 for success; else the error code is returned + */ +int cmdq_pkt_write_s_value(struct cmdq_pkt *pkt, u8 high_addr_reg_idx, + u16 addr_low, u32 value); + /** * cmdq_pkt_wfe() - append wait for event command to the CMDQ packet * @pkt: the CMDQ packet -- cgit v1.2.3 From 88a2ffc48d5bc85119ef7961df12369dcd53b4d2 Mon Sep 17 00:00:00 2001 From: Dennis YC Hsieh Date: Tue, 7 Jul 2020 23:45:11 +0800 Subject: soc: mediatek: cmdq: add write_s_mask value function add write_s_mask_value function in cmdq helper functions which writes a constant value to address with mask and large dma access support. Signed-off-by: Dennis YC Hsieh Link: https://lore.kernel.org/r/1594136714-11650-7-git-send-email-dennis-yc.hsieh@mediatek.com Signed-off-by: Matthias Brugger --- include/linux/soc/mediatek/mtk-cmdq.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/mtk-cmdq.h b/include/linux/soc/mediatek/mtk-cmdq.h index ae73e10da274..d9390d76ee14 100644 --- a/include/linux/soc/mediatek/mtk-cmdq.h +++ b/include/linux/soc/mediatek/mtk-cmdq.h @@ -164,6 +164,21 @@ int cmdq_pkt_write_s_mask(struct cmdq_pkt *pkt, u16 high_addr_reg_idx, int cmdq_pkt_write_s_value(struct cmdq_pkt *pkt, u8 high_addr_reg_idx, u16 addr_low, u32 value); +/** + * cmdq_pkt_write_s_mask_value() - append write_s command with mask to the CMDQ + * packet which write value to a physical + * address + * @pkt: the CMDQ packet + * @high_addr_reg_idx: internal register ID which contains high address of pa + * @addr_low: low address of pa + * @value: the specified target value + * @mask: the specified target mask + * + * Return: 0 for success; else the error code is returned + */ +int cmdq_pkt_write_s_mask_value(struct cmdq_pkt *pkt, u8 high_addr_reg_idx, + u16 addr_low, u32 value, u32 mask); + /** * cmdq_pkt_wfe() - append wait for event command to the CMDQ packet * @pkt: the CMDQ packet -- cgit v1.2.3 From 946f1792d3d7942acfbc6afa9a733f608f4622d6 Mon Sep 17 00:00:00 2001 From: Dennis YC Hsieh Date: Tue, 7 Jul 2020 23:45:12 +0800 Subject: soc: mediatek: cmdq: add jump function Add jump function so that client can jump to any address which contains instruction. Signed-off-by: Dennis YC Hsieh Link: https://lore.kernel.org/r/1594136714-11650-8-git-send-email-dennis-yc.hsieh@mediatek.com Signed-off-by: Matthias Brugger --- include/linux/soc/mediatek/mtk-cmdq.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/mtk-cmdq.h b/include/linux/soc/mediatek/mtk-cmdq.h index d9390d76ee14..34354e952f60 100644 --- a/include/linux/soc/mediatek/mtk-cmdq.h +++ b/include/linux/soc/mediatek/mtk-cmdq.h @@ -252,6 +252,17 @@ int cmdq_pkt_poll_mask(struct cmdq_pkt *pkt, u8 subsys, */ int cmdq_pkt_assign(struct cmdq_pkt *pkt, u16 reg_idx, u32 value); +/** + * cmdq_pkt_jump() - Append jump command to the CMDQ packet, ask GCE + * to execute an instruction that change current thread PC to + * a physical address which should contains more instruction. + * @pkt: the CMDQ packet + * @addr: physical address of target instruction buffer + * + * Return: 0 for success; else the error code is returned + */ +int cmdq_pkt_jump(struct cmdq_pkt *pkt, dma_addr_t addr); + /** * cmdq_pkt_finalize() - Append EOC and jump command to pkt. * @pkt: the CMDQ packet -- cgit v1.2.3 From 23c22299cd290409c6b78f57c42b64f8dfb6dd92 Mon Sep 17 00:00:00 2001 From: Dennis YC Hsieh Date: Tue, 7 Jul 2020 23:45:13 +0800 Subject: soc: mediatek: cmdq: add clear option in cmdq_pkt_wfe api Add clear parameter to let client decide if event should be clear to 0 after GCE receive it. Signed-off-by: Dennis YC Hsieh Acked-by: Chun-Kuang Hu Link: https://lore.kernel.org/r/1594136714-11650-9-git-send-email-dennis-yc.hsieh@mediatek.com [mb: fix commit message] Signed-off-by: Matthias Brugger --- include/linux/mailbox/mtk-cmdq-mailbox.h | 3 +-- include/linux/soc/mediatek/mtk-cmdq.h | 5 +++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mailbox/mtk-cmdq-mailbox.h b/include/linux/mailbox/mtk-cmdq-mailbox.h index efbd8a9eb2d1..d5a983d65f05 100644 --- a/include/linux/mailbox/mtk-cmdq-mailbox.h +++ b/include/linux/mailbox/mtk-cmdq-mailbox.h @@ -28,8 +28,7 @@ * bit 16-27: update value * bit 31: 1 - update, 0 - no update */ -#define CMDQ_WFE_OPTION (CMDQ_WFE_UPDATE | CMDQ_WFE_WAIT | \ - CMDQ_WFE_WAIT_VALUE) +#define CMDQ_WFE_OPTION (CMDQ_WFE_WAIT | CMDQ_WFE_WAIT_VALUE) /** cmdq event maximum */ #define CMDQ_MAX_EVENT 0x3ff diff --git a/include/linux/soc/mediatek/mtk-cmdq.h b/include/linux/soc/mediatek/mtk-cmdq.h index 34354e952f60..960704d75994 100644 --- a/include/linux/soc/mediatek/mtk-cmdq.h +++ b/include/linux/soc/mediatek/mtk-cmdq.h @@ -182,11 +182,12 @@ int cmdq_pkt_write_s_mask_value(struct cmdq_pkt *pkt, u8 high_addr_reg_idx, /** * cmdq_pkt_wfe() - append wait for event command to the CMDQ packet * @pkt: the CMDQ packet - * @event: the desired event type to "wait and CLEAR" + * @event: the desired event type to wait + * @clear: clear event or not after event arrive * * Return: 0 for success; else the error code is returned */ -int cmdq_pkt_wfe(struct cmdq_pkt *pkt, u16 event); +int cmdq_pkt_wfe(struct cmdq_pkt *pkt, u16 event, bool clear); /** * cmdq_pkt_clear_event() - append clear event command to the CMDQ packet -- cgit v1.2.3 From 1df8f55a37bd286a3d40192980050bc3d7d78887 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 24 Sep 2020 17:03:50 -0700 Subject: bpf: Enable bpf_skc_to_* sock casting helper to networking prog type There is a constant need to add more fields into the bpf_tcp_sock for the bpf programs running at tc, sock_ops...etc. A current workaround could be to use bpf_probe_read_kernel(). However, other than making another helper call for reading each field and missing CO-RE, it is also not as intuitive to use as directly reading "tp->lsndtime" for example. While already having perfmon cap to do bpf_probe_read_kernel(), it will be much easier if the bpf prog can directly read from the tcp_sock. This patch tries to do that by using the existing casting-helpers bpf_skc_to_*() whose func_proto returns a btf_id. For example, the func_proto of bpf_skc_to_tcp_sock returns the btf_id of the kernel "struct tcp_sock". These helpers are also added to is_ptr_cast_function(). It ensures the returning reg (BPF_REF_0) will also carries the ref_obj_id. That will keep the ref-tracking works properly. The bpf_skc_to_* helpers are made available to most of the bpf prog types in filter.c. The bpf_skc_to_* helpers will be limited by perfmon cap. This patch adds a ARG_PTR_TO_BTF_ID_SOCK_COMMON. The helper accepting this arg can accept a btf-id-ptr (PTR_TO_BTF_ID + &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]) or a legacy-ctx-convert-skc-ptr (PTR_TO_SOCK_COMMON). The bpf_skc_to_*() helpers are changed to take ARG_PTR_TO_BTF_ID_SOCK_COMMON such that they will accept pointer obtained from skb->sk. Instead of specifying both arg_type and arg_btf_id in the same func_proto which is how the current ARG_PTR_TO_BTF_ID does, the arg_btf_id of the new ARG_PTR_TO_BTF_ID_SOCK_COMMON is specified in the compatible_reg_types[] in verifier.c. The reason is the arg_btf_id is always the same. Discussion in this thread: https://lore.kernel.org/bpf/20200922070422.1917351-1-kafai@fb.com/ The ARG_PTR_TO_BTF_ID_ part gives a clear expectation that the helper is expecting a PTR_TO_BTF_ID which could be NULL. This is the same behavior as the existing helper taking ARG_PTR_TO_BTF_ID. The _SOCK_COMMON part means the helper is also expecting the legacy SOCK_COMMON pointer. By excluding the _OR_NULL part, the bpf prog cannot call helper with a literal NULL which doesn't make sense in most cases. e.g. bpf_skc_to_tcp_sock(NULL) will be rejected. All PTR_TO_*_OR_NULL reg has to do a NULL check first before passing into the helper or else the bpf prog will be rejected. This behavior is nothing new and consistent with the current expectation during bpf-prog-load. [ ARG_PTR_TO_BTF_ID_SOCK_COMMON will be used to replace ARG_PTR_TO_SOCK* of other existing helpers later such that those existing helpers can take the PTR_TO_BTF_ID returned by the bpf_skc_to_*() helpers. The only special case is bpf_sk_lookup_assign() which can accept a literal NULL ptr. It has to be handled specially in another follow up patch if there is a need (e.g. by renaming ARG_PTR_TO_SOCKET_OR_NULL to ARG_PTR_TO_BTF_ID_SOCK_COMMON_OR_NULL). ] [ When converting the older helpers that take ARG_PTR_TO_SOCK* in the later patch, if the kernel does not support BTF, ARG_PTR_TO_BTF_ID_SOCK_COMMON will behave like ARG_PTR_TO_SOCK_COMMON because no reg->type could have PTR_TO_BTF_ID in this case. It is not a concern for the newer-btf-only helper like the bpf_skc_to_*() here though because these helpers must require BTF vmlinux to begin with. ] Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200925000350.3855720-1-kafai@fb.com --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fc5c901c7542..d0937f1d2980 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -292,6 +292,7 @@ enum bpf_arg_type { ARG_PTR_TO_ALLOC_MEM, /* pointer to dynamically allocated memory */ ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */ ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ + ARG_PTR_TO_BTF_ID_SOCK_COMMON, /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */ __BPF_ARG_TYPE_MAX, }; -- cgit v1.2.3 From 403217f30418c808600e3e8e345413ba5cc15676 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Mon, 17 Aug 2020 12:53:06 -0400 Subject: SUNRPC/NFSD: Implement xdr_reserve_space_vec() Reserving space for a large READ payload requires special handling when reserving space in the xdr buffer pages. One problem we can have is use of the scratch buffer, which is used to get a pointer to a contiguous region of data up to PAGE_SIZE. When using the scratch buffer, calls to xdr_commit_encode() shift the data to it's proper alignment in the xdr buffer. If we've reserved several pages in a vector, then this could potentially invalidate earlier pointers and result in incorrect READ data being sent to the client. I get around this by looking at the amount of space left in the current page, and never reserve more than that for each entry in the read vector. This lets us place data directly where it needs to go in the buffer pages. Signed-off-by: Anna Schumaker Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/xdr.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 5a6a81b7cd9f..6613d96a3029 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -234,6 +234,8 @@ typedef int (*kxdrdproc_t)(struct rpc_rqst *rqstp, struct xdr_stream *xdr, extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p, struct rpc_rqst *rqst); extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes); +extern int xdr_reserve_space_vec(struct xdr_stream *xdr, struct kvec *vec, + size_t nbytes); extern void xdr_commit_encode(struct xdr_stream *xdr); extern void xdr_truncate_encode(struct xdr_stream *xdr, size_t len); extern int xdr_restrict_buflen(struct xdr_stream *xdr, int newbuflen); -- cgit v1.2.3 From afbe7973173a7ce0a68af8b33e44c967582297be Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 24 Sep 2020 12:30:20 -0400 Subject: tracepoints: Add helper to test if tracepoint is enabled in a header As tracepoints are discouraged from being added in a header because it can cause side effects if other tracepoints are in headers, as well as bloat the kernel as the trace_() function is not a small inline, the common workaround is to add a function call that calls a wrapper function in a C file that then calls the tracepoint. But as function calls add overhead, this function should only be called when the tracepoint in question is enabled. To get around this overhead, a static_branch can be used to only have the tracepoint wrapper get called when the tracepoint is enabled. Add a tracepoint_enabled(tp) macro that gets passed the name of the tracepoint, and this becomes a static_branch that is enabled when the tracepoint is enabled and is a nop when the tracepoint is disabled. Signed-off-by: Steven Rostedt (VMware) --- include/linux/tracepoint-defs.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h index b29950a19205..60625973faaf 100644 --- a/include/linux/tracepoint-defs.h +++ b/include/linux/tracepoint-defs.h @@ -48,4 +48,38 @@ struct bpf_raw_event_map { u32 writable_size; } __aligned(32); +/* + * If a tracepoint needs to be called from a header file, it is not + * recommended to call it directly, as tracepoints in header files + * may cause side-effects and bloat the kernel. Instead, use + * tracepoint_enabled() to test if the tracepoint is enabled, then if + * it is, call a wrapper function defined in a C file that will then + * call the tracepoint. + * + * For "trace_foo_bar()", you would need to create a wrapper function + * in a C file to call trace_foo_bar(): + * void do_trace_foo_bar(args) { trace_foo_bar(args); } + * Then in the header file, declare the tracepoint: + * DECLARE_TRACEPOINT(foo_bar); + * And call your wrapper: + * static inline void some_inlined_function() { + * [..] + * if (tracepoint_enabled(foo_bar)) + * do_trace_foo_bar(args); + * [..] + * } + * + * Note: tracepoint_enabled(foo_bar) is equivalent to trace_foo_bar_enabled() + * but is safe to have in headers, where trace_foo_bar_enabled() is not. + */ +#define DECLARE_TRACEPOINT(tp) \ + extern struct tracepoint __tracepoint_##tp + +#ifdef CONFIG_TRACEPOINTS +# define tracepoint_enabled(tp) \ + static_key_false(&(__tracepoint_##tp).key) +#else +# define tracepoint_enabled(tracepoint) false +#endif + #endif -- cgit v1.2.3 From c65fc2276f0f022f5ad4a84658add2b28cff7227 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 24 Sep 2020 12:43:46 -0400 Subject: mm/page_ref: Convert the open coded tracepoint enabled to the new helper As more use cases of checking if a tracepoint is enabled in a header are coming to fruition, a helper macro, tracepoint_enabled(), has been added to check if a tracepoint is enabled or not, and can be used with minimal header requirements (avoid "include hell"). Convert the page_ref logic over to the new helper macro. Cc: Joonsoo Kim Cc: Michal Nazarewicz Cc: Vlastimil Babka Cc: Minchan Kim Cc: Mel Gorman Cc: "Kirill A. Shutemov" Cc: Sergey Senozhatsky Cc: Arnd Bergmann Signed-off-by: Steven Rostedt (VMware) --- include/linux/page_ref.h | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h index d27701199a4d..f3318f34fc54 100644 --- a/include/linux/page_ref.h +++ b/include/linux/page_ref.h @@ -7,13 +7,13 @@ #include #include -extern struct tracepoint __tracepoint_page_ref_set; -extern struct tracepoint __tracepoint_page_ref_mod; -extern struct tracepoint __tracepoint_page_ref_mod_and_test; -extern struct tracepoint __tracepoint_page_ref_mod_and_return; -extern struct tracepoint __tracepoint_page_ref_mod_unless; -extern struct tracepoint __tracepoint_page_ref_freeze; -extern struct tracepoint __tracepoint_page_ref_unfreeze; +DECLARE_TRACEPOINT(page_ref_set); +DECLARE_TRACEPOINT(page_ref_mod); +DECLARE_TRACEPOINT(page_ref_mod_and_test); +DECLARE_TRACEPOINT(page_ref_mod_and_return); +DECLARE_TRACEPOINT(page_ref_mod_unless); +DECLARE_TRACEPOINT(page_ref_freeze); +DECLARE_TRACEPOINT(page_ref_unfreeze); #ifdef CONFIG_DEBUG_PAGE_REF @@ -24,7 +24,7 @@ extern struct tracepoint __tracepoint_page_ref_unfreeze; * * See trace_##name##_enabled(void) in include/linux/tracepoint.h */ -#define page_ref_tracepoint_active(t) static_key_false(&(t).key) +#define page_ref_tracepoint_active(t) tracepoint_enabled(t) extern void __page_ref_set(struct page *page, int v); extern void __page_ref_mod(struct page *page, int v); @@ -75,7 +75,7 @@ static inline int page_count(struct page *page) static inline void set_page_count(struct page *page, int v) { atomic_set(&page->_refcount, v); - if (page_ref_tracepoint_active(__tracepoint_page_ref_set)) + if (page_ref_tracepoint_active(page_ref_set)) __page_ref_set(page, v); } @@ -91,14 +91,14 @@ static inline void init_page_count(struct page *page) static inline void page_ref_add(struct page *page, int nr) { atomic_add(nr, &page->_refcount); - if (page_ref_tracepoint_active(__tracepoint_page_ref_mod)) + if (page_ref_tracepoint_active(page_ref_mod)) __page_ref_mod(page, nr); } static inline void page_ref_sub(struct page *page, int nr) { atomic_sub(nr, &page->_refcount); - if (page_ref_tracepoint_active(__tracepoint_page_ref_mod)) + if (page_ref_tracepoint_active(page_ref_mod)) __page_ref_mod(page, -nr); } @@ -106,7 +106,7 @@ static inline int page_ref_sub_return(struct page *page, int nr) { int ret = atomic_sub_return(nr, &page->_refcount); - if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_return)) + if (page_ref_tracepoint_active(page_ref_mod_and_return)) __page_ref_mod_and_return(page, -nr, ret); return ret; } @@ -114,14 +114,14 @@ static inline int page_ref_sub_return(struct page *page, int nr) static inline void page_ref_inc(struct page *page) { atomic_inc(&page->_refcount); - if (page_ref_tracepoint_active(__tracepoint_page_ref_mod)) + if (page_ref_tracepoint_active(page_ref_mod)) __page_ref_mod(page, 1); } static inline void page_ref_dec(struct page *page) { atomic_dec(&page->_refcount); - if (page_ref_tracepoint_active(__tracepoint_page_ref_mod)) + if (page_ref_tracepoint_active(page_ref_mod)) __page_ref_mod(page, -1); } @@ -129,7 +129,7 @@ static inline int page_ref_sub_and_test(struct page *page, int nr) { int ret = atomic_sub_and_test(nr, &page->_refcount); - if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_test)) + if (page_ref_tracepoint_active(page_ref_mod_and_test)) __page_ref_mod_and_test(page, -nr, ret); return ret; } @@ -138,7 +138,7 @@ static inline int page_ref_inc_return(struct page *page) { int ret = atomic_inc_return(&page->_refcount); - if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_return)) + if (page_ref_tracepoint_active(page_ref_mod_and_return)) __page_ref_mod_and_return(page, 1, ret); return ret; } @@ -147,7 +147,7 @@ static inline int page_ref_dec_and_test(struct page *page) { int ret = atomic_dec_and_test(&page->_refcount); - if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_test)) + if (page_ref_tracepoint_active(page_ref_mod_and_test)) __page_ref_mod_and_test(page, -1, ret); return ret; } @@ -156,7 +156,7 @@ static inline int page_ref_dec_return(struct page *page) { int ret = atomic_dec_return(&page->_refcount); - if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_return)) + if (page_ref_tracepoint_active(page_ref_mod_and_return)) __page_ref_mod_and_return(page, -1, ret); return ret; } @@ -165,7 +165,7 @@ static inline int page_ref_add_unless(struct page *page, int nr, int u) { int ret = atomic_add_unless(&page->_refcount, nr, u); - if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_unless)) + if (page_ref_tracepoint_active(page_ref_mod_unless)) __page_ref_mod_unless(page, nr, ret); return ret; } @@ -174,7 +174,7 @@ static inline int page_ref_freeze(struct page *page, int count) { int ret = likely(atomic_cmpxchg(&page->_refcount, count, 0) == count); - if (page_ref_tracepoint_active(__tracepoint_page_ref_freeze)) + if (page_ref_tracepoint_active(page_ref_freeze)) __page_ref_freeze(page, count, ret); return ret; } @@ -185,7 +185,7 @@ static inline void page_ref_unfreeze(struct page *page, int count) VM_BUG_ON(count == 0); atomic_set_release(&page->_refcount, count); - if (page_ref_tracepoint_active(__tracepoint_page_ref_unfreeze)) + if (page_ref_tracepoint_active(page_ref_unfreeze)) __page_ref_unfreeze(page, count); } -- cgit v1.2.3 From e0f9956a3862b32ad73869a8e52a33c84aafa46f Mon Sep 17 00:00:00 2001 From: "Chuah, Kim Tatt" Date: Fri, 25 Sep 2020 17:40:41 +0800 Subject: net: stmmac: Add option for VLAN filter fail queue enable Add option in plat_stmmacenet_data struct to enable VLAN Filter Fail Queuing. This option allows packets that fail VLAN filter to be routed to a specific Rx queue when Receive All is also set. When this option is enabled: - Enable VFFQ only when entering promiscuous mode, because Receive All will pass up all rx packets that failed address filtering (similar to promiscuous mode). - VLAN-promiscuous mode is never entered to allow rx packet to fail VLAN filters and get routed to selected VFFQ Rx queue. Reviewed-by: Voon Weifeng Reviewed-by: Ong Boon Leong Signed-off-by: Chuah, Kim Tatt Signed-off-by: Ong Boon Leong Signed-off-by: David S. Miller --- include/linux/stmmac.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index bd964c31d333..00e83c877496 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -198,5 +198,7 @@ struct plat_stmmacenet_data { int mac_port_sel_speed; bool en_tx_lpi_clockgating; int has_xgmac; + bool vlan_fail_q_en; + u8 vlan_fail_q; }; #endif -- cgit v1.2.3 From ba5f4cfeac77fca981b199ec7f2396a3616e5216 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 24 Sep 2020 12:58:40 -0700 Subject: bpf: Add comment to document BTF type PTR_TO_BTF_ID_OR_NULL The meaning of PTR_TO_BTF_ID_OR_NULL differs slightly from other types denoted with the *_OR_NULL type. For example the types PTR_TO_SOCKET and PTR_TO_SOCKET_OR_NULL can be used for branch analysis because the type PTR_TO_SOCKET is guaranteed to _not_ have a null value. In contrast PTR_TO_BTF_ID and BTF_TO_BTF_ID_OR_NULL have slightly different meanings. A PTR_TO_BTF_TO_ID may be a pointer to NULL value, but it is safe to read this pointer in the program context because the program context will handle any faults. The fallout is for PTR_TO_BTF_ID the verifier can assume reads are safe, but can not use the type in branch analysis. Additionally, authors need to be extra careful when passing PTR_TO_BTF_ID into helpers. In general helpers consuming type PTR_TO_BTF_ID will need to assume it may be null. Seeing the above is not obvious to readers without the back knowledge lets add a comment in the type definition. Editorial comment, as networking and tracing programs get closer and more tightly merged we may need to consider a new type that we can ensure is non-null for branch analysis and also passing into helpers. Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Acked-by: Lorenz Bauer --- include/linux/bpf.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d0937f1d2980..79902325bef8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -383,8 +383,22 @@ enum bpf_reg_type { PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */ PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */ PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */ - PTR_TO_BTF_ID, /* reg points to kernel struct */ - PTR_TO_BTF_ID_OR_NULL, /* reg points to kernel struct or NULL */ + /* PTR_TO_BTF_ID points to a kernel struct that does not need + * to be null checked by the BPF program. This does not imply the + * pointer is _not_ null and in practice this can easily be a null + * pointer when reading pointer chains. The assumption is program + * context will handle null pointer dereference typically via fault + * handling. The verifier must keep this in mind and can make no + * assumptions about null or non-null when doing branch analysis. + * Further, when passed into helpers the helpers can not, without + * additional context, assume the value is non-null. + */ + PTR_TO_BTF_ID, + /* PTR_TO_BTF_ID_OR_NULL points to a kernel struct that has not + * been checked for null. Used primarily to inform the verifier + * an explicit null check is required for this struct. + */ + PTR_TO_BTF_ID_OR_NULL, PTR_TO_MEM, /* reg points to valid memory region */ PTR_TO_MEM_OR_NULL, /* reg points to valid memory region or NULL */ PTR_TO_RDONLY_BUF, /* reg points to a readonly buffer */ -- cgit v1.2.3 From da9aa5d96bfe49e903ce2bc01cfb8a776c2619e9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 26 Sep 2020 09:03:57 +0200 Subject: fs: remove vfs_statx_fd vfs_statx_fd is only used to implement vfs_fstat. Remove vfs_statx_fd and just implement vfs_fstat directly. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- include/linux/fs.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index e019ea2f1347..f100d9f711a3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3163,7 +3163,7 @@ extern const struct inode_operations simple_symlink_inode_operations; extern int iterate_dir(struct file *, struct dir_context *); extern int vfs_statx(int, const char __user *, int, struct kstat *, u32); -extern int vfs_statx_fd(unsigned int, struct kstat *, u32, unsigned int); +int vfs_fstat(int fd, struct kstat *stat); static inline int vfs_stat(const char __user *filename, struct kstat *stat) { @@ -3181,11 +3181,6 @@ static inline int vfs_fstatat(int dfd, const char __user *filename, return vfs_statx(dfd, filename, flags | AT_NO_AUTOMOUNT, stat, STATX_BASIC_STATS); } -static inline int vfs_fstat(int fd, struct kstat *stat) -{ - return vfs_statx_fd(fd, stat, STATX_BASIC_STATS, 0); -} - extern const char *vfs_get_link(struct dentry *, struct delayed_call *); extern int vfs_readlink(struct dentry *, char __user *, int); -- cgit v1.2.3 From 0b2c6693b4220595e9cff95d829d5d5bc5d544dc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 26 Sep 2020 09:03:58 +0200 Subject: fs: implement vfs_stat and vfs_lstat in terms of vfs_fstatat Go through vfs_fstatat instead of duplicating the *stat to statx mapping three times. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- include/linux/fs.h | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index f100d9f711a3..b43c9ad7c3c2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3165,21 +3165,19 @@ extern int iterate_dir(struct file *, struct dir_context *); extern int vfs_statx(int, const char __user *, int, struct kstat *, u32); int vfs_fstat(int fd, struct kstat *stat); -static inline int vfs_stat(const char __user *filename, struct kstat *stat) +static inline int vfs_fstatat(int dfd, const char __user *filename, + struct kstat *stat, int flags) { - return vfs_statx(AT_FDCWD, filename, AT_NO_AUTOMOUNT, + return vfs_statx(dfd, filename, flags | AT_NO_AUTOMOUNT, stat, STATX_BASIC_STATS); } -static inline int vfs_lstat(const char __user *name, struct kstat *stat) +static inline int vfs_stat(const char __user *filename, struct kstat *stat) { - return vfs_statx(AT_FDCWD, name, AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT, - stat, STATX_BASIC_STATS); + return vfs_fstatat(AT_FDCWD, filename, stat, 0); } -static inline int vfs_fstatat(int dfd, const char __user *filename, - struct kstat *stat, int flags) +static inline int vfs_lstat(const char __user *name, struct kstat *stat) { - return vfs_statx(dfd, filename, flags | AT_NO_AUTOMOUNT, - stat, STATX_BASIC_STATS); + return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW); } extern const char *vfs_get_link(struct dentry *, struct delayed_call *); -- cgit v1.2.3 From 09f1bde4017e9c34749da2918b3926799c77bce8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 26 Sep 2020 09:03:59 +0200 Subject: fs: move vfs_fstatat out of line This allows to keep vfs_statx static in fs/stat.c to prepare for the following changes. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- include/linux/fs.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index b43c9ad7c3c2..1ac68236bc09 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3162,15 +3162,10 @@ extern const struct inode_operations simple_symlink_inode_operations; extern int iterate_dir(struct file *, struct dir_context *); -extern int vfs_statx(int, const char __user *, int, struct kstat *, u32); +int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat, + int flags); int vfs_fstat(int fd, struct kstat *stat); -static inline int vfs_fstatat(int dfd, const char __user *filename, - struct kstat *stat, int flags) -{ - return vfs_statx(dfd, filename, flags | AT_NO_AUTOMOUNT, - stat, STATX_BASIC_STATS); -} static inline int vfs_stat(const char __user *filename, struct kstat *stat) { return vfs_fstatat(AT_FDCWD, filename, stat, 0); -- cgit v1.2.3 From f2d077ff1b5c17008cff5dc27e7356a694e55462 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 26 Sep 2020 09:04:01 +0200 Subject: fs: remove KSTAT_QUERY_FLAGS KSTAT_QUERY_FLAGS expands to AT_STATX_SYNC_TYPE, which itself already is a mask. Remove the double name, especially given that the prefix is a little confusing vs the normal AT_* flags. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- include/linux/stat.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stat.h b/include/linux/stat.h index 56614af83d4a..fff27e603814 100644 --- a/include/linux/stat.h +++ b/include/linux/stat.h @@ -19,8 +19,6 @@ #include #include -#define KSTAT_QUERY_FLAGS (AT_STATX_SYNC_TYPE) - struct kstat { u32 result_mask; /* What fields the user got */ umode_t mode; -- cgit v1.2.3 From e622129569963fa3bab976685443756e5da70da9 Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Sun, 27 Sep 2020 16:01:50 +0800 Subject: ptp: add stub function for ptp_get_msgtype() Added the missing stub function for ptp_get_msgtype(). Fixes: 036c508ba95e ("ptp: Add generic ptp message type function") Reported-by: Randy Dunlap Signed-off-by: Yangbo Lu Acked-by: Randy Dunlap # build-tested Signed-off-by: David S. Miller --- include/linux/ptp_classify.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ptp_classify.h b/include/linux/ptp_classify.h index 8437307cca8c..c6487b7ab026 100644 --- a/include/linux/ptp_classify.h +++ b/include/linux/ptp_classify.h @@ -134,5 +134,13 @@ static inline struct ptp_header *ptp_parse_header(struct sk_buff *skb, { return NULL; } +static inline u8 ptp_get_msgtype(const struct ptp_header *hdr, + unsigned int type) +{ + /* The return is meaningless. The stub function would not be + * executed since no available header from ptp_parse_header. + */ + return 0; +} #endif #endif /* _PTP_CLASSIFY_H_ */ -- cgit v1.2.3 From f2d10ff4a903813df767a4b56b651a26b938df06 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Sun, 27 Sep 2020 22:15:29 +0100 Subject: kgdb: Honour the kprobe blocklist when setting breakpoints Currently kgdb has absolutely no safety rails in place to discourage or prevent a user from placing a breakpoint in dangerous places such as the debugger's own trap entry/exit and other places where it is not safe to take synchronous traps. Introduce a new config symbol KGDB_HONOUR_BLOCKLIST and modify the default implementation of kgdb_validate_break_address() so that we use the kprobe blocklist to prohibit instrumentation of critical functions if the config symbol is set. The config symbol dependencies are set to ensure that the blocklist will be enabled by default if we enable KGDB and are compiling for an architecture where we HAVE_KPROBES. Suggested-by: Peter Zijlstra Reviewed-by: Douglas Anderson Reviewed-by: Masami Hiramatsu Link: https://lore.kernel.org/r/20200927211531.1380577-2-daniel.thompson@linaro.org Signed-off-by: Daniel Thompson --- include/linux/kgdb.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h index 477b8b7c908f..0d6cf64c8bb1 100644 --- a/include/linux/kgdb.h +++ b/include/linux/kgdb.h @@ -16,6 +16,7 @@ #include #include #include +#include #ifdef CONFIG_HAVE_ARCH_KGDB #include #endif @@ -335,6 +336,23 @@ extern int kgdb_nmicallin(int cpu, int trapnr, void *regs, int err_code, atomic_t *snd_rdy); extern void gdbstub_exit(int status); +/* + * kgdb and kprobes both use the same (kprobe) blocklist (which makes sense + * given they are both typically hooked up to the same trap meaning on most + * architectures one cannot be used to debug the other) + * + * However on architectures where kprobes is not (yet) implemented we permit + * breakpoints everywhere rather than blocking everything by default. + */ +static inline bool kgdb_within_blocklist(unsigned long addr) +{ +#ifdef CONFIG_KGDB_HONOUR_BLOCKLIST + return within_kprobe_blacklist(addr); +#else + return false; +#endif +} + extern int kgdb_single_step; extern atomic_t kgdb_active; #define in_dbg_master() \ -- cgit v1.2.3 From d2b7588a47de8322891de38ec14d15105d66cb1e Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Mon, 21 Sep 2020 19:28:04 -0700 Subject: nl80211: support S1G capability overrides in assoc NL80211_ATTR_S1G_CAPABILITY can be passed along with NL80211_ATTR_S1G_CAPABILITY_MASK to NL80211_CMD_ASSOCIATE to indicate S1G capabilities which should override the hardware capabilities in eg. the association request. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200922022818.15855-4-thomas@adapt-ip.com [johannes: always require both attributes together, commit message] Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 53fba39d4ba6..f71cffa18176 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2330,6 +2330,8 @@ ieee80211_he_spr_size(const u8 *he_spr_ie) } /* S1G Capabilities Information field */ +#define IEEE80211_S1G_CAPABILITY_LEN 15 + #define S1G_CAP0_S1G_LONG BIT(0) #define S1G_CAP0_SGI_1MHZ BIT(1) #define S1G_CAP0_SGI_2MHZ BIT(2) -- cgit v1.2.3 From 9eaffe5078ca0808603cdd15c4eaf0106a996f3a Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Mon, 21 Sep 2020 19:28:06 -0700 Subject: cfg80211: convert S1G beacon to scan results The S1G beacon is an extension frame as opposed to management frame for the regular beacon. This means we may have to occasionally cast the frame buffer to a different header type. Luckily this isn't too bad as scan results mostly only care about the IEs. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200922022818.15855-6-thomas@adapt-ip.com Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index f71cffa18176..1ce0b37441b9 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -151,6 +151,9 @@ #define IEEE80211_ANO_NETTYPE_WILD 15 +/* bits unique to S1G beacon */ +#define IEEE80211_S1G_BCN_NEXT_TBTT 0x100 + /* control extension - for IEEE80211_FTYPE_CTL | IEEE80211_STYPE_CTL_EXT */ #define IEEE80211_CTL_EXT_POLL 0x2000 #define IEEE80211_CTL_EXT_SPR 0x3000 @@ -553,6 +556,28 @@ static inline bool ieee80211_is_s1g_beacon(__le16 fc) cpu_to_le16(IEEE80211_FTYPE_EXT | IEEE80211_STYPE_S1G_BEACON); } +/** + * ieee80211_next_tbtt_present - check if IEEE80211_FTYPE_EXT && + * IEEE80211_STYPE_S1G_BEACON && IEEE80211_S1G_BCN_NEXT_TBTT + * @fc: frame control bytes in little-endian byteorder + */ +static inline bool ieee80211_next_tbtt_present(__le16 fc) +{ + return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) == + cpu_to_le16(IEEE80211_FTYPE_EXT | IEEE80211_STYPE_S1G_BEACON) && + fc & cpu_to_le16(IEEE80211_S1G_BCN_NEXT_TBTT); +} + +/** + * ieee80211_is_s1g_short_beacon - check if next tbtt present bit is set. Only + * true for S1G beacons when they're short. + * @fc: frame control bytes in little-endian byteorder + */ +static inline bool ieee80211_is_s1g_short_beacon(__le16 fc) +{ + return ieee80211_is_s1g_beacon(fc) && ieee80211_next_tbtt_present(fc); +} + /** * ieee80211_is_atim - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_ATIM * @fc: frame control bytes in little-endian byteorder @@ -1034,6 +1059,13 @@ struct ieee80211_ext { u8 change_seq; u8 variable[0]; } __packed s1g_beacon; + struct { + u8 sa[ETH_ALEN]; + __le32 timestamp; + u8 change_seq; + u8 next_tbtt[3]; + u8 variable[0]; + } __packed s1g_short_beacon; } u; } __packed __aligned(2); -- cgit v1.2.3 From 80ca25711380c8eabe51eed875ca9432b4f8939e Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Mon, 21 Sep 2020 19:28:09 -0700 Subject: cfg80211: handle Association Response from S1G STA The sending STA type is implicit based on beacon or probe response content. If sending STA was an S1G STA, adjust the Information Element location accordingly. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200922022818.15855-9-thomas@adapt-ip.com Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 1ce0b37441b9..d0fda8424118 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1100,6 +1100,11 @@ struct ieee80211_mgmt { /* followed by Supported rates */ u8 variable[0]; } __packed assoc_resp, reassoc_resp; + struct { + __le16 capab_info; + __le16 status_code; + u8 variable[0]; + } __packed s1g_assoc_resp, s1g_reassoc_resp; struct { __le16 capab_info; __le16 listen_interval; -- cgit v1.2.3 From 05d109576a36fd498e5db2d905eb50c7dd844b83 Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Mon, 21 Sep 2020 19:28:10 -0700 Subject: mac80211: encode listen interval for S1G S1G allows listen interval up to 2^14 * 10000 beacon intervals. In order to do this listen interval needs a scaling factor applied to the lower 14 bits. Calculate this and properly encode the listen interval for S1G STAs. See IEEE802.11ah-2016 Table 9-44a for reference. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200922022818.15855-10-thomas@adapt-ip.com [move listen_int_usf into function using it] Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index d0fda8424118..7b6af47dd279 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2448,6 +2448,13 @@ ieee80211_he_spr_size(const u8 *he_spr_ie) #define S1G_OPER_CH_WIDTH_PRIMARY_1MHZ BIT(0) #define S1G_OPER_CH_WIDTH_OPER GENMASK(4, 1) + +#define LISTEN_INT_USF GENMASK(15, 14) +#define LISTEN_INT_UI GENMASK(13, 0) + +#define IEEE80211_MAX_USF FIELD_MAX(LISTEN_INT_USF) +#define IEEE80211_MAX_UI FIELD_MAX(LISTEN_INT_UI) + /* Authentication algorithms */ #define WLAN_AUTH_OPEN 0 #define WLAN_AUTH_SHARED_KEY 1 -- cgit v1.2.3 From 1d00ce807efaa0ee3a96de7801be042a06d35873 Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Mon, 21 Sep 2020 19:28:15 -0700 Subject: mac80211: support S1G association The changes required for associating in S1G are: - apply S1G BSS channel info before assoc - mark all S1G STAs as QoS STAs - include and parse AID request element - handle new Association Response format - don't fail assoc if supported rates element is missing Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200922022818.15855-15-thomas@adapt-ip.com [pass skb to ieee80211_add_aid_request_ie(), remove unused variable 'bss'] Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 7b6af47dd279..f2f56b287aed 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -987,6 +987,25 @@ enum ieee80211_vht_opmode_bits { IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF = 0x80, }; +/** + * enum ieee80211_s1g_chanwidth + * These are defined in IEEE802.11-2016ah Table 10-20 + * as BSS Channel Width + * + * @IEEE80211_S1G_CHANWIDTH_1MHZ: 1MHz operating channel + * @IEEE80211_S1G_CHANWIDTH_2MHZ: 2MHz operating channel + * @IEEE80211_S1G_CHANWIDTH_4MHZ: 4MHz operating channel + * @IEEE80211_S1G_CHANWIDTH_8MHZ: 8MHz operating channel + * @IEEE80211_S1G_CHANWIDTH_16MHZ: 16MHz operating channel + */ +enum ieee80211_s1g_chanwidth { + IEEE80211_S1G_CHANWIDTH_1MHZ = 0, + IEEE80211_S1G_CHANWIDTH_2MHZ = 1, + IEEE80211_S1G_CHANWIDTH_4MHZ = 3, + IEEE80211_S1G_CHANWIDTH_8MHZ = 7, + IEEE80211_S1G_CHANWIDTH_16MHZ = 15, +}; + #define WLAN_SA_QUERY_TR_ID_LEN 2 #define WLAN_MEMBERSHIP_LEN 8 #define WLAN_USER_POSITION_LEN 16 @@ -2854,6 +2873,8 @@ enum ieee80211_eid { WLAN_EID_REDUCED_NEIGHBOR_REPORT = 201, + WLAN_EID_AID_REQUEST = 210, + WLAN_EID_AID_RESPONSE = 211, WLAN_EID_S1G_BCN_COMPAT = 213, WLAN_EID_S1G_SHORT_BCN_INTERVAL = 214, WLAN_EID_S1G_CAPABILITIES = 217, -- cgit v1.2.3 From f5bec330e3010450daeb5cb6a94a4a7c54afa306 Mon Sep 17 00:00:00 2001 From: Rajkumar Manoharan Date: Mon, 28 Sep 2020 00:28:11 -0700 Subject: nl80211: extend support to config spatial reuse parameter set Allow the user to configure below Spatial Reuse Parameter Set element. * Non-SRG OBSS PD Max Offset * SRG BSS Color Bitmap * SRG Partial BSSID Bitmap Signed-off-by: Rajkumar Manoharan Link: https://lore.kernel.org/r/1601278091-20313-2-git-send-email-rmanohar@codeaurora.org Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index f2f56b287aed..770408b2fdaf 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2350,8 +2350,11 @@ ieee80211_he_6ghz_oper(const struct ieee80211_he_operation *he_oper) } /* HE Spatial Reuse defines */ -#define IEEE80211_HE_SPR_NON_SRG_OFFSET_PRESENT 0x4 -#define IEEE80211_HE_SPR_SRG_INFORMATION_PRESENT 0x8 +#define IEEE80211_HE_SPR_PSR_DISALLOWED BIT(0) +#define IEEE80211_HE_SPR_NON_SRG_OBSS_PD_SR_DISALLOWED BIT(1) +#define IEEE80211_HE_SPR_NON_SRG_OFFSET_PRESENT BIT(2) +#define IEEE80211_HE_SPR_SRG_INFORMATION_PRESENT BIT(3) +#define IEEE80211_HE_SPR_HESIGA_SR_VAL15_ALLOWED BIT(4) /* * ieee80211_he_spr_size - calculate 802.11ax HE Spatial Reuse IE size -- cgit v1.2.3 From ef24f97daac4d9450c956ab165d8337c2feca0e9 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 27 Aug 2020 10:51:57 +0200 Subject: mtd: rawnand: Separate the ECC engine type and the ECC byte placement The use of "syndrome" placement should not be encoded in the ECC engine mode/type. Create a "placement" field in NAND chip and change all occurrences of the NAND_ECC_HW_SYNDROME enumeration to be just NAND_ECC_HW and possibly a placement entry like NAND_ECC_PLACEMENT_INTERLEAVED. Signed-off-by: Miquel Raynal Reviewed-by: Boris Brezillon Link: https://lore.kernel.org/linux-mtd/20200827085208.16276-10-miquel.raynal@bootlin.com --- include/linux/mtd/rawnand.h | 6 ++++-- include/linux/platform_data/mtd-davinci.h | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h index 10bbfbf4ad7f..cfd75a12f802 100644 --- a/include/linux/mtd/rawnand.h +++ b/include/linux/mtd/rawnand.h @@ -304,6 +304,7 @@ static const struct nand_ecc_caps __name = { \ /** * struct nand_ecc_ctrl - Control structure for ECC * @mode: ECC mode + * @placement: OOB bytes placement * @algo: ECC algorithm * @steps: number of ECC steps per page * @size: data bytes per ECC step @@ -331,7 +332,7 @@ static const struct nand_ecc_caps __name = { \ * controller and always return contiguous in-band and * out-of-band data even if they're not stored * contiguously on the NAND chip (e.g. - * NAND_ECC_HW_SYNDROME interleaves in-band and + * NAND_ECC_PLACEMENT_INTERLEAVED interleaves in-band and * out-of-band data). * @write_page_raw: function to write a raw page without ECC. This function * should hide the specific layout used by the ECC @@ -339,7 +340,7 @@ static const struct nand_ecc_caps __name = { \ * in-band and out-of-band data. ECC controller is * responsible for doing the appropriate transformations * to adapt to its specific layout (e.g. - * NAND_ECC_HW_SYNDROME interleaves in-band and + * NAND_ECC_PLACEMENT_INTERLEAVED interleaves in-band and * out-of-band data). * @read_page: function to read a page according to the ECC generator * requirements; returns maximum number of bitflips corrected in @@ -356,6 +357,7 @@ static const struct nand_ecc_caps __name = { \ */ struct nand_ecc_ctrl { enum nand_ecc_mode mode; + enum nand_ecc_placement placement; enum nand_ecc_algo algo; int steps; int size; diff --git a/include/linux/platform_data/mtd-davinci.h b/include/linux/platform_data/mtd-davinci.h index 03e92c71b3fa..6e2b252a4ce6 100644 --- a/include/linux/platform_data/mtd-davinci.h +++ b/include/linux/platform_data/mtd-davinci.h @@ -69,6 +69,7 @@ struct davinci_nand_pdata { /* platform_data */ * using it with large page chips. */ enum nand_ecc_mode ecc_mode; + enum nand_ecc_placement ecc_placement; u8 ecc_bits; /* e.g. NAND_BUSWIDTH_16 */ -- cgit v1.2.3 From bace41f80f65dc4ba13c892bac783e7e81847379 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 27 Aug 2020 10:51:58 +0200 Subject: mtd: rawnand: Use the new ECC engine type enumeration Mechanical switch from the legacy "mode" enumeration to the new "engine type" enumeration in drivers and board files. The device tree parsing is also updated to return the new enumeration from the old strings. Signed-off-by: Miquel Raynal Reviewed-by: Boris Brezillon Link: https://lore.kernel.org/linux-mtd/20200827085208.16276-11-miquel.raynal@bootlin.com --- include/linux/mtd/rawnand.h | 4 ++-- include/linux/platform_data/mtd-davinci.h | 8 ++++---- include/linux/platform_data/mtd-nand-s3c2410.h | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h index cfd75a12f802..967b616c50df 100644 --- a/include/linux/mtd/rawnand.h +++ b/include/linux/mtd/rawnand.h @@ -303,7 +303,7 @@ static const struct nand_ecc_caps __name = { \ /** * struct nand_ecc_ctrl - Control structure for ECC - * @mode: ECC mode + * @engine_type: ECC engine type * @placement: OOB bytes placement * @algo: ECC algorithm * @steps: number of ECC steps per page @@ -356,7 +356,7 @@ static const struct nand_ecc_caps __name = { \ * @write_oob: function to write chip OOB data */ struct nand_ecc_ctrl { - enum nand_ecc_mode mode; + enum nand_ecc_engine_type engine_type; enum nand_ecc_placement placement; enum nand_ecc_algo algo; int steps; diff --git a/include/linux/platform_data/mtd-davinci.h b/include/linux/platform_data/mtd-davinci.h index 6e2b252a4ce6..dd474dd44848 100644 --- a/include/linux/platform_data/mtd-davinci.h +++ b/include/linux/platform_data/mtd-davinci.h @@ -60,15 +60,15 @@ struct davinci_nand_pdata { /* platform_data */ struct mtd_partition *parts; unsigned nr_parts; - /* none == NAND_ECC_NONE (strongly *not* advised!!) - * soft == NAND_ECC_SOFT - * else == NAND_ECC_HW, according to ecc_bits + /* none == NAND_ECC_ENGINE_TYPE_NONE (strongly *not* advised!!) + * soft == NAND_ECC_ENGINE_TYPE_SOFT + * else == NAND_ECC_ENGINE_TYPE_ON_HOST, according to ecc_bits * * All DaVinci-family chips support 1-bit hardware ECC. * Newer ones also support 4-bit ECC, but are awkward * using it with large page chips. */ - enum nand_ecc_mode ecc_mode; + enum nand_ecc_engine_type engine_type; enum nand_ecc_placement ecc_placement; u8 ecc_bits; diff --git a/include/linux/platform_data/mtd-nand-s3c2410.h b/include/linux/platform_data/mtd-nand-s3c2410.h index 08675b16f9e1..25390fc3e795 100644 --- a/include/linux/platform_data/mtd-nand-s3c2410.h +++ b/include/linux/platform_data/mtd-nand-s3c2410.h @@ -49,7 +49,7 @@ struct s3c2410_platform_nand { unsigned int ignore_unset_ecc:1; - enum nand_ecc_mode ecc_mode; + enum nand_ecc_engine_type engine_type; int nr_sets; struct s3c2410_nand_set *sets; -- cgit v1.2.3 From d193792a26c216cb7db3cf12300c9414990fa603 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 27 Aug 2020 10:51:59 +0200 Subject: mtd: nand: Create a helper to extract the ECC configuration Despite its current name, the eccreq field actually encodes both the NAND requirements and the final ECC configuration. That works fine when using on-die ECC since those 2 concepts match perfectly, but it starts being a problem as soon as we use on-host ECC engines, where we're not guaranteed to have a perfect match. Let's hide the ECC configuration access behind a helper so we can later split those 2 concepts. Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20200827085208.16276-12-miquel.raynal@bootlin.com --- include/linux/mtd/nand.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 8cf5bdbea782..9cbb41a5541c 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -523,6 +523,16 @@ nanddev_get_memorg(struct nand_device *nand) return &nand->memorg; } +/** + * nanddev_get_ecc_conf() - Extract the ECC configuration from a NAND device + * @nand: NAND device + */ +static inline const struct nand_ecc_props * +nanddev_get_ecc_conf(struct nand_device *nand) +{ + return &nand->eccreq; +} + int nanddev_init(struct nand_device *nand, const struct nand_ops *ops, struct module *owner); void nanddev_cleanup(struct nand_device *nand); -- cgit v1.2.3 From 3316c8e3ad1fcaeefd4ffa93587dd78fb24e8afa Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 27 Aug 2020 10:52:01 +0200 Subject: mtd: nand: Create helpers to set/extract the ECC requirements Despite its current name, the eccreq field actually encodes both the NAND requirements and the final ECC configuration. That works fine when using on-die ECC since those 2 concepts match perfectly, but it starts being a problem as soon as we use on-host ECC engines, where we're not guaranteed to have a perfect match. Let's hide the ECC requirements access behind helpers so we can later split those 2 concepts. As the structures have not been clarified yet, these helpers access the same internal variable as nanddev_get_ecc_conf() for now. Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20200827085208.16276-14-miquel.raynal@bootlin.com --- include/linux/mtd/nand.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 9cbb41a5541c..348fb2ad4d90 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -533,6 +533,30 @@ nanddev_get_ecc_conf(struct nand_device *nand) return &nand->eccreq; } +/** + * nanddev_get_ecc_requirements() - Extract the ECC requirements from a NAND + * device + * @nand: NAND device + */ +static inline const struct nand_ecc_props * +nanddev_get_ecc_requirements(struct nand_device *nand) +{ + return &nand->eccreq; +} + +/** + * nanddev_set_ecc_requirements() - Assign the ECC requirements of a NAND + * device + * @nand: NAND device + * @reqs: Requirements + */ +static inline void +nanddev_set_ecc_requirements(struct nand_device *nand, + const struct nand_ecc_props *reqs) +{ + nand->eccreq = *reqs; +} + int nanddev_init(struct nand_device *nand, const struct nand_ops *ops, struct module *owner); void nanddev_cleanup(struct nand_device *nand); -- cgit v1.2.3 From 93ef92f6f42275e3d6070b1c5020bfca0e614fff Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 27 Aug 2020 10:52:03 +0200 Subject: mtd: nand: Use the new generic ECC object Embed a generic NAND ECC high-level object in the nand_device structure to carry all the ECC engine configuration/data. Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20200827085208.16276-16-miquel.raynal@bootlin.com --- include/linux/mtd/nand.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 348fb2ad4d90..697ea2474a7c 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -301,7 +301,7 @@ struct nand_ecc { * struct nand_device - NAND device * @mtd: MTD instance attached to the NAND device * @memorg: memory layout - * @eccreq: ECC requirements + * @ecc: NAND ECC object attached to the NAND device * @rowconv: position to row address converter * @bbt: bad block table info * @ops: NAND operations attached to the NAND device @@ -309,8 +309,8 @@ struct nand_ecc { * Generic NAND object. Specialized NAND layers (raw NAND, SPI NAND, OneNAND) * should declare their own NAND object embedding a nand_device struct (that's * how inheritance is done). - * struct_nand_device->memorg and struct_nand_device->eccreq should be filled - * at device detection time to reflect the NAND device + * struct_nand_device->memorg and struct_nand_device->ecc.requirements should + * be filled at device detection time to reflect the NAND device * capabilities/requirements. Once this is done nanddev_init() can be called. * It will take care of converting NAND information into MTD ones, which means * the specialized NAND layers should never manually tweak @@ -319,7 +319,7 @@ struct nand_ecc { struct nand_device { struct mtd_info mtd; struct nand_memory_organization memorg; - struct nand_ecc_props eccreq; + struct nand_ecc ecc; struct nand_row_converter rowconv; struct nand_bbt bbt; const struct nand_ops *ops; @@ -530,7 +530,7 @@ nanddev_get_memorg(struct nand_device *nand) static inline const struct nand_ecc_props * nanddev_get_ecc_conf(struct nand_device *nand) { - return &nand->eccreq; + return &nand->ecc.ctx.conf; } /** @@ -541,7 +541,7 @@ nanddev_get_ecc_conf(struct nand_device *nand) static inline const struct nand_ecc_props * nanddev_get_ecc_requirements(struct nand_device *nand) { - return &nand->eccreq; + return &nand->ecc.requirements; } /** @@ -554,7 +554,7 @@ static inline void nanddev_set_ecc_requirements(struct nand_device *nand, const struct nand_ecc_props *reqs) { - nand->eccreq = *reqs; + nand->ecc.requirements = *reqs; } int nanddev_init(struct nand_device *nand, const struct nand_ops *ops, -- cgit v1.2.3 From 1b4d60ec162f82ea29a2e7a907b5c6cc9f926321 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 25 Sep 2020 13:54:29 -0700 Subject: bpf: Enable BPF_PROG_TEST_RUN for raw_tracepoint Add .test_run for raw_tracepoint. Also, introduce a new feature that runs the target program on a specific CPU. This is achieved by a new flag in bpf_attr.test, BPF_F_TEST_RUN_ON_CPU. When this flag is set, the program is triggered on cpu with id bpf_attr.test.cpu. This feature is needed for BPF programs that handle perf_event and other percpu resources, as the program can access these resource locally. Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200925205432.1777-2-songliubraving@fb.com --- include/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 79902325bef8..db6dcdee7933 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1396,6 +1396,9 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog, int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); +int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr); bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info); -- cgit v1.2.3 From efc68158c429f37d87fd02ee9a26913c78546fc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 25 Sep 2020 23:25:01 +0200 Subject: bpf: change logging calls from verbose() to bpf_log() and use log pointer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation for moving code around, change a bunch of references to env->log (and the verbose() logging helper) to use bpf_log() and a direct pointer to struct bpf_verifier_log. While we're touching the function signature, mark the 'prog' argument to bpf_check_type_match() as const. Also enhance the bpf_verifier_log_needed() check to handle NULL pointers for the log struct so we can re-use the code with logging disabled. Acked-by: Andrii Nakryiko Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- include/linux/bpf_verifier.h | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index db6dcdee7933..5176726f4f03 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1420,7 +1420,7 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *reg); -int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog, +int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, struct btf *btf, const struct btf_type *t); struct bpf_prog *bpf_prog_by_id(u32 id); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 2bb48a2c4d08..7bc9276c4ef4 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -347,8 +347,9 @@ static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) { - return (log->level && log->ubuf && !bpf_verifier_log_full(log)) || - log->level == BPF_LOG_KERNEL; + return log && + ((log->level && log->ubuf && !bpf_verifier_log_full(log)) || + log->level == BPF_LOG_KERNEL); } #define BPF_MAX_SUBPROGS 256 -- cgit v1.2.3 From f7b12b6fea00988496b7606d4964cd77beef46a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 25 Sep 2020 23:25:02 +0200 Subject: bpf: verifier: refactor check_attach_btf_id() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The check_attach_btf_id() function really does three things: 1. It performs a bunch of checks on the program to ensure that the attachment is valid. 2. It stores a bunch of state about the attachment being requested in the verifier environment and struct bpf_prog objects. 3. It allocates a trampoline for the attachment. This patch splits out (1.) and (3.) into separate functions which will perform the checks, but return the computed values instead of directly modifying the environment. This is done in preparation for reusing the checks when the actual attachment is happening, which will allow tracing programs to have multiple (compatible) attachments. This also fixes a bug where a bunch of checks were skipped if a trampoline already existed for the tracing target. Fixes: 6ba43b761c41 ("bpf: Attachment verification for BPF_MODIFY_RETURN") Fixes: 1e6c62a88215 ("bpf: Introduce sleepable BPF programs") Acked-by: Andrii Nakryiko Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 19 ++++++++++++++----- include/linux/bpf_verifier.h | 13 +++++++++++++ 2 files changed, 27 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5176726f4f03..b89a30764069 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -606,6 +606,13 @@ struct bpf_trampoline { struct bpf_ksym ksym; }; +struct bpf_attach_target_info { + struct btf_func_model fmodel; + long tgt_addr; + const char *tgt_name; + const struct btf_type *tgt_type; +}; + #define BPF_DISPATCHER_MAX 48 /* Fits in 2048B */ struct bpf_dispatcher_prog { @@ -633,9 +640,10 @@ static __always_inline unsigned int bpf_dispatcher_nop_func( return bpf_func(ctx, insnsi); } #ifdef CONFIG_BPF_JIT -struct bpf_trampoline *bpf_trampoline_lookup(u64 key); int bpf_trampoline_link_prog(struct bpf_prog *prog); int bpf_trampoline_unlink_prog(struct bpf_prog *prog); +struct bpf_trampoline *bpf_trampoline_get(u64 key, + struct bpf_attach_target_info *tgt_info); void bpf_trampoline_put(struct bpf_trampoline *tr); #define BPF_DISPATCHER_INIT(_name) { \ .mutex = __MUTEX_INITIALIZER(_name.mutex), \ @@ -680,10 +688,6 @@ void bpf_image_ksym_del(struct bpf_ksym *ksym); void bpf_ksym_add(struct bpf_ksym *ksym); void bpf_ksym_del(struct bpf_ksym *ksym); #else -static inline struct bpf_trampoline *bpf_trampoline_lookup(u64 key) -{ - return NULL; -} static inline int bpf_trampoline_link_prog(struct bpf_prog *prog) { return -ENOTSUPP; @@ -692,6 +696,11 @@ static inline int bpf_trampoline_unlink_prog(struct bpf_prog *prog) { return -ENOTSUPP; } +static inline struct bpf_trampoline *bpf_trampoline_get(u64 key, + struct bpf_attach_target_info *tgt_info) +{ + return ERR_PTR(-EOPNOTSUPP); +} static inline void bpf_trampoline_put(struct bpf_trampoline *tr) {} #define DEFINE_BPF_DISPATCHER(name) #define DECLARE_BPF_DISPATCHER(name) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 7bc9276c4ef4..363b4f1c562a 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -450,4 +450,17 @@ bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt); int check_ctx_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno); +/* this lives here instead of in bpf.h because it needs to dereference tgt_prog */ +static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, + u32 btf_id) +{ + return tgt_prog ? (((u64)tgt_prog->aux->id) << 32 | btf_id) : btf_id; +} + +int bpf_check_attach_target(struct bpf_verifier_log *log, + const struct bpf_prog *prog, + const struct bpf_prog *tgt_prog, + u32 btf_id, + struct bpf_attach_target_info *tgt_info); + #endif /* _LINUX_BPF_VERIFIER_H */ -- cgit v1.2.3 From 76654e67f3a01c50dc13dd6dea75e58943413956 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Mon, 28 Sep 2020 12:31:03 +0100 Subject: bpf: Provide function to get vmlinux BTF information It will be used later for BPF structure display support Signed-off-by: Alan Maguire Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1601292670-1616-2-git-send-email-alan.maguire@oracle.com --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b89a30764069..e620a4b1290f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1364,6 +1364,8 @@ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, union bpf_attr __user *uattr); void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); +struct btf *bpf_get_btf_vmlinux(void); + /* Map specifics */ struct xdp_buff; struct sk_buff; -- cgit v1.2.3 From 31d0bc81637d8d974a6dad9827b765b4b70c89d7 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Mon, 28 Sep 2020 12:31:04 +0100 Subject: bpf: Move to generic BTF show support, apply it to seq files/strings generalize the "seq_show" seq file support in btf.c to support a generic show callback of which we support two instances; the current seq file show, and a show with snprintf() behaviour which instead writes the type data to a supplied string. Both classes of show function call btf_type_show() with different targets; the seq file or the string to be written. In the string case we need to track additional data - length left in string to write and length to return that we would have written (a la snprintf). By default show will display type information, field members and their types and values etc, and the information is indented based upon structure depth. Zeroed fields are omitted. Show however supports flags which modify its behaviour: BTF_SHOW_COMPACT - suppress newline/indent. BTF_SHOW_NONAME - suppress show of type and member names. BTF_SHOW_PTR_RAW - do not obfuscate pointer values. BTF_SHOW_UNSAFE - do not copy data to safe buffer before display. BTF_SHOW_ZERO - show zeroed values (by default they are not shown). Signed-off-by: Alan Maguire Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1601292670-1616-3-git-send-email-alan.maguire@oracle.com --- include/linux/btf.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index a9af5e7a7ece..d0f5d3c9ec3d 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -13,6 +13,7 @@ struct btf; struct btf_member; struct btf_type; union bpf_attr; +struct btf_show; extern const struct file_operations btf_fops; @@ -46,8 +47,43 @@ int btf_get_info_by_fd(const struct btf *btf, const struct btf_type *btf_type_id_size(const struct btf *btf, u32 *type_id, u32 *ret_size); + +/* + * Options to control show behaviour. + * - BTF_SHOW_COMPACT: no formatting around type information + * - BTF_SHOW_NONAME: no struct/union member names/types + * - BTF_SHOW_PTR_RAW: show raw (unobfuscated) pointer values; + * equivalent to %px. + * - BTF_SHOW_ZERO: show zero-valued struct/union members; they + * are not displayed by default + * - BTF_SHOW_UNSAFE: skip use of bpf_probe_read() to safely read + * data before displaying it. + */ +#define BTF_SHOW_COMPACT (1ULL << 0) +#define BTF_SHOW_NONAME (1ULL << 1) +#define BTF_SHOW_PTR_RAW (1ULL << 2) +#define BTF_SHOW_ZERO (1ULL << 3) +#define BTF_SHOW_UNSAFE (1ULL << 4) + void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m); + +/* + * Copy len bytes of string representation of obj of BTF type_id into buf. + * + * @btf: struct btf object + * @type_id: type id of type obj points to + * @obj: pointer to typed data + * @buf: buffer to write to + * @len: maximum length to write to buf + * @flags: show options (see above) + * + * Return: length that would have been/was copied as per snprintf, or + * negative error. + */ +int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj, + char *buf, int len, u64 flags); + int btf_get_fd_by_id(u32 id); u32 btf_id(const struct btf *btf); bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, -- cgit v1.2.3 From c4d0bfb45068d853a478b9067a95969b1886a30f Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Mon, 28 Sep 2020 12:31:05 +0100 Subject: bpf: Add bpf_snprintf_btf helper A helper is added to support tracing kernel type information in BPF using the BPF Type Format (BTF). Its signature is long bpf_snprintf_btf(char *str, u32 str_size, struct btf_ptr *ptr, u32 btf_ptr_size, u64 flags); struct btf_ptr * specifies - a pointer to the data to be traced - the BTF id of the type of data pointed to - a flags field is provided for future use; these flags are not to be confused with the BTF_F_* flags below that control how the btf_ptr is displayed; the flags member of the struct btf_ptr may be used to disambiguate types in kernel versus module BTF, etc; the main distinction is the flags relate to the type and information needed in identifying it; not how it is displayed. For example a BPF program with a struct sk_buff *skb could do the following: static struct btf_ptr b = { }; b.ptr = skb; b.type_id = __builtin_btf_type_id(struct sk_buff, 1); bpf_snprintf_btf(str, sizeof(str), &b, sizeof(b), 0, 0); Default output looks like this: (struct sk_buff){ .transport_header = (__u16)65535, .mac_header = (__u16)65535, .end = (sk_buff_data_t)192, .head = (unsigned char *)0x000000007524fd8b, .data = (unsigned char *)0x000000007524fd8b, .truesize = (unsigned int)768, .users = (refcount_t){ .refs = (atomic_t){ .counter = (int)1, }, }, } Flags modifying display are as follows: - BTF_F_COMPACT: no formatting around type information - BTF_F_NONAME: no struct/union member names/types - BTF_F_PTR_RAW: show raw (unobfuscated) pointer values; equivalent to %px. - BTF_F_ZERO: show zero-valued struct/union members; they are not displayed by default Signed-off-by: Alan Maguire Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1601292670-1616-4-git-send-email-alan.maguire@oracle.com --- include/linux/bpf.h | 1 + include/linux/btf.h | 9 +++++---- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e620a4b1290f..768b533ba48e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1822,6 +1822,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; extern const struct bpf_func_proto bpf_copy_from_user_proto; +extern const struct bpf_func_proto bpf_snprintf_btf_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/linux/btf.h b/include/linux/btf.h index d0f5d3c9ec3d..3e5cdc2ba963 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -6,6 +6,7 @@ #include #include +#include #define BTF_TYPE_EMIT(type) ((void)(type *)0) @@ -59,10 +60,10 @@ const struct btf_type *btf_type_id_size(const struct btf *btf, * - BTF_SHOW_UNSAFE: skip use of bpf_probe_read() to safely read * data before displaying it. */ -#define BTF_SHOW_COMPACT (1ULL << 0) -#define BTF_SHOW_NONAME (1ULL << 1) -#define BTF_SHOW_PTR_RAW (1ULL << 2) -#define BTF_SHOW_ZERO (1ULL << 3) +#define BTF_SHOW_COMPACT BTF_F_COMPACT +#define BTF_SHOW_NONAME BTF_F_NONAME +#define BTF_SHOW_PTR_RAW BTF_F_PTR_RAW +#define BTF_SHOW_ZERO BTF_F_ZERO #define BTF_SHOW_UNSAFE (1ULL << 4) void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, -- cgit v1.2.3 From eb411377aed9e27835e77ee0710ee8f4649958f3 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Mon, 28 Sep 2020 12:31:09 +0100 Subject: bpf: Add bpf_seq_printf_btf helper A helper is added to allow seq file writing of kernel data structures using vmlinux BTF. Its signature is long bpf_seq_printf_btf(struct seq_file *m, struct btf_ptr *ptr, u32 btf_ptr_size, u64 flags); Flags and struct btf_ptr definitions/use are identical to the bpf_snprintf_btf helper, and the helper returns 0 on success or a negative error value. Suggested-by: Alexei Starovoitov Signed-off-by: Alan Maguire Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1601292670-1616-8-git-send-email-alan.maguire@oracle.com --- include/linux/btf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index 3e5cdc2ba963..024e16ff7dcc 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -68,6 +68,8 @@ const struct btf_type *btf_type_id_size(const struct btf *btf, void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m); +int btf_type_seq_show_flags(const struct btf *btf, u32 type_id, void *obj, + struct seq_file *m, u64 flags); /* * Copy len bytes of string representation of obj of BTF type_id into buf. -- cgit v1.2.3 From b4c5f83ae3f3e2b3239751c304e424eace62448b Mon Sep 17 00:00:00 2001 From: Rusaimi Amira Ruslan Date: Mon, 28 Sep 2020 18:12:12 +0800 Subject: stmmac: intel: Adding ref clock 1us tic for LPI cntr Adding reference clock (1us tic) for all LPI timer on Intel platforms. The reference clock is derived from ptp clk. This also enables all LPI counter. Signed-off-by: Rusaimi Amira Ruslan Signed-off-by: Voon Weifeng Signed-off-by: David S. Miller --- include/linux/stmmac.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 00e83c877496..628e28903b8b 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -200,5 +200,6 @@ struct plat_stmmacenet_data { int has_xgmac; bool vlan_fail_q_en; u8 vlan_fail_q; + unsigned int eee_usecs_rate; }; #endif -- cgit v1.2.3 From a5d02e901e6dadd7dd60fafb6448a822a47430ff Mon Sep 17 00:00:00 2001 From: Vaibhav Gupta Date: Fri, 31 Jul 2020 01:14:16 +0530 Subject: PCI/PM: Remove unused pcibios_pm_ops The "struct dev_pm_ops pcibios_pm_ops", declared in include/linux/pci.h and defined in drivers/pci/pci-driver.c, provided arch-specific hooks when a PCI device was doing a hibernate transition. 394216275c7d ("s390: remove broken hibernate / power management support") removed the last use of pcibios_pm_ops, so remove it completely. [bhelgaas: drop unused "error"] Link: https://lore.kernel.org/r/20200730194416.1029509-1-vaibhavgupta40@gmail.com Reported-by: Bjorn Helgaas Signed-off-by: Vaibhav Gupta Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 835530605c0d..c9e169c4e216 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2034,10 +2034,6 @@ int pcibios_alloc_irq(struct pci_dev *dev); void pcibios_free_irq(struct pci_dev *dev); resource_size_t pcibios_default_alignment(void); -#ifdef CONFIG_HIBERNATE_CALLBACKS -extern struct dev_pm_ops pcibios_pm_ops; -#endif - #if defined(CONFIG_PCI_MMCONFIG) || defined(CONFIG_ACPI_MCFG) void __init pci_mmcfg_early_init(void); void __init pci_mmcfg_late_init(void); -- cgit v1.2.3 From 3789af9a13e5561738c0f2114e3a5e22c843ca3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= Date: Thu, 30 Jul 2020 21:08:48 +0000 Subject: PCI/PM: Rename pci_dev.d3_delay to d3hot_delay MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PCI devices support two variants of the D3 power state: D3hot (main power present) D3cold (main power removed). Previously struct pci_dev contained: unsigned int d3_delay; /* D3->D0 transition time in ms */ unsigned int d3cold_delay; /* D3cold->D0 transition time in ms */ "d3_delay" refers specifically to the D3hot state. Rename it to "d3hot_delay" to avoid ambiguity and align with the ACPI "_DSM for Specifying Device Readiness Durations" in the PCI Firmware spec r3.2, sec 4.6.9. There is no change to the functionality. Link: https://lore.kernel.org/r/20200730210848.1578826-1-kw@linux.com Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index c9e169c4e216..bea1a03faab6 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -373,7 +373,7 @@ struct pci_dev { user sysfs */ unsigned int clear_retrain_link:1; /* Need to clear Retrain Link bit manually */ - unsigned int d3_delay; /* D3->D0 transition time in ms */ + unsigned int d3hot_delay; /* D3hot->D0 transition time in ms */ unsigned int d3cold_delay; /* D3cold->D0 transition time in ms */ #ifdef CONFIG_PCIEASPM -- cgit v1.2.3 From 3aac1ead5eb6b76f3d2b8d111e6de1c2de23fb34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 29 Sep 2020 14:45:50 +0200 Subject: bpf: Move prog->aux->linked_prog and trampoline into bpf_link on attach MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation for allowing multiple attachments of freplace programs, move the references to the target program and trampoline into the bpf_tracing_link structure when that is created. To do this atomically, introduce a new mutex in prog->aux to protect writing to the two pointers to target prog and trampoline, and rename the members to make it clear that they are related. With this change, it is no longer possible to attach the same tracing program multiple times (detaching in-between), since the reference from the tracing program to the target disappears on the first attach. However, since the next patch will let the caller supply an attach target, that will also make it possible to attach to the same place multiple times. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/160138355059.48470.2503076992210324984.stgit@toke.dk --- include/linux/bpf.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 768b533ba48e..839dd8670a7a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -640,8 +640,8 @@ static __always_inline unsigned int bpf_dispatcher_nop_func( return bpf_func(ctx, insnsi); } #ifdef CONFIG_BPF_JIT -int bpf_trampoline_link_prog(struct bpf_prog *prog); -int bpf_trampoline_unlink_prog(struct bpf_prog *prog); +int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr); +int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr); struct bpf_trampoline *bpf_trampoline_get(u64 key, struct bpf_attach_target_info *tgt_info); void bpf_trampoline_put(struct bpf_trampoline *tr); @@ -688,11 +688,13 @@ void bpf_image_ksym_del(struct bpf_ksym *ksym); void bpf_ksym_add(struct bpf_ksym *ksym); void bpf_ksym_del(struct bpf_ksym *ksym); #else -static inline int bpf_trampoline_link_prog(struct bpf_prog *prog) +static inline int bpf_trampoline_link_prog(struct bpf_prog *prog, + struct bpf_trampoline *tr) { return -ENOTSUPP; } -static inline int bpf_trampoline_unlink_prog(struct bpf_prog *prog) +static inline int bpf_trampoline_unlink_prog(struct bpf_prog *prog, + struct bpf_trampoline *tr) { return -ENOTSUPP; } @@ -763,7 +765,9 @@ struct bpf_prog_aux { u32 max_rdonly_access; u32 max_rdwr_access; const struct bpf_ctx_arg_aux *ctx_arg_info; - struct bpf_prog *linked_prog; + struct mutex dst_mutex; /* protects dst_* pointers below, *after* prog becomes visible */ + struct bpf_prog *dst_prog; + struct bpf_trampoline *dst_trampoline; bool verifier_zext; /* Zero extensions has been inserted by verifier. */ bool offload_requested; bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ @@ -771,7 +775,6 @@ struct bpf_prog_aux { bool sleepable; bool tail_call_reachable; enum bpf_tramp_prog_type trampoline_prog_type; - struct bpf_trampoline *trampoline; struct hlist_node tramp_hlist; /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ const struct btf_type *attach_func_proto; -- cgit v1.2.3 From 4a1e7c0c63e02daad751842b7880f9bbcdfb6e89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 29 Sep 2020 14:45:51 +0200 Subject: bpf: Support attaching freplace programs to multiple attach points MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This enables support for attaching freplace programs to multiple attach points. It does this by amending the UAPI for bpf_link_Create with a target btf ID that can be used to supply the new attachment point along with the target program fd. The target must be compatible with the target that was supplied at program load time. The implementation reuses the checks that were factored out of check_attach_btf_id() to ensure compatibility between the BTF types of the old and new attachment. If these match, a new bpf_tracing_link will be created for the new attach target, allowing multiple attachments to co-exist simultaneously. The code could theoretically support multiple-attach of other types of tracing programs as well, but since I don't have a use case for any of those, there is no API support for doing so. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/160138355169.48470.17165680973640685368.stgit@toke.dk --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 839dd8670a7a..50e5c4b52bd1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -768,6 +768,8 @@ struct bpf_prog_aux { struct mutex dst_mutex; /* protects dst_* pointers below, *after* prog becomes visible */ struct bpf_prog *dst_prog; struct bpf_trampoline *dst_trampoline; + enum bpf_prog_type saved_dst_prog_type; + enum bpf_attach_type saved_dst_attach_type; bool verifier_zext; /* Zero extensions has been inserted by verifier. */ bool offload_requested; bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ -- cgit v1.2.3 From c11171a413385c1a72291cdb4a7435cb189762ab Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 29 Sep 2020 22:25:12 +0200 Subject: net: Add netif_rx_any_context() Quite some drivers make conditional decisions based on in_interrupt() to invoke either netif_rx() or netif_rx_ni(). Conditionals based on in_interrupt() or other variants of preempt count checks in drivers should not exist for various reasons and Linus clearly requested to either split the code pathes or pass an argument to the common functions which provides the context. This is obviously the correct solution, but for some of the affected drivers this needs a major rewrite due to their convoluted structure. As in_interrupt() usage in drivers needs to be phased out, provide netif_rx_any_context() as a stop gap for these drivers. This confines the in_interrupt() conditional to core code which in turn allows to remove the access to this check for driver code and provides one central place to do further modifications once the driver maze is cleaned up. Suggested-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a431c3229cbf..28cfa53daf72 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3785,6 +3785,7 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog); int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb); int netif_rx(struct sk_buff *skb); int netif_rx_ni(struct sk_buff *skb); +int netif_rx_any_context(struct sk_buff *skb); int netif_receive_skb(struct sk_buff *skb); int netif_receive_skb_core(struct sk_buff *skb); void netif_receive_skb_list(struct list_head *head); -- cgit v1.2.3 From 714fb2fbe7379d0a413be217194f7af9814f2079 Mon Sep 17 00:00:00 2001 From: Vignesh Raghavendra Date: Thu, 24 Sep 2020 13:42:11 +0530 Subject: mtd: hyperbus: Provide per device private pointer Provide per device private pointer that can be used by controller drivers to store device specific private data. Signed-off-by: Vignesh Raghavendra Link: https://lore.kernel.org/r/20200924081214.16934-2-vigneshr@ti.com --- include/linux/mtd/hyperbus.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/hyperbus.h b/include/linux/mtd/hyperbus.h index 2129f7d3b6eb..d8cb1aec826d 100644 --- a/include/linux/mtd/hyperbus.h +++ b/include/linux/mtd/hyperbus.h @@ -20,6 +20,7 @@ enum hyperbus_memtype { * @mtd: pointer to MTD struct * @ctlr: pointer to HyperBus controller struct * @memtype: type of memory device: HyperFlash or HyperRAM + * @priv: pointer to controller specific per device private data */ struct hyperbus_device { @@ -28,6 +29,7 @@ struct hyperbus_device { struct mtd_info *mtd; struct hyperbus_ctlr *ctlr; enum hyperbus_memtype memtype; + void *priv; }; /** -- cgit v1.2.3 From 1e3b37aab958861a9d0c01ff6dbec96a82743701 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 27 Aug 2020 10:52:05 +0200 Subject: mtd: rawnand: Use the ECC framework OOB layouts No need to have our own in the raw NAND core. Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20200827085208.16276-18-miquel.raynal@bootlin.com --- include/linux/mtd/rawnand.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h index 967b616c50df..1bbce6fa1b08 100644 --- a/include/linux/mtd/rawnand.h +++ b/include/linux/mtd/rawnand.h @@ -14,6 +14,7 @@ #define __LINUX_MTD_RAWNAND_H #include +#include #include #include #include @@ -1156,9 +1157,6 @@ struct nand_chip { void *priv; }; -extern const struct mtd_ooblayout_ops nand_ooblayout_sp_ops; -extern const struct mtd_ooblayout_ops nand_ooblayout_lp_ops; - static inline struct nand_chip *mtd_to_nand(struct mtd_info *mtd) { return container_of(mtd, struct nand_chip, base.mtd); -- cgit v1.2.3 From d7157ff49a5b5845b37b8f2bf31607f0af295ef1 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 27 Aug 2020 10:52:07 +0200 Subject: mtd: rawnand: Use the ECC framework user input parsing bits Many helpers are generic to all NAND chips, they should not be raw-NAND specific, so use the generic ones. To avoid moving all the raw NAND core "history" into the generic NAND layer, we keep a part of this parsing in the raw NAND core to ensure backward compatibility. Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20200827085208.16276-20-miquel.raynal@bootlin.com --- include/linux/mtd/rawnand.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h index 1bbce6fa1b08..671e60948deb 100644 --- a/include/linux/mtd/rawnand.h +++ b/include/linux/mtd/rawnand.h @@ -81,18 +81,6 @@ struct nand_chip; #define NAND_DATA_IFACE_CHECK_ONLY -1 -/* - * Constants for ECC_MODES - */ -enum nand_ecc_mode { - NAND_ECC_INVALID, - NAND_ECC_NONE, - NAND_ECC_SOFT, - NAND_ECC_HW, - NAND_ECC_HW_SYNDROME, - NAND_ECC_ON_DIE, -}; - /* * Constants for Hardware ECC */ -- cgit v1.2.3 From b5156335ac37f186812090795ed27884a76c3266 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 27 Aug 2020 10:52:08 +0200 Subject: mtd: rawnand: Use the NAND framework user_conf object for ECC flags Instead of storing the ECC flags in chip->ecc.options, use nanddev->ecc.user_conf.flags. There is currently only one to save: NAND_ECC_MAXIMIZE. Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20200827085208.16276-21-miquel.raynal@bootlin.com --- include/linux/mtd/rawnand.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h index 671e60948deb..aac07940de09 100644 --- a/include/linux/mtd/rawnand.h +++ b/include/linux/mtd/rawnand.h @@ -98,7 +98,6 @@ struct nand_chip; * pages and you want to rely on the default implementation. */ #define NAND_ECC_GENERIC_ERASED_CHECK BIT(0) -#define NAND_ECC_MAXIMIZE BIT(1) /* * Option constants for bizarre disfunctionality and real -- cgit v1.2.3 From 92acdc58ab11af66fcaef485433fde61b5e32fac Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 30 Sep 2020 17:18:16 +0200 Subject: bpf, net: Rework cookie generator as per-cpu one With its use in BPF, the cookie generator can be called very frequently in particular when used out of cgroup v2 hooks (e.g. connect / sendmsg) and attached to the root cgroup, for example, when used in v1/v2 mixed environments. In particular, when there's a high churn on sockets in the system there can be many parallel requests to the bpf_get_socket_cookie() and bpf_get_netns_cookie() helpers which then cause contention on the atomic counter. As similarly done in f991bd2e1421 ("fs: introduce a per-cpu last_ino allocator"), add a small helper library that both can use for the 64 bit counters. Given this can be called from different contexts, we also need to deal with potential nested calls even though in practice they are considered extremely rare. One idea as suggested by Eric Dumazet was to use a reverse counter for this situation since we don't expect 64 bit overflows anyways; that way, we can avoid bigger gaps in the 64 bit counter space compared to just batch-wise increase. Even on machines with small number of cores (e.g. 4) the cookie generation shrinks from min/max/med/avg (ns) of 22/50/40/38.9 down to 10/35/14/17.3 when run in parallel from multiple CPUs. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Reviewed-by: Eric Dumazet Acked-by: Martin KaFai Lau Cc: Eric Dumazet Link: https://lore.kernel.org/bpf/8a80b8d27d3c49f9a14e1d5213c19d8be87d1dc8.1601477936.git.daniel@iogearbox.net --- include/linux/cookie.h | 51 +++++++++++++++++++++++++++++++++++++++++++++++ include/linux/sock_diag.h | 14 ++++++++++++- 2 files changed, 64 insertions(+), 1 deletion(-) create mode 100644 include/linux/cookie.h (limited to 'include/linux') diff --git a/include/linux/cookie.h b/include/linux/cookie.h new file mode 100644 index 000000000000..0c159f585109 --- /dev/null +++ b/include/linux/cookie.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_COOKIE_H +#define __LINUX_COOKIE_H + +#include +#include +#include + +struct pcpu_gen_cookie { + local_t nesting; + u64 last; +} __aligned(16); + +struct gen_cookie { + struct pcpu_gen_cookie __percpu *local; + atomic64_t forward_last ____cacheline_aligned_in_smp; + atomic64_t reverse_last; +}; + +#define COOKIE_LOCAL_BATCH 4096 + +#define DEFINE_COOKIE(name) \ + static DEFINE_PER_CPU(struct pcpu_gen_cookie, __##name); \ + static struct gen_cookie name = { \ + .local = &__##name, \ + .forward_last = ATOMIC64_INIT(0), \ + .reverse_last = ATOMIC64_INIT(0), \ + } + +static __always_inline u64 gen_cookie_next(struct gen_cookie *gc) +{ + struct pcpu_gen_cookie *local = this_cpu_ptr(gc->local); + u64 val; + + if (likely(local_inc_return(&local->nesting) == 1)) { + val = local->last; + if (__is_defined(CONFIG_SMP) && + unlikely((val & (COOKIE_LOCAL_BATCH - 1)) == 0)) { + s64 next = atomic64_add_return(COOKIE_LOCAL_BATCH, + &gc->forward_last); + val = next - COOKIE_LOCAL_BATCH; + } + local->last = ++val; + } else { + val = atomic64_dec_return(&gc->reverse_last); + } + local_dec(&local->nesting); + return val; +} + +#endif /* __LINUX_COOKIE_H */ diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h index 15fe980a27ea..0b9ecd8cf979 100644 --- a/include/linux/sock_diag.h +++ b/include/linux/sock_diag.h @@ -25,7 +25,19 @@ void sock_diag_unregister(const struct sock_diag_handler *h); void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)); void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)); -u64 sock_gen_cookie(struct sock *sk); +u64 __sock_gen_cookie(struct sock *sk); + +static inline u64 sock_gen_cookie(struct sock *sk) +{ + u64 cookie; + + preempt_disable(); + cookie = __sock_gen_cookie(sk); + preempt_enable(); + + return cookie; +} + int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie); void sock_diag_save_cookie(struct sock *sk, __u32 *cookie); -- cgit v1.2.3 From b4ab31414970a7a03a5d55d75083f2c101a30592 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 30 Sep 2020 17:18:17 +0200 Subject: bpf: Add redirect_neigh helper as redirect drop-in Add a redirect_neigh() helper as redirect() drop-in replacement for the xmit side. Main idea for the helper is to be very similar in semantics to the latter just that the skb gets injected into the neighboring subsystem in order to let the stack do the work it knows best anyway to populate the L2 addresses of the packet and then hand over to dev_queue_xmit() as redirect() does. This solves two bigger items: i) skbs don't need to go up to the stack on the host facing veth ingress side for traffic egressing the container to achieve the same for populating L2 which also has the huge advantage that ii) the skb->sk won't get orphaned in ip_rcv_core() when entering the IP routing layer on the host stack. Given that skb->sk neither gets orphaned when crossing the netns as per 9c4c325252c5 ("skbuff: preserve sock reference when scrubbing the skb.") the helper can then push the skbs directly to the phys device where FQ scheduler can do its work and TCP stack gets proper backpressure given we hold on to skb->sk as long as skb is still residing in queues. With the helper used in BPF data path to then push the skb to the phys device, I observed a stable/consistent TCP_STREAM improvement on veth devices for traffic going container -> host -> host -> container from ~10Gbps to ~15Gbps for a single stream in my test environment. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Reviewed-by: David Ahern Acked-by: Martin KaFai Lau Cc: David Ahern Link: https://lore.kernel.org/bpf/f207de81629e1724899b73b8112e0013be782d35.1601477936.git.daniel@iogearbox.net --- include/linux/skbuff.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 04a18e01b362..3d0cf3722bb4 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2548,6 +2548,11 @@ static inline int skb_mac_header_was_set(const struct sk_buff *skb) return skb->mac_header != (typeof(skb->mac_header))~0U; } +static inline void skb_unset_mac_header(struct sk_buff *skb) +{ + skb->mac_header = (typeof(skb->mac_header))~0U; +} + static inline void skb_reset_mac_header(struct sk_buff *skb) { skb->mac_header = skb->data - skb->head; -- cgit v1.2.3 From 20c168be684a97b084525906eb7ed017b7f9c0b8 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Wed, 30 Sep 2020 12:50:59 +0200 Subject: net: macb: move pdata to private header struct macb_platform_data is only used by macb_pci to register the platform device, move its definition to cadence/macb.h and remove platform_data/macb.h Signed-off-by: Alexandre Belloni Signed-off-by: David S. Miller --- include/linux/platform_data/macb.h | 20 -------------------- 1 file changed, 20 deletions(-) delete mode 100644 include/linux/platform_data/macb.h (limited to 'include/linux') diff --git a/include/linux/platform_data/macb.h b/include/linux/platform_data/macb.h deleted file mode 100644 index aa5b5562d6f7..000000000000 --- a/include/linux/platform_data/macb.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2004-2006 Atmel Corporation - */ -#ifndef __MACB_PDATA_H__ -#define __MACB_PDATA_H__ - -#include - -/** - * struct macb_platform_data - platform data for MACB Ethernet - * @pclk: platform clock - * @hclk: AHB clock - */ -struct macb_platform_data { - struct clk *pclk; - struct clk *hclk; -}; - -#endif /* __MACB_PDATA_H__ */ -- cgit v1.2.3 From 7cd7becdddb00620fb8deb74e6fe4e5a1522ae5a Mon Sep 17 00:00:00 2001 From: sunils Date: Fri, 11 Sep 2020 00:13:26 +0300 Subject: net/mlx5: E-switch, Use PF num in metadata reg c0 Currently only 256 vports can be supported as only 8 bits are reserved for them and 8 bits are reserved for vhca_ids in metadata reg c0. To support more than 256 vports, replace vhca_id with a unique shorter 4-bit PF number which covers upto 16 PF's. Use remaining 12 bits for vports ranging 1-4095. This will continue to generate unique metadata even if multiple PCI devices have same switch_id. Signed-off-by: sunils Reviewed-by: Parav Pandit Reviewed-by: Vu Pham Signed-off-by: Saeed Mahameed --- include/linux/mlx5/eswitch.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h index c16827eeba9c..b0ae8020f13e 100644 --- a/include/linux/mlx5/eswitch.h +++ b/include/linux/mlx5/eswitch.h @@ -74,15 +74,16 @@ bool mlx5_eswitch_reg_c1_loopback_enabled(const struct mlx5_eswitch *esw); bool mlx5_eswitch_vport_match_metadata_enabled(const struct mlx5_eswitch *esw); /* Reg C0 usage: - * Reg C0 = < ESW_VHCA_ID_BITS(8) | ESW_VPORT BITS(8) | ESW_CHAIN_TAG(16) > + * Reg C0 = < ESW_PFNUM_BITS(4) | ESW_VPORT BITS(12) | ESW_CHAIN_TAG(16) > * - * Highest 8 bits of the reg c0 is the vhca_id, next 8 bits is vport_num, - * the rest (lowest 16 bits) is left for tc chain tag restoration. - * VHCA_ID + VPORT comprise the SOURCE_PORT matching. + * Highest 4 bits of the reg c0 is the PF_NUM (range 0-15), 12 bits of + * unique non-zero vport id (range 1-4095). The rest (lowest 16 bits) is left + * for tc chain tag restoration. + * PFNUM + VPORT comprise the SOURCE_PORT matching. */ -#define ESW_VHCA_ID_BITS 8 -#define ESW_VPORT_BITS 8 -#define ESW_SOURCE_PORT_METADATA_BITS (ESW_VHCA_ID_BITS + ESW_VPORT_BITS) +#define ESW_VPORT_BITS 12 +#define ESW_PFNUM_BITS 4 +#define ESW_SOURCE_PORT_METADATA_BITS (ESW_PFNUM_BITS + ESW_VPORT_BITS) #define ESW_SOURCE_PORT_METADATA_OFFSET (32 - ESW_SOURCE_PORT_METADATA_BITS) #define ESW_CHAIN_TAG_METADATA_BITS (32 - ESW_SOURCE_PORT_METADATA_BITS) #define ESW_CHAIN_TAG_METADATA_MASK GENMASK(ESW_CHAIN_TAG_METADATA_BITS - 1,\ -- cgit v1.2.3 From 41a7431dbaa37533c3b732cdea425a7b8f2d4162 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sat, 19 Sep 2020 16:04:18 +0200 Subject: power: supply: bq27xxx: add support for TI bq34z100 Add support for new device: the TI bq34z100-G1, a Wide Range Fuel Gauge for Li-Ion, PbA, NiMH, and NiCd batteries. The device shares a lot with other models, although it has its own differences requiring new quirks. This patch was tested on a system equipped with NiMH batteries. Signed-off-by: Krzysztof Kozlowski Signed-off-by: Sebastian Reichel --- include/linux/power/bq27xxx_battery.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/power/bq27xxx_battery.h b/include/linux/power/bq27xxx_battery.h index 987d9652aa4e..111a40d0d3d5 100644 --- a/include/linux/power/bq27xxx_battery.h +++ b/include/linux/power/bq27xxx_battery.h @@ -32,6 +32,7 @@ enum bq27xxx_chip { BQ27621, BQ27Z561, BQ28Z610, + BQ34Z100, }; struct bq27xxx_device_info; -- cgit v1.2.3 From a4947e84f23474803b62a2759b5808147e4e15f9 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 13 Sep 2020 13:29:28 +0300 Subject: overflow: Include header file with SIZE_MAX declaration The various array_size functions use SIZE_MAX define, but missed limits.h causes to failure to compile code that needs overflow.h. In file included from drivers/infiniband/core/uverbs_std_types_device.c:6: ./include/linux/overflow.h: In function 'array_size': ./include/linux/overflow.h:258:10: error: 'SIZE_MAX' undeclared (first use in this function) 258 | return SIZE_MAX; | ^~~~~~~~ Fixes: 610b15c50e86 ("overflow.h: Add allocation size calculation helpers") Link: https://lore.kernel.org/r/20200913102928.134985-1-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/linux/overflow.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/overflow.h b/include/linux/overflow.h index 93fcef105061..ff3c48f0abc5 100644 --- a/include/linux/overflow.h +++ b/include/linux/overflow.h @@ -3,6 +3,7 @@ #define __LINUX_OVERFLOW_H #include +#include /* * In the fallback code below, we need to compute the minimum and -- cgit v1.2.3 From 4976b718c3551faba2c0616ef55ebeb74db1c5ca Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Tue, 29 Sep 2020 16:50:44 -0700 Subject: bpf: Introduce pseudo_btf_id Pseudo_btf_id is a type of ld_imm insn that associates a btf_id to a ksym so that further dereferences on the ksym can use the BTF info to validate accesses. Internally, when seeing a pseudo_btf_id ld insn, the verifier reads the btf_id stored in the insn[0]'s imm field and marks the dst_reg as PTR_TO_BTF_ID. The btf_id points to a VAR_KIND, which is encoded in btf_vminux by pahole. If the VAR is not of a struct type, the dst reg will be marked as PTR_TO_MEM instead of PTR_TO_BTF_ID and the mem_size is resolved to the size of the VAR's type. >From the VAR btf_id, the verifier can also read the address of the ksym's corresponding kernel var from kallsyms and use that to fill dst_reg. Therefore, the proper functionality of pseudo_btf_id depends on (1) kallsyms and (2) the encoding of kernel global VARs in pahole, which should be available since pahole v1.18. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200929235049.2533242-2-haoluo@google.com --- include/linux/bpf_verifier.h | 7 +++++++ include/linux/btf.h | 15 +++++++++++++++ 2 files changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 363b4f1c562a..e83ef6f6bf43 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -308,6 +308,13 @@ struct bpf_insn_aux_data { u32 map_index; /* index into used_maps[] */ u32 map_off; /* offset from value base address */ }; + struct { + enum bpf_reg_type reg_type; /* type of pseudo_btf_id */ + union { + u32 btf_id; /* btf_id for struct typed var */ + u32 mem_size; /* mem_size for non-struct typed var */ + }; + } btf_var; }; u64 map_key_state; /* constant (32 bit) key tracking for maps */ int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ diff --git a/include/linux/btf.h b/include/linux/btf.h index 024e16ff7dcc..af1244180588 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -145,6 +145,21 @@ static inline bool btf_type_is_func_proto(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO; } +static inline bool btf_type_is_var(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_VAR; +} + +/* union is only a special case of struct: + * all its offsetof(member) == 0 + */ +static inline bool btf_type_is_struct(const struct btf_type *t) +{ + u8 kind = BTF_INFO_KIND(t->info); + + return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; +} + static inline u16 btf_type_vlen(const struct btf_type *t) { return BTF_INFO_VLEN(t->info); -- cgit v1.2.3 From eaa6bcb71ef6ed3dc18fc525ee7e293b06b4882b Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Tue, 29 Sep 2020 16:50:47 -0700 Subject: bpf: Introduce bpf_per_cpu_ptr() Add bpf_per_cpu_ptr() to help bpf programs access percpu vars. bpf_per_cpu_ptr() has the same semantic as per_cpu_ptr() in the kernel except that it may return NULL. This happens when the cpu parameter is out of range. So the caller must check the returned value. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200929235049.2533242-5-haoluo@google.com --- include/linux/bpf.h | 4 ++++ include/linux/btf.h | 11 +++++++++++ 2 files changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 50e5c4b52bd1..9dde15b2479d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -293,6 +293,7 @@ enum bpf_arg_type { ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */ ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ ARG_PTR_TO_BTF_ID_SOCK_COMMON, /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */ + ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */ __BPF_ARG_TYPE_MAX, }; @@ -307,6 +308,7 @@ enum bpf_return_type { RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */ RET_PTR_TO_ALLOC_MEM_OR_NULL, /* returns a pointer to dynamically allocated memory or NULL */ RET_PTR_TO_BTF_ID_OR_NULL, /* returns a pointer to a btf_id or NULL */ + RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */ }; /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs @@ -405,6 +407,7 @@ enum bpf_reg_type { PTR_TO_RDONLY_BUF_OR_NULL, /* reg points to a readonly buffer or NULL */ PTR_TO_RDWR_BUF, /* reg points to a read/write buffer */ PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */ + PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */ }; /* The information passed from prog-specific *_is_valid_access @@ -1828,6 +1831,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; extern const struct bpf_func_proto bpf_copy_from_user_proto; extern const struct bpf_func_proto bpf_snprintf_btf_proto; +extern const struct bpf_func_proto bpf_per_cpu_ptr_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/linux/btf.h b/include/linux/btf.h index af1244180588..2bf641829664 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -110,6 +110,11 @@ btf_resolve_size(const struct btf *btf, const struct btf_type *type, i < btf_type_vlen(struct_type); \ i++, member++) +#define for_each_vsi(i, datasec_type, member) \ + for (i = 0, member = btf_type_var_secinfo(datasec_type); \ + i < btf_type_vlen(datasec_type); \ + i++, member++) + static inline bool btf_type_is_ptr(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_PTR; @@ -194,6 +199,12 @@ static inline const struct btf_member *btf_type_member(const struct btf_type *t) return (const struct btf_member *)(t + 1); } +static inline const struct btf_var_secinfo *btf_type_var_secinfo( + const struct btf_type *t) +{ + return (const struct btf_var_secinfo *)(t + 1); +} + #ifdef CONFIG_BPF_SYSCALL const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); const char *btf_name_by_offset(const struct btf *btf, u32 offset); -- cgit v1.2.3 From 63d9b80dcf2c67bc5ade61cbbaa09d7af21f43f1 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Tue, 29 Sep 2020 16:50:48 -0700 Subject: bpf: Introducte bpf_this_cpu_ptr() Add bpf_this_cpu_ptr() to help access percpu var on this cpu. This helper always returns a valid pointer, therefore no need to check returned value for NULL. Also note that all programs run with preemption disabled, which means that the returned pointer is stable during all the execution of the program. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200929235049.2533242-6-haoluo@google.com --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9dde15b2479d..dc63eeed4fd9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -309,6 +309,7 @@ enum bpf_return_type { RET_PTR_TO_ALLOC_MEM_OR_NULL, /* returns a pointer to dynamically allocated memory or NULL */ RET_PTR_TO_BTF_ID_OR_NULL, /* returns a pointer to a btf_id or NULL */ RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */ + RET_PTR_TO_MEM_OR_BTF_ID, /* returns a pointer to a valid memory or a btf_id */ }; /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs @@ -1832,6 +1833,7 @@ extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; extern const struct bpf_func_proto bpf_copy_from_user_proto; extern const struct bpf_func_proto bpf_snprintf_btf_proto; extern const struct bpf_func_proto bpf_per_cpu_ptr_proto; +extern const struct bpf_func_proto bpf_this_cpu_ptr_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); -- cgit v1.2.3 From 8e1b3884eed7d96d4f1210b668611ee6a1803ea5 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Thu, 1 Oct 2020 17:12:50 +0000 Subject: net: remove NETDEV_HW_ADDR_T_SLAVE NETDEV_HW_ADDR_T_SLAVE is not used anymore, remove it. Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 28cfa53daf72..0c79d9e56a5e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -212,9 +212,8 @@ struct netdev_hw_addr { unsigned char type; #define NETDEV_HW_ADDR_T_LAN 1 #define NETDEV_HW_ADDR_T_SAN 2 -#define NETDEV_HW_ADDR_T_SLAVE 3 -#define NETDEV_HW_ADDR_T_UNICAST 4 -#define NETDEV_HW_ADDR_T_MULTICAST 5 +#define NETDEV_HW_ADDR_T_UNICAST 3 +#define NETDEV_HW_ADDR_T_MULTICAST 4 bool global_use; int sync_cnt; int refcount; -- cgit v1.2.3 From 19fbcb36a39eefbe8912a13ccc02e937b1c418d6 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Sat, 3 Oct 2020 00:44:28 +0200 Subject: net/sched: act_vlan: Add {POP,PUSH}_ETH actions Implement TCA_VLAN_ACT_POP_ETH and TCA_VLAN_ACT_PUSH_ETH, to respectively pop and push a base Ethernet header at the beginning of a frame. POP_ETH is just a matter of pulling ETH_HLEN bytes. VLAN tags, if any, must be stripped before calling POP_ETH. PUSH_ETH is restricted to skbs with no mac_header, and only the MAC addresses can be configured. The Ethertype is automatically set from skb->protocol. These restrictions ensure that all skb's fields remain consistent, so that this action can't confuse other part of the networking stack (like GSO). Since openvswitch already had these actions, consolidate the code in skbuff.c (like for vlan and mpls push/pop). Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3d0cf3722bb4..42131e325e27 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3573,6 +3573,9 @@ int skb_ensure_writable(struct sk_buff *skb, int write_len); int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci); int skb_vlan_pop(struct sk_buff *skb); int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci); +int skb_eth_pop(struct sk_buff *skb); +int skb_eth_push(struct sk_buff *skb, const unsigned char *dst, + const unsigned char *src); int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, int mac_len, bool ethernet); int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len, -- cgit v1.2.3 From 24ce66c04a06a678f156cf575128246f3d214b4a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 7 Sep 2020 07:58:18 +0200 Subject: uaccess: provide a generic TASK_SIZE_MAX definition Define TASK_SIZE_MAX as TASK_SIZE if not otherwise defined. Signed-off-by: Christoph Hellwig Signed-off-by: Palmer Dabbelt --- include/linux/uaccess.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 70073c802b48..d0e43761c708 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -31,6 +31,10 @@ typedef struct { /* empty dummy */ } mm_segment_t; +#ifndef TASK_SIZE_MAX +#define TASK_SIZE_MAX TASK_SIZE +#endif + #define uaccess_kernel() (false) #define user_addr_max() (TASK_SIZE_MAX) -- cgit v1.2.3 From 261bfb3328b89c63ca410ae30a0c87cd3955344c Mon Sep 17 00:00:00 2001 From: Vincent Huang Date: Sun, 4 Oct 2020 19:42:47 -0700 Subject: Input: synaptics-rmi4 - rename f30_data to gpio_data f30_data in rmi_device_platform_data could be also referenced by RMI function 3A, so rename it and the structure name to avoid confusion. Signed-off-by: Vincent Huang Reviewed-by: Hans de Goede Tested-by: Hans de Goede Reviewed-by: Andrew Duggan Link: https://lore.kernel.org/r/20200930094147.635556-2-vincent.huang@tw.synaptics.com Signed-off-by: Dmitry Torokhov --- include/linux/rmi.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rmi.h b/include/linux/rmi.h index 8ed37f93f3c8..ab7eea01ab42 100644 --- a/include/linux/rmi.h +++ b/include/linux/rmi.h @@ -102,15 +102,16 @@ struct rmi_2d_sensor_platform_data { }; /** - * struct rmi_f30_data - overrides defaults for a single F30 GPIOs/LED chip. + * struct rmi_gpio_data - overrides defaults for a single F30/F3A GPIOs/LED + * chip. * @buttonpad - the touchpad is a buttonpad, so enable only the first actual * button that is found. - * @trackstick_buttons - Set when the function 30 is handling the physical + * @trackstick_buttons - Set when the function 30 or 3a is handling the physical * buttons of the trackstick (as a PS/2 passthrough device). - * @disable - the touchpad incorrectly reports F30 and it should be ignored. + * @disable - the touchpad incorrectly reports F30/F3A and it should be ignored. * This is a special case which is due to misconfigured firmware. */ -struct rmi_f30_data { +struct rmi_gpio_data { bool buttonpad; bool trackstick_buttons; bool disable; @@ -218,7 +219,7 @@ struct rmi_device_platform_data { /* function handler pdata */ struct rmi_2d_sensor_platform_data sensor_pdata; struct rmi_f01_power_management power_management; - struct rmi_f30_data f30_data; + struct rmi_gpio_data gpio_data; }; /** -- cgit v1.2.3 From 07da1223ec939982497db3caccd6215b55acc35c Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Sun, 4 Oct 2020 18:43:37 +0300 Subject: lib/scatterlist: Add support in dynamic allocation of SG table from pages Extend __sg_alloc_table_from_pages to support dynamic allocation of SG table from pages. It should be used by drivers that can't supply all the pages at one time. This function returns the last populated SGE in the table. Users should pass it as an argument to the function from the second call and forward. As before, nents will be equal to the number of populated SGEs (chunks). With this new extension, drivers can benefit the optimization of merging contiguous pages without a need to allocate all pages in advance and hold them in a large buffer. E.g. with the Infiniband driver that allocates a single page for hold the pages. For 1TB memory registration, the temporary buffer would consume only 4KB, instead of 2GB. Link: https://lore.kernel.org/r/20201004154340.1080481-2-leon@kernel.org Signed-off-by: Maor Gottlieb Reviewed-by: Christoph Hellwig Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/linux/scatterlist.h | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 45cf7b69d852..36c47e7e66a2 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -165,6 +165,22 @@ static inline void sg_set_buf(struct scatterlist *sg, const void *buf, #define for_each_sgtable_dma_sg(sgt, sg, i) \ for_each_sg((sgt)->sgl, sg, (sgt)->nents, i) +static inline void __sg_chain(struct scatterlist *chain_sg, + struct scatterlist *sgl) +{ + /* + * offset and length are unused for chain entry. Clear them. + */ + chain_sg->offset = 0; + chain_sg->length = 0; + + /* + * Set lowest bit to indicate a link pointer, and make sure to clear + * the termination bit if it happens to be set. + */ + chain_sg->page_link = ((unsigned long) sgl | SG_CHAIN) & ~SG_END; +} + /** * sg_chain - Chain two sglists together * @prv: First scatterlist @@ -178,18 +194,7 @@ static inline void sg_set_buf(struct scatterlist *sg, const void *buf, static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents, struct scatterlist *sgl) { - /* - * offset and length are unused for chain entry. Clear them. - */ - prv[prv_nents - 1].offset = 0; - prv[prv_nents - 1].length = 0; - - /* - * Set lowest bit to indicate a link pointer, and make sure to clear - * the termination bit if it happens to be set. - */ - prv[prv_nents - 1].page_link = ((unsigned long) sgl | SG_CHAIN) - & ~SG_END; + __sg_chain(&prv[prv_nents - 1], sgl); } /** @@ -286,10 +291,11 @@ void sg_free_table(struct sg_table *); int __sg_alloc_table(struct sg_table *, unsigned int, unsigned int, struct scatterlist *, unsigned int, gfp_t, sg_alloc_fn *); int sg_alloc_table(struct sg_table *, unsigned int, gfp_t); -int __sg_alloc_table_from_pages(struct sg_table *sgt, struct page **pages, - unsigned int n_pages, unsigned int offset, - unsigned long size, unsigned int max_segment, - gfp_t gfp_mask); +struct scatterlist *__sg_alloc_table_from_pages(struct sg_table *sgt, + struct page **pages, unsigned int n_pages, unsigned int offset, + unsigned long size, unsigned int max_segment, + struct scatterlist *prv, unsigned int left_pages, + gfp_t gfp_mask); int sg_alloc_table_from_pages(struct sg_table *sgt, struct page **pages, unsigned int n_pages, unsigned int offset, unsigned long size, gfp_t gfp_mask); -- cgit v1.2.3 From 5de15b610f785f0e188fefb707f0b19de156968a Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Sat, 3 Oct 2020 23:23:20 +0300 Subject: mtd: hyperbus: add Renesas RPC-IF driver Add the HyperFLash driver for the Renesas RPC-IF. It's the "front end" driver using the "back end" APIs in the main driver to talk to the real hardware. Signed-off-by: Sergei Shtylyov Signed-off-by: Vignesh Raghavendra Link: https://lore.kernel.org/r/78abb851-2beb-fe7d-87e5-ce58ee877d35@gmail.com --- include/linux/mtd/hyperbus.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/hyperbus.h b/include/linux/mtd/hyperbus.h index d8cb1aec826d..0ce612428aea 100644 --- a/include/linux/mtd/hyperbus.h +++ b/include/linux/mtd/hyperbus.h @@ -8,6 +8,17 @@ #include +/* HyperBus command bits */ +#define HYPERBUS_RW 0x80 /* R/W# */ +#define HYPERBUS_RW_WRITE 0 +#define HYPERBUS_RW_READ 0x80 +#define HYPERBUS_AS 0x40 /* Address Space */ +#define HYPERBUS_AS_MEM 0 +#define HYPERBUS_AS_REG 0x40 +#define HYPERBUS_BT 0x20 /* Burst Type */ +#define HYPERBUS_BT_WRAPPED 0 +#define HYPERBUS_BT_LINEAR 0x20 + enum hyperbus_memtype { HYPERFLASH, HYPERRAM, -- cgit v1.2.3 From c6db31ffe202c3120147e9f3455a4dbc90546d39 Mon Sep 17 00:00:00 2001 From: Igor Russkikh Date: Mon, 5 Oct 2020 18:39:37 +0300 Subject: ethtool: allow netdev driver to define phy tunables Define get/set phy tunable callbacks in ethtool ops. This will allow MAC drivers with integrated PHY still to implement these tunables. Reviewed-by: Andrew Lunn Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- include/linux/ethtool.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 060b20f0b20f..6408b446051f 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -505,6 +505,10 @@ struct ethtool_ops { struct ethtool_fecparam *); void (*get_ethtool_phy_stats)(struct net_device *, struct ethtool_stats *, u64 *); + int (*get_phy_tunable)(struct net_device *, + const struct ethtool_tunable *, void *); + int (*set_phy_tunable)(struct net_device *, + const struct ethtool_tunable *, const void *); }; int ethtool_check_ops(const struct ethtool_ops *ops); -- cgit v1.2.3 From 451b05f413d3fd89ee84ae99a5029f619ed3cb78 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 5 Oct 2020 22:34:18 +0200 Subject: net: netdevice.h: sw_netstats_rx_add helper some drivers/network protocols update rx bytes/packets under u64_stats_update_begin/end sequence. Add a specific helper like dev_lstats_add() Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- include/linux/netdevice.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d126e36580c9..a0df43b13839 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2543,6 +2543,16 @@ struct pcpu_lstats { void dev_lstats_read(struct net_device *dev, u64 *packets, u64 *bytes); +static inline void dev_sw_netstats_rx_add(struct net_device *dev, unsigned int len) +{ + struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); + + u64_stats_update_begin(&tstats->syncp); + tstats->rx_bytes += len; + tstats->rx_packets++; + u64_stats_update_end(&tstats->syncp); +} + static inline void dev_lstats_add(struct net_device *dev, unsigned int len) { struct pcpu_lstats *lstats = this_cpu_ptr(dev->lstats); -- cgit v1.2.3 From f55a52bb2cdbc5a92ca209d0ada90a490a188f58 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Sat, 3 Oct 2020 00:41:46 +0900 Subject: can: dev: fix type of get_can_dlc() and get_canfd_dlc() macros The macros get_can_dlc() and get_canfd_dlc() are not visible in userland. As such, type u8 should be preferred over type __u8. Reference: https://lkml.org/lkml/2020/10/1/708 Signed-off-by: Vincent Mailhol Link: https://lore.kernel.org/r/20201002154219.4887-3-mailhol.vincent@wanadoo.fr Signed-off-by: Marc Kleine-Budde --- include/linux/can/dev.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index ed0482b2f4b2..f8975d0b90bb 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -84,13 +84,13 @@ struct can_priv { /* * get_can_dlc(value) - helper macro to cast a given data length code (dlc) - * to __u8 and ensure the dlc value to be max. 8 bytes. + * to u8 and ensure the dlc value to be max. 8 bytes. * * To be used in the CAN netdriver receive path to ensure conformance with * ISO 11898-1 Chapter 8.4.2.3 (DLC field) */ -#define get_can_dlc(i) (min_t(__u8, (i), CAN_MAX_DLC)) -#define get_canfd_dlc(i) (min_t(__u8, (i), CANFD_MAX_DLC)) +#define get_can_dlc(i) (min_t(u8, (i), CAN_MAX_DLC)) +#define get_canfd_dlc(i) (min_t(u8, (i), CANFD_MAX_DLC)) /* Check for outgoing skbs that have not been created by the CAN subsystem */ static inline bool can_skb_headroom_valid(struct net_device *dev, -- cgit v1.2.3 From dcb5cdf60a1fbbdb3b4dd2abc562206481f09ef1 Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Sat, 3 Oct 2020 13:19:42 +0530 Subject: powerpc/perf/hv-gpci: Add cpu hotplug support Patch here adds cpu hotplug functions to hv_gpci pmu. A new cpuhp_state "CPUHP_AP_PERF_POWERPC_HV_GPCI_ONLINE" enum is added. The online callback function updates the cpumask only if its empty. As the primary intention of adding hotplug support is to designate a CPU to make HCALL to collect the counter data. The offline function test and clear corresponding cpu in a cpumask and update cpumask to any other active cpu. Signed-off-by: Kajol Jain Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20201003074943.338618-4-kjain@linux.ibm.com --- include/linux/cpuhotplug.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 3215023d4852..5d08ed922510 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -183,6 +183,7 @@ enum cpuhp_state { CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE, CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE, CPUHP_AP_PERF_POWERPC_HV_24x7_ONLINE, + CPUHP_AP_PERF_POWERPC_HV_GPCI_ONLINE, CPUHP_AP_WATCHDOG_ONLINE, CPUHP_AP_WORKQUEUE_ONLINE, CPUHP_AP_RCUTREE_ONLINE, -- cgit v1.2.3 From dd841a749d1ded8e2e5facc4242ee0b6779fc0cb Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 14 Jun 2020 06:07:10 -0400 Subject: radix tree test suite: Fix compilation Introducing local_lock broke compilation; fix it all up. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/radix-tree.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index c2a9f7c90727..5c85059a92ba 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From cf1f08cac375630af6b6307907a3fc20fcf847c7 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Fri, 17 Apr 2020 11:00:24 -0400 Subject: SUNRPC: Implement a xdr_page_pos() function I'll need this for READ_PLUS to help figure out the offset where page data is stored at, but it might also be useful for other things. Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xdr.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 5a6a81b7cd9f..25a68dd87ecf 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -240,6 +240,7 @@ extern int xdr_restrict_buflen(struct xdr_stream *xdr, int newbuflen); extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int base, unsigned int len); extern unsigned int xdr_stream_pos(const struct xdr_stream *xdr); +extern unsigned int xdr_page_pos(const struct xdr_stream *xdr); extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p, struct rpc_rqst *rqst); extern void xdr_init_decode_pages(struct xdr_stream *xdr, struct xdr_buf *buf, -- cgit v1.2.3 From c567552612ece787b178e3b147b5854ad422a836 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Wed, 28 May 2014 13:41:22 -0400 Subject: NFS: Add READ_PLUS data segment support This patch adds client support for decoding a single NFS4_CONTENT_DATA segment returned by the server. This is the simplest implementation possible, since it does not account for any hole segments in the reply. Signed-off-by: Anna Schumaker --- include/linux/nfs4.h | 2 +- include/linux/nfs_fs_sb.h | 1 + include/linux/nfs_xdr.h | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index b8360be141da..9dc7eeac924f 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -551,13 +551,13 @@ enum { NFSPROC4_CLNT_LOOKUPP, NFSPROC4_CLNT_LAYOUTERROR, - NFSPROC4_CLNT_COPY_NOTIFY, NFSPROC4_CLNT_GETXATTR, NFSPROC4_CLNT_SETXATTR, NFSPROC4_CLNT_LISTXATTRS, NFSPROC4_CLNT_REMOVEXATTR, + NFSPROC4_CLNT_READ_PLUS, }; /* nfs41 types */ diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 7eae72a8762e..38e60ec742df 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -287,5 +287,6 @@ struct nfs_server { #define NFS_CAP_LAYOUTERROR (1U << 26) #define NFS_CAP_COPY_NOTIFY (1U << 27) #define NFS_CAP_XATTR (1U << 28) +#define NFS_CAP_READ_PLUS (1U << 29) #endif diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 0599efd57eb9..d63cb862d58e 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -657,7 +657,7 @@ struct nfs_pgio_args { struct nfs_pgio_res { struct nfs4_sequence_res seq_res; struct nfs_fattr * fattr; - __u32 count; + __u64 count; __u32 op_status; union { struct { -- cgit v1.2.3 From 84ce182ab85b8ad5002fb1125ba572df99dd0d1c Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Wed, 28 May 2014 13:38:53 -0400 Subject: SUNRPC: Add the ability to expand holes in data pages This patch adds the ability to "read a hole" into a set of XDR data pages by taking the following steps: 1) Shift all data after the current xdr->p to the right, possibly into the tail, 2) Zero the specified range, and 3) Update xdr->p to point beyond the hole. Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xdr.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 25a68dd87ecf..f9636d2a6d54 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -250,6 +250,7 @@ extern __be32 *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes); extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len); extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len); extern int xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data); +extern uint64_t xdr_expand_hole(struct xdr_stream *, uint64_t, uint64_t); /** * xdr_stream_remaining - Return the number of bytes remaining in the stream -- cgit v1.2.3 From e6ac0accb27c6892b7ebc7799e7ce56b3390a678 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Tue, 21 Apr 2020 11:27:00 -0400 Subject: SUNRPC: Add an xdr_align_data() function For now, this function simply aligns the data at the beginning of the pages. This can eventually be expanded to shift data to the correct offsets when we're ready. Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xdr.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index f9636d2a6d54..fe7ff7f5b584 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -250,6 +250,7 @@ extern __be32 *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes); extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len); extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len); extern int xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data); +extern uint64_t xdr_align_data(struct xdr_stream *, uint64_t, uint32_t); extern uint64_t xdr_expand_hole(struct xdr_stream *, uint64_t, uint64_t); /** -- cgit v1.2.3 From 1c47fa6b31c2683f03bc2f9174902bb7dcd35d83 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Sat, 3 Oct 2020 00:41:49 +0900 Subject: can: dev: add a helper function to calculate the duration of one bit Rename macro CAN_CALC_SYNC_SEG to CAN_SYNC_SEG and make it available through include/linux/can/dev.h Add an helper function can_bit_time() which returns the duration (in time quanta) of one CAN bit. Rationale for this patch: the sync segment and the bit time are two concepts which are defined in the CAN ISO standard. Device drivers for CAN might need those. Please refer to ISO 11898-1:2015, section 11.3.1.1 "Bit time" for additional information. Signed-off-by: Vincent Mailhol Link: https://lore.kernel.org/r/20201002154219.4887-6-mailhol.vincent@wanadoo.fr [mkl: Let can_bit_time() return an unsinged int, make argument const] Signed-off-by: Marc Kleine-Budde --- include/linux/can/dev.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index f8975d0b90bb..41ff31795320 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -82,6 +82,21 @@ struct can_priv { #endif }; +#define CAN_SYNC_SEG 1 + +/* + * can_bit_time() - Duration of one bit + * + * Please refer to ISO 11898-1:2015, section 11.3.1.1 "Bit time" for + * additional information. + * + * Return: the number of time quanta in one bit. + */ +static inline unsigned int can_bit_time(const struct can_bittiming *bt) +{ + return CAN_SYNC_SEG + bt->prop_seg + bt->phase_seg1 + bt->phase_seg2; +} + /* * get_can_dlc(value) - helper macro to cast a given data length code (dlc) * to u8 and ensure the dlc value to be max. 8 bytes. -- cgit v1.2.3 From 8446466c9dd645da4c1848f35ffd0fc1df3524ee Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 6 Aug 2020 10:07:24 -0400 Subject: XArray: Fix xas_for_each_conflict documentation At one point, xas_for_each_conflict() was going to work this way, and I forgot to update the documentation when I changed my mind. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/xarray.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/xarray.h b/include/linux/xarray.h index b4d70e7568b2..6b336098fca7 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -1714,13 +1714,12 @@ enum { * @xas: XArray operation state. * @entry: Entry retrieved from the array. * - * The loop body will be executed for each entry in the XArray that lies - * within the range specified by @xas. If the loop completes successfully, - * any entries that lie in this range will be replaced by @entry. The caller - * may break out of the loop; if they do so, the contents of the XArray will - * be unchanged. The operation may fail due to an out of memory condition. - * The caller may also call xa_set_err() to exit the loop while setting an - * error to record the reason. + * The loop body will be executed for each entry in the XArray that + * lies within the range specified by @xas. If the loop terminates + * normally, @entry will be %NULL. The user may break out of the loop, + * which will leave @entry set to the conflicting entry. The caller + * may also call xa_set_err() to exit the loop while setting an error + * to record the reason. */ #define xas_for_each_conflict(xas, entry) \ while ((entry = xas_find_conflict(xas))) -- cgit v1.2.3 From 02dae28f0b542969e44cbc1e14ffc9944cd2975c Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 31 Aug 2020 11:11:01 +0800 Subject: ftrace: Simplify the dyn_ftrace->flags macro All the three macro are defined to be used for ftrace_rec_count(). This can be achieved by (flags & FTRACE_REF_MAX) directly. Since no other places would use those macros, remove them for clarity. Also it fixes a typo in the comment. Link: https://lkml.kernel.org/r/20200831031104.23322-4-richard.weiyang@linux.alibaba.com Signed-off-by: Wei Yang Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index e5c2d5cc6e6a..b1f56e3410dc 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -432,7 +432,7 @@ bool is_ftrace_trampoline(unsigned long addr); * DIRECT - there is a direct function to call * * When a new ftrace_ops is registered and wants a function to save - * pt_regs, the rec->flag REGS is set. When the function has been + * pt_regs, the rec->flags REGS is set. When the function has been * set up to save regs, the REG_EN flag is set. Once a function * starts saving regs it will do so until all ftrace_ops are removed * from tracing that function. @@ -450,12 +450,9 @@ enum { }; #define FTRACE_REF_MAX_SHIFT 23 -#define FTRACE_FL_BITS 9 -#define FTRACE_FL_MASKED_BITS ((1UL << FTRACE_FL_BITS) - 1) -#define FTRACE_FL_MASK (FTRACE_FL_MASKED_BITS << FTRACE_REF_MAX_SHIFT) #define FTRACE_REF_MAX ((1UL << FTRACE_REF_MAX_SHIFT) - 1) -#define ftrace_rec_count(rec) ((rec)->flags & ~FTRACE_FL_MASK) +#define ftrace_rec_count(rec) ((rec)->flags & FTRACE_REF_MAX) struct dyn_ftrace { unsigned long ip; /* address of mcount call-site */ -- cgit v1.2.3 From 40dc4a42b97ef5a52ef34a73093a7992faaab15e Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 31 Aug 2020 11:11:04 +0800 Subject: ftrace: ftrace_global_list is renamed to ftrace_ops_list Fix the comment to comply with the code. Link: https://lkml.kernel.org/r/20200831031104.23322-7-richard.weiyang@linux.alibaba.com Signed-off-by: Wei Yang Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index b1f56e3410dc..1bd3a0356ae4 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -217,11 +217,11 @@ extern struct ftrace_ops __rcu *ftrace_ops_list; extern struct ftrace_ops ftrace_list_end; /* - * Traverse the ftrace_global_list, invoking all entries. The reason that we + * Traverse the ftrace_ops_list, invoking all entries. The reason that we * can use rcu_dereference_raw_check() is that elements removed from this list * are simply leaked, so there is no need to interact with a grace-period * mechanism. The rcu_dereference_raw_check() calls are needed to handle - * concurrent insertions into the ftrace_global_list. + * concurrent insertions into the ftrace_ops_list. * * Silly Alpha and silly pointer-speculation compiler optimizations! */ -- cgit v1.2.3 From 38b9f903f22b9baa5c4b9bfb07c8bbc49f5efbba Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 7 Oct 2020 09:00:49 +0300 Subject: net/mlx5: Handle sync reset request event Once the driver gets sync_reset_request from firmware it prepares for the coming reset and sends acknowledge. After getting this event the driver expects device reset, either it will trigger PCI reset on sync_reset_now event or such PCI reset will be triggered by another PF of the same device. So it moves to reset requested mode and if it gets PCI reset triggered by the other PF it detect the reset and reloads. Signed-off-by: Moshe Shemesh Reviewed-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- include/linux/mlx5/driver.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 239e5348ee09..add85094f9a5 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -501,6 +501,7 @@ struct mlx5_mpfs; struct mlx5_eswitch; struct mlx5_lag; struct mlx5_devcom; +struct mlx5_fw_reset; struct mlx5_eq_table; struct mlx5_irq_table; @@ -578,6 +579,7 @@ struct mlx5_priv { struct mlx5_core_sriov sriov; struct mlx5_lag *lag; struct mlx5_devcom *devcom; + struct mlx5_fw_reset *fw_reset; struct mlx5_core_roce roce; struct mlx5_fc_stats fc_stats; struct mlx5_rl_table rl_table; -- cgit v1.2.3 From 2d69356752ff862dbb0c7e6725874740799d7708 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 7 Oct 2020 09:00:55 +0300 Subject: net/mlx5: Add support for fw live patch event Firmware live patch event notifies the driver that the firmware was just updated using live patch. In such case the driver should not reload or re-initiate entities, part to updating the firmware version and re-initiate the firmware tracer which can be updated by live patch with new strings database to help debugging an issue. Signed-off-by: Moshe Shemesh Reviewed-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- include/linux/mlx5/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 81ca5989009b..cf824366a7d1 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -366,6 +366,7 @@ enum { enum { MLX5_GENERAL_SUBTYPE_DELAY_DROP_TIMEOUT = 0x1, MLX5_GENERAL_SUBTYPE_PCI_POWER_CHANGE_EVENT = 0x5, + MLX5_GENERAL_SUBTYPE_FW_LIVE_PATCH_EVENT = 0x7, MLX5_GENERAL_SUBTYPE_PCI_SYNC_FOR_FW_UPDATE_EVENT = 0x8, }; -- cgit v1.2.3 From 44f3625bc61653ea3bde9960298faf2f5518fda5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 8 Oct 2020 12:45:17 +0200 Subject: netlink: export policy in extended ACK Add a new attribute NLMSGERR_ATTR_POLICY to the extended ACK to advertise the policy, e.g. if an attribute was out of range, you'll know the range that's permissible. Add new NL_SET_ERR_MSG_ATTR_POL() and NL_SET_ERR_MSG_ATTR_POL() macros to set this, since realistically it's only useful to do this when the bad attribute (offset) is also returned. Use it in lib/nlattr.c which practically does all the policy validation. v2: - add and use netlink_policy_dump_attr_size_estimate() v3: - remove redundant break v4: - really remove redundant break ... sorry Reviewed-by: Jakub Kicinski Signed-off-by: Johannes Berg Signed-off-by: Jakub Kicinski --- include/linux/netlink.h | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index e3e49f0e5c13..666cd0390699 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -68,12 +68,14 @@ netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg) * @_msg: message string to report - don't access directly, use * %NL_SET_ERR_MSG * @bad_attr: attribute with error + * @policy: policy for a bad attribute * @cookie: cookie data to return to userspace (for success) * @cookie_len: actual cookie data length */ struct netlink_ext_ack { const char *_msg; const struct nlattr *bad_attr; + const struct nla_policy *policy; u8 cookie[NETLINK_MAX_COOKIE_LEN]; u8 cookie_len; }; @@ -95,21 +97,29 @@ struct netlink_ext_ack { #define NL_SET_ERR_MSG_MOD(extack, msg) \ NL_SET_ERR_MSG((extack), KBUILD_MODNAME ": " msg) -#define NL_SET_BAD_ATTR(extack, attr) do { \ - if ((extack)) \ +#define NL_SET_BAD_ATTR_POLICY(extack, attr, pol) do { \ + if ((extack)) { \ (extack)->bad_attr = (attr); \ + (extack)->policy = (pol); \ + } \ } while (0) -#define NL_SET_ERR_MSG_ATTR(extack, attr, msg) do { \ - static const char __msg[] = msg; \ - struct netlink_ext_ack *__extack = (extack); \ - \ - if (__extack) { \ - __extack->_msg = __msg; \ - __extack->bad_attr = (attr); \ - } \ +#define NL_SET_BAD_ATTR(extack, attr) NL_SET_BAD_ATTR_POLICY(extack, attr, NULL) + +#define NL_SET_ERR_MSG_ATTR_POL(extack, attr, pol, msg) do { \ + static const char __msg[] = msg; \ + struct netlink_ext_ack *__extack = (extack); \ + \ + if (__extack) { \ + __extack->_msg = __msg; \ + __extack->bad_attr = (attr); \ + __extack->policy = (pol); \ + } \ } while (0) +#define NL_SET_ERR_MSG_ATTR(extack, attr, msg) \ + NL_SET_ERR_MSG_ATTR_POL(extack, attr, NULL, msg) + static inline void nl_set_extack_cookie_u64(struct netlink_ext_ack *extack, u64 cookie) { -- cgit v1.2.3 From 9aa1206e8f48222f35a0c809f33b2f4aaa1e2661 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Oct 2020 01:40:02 +0200 Subject: bpf: Add redirect_peer helper Add an efficient ingress to ingress netns switch that can be used out of tc BPF programs in order to redirect traffic from host ns ingress into a container veth device ingress without having to go via CPU backlog queue [0]. For local containers this can also be utilized and path via CPU backlog queue only needs to be taken once, not twice. On a high level this borrows from ipvlan which does similar switch in __netif_receive_skb_core() and then iterates via another_round. This helps to reduce latency for mentioned use cases. Pod to remote pod with redirect(), TCP_RR [1]: # percpu_netperf 10.217.1.33 RT_LATENCY: 122.450 (per CPU: 122.666 122.401 122.333 122.401 ) MEAN_LATENCY: 121.210 (per CPU: 121.100 121.260 121.320 121.160 ) STDDEV_LATENCY: 120.040 (per CPU: 119.420 119.910 125.460 115.370 ) MIN_LATENCY: 46.500 (per CPU: 47.000 47.000 47.000 45.000 ) P50_LATENCY: 118.500 (per CPU: 118.000 119.000 118.000 119.000 ) P90_LATENCY: 127.500 (per CPU: 127.000 128.000 127.000 128.000 ) P99_LATENCY: 130.750 (per CPU: 131.000 131.000 129.000 132.000 ) TRANSACTION_RATE: 32666.400 (per CPU: 8152.200 8169.842 8174.439 8169.897 ) Pod to remote pod with redirect_peer(), TCP_RR: # percpu_netperf 10.217.1.33 RT_LATENCY: 44.449 (per CPU: 43.767 43.127 45.279 45.622 ) MEAN_LATENCY: 45.065 (per CPU: 44.030 45.530 45.190 45.510 ) STDDEV_LATENCY: 84.823 (per CPU: 66.770 97.290 84.380 90.850 ) MIN_LATENCY: 33.500 (per CPU: 33.000 33.000 34.000 34.000 ) P50_LATENCY: 43.250 (per CPU: 43.000 43.000 43.000 44.000 ) P90_LATENCY: 46.750 (per CPU: 46.000 47.000 47.000 47.000 ) P99_LATENCY: 52.750 (per CPU: 51.000 54.000 53.000 53.000 ) TRANSACTION_RATE: 90039.500 (per CPU: 22848.186 23187.089 22085.077 21919.130 ) [0] https://linuxplumbersconf.org/event/7/contributions/674/attachments/568/1002/plumbers_2020_cilium_load_balancer.pdf [1] https://github.com/borkmann/netperf_scripts/blob/master/percpu_netperf Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201010234006.7075-3-daniel@iogearbox.net --- include/linux/netdevice.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 28cfa53daf72..0533f86018dd 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1277,6 +1277,9 @@ struct netdev_net_notifier { * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p, * int cmd); * Add, change, delete or get information on an IPv4 tunnel. + * struct net_device *(*ndo_get_peer_dev)(struct net_device *dev); + * If a device is paired with a peer device, return the peer instance. + * The caller must be under RCU read context. */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1484,6 +1487,7 @@ struct net_device_ops { struct devlink_port * (*ndo_get_devlink_port)(struct net_device *dev); int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p, int cmd); + struct net_device * (*ndo_get_peer_dev)(struct net_device *dev); }; /** -- cgit v1.2.3 From 4a8f87e60f6db40e640f1db555d063b2c4dea5f1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Oct 2020 01:40:03 +0200 Subject: bpf: Allow for map-in-map with dynamic inner array map entries Recent work in f4d05259213f ("bpf: Add map_meta_equal map ops") and 134fede4eecf ("bpf: Relax max_entries check for most of the inner map types") added support for dynamic inner max elements for most map-in-map types. Exceptions were maps like array or prog array where the map_gen_lookup() callback uses the maps' max_entries field as a constant when emitting instructions. We recently implemented Maglev consistent hashing into Cilium's load balancer which uses map-in-map with an outer map being hash and inner being array holding the Maglev backend table for each service. This has been designed this way in order to reduce overall memory consumption given the outer hash map allows to avoid preallocating a large, flat memory area for all services. Also, the number of service mappings is not always known a-priori. The use case for dynamic inner array map entries is to further reduce memory overhead, for example, some services might just have a small number of back ends while others could have a large number. Right now the Maglev backend table for small and large number of backends would need to have the same inner array map entries which adds a lot of unneeded overhead. Dynamic inner array map entries can be realized by avoiding the inlined code generation for their lookup. The lookup will still be efficient since it will be calling into array_map_lookup_elem() directly and thus avoiding retpoline. The patch adds a BPF_F_INNER_MAP flag to map creation which therefore skips inline code generation and relaxes array_map_meta_equal() check to ignore both maps' max_entries. This also still allows to have faster lookups for map-in-map when BPF_F_INNER_MAP is not specified and hence dynamic max_entries not needed. Example code generation where inner map is dynamic sized array: # bpftool p d x i 125 int handle__sys_enter(void * ctx): ; int handle__sys_enter(void *ctx) 0: (b4) w1 = 0 ; int key = 0; 1: (63) *(u32 *)(r10 -4) = r1 2: (bf) r2 = r10 ; 3: (07) r2 += -4 ; inner_map = bpf_map_lookup_elem(&outer_arr_dyn, &key); 4: (18) r1 = map[id:468] 6: (07) r1 += 272 7: (61) r0 = *(u32 *)(r2 +0) 8: (35) if r0 >= 0x3 goto pc+5 9: (67) r0 <<= 3 10: (0f) r0 += r1 11: (79) r0 = *(u64 *)(r0 +0) 12: (15) if r0 == 0x0 goto pc+1 13: (05) goto pc+1 14: (b7) r0 = 0 15: (b4) w6 = -1 ; if (!inner_map) 16: (15) if r0 == 0x0 goto pc+6 17: (bf) r2 = r10 ; 18: (07) r2 += -4 ; val = bpf_map_lookup_elem(inner_map, &key); 19: (bf) r1 = r0 | No inlining but instead 20: (85) call array_map_lookup_elem#149280 | call to array_map_lookup_elem() ; return val ? *val : -1; | for inner array lookup. 21: (15) if r0 == 0x0 goto pc+1 ; return val ? *val : -1; 22: (61) r6 = *(u32 *)(r0 +0) ; } 23: (bc) w0 = w6 24: (95) exit Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201010234006.7075-4-daniel@iogearbox.net --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index dc63eeed4fd9..2b16bf48aab6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -82,7 +82,7 @@ struct bpf_map_ops { void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, int fd); void (*map_fd_put_ptr)(void *ptr); - u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); + int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); u32 (*map_fd_sys_lookup_elem)(void *ptr); void (*map_seq_show_elem)(struct bpf_map *map, void *key, struct seq_file *m); -- cgit v1.2.3 From ef5659280eb13e8ac31c296f58cfdfa1684ac06b Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sat, 10 Oct 2020 22:09:38 -0700 Subject: bpf, sockmap: Allow skipping sk_skb parser program Currently, we often run with a nop parser namely one that just does this, 'return skb->len'. This happens when either our verdict program can handle streaming data or it is only looking at socket data such as IP addresses and other metadata associated with the flow. The second case is common for a L3/L4 proxy for instance. So lets allow loading programs without the parser then we can skip the stream parser logic and avoid having to add a BPF program that is effectively a nop. Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/160239297866.8495.13345662302749219672.stgit@john-Precision-5820-Tower --- include/linux/skmsg.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 3119928fc103..fec0c5ac1c4f 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -308,6 +308,8 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node); int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock); void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock); void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock); +void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock); +void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock); int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg); -- cgit v1.2.3 From f726f3d37163f714034aa5fd1f92a1a73df4297f Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 12 Oct 2020 09:43:54 +0200 Subject: can: remove obsolete version strings As pointed out by Jakub Kicinski here: http://lore.kernel.org/r/20201009175751.5c54097f@kicinski-fedora-pc1c0hjn.dhcp.thefacebook.com this patch removes the obsolete version information of the different CAN protocols and the AF_CAN core module. Signed-off-by: Oliver Hartkopp Link: https://lore.kernel.org/r/20201012074354.25839-2-socketcan@hartkopp.net Signed-off-by: Marc Kleine-Budde --- include/linux/can/core.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/can/core.h b/include/linux/can/core.h index 7da9f1f82e8e..5fb8d0e3f9c1 100644 --- a/include/linux/can/core.h +++ b/include/linux/can/core.h @@ -18,13 +18,6 @@ #include #include -#define CAN_VERSION "20170425" - -/* increment this number each time you change some user-space interface */ -#define CAN_ABI_VERSION "9" - -#define CAN_VERSION_STRING "rev " CAN_VERSION " abi " CAN_ABI_VERSION - #define DNAME(dev) ((dev) ? (dev)->name : "any") /** -- cgit v1.2.3 From 585834a5eeb38a9b290786b6d3ee55f22098c602 Mon Sep 17 00:00:00 2001 From: zhuguangqing Date: Thu, 17 Sep 2020 15:35:53 +0800 Subject: thermal/idle_inject: Fix comment of idle_duration_us and name of latency_ns The comment of idle_duration_us and the name of latency_ns can be misleading, so fix them. Signed-off-by: zhuguangqing Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20200917073553.898-1-zhuguangqing83@gmail.com --- include/linux/idle_inject.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/idle_inject.h b/include/linux/idle_inject.h index 91a8612b8bf9..fb88e23a99d3 100644 --- a/include/linux/idle_inject.h +++ b/include/linux/idle_inject.h @@ -28,6 +28,6 @@ void idle_inject_get_duration(struct idle_inject_device *ii_dev, unsigned int *idle_duration_us); void idle_inject_set_latency(struct idle_inject_device *ii_dev, - unsigned int latency_ns); + unsigned int latency_us); #endif /* __IDLE_INJECT_H__ */ -- cgit v1.2.3 From 88052319620a35f827509d645e4c8063ded751c8 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Tue, 15 Sep 2020 15:36:49 -0700 Subject: thermal: core: Add new event for sending keep alive notifications This event is sent by the platform firmware to confirm that user space thermal solution is alive. The response to this event from the user space thermal solution is platform specific. Signed-off-by: Srinivas Pandruvada Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20200915223650.406046-3-srinivas.pandruvada@linux.intel.com --- include/linux/thermal.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 42ef807e5d84..42b69d4072a4 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -55,6 +55,7 @@ enum thermal_notify_event { THERMAL_DEVICE_UP, /* Thermal device is up after a down event */ THERMAL_DEVICE_POWER_CAPABILITY_CHANGED, /* power capability changed */ THERMAL_TABLE_CHANGED, /* Thermal table(s) changed */ + THERMAL_EVENT_KEEP_ALIVE, /* Request for user space handler to respond */ }; struct thermal_zone_device_ops { -- cgit v1.2.3 From ecd1d2a3e4f893584578a304d7bd1591e895a8e4 Mon Sep 17 00:00:00 2001 From: zhuguangqing Date: Mon, 14 Sep 2020 15:11:01 +0800 Subject: thermal: cooling: Remove unused variable *tz 1. devfreq_cooling.c: The variable *tz is not used in devfreq_cooling_get_requested_power(), devfreq_cooling_state2power() and devfreq_cooling_power2state(). 2. cpufreq_cooling.c: After 84fe2cab48590, the variable *tz is not used anymore in cpufreq_get_requested_power(), cpufreq_state2power() and cpufreq_power2state(). Remove the variable *tz. Signed-off-by: zhuguangqing Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20200914071101.13575-1-zhuguangqing83@gmail.com --- include/linux/thermal.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 42b69d4072a4..d07ea27e72a9 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -85,12 +85,9 @@ struct thermal_cooling_device_ops { int (*get_max_state) (struct thermal_cooling_device *, unsigned long *); int (*get_cur_state) (struct thermal_cooling_device *, unsigned long *); int (*set_cur_state) (struct thermal_cooling_device *, unsigned long); - int (*get_requested_power)(struct thermal_cooling_device *, - struct thermal_zone_device *, u32 *); - int (*state2power)(struct thermal_cooling_device *, - struct thermal_zone_device *, unsigned long, u32 *); - int (*power2state)(struct thermal_cooling_device *, - struct thermal_zone_device *, u32, unsigned long *); + int (*get_requested_power)(struct thermal_cooling_device *, u32 *); + int (*state2power)(struct thermal_cooling_device *, unsigned long, u32 *); + int (*power2state)(struct thermal_cooling_device *, u32, unsigned long *); }; struct thermal_cooling_device { -- cgit v1.2.3 From 3986f9a42e993075af01c17dc8968cfb96a4fe53 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 17 Aug 2020 13:45:04 +0200 Subject: libceph: multiple workspaces for CRUSH computations Replace a global map->crush_workspace (protected by a global mutex) with a list of workspaces, up to the number of CPUs + 1. This is based on a patch from Robin Geuze . Robin and his team have observed a 10-20% increase in IOPS on all queue depths and lower CPU usage as well on a high-end all-NVMe 100GbE cluster. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osdmap.h | 14 ++++++++++++-- include/linux/crush/crush.h | 3 +++ 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 3f4498fef6ad..cad9acfbc320 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -137,6 +137,17 @@ int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp, const char *fmt, ...); void ceph_oid_destroy(struct ceph_object_id *oid); +struct workspace_manager { + struct list_head idle_ws; + spinlock_t ws_lock; + /* Number of free workspaces */ + int free_ws; + /* Total number of allocated workspaces */ + atomic_t total_ws; + /* Waiters for a free workspace */ + wait_queue_head_t ws_wait; +}; + struct ceph_pg_mapping { struct rb_node node; struct ceph_pg pgid; @@ -184,8 +195,7 @@ struct ceph_osdmap { * the list of osds that store+replicate them. */ struct crush_map *crush; - struct mutex crush_workspace_mutex; - void *crush_workspace; + struct workspace_manager crush_wsm; }; static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd) diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 2f811baf78d2..30dba392b730 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -346,6 +346,9 @@ struct crush_work_bucket { struct crush_work { struct crush_work_bucket **work; /* Per-bucket working store */ +#ifdef __KERNEL__ + struct list_head item; +#endif }; #ifdef __KERNEL__ -- cgit v1.2.3 From 0b98acd6188309333c3a8a6e16feadadd31e4523 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 14 Sep 2020 13:39:19 +0200 Subject: libceph, rbd, ceph: "blacklist" -> "blocklist" Signed-off-by: Ilya Dryomov --- include/linux/ceph/mon_client.h | 2 +- include/linux/ceph/rados.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index ce4ffeb384d7..b658961156a0 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -142,7 +142,7 @@ int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what, int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what, ceph_monc_callback_t cb, u64 private_data); -int ceph_monc_blacklist_add(struct ceph_mon_client *monc, +int ceph_monc_blocklist_add(struct ceph_mon_client *monc, struct ceph_entity_addr *client_addr); extern int ceph_monc_open_session(struct ceph_mon_client *monc); diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 3a518fd0eaad..43a7a1573b51 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -424,7 +424,7 @@ enum { }; #define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/ -#define EBLACKLISTED ESHUTDOWN /* blacklisted */ +#define EBLOCKLISTED ESHUTDOWN /* blocklisted */ /* xattr comparison */ enum { -- cgit v1.2.3 From b07720d0bd1e7c2251642010efb6075dbee23bb8 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 2 Oct 2020 14:38:08 +0200 Subject: libceph: fix ENTITY_NAME format suggestion Signed-off-by: Ilya Dryomov --- include/linux/ceph/messenger.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 76371aaae2d1..60b324efd1c4 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -54,7 +54,7 @@ struct ceph_connection_operations { int (*check_message_signature) (struct ceph_msg *msg); }; -/* use format string %s%d */ +/* use format string %s%lld */ #define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num) struct ceph_messenger { -- cgit v1.2.3 From ee92e4f1f95eb7b8820299f10fc5fba16d85cece Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Wed, 8 Apr 2020 14:47:39 -0500 Subject: net/mlx5: Add NIC TX domain namespace Add new namespace that represents the NIC TX domain. Signed-off-by: Huy Nguyen Signed-off-by: Raed Salem Signed-off-by: Saeed Mahameed --- include/linux/mlx5/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 92d991d93757..846d94ad04bc 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -76,6 +76,7 @@ enum mlx5_flow_namespace_type { MLX5_FLOW_NAMESPACE_SNIFFER_RX, MLX5_FLOW_NAMESPACE_SNIFFER_TX, MLX5_FLOW_NAMESPACE_EGRESS, + MLX5_FLOW_NAMESPACE_EGRESS_KERNEL, MLX5_FLOW_NAMESPACE_RDMA_RX, MLX5_FLOW_NAMESPACE_RDMA_RX_KERNEL, MLX5_FLOW_NAMESPACE_RDMA_TX, -- cgit v1.2.3 From 9b9d454ddbf0c41391ed68ea82bc3d8ff6a65074 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Fri, 5 Jun 2020 20:17:51 -0500 Subject: net/mlx5e: IPsec: Add TX steering rule per IPsec state Add new FTE in TX IPsec FT per IPsec state. It has the same matching criteria as the RX steering rule. The IPsec FT is created/destroyed when the first/last rule is added/deleted respectively. Signed-off-by: Huy Nguyen Reviewed-by: Boris Pismenny Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/qp.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 36492a1342cf..d75ef8aa8fac 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -245,6 +245,10 @@ enum { MLX5_ETH_WQE_SWP_OUTER_L4_UDP = 1 << 5, }; +enum { + MLX5_ETH_WQE_FT_META_IPSEC = BIT(0), +}; + struct mlx5_wqe_eth_seg { u8 swp_outer_l4_offset; u8 swp_outer_l3_offset; @@ -253,7 +257,7 @@ struct mlx5_wqe_eth_seg { u8 cs_flags; u8 swp_flags; __be16 mss; - __be32 rsvd2; + __be32 flow_table_metadata; union { struct { __be16 sz; -- cgit v1.2.3 From 3528f8ec95a5b1ee1b98d3e85371843c6428e4be Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 15 Jul 2020 18:28:58 -0700 Subject: bcm963xx_tag.h: fix duplicated word Change doubled word "is" to "it is". Signed-off-by: Randy Dunlap Cc: Florian Fainelli Cc: bcm-kernel-feedback-list@broadcom.com Cc: linux-mips@vger.kernel.org Acked-by: Florian Fainelli Signed-off-by: Thomas Bogendoerfer --- include/linux/bcm963xx_tag.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bcm963xx_tag.h b/include/linux/bcm963xx_tag.h index b87945cb6946..7edb809a2586 100644 --- a/include/linux/bcm963xx_tag.h +++ b/include/linux/bcm963xx_tag.h @@ -84,7 +84,7 @@ struct bcm_tag { char flash_layout_ver[FLASHLAYOUTVER_LEN]; /* 196-199: kernel+rootfs CRC32 */ __u32 fskernel_crc; - /* 200-215: Unused except on Alice Gate where is is information */ + /* 200-215: Unused except on Alice Gate where it is information */ char information2[TAGINFO2_LEN]; /* 216-219: CRC32 of image less imagetag (kernel for Alice Gate) */ __u32 image_crc; -- cgit v1.2.3 From f82cd2f0b5eb715b1a296e20b34da7d296b6e9a4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 18 Aug 2020 09:05:56 -0400 Subject: XArray: Add private interface for workingset node deletion Move the tricky bits of dealing with the XArray from the workingset code to the XArray. Make it clear in the documentation that this is a private interface, and only export it for the benefit of the test suite. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/xarray.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 6b336098fca7..29db4e16eb89 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -1286,6 +1286,8 @@ static inline bool xa_is_advanced(const void *entry) */ typedef void (*xa_update_node_t)(struct xa_node *node); +void xa_delete_node(struct xa_node *, xa_update_node_t); + /* * The xa_state is opaque to its users. It contains various different pieces * of state involved in the current operation on the XArray. It should be -- cgit v1.2.3 From ca7b639e8611b3260a30b18aaa0d6db9c80a75ef Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 2 Aug 2020 14:17:21 -0400 Subject: XArray: Fix xas_reload for multi-index entries xas_reload() was only checking that the head entry was still at the head index. If the entry has been split, that's not enough as there may be a different entry at the specified index now. Solve this by checking the slot for the requested index instead of the head index. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/xarray.h | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 29db4e16eb89..4be9c57132fe 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -1524,10 +1524,21 @@ void xas_create_range(struct xa_state *); static inline void *xas_reload(struct xa_state *xas) { struct xa_node *node = xas->xa_node; - - if (node) - return xa_entry(xas->xa, node, xas->xa_offset); - return xa_head(xas->xa); + void *entry; + char offset; + + if (!node) + return xa_head(xas->xa); + if (IS_ENABLED(CONFIG_XARRAY_MULTI)) { + offset = (xas->xa_index >> node->shift) & XA_CHUNK_MASK; + entry = xa_entry(xas->xa, node, offset); + if (!xa_is_sibling(entry)) + return entry; + offset = xa_to_sibling(entry); + } else { + offset = xas->xa_offset; + } + return xa_entry(xas->xa, node, offset); } /** -- cgit v1.2.3 From f78b8250a076ac63ddd021c7ea9739bcc2f6f737 Mon Sep 17 00:00:00 2001 From: Hui Su Date: Mon, 28 Sep 2020 01:15:53 +0800 Subject: radix-tree: fix the comment of radix_tree_next_slot() fix the comment of radix_tree_next_slot(): interator --> iterator. Signed-off-by: Hui Su Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/radix-tree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 5c85059a92ba..64ad900ac742 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -377,7 +377,7 @@ radix_tree_chunk_size(struct radix_tree_iter *iter) * radix_tree_next_slot - find next slot in chunk * * @slot: pointer to current slot - * @iter: pointer to interator state + * @iter: pointer to iterator state * @flags: RADIX_TREE_ITER_*, should be constant * Returns: pointer to next slot, or NULL if there no more left * -- cgit v1.2.3 From 3b481d91356e5693d8358d4ef9c383bdb92c8da0 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 24 Sep 2020 13:53:28 -0700 Subject: block: add zone specific block statuses A zoned device with limited resources to open or activate zones may return an error when the host exceeds those limits. The same command may be successful if retried later, but the host needs to wait for specific zone states before it should expect a retry to succeed. Have the block layer provide an appropriate status for these conditions so applications can distinuguish this error for special handling. Cc: linux-api@vger.kernel.org Cc: Niklas Cassel Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Martin K. Petersen Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 7d7c13238fdb..d9b69bbde5cc 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -104,6 +104,24 @@ typedef u8 __bitwise blk_status_t; */ #define BLK_STS_ZONE_RESOURCE ((__force blk_status_t)14) +/* + * BLK_STS_ZONE_OPEN_RESOURCE is returned from the driver in the completion + * path if the device returns a status indicating that too many zone resources + * are currently open. The same command should be successful if resubmitted + * after the number of open zones decreases below the device's limits, which is + * reported in the request_queue's max_open_zones. + */ +#define BLK_STS_ZONE_OPEN_RESOURCE ((__force blk_status_t)15) + +/* + * BLK_STS_ZONE_ACTIVE_RESOURCE is returned from the driver in the completion + * path if the device returns a status indicating that too many zone resources + * are currently active. The same command should be successful if resubmitted + * after the number of active zones decreases below the device's limits, which + * is reported in the request_queue's max_active_zones. + */ +#define BLK_STS_ZONE_ACTIVE_RESOURCE ((__force blk_status_t)16) + /** * blk_path_error - returns true if error may be path related * @error: status the request was completed with -- cgit v1.2.3 From bf41a0910cb2dd06abd4d6a920edcee798460ad7 Mon Sep 17 00:00:00 2001 From: Rishabh Bhatnagar Date: Fri, 2 Oct 2020 11:09:02 -0700 Subject: remoteproc: Change default dump configuration to "disabled" Currently "default" configuration option means coredumps are enabled. To avoid confusion rename the "default" configuration option to "enabled" and disable collection of dumps by default as doing so makes sense for production devices. Signed-off-by: Rishabh Bhatnagar Link: https://lore.kernel.org/r/1601662144-5964-2-git-send-email-rishabhb@codeaurora.org Signed-off-by: Bjorn Andersson --- include/linux/remoteproc.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index 2fa68bf5aa4f..3fa3ba6498e8 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -442,16 +442,16 @@ enum rproc_crash_type { /** * enum rproc_dump_mechanism - Coredump options for core - * @RPROC_COREDUMP_DEFAULT: Copy dump to separate buffer and carry on with + * @RPROC_COREDUMP_DISABLED: Don't perform any dump + * @RPROC_COREDUMP_ENABLED: Copy dump to separate buffer and carry on with recovery * @RPROC_COREDUMP_INLINE: Read segments directly from device memory. Stall recovery until all segments are read - * @RPROC_COREDUMP_DISABLED: Don't perform any dump */ enum rproc_dump_mechanism { - RPROC_COREDUMP_DEFAULT, - RPROC_COREDUMP_INLINE, RPROC_COREDUMP_DISABLED, + RPROC_COREDUMP_ENABLED, + RPROC_COREDUMP_INLINE, }; /** -- cgit v1.2.3 From 44fa32f008ab7092842095a7a474c48303a2f186 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 12 Oct 2020 10:01:27 +0200 Subject: net: add function dev_fetch_sw_netstats for fetching pcpu_sw_netstats In several places the same code is used to populate rtnl_link_stats64 fields with data from pcpu_sw_netstats. Therefore factor out this code to a new function dev_fetch_sw_netstats(). v2: - constify argument netstats - don't ignore netstats being NULL or an ERRPTR - switch to EXPORT_SYMBOL_GPL Signed-off-by: Heiner Kallweit Link: https://lore.kernel.org/r/6d16a338-52f5-df69-0020-6bc771a7d498@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 948d7105e91a..964b494b0e8d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4499,6 +4499,8 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, struct rtnl_link_stats64 *storage); void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, const struct net_device_stats *netdev_stats); +void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, + const struct pcpu_sw_netstats __percpu *netstats); extern int netdev_max_backlog; extern int netdev_tstamp_prequeue; -- cgit v1.2.3 From ed3e453798d4f81c99056aa09fcd79d0874a60fd Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 13 Oct 2020 14:14:43 +0200 Subject: locking/seqlocks: Fix kernel-doc warnings Right now, seqlock.h produces kernel-doc warnings: ./include/linux/seqlock.h:181: error: Cannot parse typedef! Convert it to a plain comment to avoid confusing kernel-doc. Fixes: a8772dccb2ec ("seqlock: Fold seqcount_LOCKNAME_t definition") Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/a59144cdaadf7fdf1fe5d55d0e1575abbf1c0cb3.1602590106.git.mchehab+huawei@kernel.org --- include/linux/seqlock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index ac5b07f558b0..cbfc78b92b65 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -154,7 +154,7 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s) #define __SEQ_LOCK(expr) #endif -/** +/* * typedef seqcount_LOCKNAME_t - sequence counter with LOCKNAME associated * @seqcount: The real sequence counter * @lock: Pointer to the associated lock -- cgit v1.2.3 From 02e83f46ebfaf9405881e290794c913d457541f0 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 14 Oct 2020 16:47:08 -0700 Subject: vfs: move generic_remap_checks out of mm I would like to move all the generic helpers for the vfs remap range functionality (aka clonerange and dedupe) into a separate file so that they won't be scattered across the vfs and the mm subsystems. The eventual goal is to be able to deselect remap_range.c if none of the filesystems need that code, but the tricky part here is picking a stable(ish) part of the merge window to rearrange code. Signed-off-by: Darrick J. Wong --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 7519ae003a08..eea754a8dd67 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3012,6 +3012,8 @@ extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *); extern int generic_remap_checks(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *count, unsigned int remap_flags); +extern int generic_write_check_limits(struct file *file, loff_t pos, + loff_t *count); extern int generic_file_rw_checks(struct file *file_in, struct file *file_out); extern int generic_copy_file_checks(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, -- cgit v1.2.3 From c2a9a645591f11761fdbe3c7f45c0ad32af97eb4 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 28 Sep 2020 15:25:17 +0200 Subject: math64.h: kernel-docs: Convert some markups into normal comments There are several functions at math64.h that are also defined at div64.c. As both are included at kernel-api.rst, Sphinx 3.x complains about symbol duplication: ./lib/math/div64.c:73: WARNING: Duplicate C declaration, also defined in 'core-api/kernel-api'. Declaration is 'div_s64_rem'. ./lib/math/div64.c:104: WARNING: Duplicate C declaration, also defined in 'core-api/kernel-api'. Declaration is 'div64_u64_rem'. ./lib/math/div64.c:144: WARNING: Duplicate C declaration, also defined in 'core-api/kernel-api'. Declaration is 'div64_u64'. ./lib/math/div64.c:172: WARNING: Duplicate C declaration, also defined in 'core-api/kernel-api'. Declaration is 'div64_s64'. In order to avoid Sphinx warnings about duplication, change the kernel-doc markups to just comments at math64.h. Reviewed-by: Vincenzo Frascino Signed-off-by: Mauro Carvalho Chehab --- include/linux/math64.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/math64.h b/include/linux/math64.h index 3381d9e33c4e..66deb1fdc2ef 100644 --- a/include/linux/math64.h +++ b/include/linux/math64.h @@ -28,7 +28,7 @@ static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder) return dividend / divisor; } -/** +/* * div_s64_rem - signed 64bit divide with 32bit divisor with remainder * @dividend: signed 64bit dividend * @divisor: signed 32bit divisor @@ -42,7 +42,7 @@ static inline s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder) return dividend / divisor; } -/** +/* * div64_u64_rem - unsigned 64bit divide with 64bit divisor and remainder * @dividend: unsigned 64bit dividend * @divisor: unsigned 64bit divisor @@ -56,7 +56,7 @@ static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder) return dividend / divisor; } -/** +/* * div64_u64 - unsigned 64bit divide with 64bit divisor * @dividend: unsigned 64bit dividend * @divisor: unsigned 64bit divisor @@ -68,7 +68,7 @@ static inline u64 div64_u64(u64 dividend, u64 divisor) return dividend / divisor; } -/** +/* * div64_s64 - signed 64bit divide with 64bit divisor * @dividend: signed 64bit dividend * @divisor: signed 64bit divisor -- cgit v1.2.3 From 1b7743912bcf2b4a9d9c96de668542156450554f Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 10 Sep 2020 09:25:14 +0200 Subject: usb: docs: document altmode register/unregister functions The typec_bus.rst asks for documentation of those two functions, but they don't exist: ./drivers/usb/typec/bus.c:1: warning: 'typec_altmode_unregister_driver' not found ./drivers/usb/typec/bus.c:1: warning: 'typec_altmode_register_driver' not found Also, they're not declared on bus.c but, instead, at a header file (typec_altmode.h). So, add documentation for both functions at the header and change the kernel-doc markup under typec_bus.rst to point to the right place. While here, also place the documentation for both structs declared on typec_altmode.h at the rst file. Signed-off-by: Mauro Carvalho Chehab --- include/linux/usb/typec_altmode.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/typec_altmode.h b/include/linux/usb/typec_altmode.h index a4b65eaa0f62..5e0a7b7647c3 100644 --- a/include/linux/usb/typec_altmode.h +++ b/include/linux/usb/typec_altmode.h @@ -152,10 +152,26 @@ struct typec_altmode_driver { #define to_altmode_driver(d) container_of(d, struct typec_altmode_driver, \ driver) +/** + * typec_altmode_register_driver - registers a USB Type-C alternate mode + * device driver + * @drv: pointer to struct typec_altmode_driver + * + * These drivers will be bind to the partner alternate mode devices. They will + * handle all SVID specific communication. + */ #define typec_altmode_register_driver(drv) \ __typec_altmode_register_driver(drv, THIS_MODULE) int __typec_altmode_register_driver(struct typec_altmode_driver *drv, struct module *module); +/** + * typec_altmode_unregister_driver - unregisters a USB Type-C alternate mode + * device driver + * @drv: pointer to struct typec_altmode_driver + * + * These drivers will be bind to the partner alternate mode devices. They will + * handle all SVID specific communication. + */ void typec_altmode_unregister_driver(struct typec_altmode_driver *drv); #define module_typec_altmode_driver(__typec_altmode_driver) \ -- cgit v1.2.3 From f2c6855159228861f4552d563c13f5b194b9889f Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 10 Sep 2020 10:27:17 +0200 Subject: Input: sparse-keymap: add a description for @sw Add a description for it, in order to avoids this warning: ./include/linux/input/sparse-keymap.h:43: warning: Function parameter or member 'sw' not described in 'key_entry' Signed-off-by: Mauro Carvalho Chehab --- include/linux/input/sparse-keymap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/input/sparse-keymap.h b/include/linux/input/sparse-keymap.h index d25d1452dc6e..d0dddc14ebc8 100644 --- a/include/linux/input/sparse-keymap.h +++ b/include/linux/input/sparse-keymap.h @@ -20,6 +20,7 @@ * private definitions. * @code: Device-specific data identifying the button/switch * @keycode: KEY_* code assigned to a key/button + * @sw: struct with code/value used by KE_SW and KE_VSW * @sw.code: SW_* code assigned to a switch * @sw.value: Value that should be sent in an input even when KE_SW * switch is toggled. KE_VSW switches ignore this field and -- cgit v1.2.3 From 770c03e6dabacd5b9f57bba93c4311d32b618640 Mon Sep 17 00:00:00 2001 From: Fei Shao Date: Thu, 8 Oct 2020 17:34:14 +0800 Subject: rtc: mt6397: Remove unused member dev Removing the struct member "dev" in mt6397 RTC driver because it's not initialized and the only usage is for one debugging message. Also fixed a typo in the error message. Signed-off-by: Fei Shao Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20201008093414.1911699-1-fshao@chromium.org --- include/linux/mfd/mt6397/rtc.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mfd/mt6397/rtc.h b/include/linux/mfd/mt6397/rtc.h index 66989a16221a..c3748b53bf7d 100644 --- a/include/linux/mfd/mt6397/rtc.h +++ b/include/linux/mfd/mt6397/rtc.h @@ -72,7 +72,6 @@ struct mtk_rtc_data { }; struct mt6397_rtc { - struct device *dev; struct rtc_device *rtc_dev; /* Protect register access from multiple tasks */ -- cgit v1.2.3 From 1b2c54d63cde7e8cf15aa6319aba168d81c7e364 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 14 Oct 2020 16:38:47 -0700 Subject: vfs: move the remap range helpers to remap_range.c Complete the migration by moving the file remapping helper functions out of read_write.c and into remap_range.c. This reduces the clutter in the first file and (eventually) will make it so that we can compile out the second file if it isn't needed. Signed-off-by: Darrick J. Wong --- include/linux/fs.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index eea754a8dd67..073da53b59b0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3009,9 +3009,6 @@ extern int sb_min_blocksize(struct super_block *, int); extern int generic_file_mmap(struct file *, struct vm_area_struct *); extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *); -extern int generic_remap_checks(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *count, unsigned int remap_flags); extern int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count); extern int generic_file_rw_checks(struct file *file_in, struct file *file_out); -- cgit v1.2.3 From 407e9c63ee571f44a2dfb0828fc30daa02abb6dc Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 15 Oct 2020 09:21:17 -0700 Subject: vfs: move the generic write and copy checks out of mm The generic write check helpers also don't have much to do with the page cache, so move them to the vfs. Signed-off-by: Darrick J. Wong --- include/linux/fs.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 073da53b59b0..8fb063ab7d50 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3012,9 +3012,6 @@ extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *); extern int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count); extern int generic_file_rw_checks(struct file *file_in, struct file *file_out); -extern int generic_copy_file_checks(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - size_t *count, unsigned int flags); extern ssize_t generic_file_buffered_read(struct kiocb *iocb, struct iov_iter *to, ssize_t already_read); extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *); -- cgit v1.2.3 From 3e2ac9798e13ad1f52d735ea2ea1d252cb140ae5 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 5 Oct 2020 10:58:18 +0200 Subject: PM / devfreq: remove a duplicated kernel-doc markup The update_devfreq() is also documented at devfreq.c, which has a more complete note. So, drop the duplicated markup, in order to avoid this warning: .../Documentation/driver-api/device_link.rst: WARNING: Duplicate C declaration, also defined in 'driver-api/infrastructure'. Declaration is 'device_link_state'. (and to cause a problem with cross-references to it) Signed-off-by: Mauro Carvalho Chehab --- include/linux/devfreq.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/devfreq.h b/include/linux/devfreq.h index 2f4a74efa6be..121a2430d7f7 100644 --- a/include/linux/devfreq.h +++ b/include/linux/devfreq.h @@ -228,12 +228,7 @@ int devfreq_resume_device(struct devfreq *devfreq); void devfreq_suspend(void); void devfreq_resume(void); -/** - * update_devfreq() - Reevaluate the device and configure frequency - * @devfreq: the devfreq device - * - * Note: devfreq->lock must be held - */ +/* update_devfreq() - Reevaluate the device and configure frequency */ int update_devfreq(struct devfreq *devfreq); /* Helper functions for devfreq user device driver with OPP. */ -- cgit v1.2.3 From bae314dd5d8dfdd90ee584003a0f8c06e1bf3ea2 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 15 Oct 2020 16:44:27 +0200 Subject: cpuidle: Remove pointless stub The cpuidle.h header is declaring a function with an empty stub for the cpuidle disabled case, but that function is only called by cpuidle governors which depend on cpuidle anyway. In other words, the function is only called when cpuidle is enabled, so there is no need for the stub. Remove the pointless stub. Signed-off-by: Daniel Lezcano [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki --- include/linux/cpuidle.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index ed0da0e58e8b..bd605b5585cf 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -271,13 +271,8 @@ struct cpuidle_governor { void (*reflect) (struct cpuidle_device *dev, int index); }; -#ifdef CONFIG_CPU_IDLE extern int cpuidle_register_governor(struct cpuidle_governor *gov); extern s64 cpuidle_governor_latency_req(unsigned int cpu); -#else -static inline int cpuidle_register_governor(struct cpuidle_governor *gov) -{return 0;} -#endif #define __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, \ idx, \ -- cgit v1.2.3 From f1e8d7560d3051b38f73a0cf6acc1b0bf5305ad9 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 13 Oct 2020 15:42:41 +0800 Subject: powercap/intel_rapl: enumerate Psys RAPL domain together with package RAPL domain On multi-package systems, the Psys MSR is only valid for CPUs on specific package (master package). The current code makes the assumption that package 0 is the master package, but this is not true on new platforms like SPR. Fix the problem by emuerating the Psys RAPL domain for every package, so CPUs in slave packages will read 0 for the Psys energy counter and only CPUs in master packages can get a valid reading and register the Psys RAPL domain. The sysfs I/F for the Psys RAPL domain is not changed. Signed-off-by: Zhang Rui [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki --- include/linux/intel_rapl.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index 3582176a1eca..50b8398ffd21 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -79,8 +79,10 @@ struct rapl_power_limit { struct rapl_package; +#define RAPL_DOMAIN_NAME_LENGTH 16 + struct rapl_domain { - const char *name; + char name[RAPL_DOMAIN_NAME_LENGTH]; enum rapl_domain_type id; u64 regs[RAPL_DOMAIN_REG_MAX]; struct powercap_zone power_zone; @@ -152,7 +154,4 @@ struct rapl_package *rapl_find_package_domain(int cpu, struct rapl_if_priv *priv struct rapl_package *rapl_add_package(int cpu, struct rapl_if_priv *priv); void rapl_remove_package(struct rapl_package *rp); -int rapl_add_platform_domain(struct rapl_if_priv *priv); -void rapl_remove_platform_domain(struct rapl_if_priv *priv); - #endif /* __INTEL_RAPL_H__ */ -- cgit v1.2.3 From d4f8138354b9ec290de0c7ba527a945c5549e32b Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 13 Oct 2020 14:23:39 +0200 Subject: PM: domains: Add support for PM domain on/off notifiers for genpd A device may have specific HW constraints that must be obeyed to, before its corresponding PM domain (genpd) can be powered off - and vice verse at power on. These constraints can't be managed through the regular runtime PM based deployment for a device, because the access pattern for it, isn't always request based. In other words, using the runtime PM callbacks to deal with the constraints doesn't work for these cases. For these reasons, let's instead add a PM domain power on/off notification mechanism to genpd. To add/remove a notifier for a device, the device must already have been attached to the genpd, which also means that it needs to be a part of the PM domain topology. To add/remove a notifier, let's introduce two genpd specific functions: - dev_pm_genpd_add|remove_notifier() Note that, to further clarify when genpd power on/off notifiers may be used, one can compare with the existing CPU_CLUSTER_PM_ENTER|EXIT notifiers. In the long run, the genpd power on/off notifiers should be able to replace them, but that requires additional genpd based platform support for the current users. Signed-off-by: Ulf Hansson Tested-by: Lina Iyer Signed-off-by: Rafael J. Wysocki --- include/linux/pm_domain.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index 66f3c5d64d81..db039da0aba2 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -68,6 +68,13 @@ enum gpd_status { GENPD_STATE_OFF, /* PM domain is off */ }; +enum genpd_notication { + GENPD_NOTIFY_PRE_OFF = 0, + GENPD_NOTIFY_OFF, + GENPD_NOTIFY_PRE_ON, + GENPD_NOTIFY_ON, +}; + struct dev_power_governor { bool (*power_down_ok)(struct dev_pm_domain *domain); bool (*suspend_ok)(struct device *dev); @@ -112,6 +119,7 @@ struct generic_pm_domain { cpumask_var_t cpus; /* A cpumask of the attached CPUs */ int (*power_off)(struct generic_pm_domain *domain); int (*power_on)(struct generic_pm_domain *domain); + struct raw_notifier_head power_notifiers; /* Power on/off notifiers */ struct opp_table *opp_table; /* OPP table of the genpd */ unsigned int (*opp_to_performance_state)(struct generic_pm_domain *genpd, struct dev_pm_opp *opp); @@ -178,6 +186,7 @@ struct generic_pm_domain_data { struct pm_domain_data base; struct gpd_timing_data td; struct notifier_block nb; + struct notifier_block *power_nb; int cpu; unsigned int performance_state; void *data; @@ -204,6 +213,8 @@ int pm_genpd_init(struct generic_pm_domain *genpd, struct dev_power_governor *gov, bool is_off); int pm_genpd_remove(struct generic_pm_domain *genpd); int dev_pm_genpd_set_performance_state(struct device *dev, unsigned int state); +int dev_pm_genpd_add_notifier(struct device *dev, struct notifier_block *nb); +int dev_pm_genpd_remove_notifier(struct device *dev); extern struct dev_power_governor simple_qos_governor; extern struct dev_power_governor pm_domain_always_on_gov; @@ -251,6 +262,17 @@ static inline int dev_pm_genpd_set_performance_state(struct device *dev, return -ENOTSUPP; } +static inline int dev_pm_genpd_add_notifier(struct device *dev, + struct notifier_block *nb) +{ + return -ENOTSUPP; +} + +static inline int dev_pm_genpd_remove_notifier(struct device *dev) +{ + return -ENOTSUPP; +} + #define simple_qos_governor (*(struct dev_power_governor *)(NULL)) #define pm_domain_always_on_gov (*(struct dev_power_governor *)(NULL)) #endif -- cgit v1.2.3 From c6a113b52302adcfadda63af81dc05f7a669fbc8 Mon Sep 17 00:00:00 2001 From: Lina Iyer Date: Thu, 15 Oct 2020 14:47:22 -0600 Subject: PM: domains: enable domain idle state accounting To enable better debug of PM domains, keep a track of successful and failing attempts to enter each domain idle state. This statistics are exported in debugfs when reading the idle_states node associated with each PM domain. Signed-off-by: Lina Iyer [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki --- include/linux/pm_domain.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index db039da0aba2..1ad0ec481416 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -89,6 +89,8 @@ struct genpd_power_state { s64 power_off_latency_ns; s64 power_on_latency_ns; s64 residency_ns; + u64 usage; + u64 rejected; struct fwnode_handle *fwnode; ktime_t idle_time; void *data; -- cgit v1.2.3 From ecdf57b4f6748f3cb89eaf2ffdc9cfae4829f493 Mon Sep 17 00:00:00 2001 From: "Saheed O. Bolarinwa" Date: Thu, 15 Oct 2020 14:30:34 -0500 Subject: PCI/ASPM: Remove struct aspm_register_info.l1ss_cap_ptr Save the L1 Substates Capability pointer in struct pci_dev. Then we don't have to keep track of it in the struct aspm_register_info and struct pcie_link_state, which makes the code easier to read. No functional change intended. [bhelgaas: split to a separate patch] Link: https://lore.kernel.org/r/20201015193039.12585-8-helgaas@kernel.org Signed-off-by: Saheed O. Bolarinwa Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 835530605c0d..c5288cd71a2e 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -380,6 +380,7 @@ struct pci_dev { struct pcie_link_state *link_state; /* ASPM link state */ unsigned int ltr_path:1; /* Latency Tolerance Reporting supported from root to here */ + int l1ss; /* L1SS Capability pointer */ #endif unsigned int eetlp_prefix_path:1; /* End-to-End TLP Prefix */ -- cgit v1.2.3 From 57417cebc96b57122a2207fc84a6077d20c84b4b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:05:13 -0700 Subject: XArray: add xa_get_order Patch series "Fix read-only THP for non-tmpfs filesystems". As described more verbosely in the [3/3] changelog, we can inadvertently put an order-0 page in the page cache which occupies 512 consecutive entries. Users are running into this if they enable the READ_ONLY_THP_FOR_FS config option; see https://bugzilla.kernel.org/show_bug.cgi?id=206569 and Qian Cai has also reported it here: https://lore.kernel.org/lkml/20200616013309.GB815@lca.pw/ This is a rather intrusive way of fixing the problem, but has the advantage that I've actually been testing it with the THP patches, which means that it sees far more use than it does upstream -- indeed, Song has been entirely unable to reproduce it. It also has the advantage that it removes a few patches from my gargantuan backlog of THP patches. This patch (of 3): This function returns the order of the entry at the index. We need this because there isn't space in the shadow entry to encode its order. [akpm@linux-foundation.org: export xa_get_order to modules] Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: "Kirill A . Shutemov" Cc: Qian Cai Cc: Song Liu Link: https://lkml.kernel.org/r/20200903183029.14930-1-willy@infradead.org Link: https://lkml.kernel.org/r/20200903183029.14930-2-willy@infradead.org Signed-off-by: Linus Torvalds --- include/linux/xarray.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/xarray.h b/include/linux/xarray.h index b4d70e7568b2..5b7f4ebcf4ff 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -1505,6 +1505,15 @@ void xas_pause(struct xa_state *); void xas_create_range(struct xa_state *); +#ifdef CONFIG_XARRAY_MULTI +int xa_get_order(struct xarray *, unsigned long index); +#else +static inline int xa_get_order(struct xarray *xa, unsigned long index) +{ + return 0; +} +#endif + /** * xas_reload() - Refetch an entry from the xarray. * @xas: XArray operation state. -- cgit v1.2.3 From 8fc75643c5e14574c8be59b69182452ece28315a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:05:16 -0700 Subject: XArray: add xas_split In order to use multi-index entries for huge pages in the page cache, we need to be able to split a multi-index entry (eg if a file is truncated in the middle of a huge page entry). This version does not support splitting more than one level of the tree at a time. This is an acceptable limitation for the page cache as we do not expect to support order-12 pages in the near future. [akpm@linux-foundation.org: export xas_split_alloc() to modules] [willy@infradead.org: fix xarray split] Link: https://lkml.kernel.org/r/20200910175450.GV6583@casper.infradead.org [willy@infradead.org: fix xarray] Link: https://lkml.kernel.org/r/20201001233943.GW20115@casper.infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: "Kirill A . Shutemov" Cc: Qian Cai Cc: Song Liu Link: https://lkml.kernel.org/r/20200903183029.14930-3-willy@infradead.org Signed-off-by: Linus Torvalds --- include/linux/xarray.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 5b7f4ebcf4ff..5cdf441f6377 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -1507,11 +1507,24 @@ void xas_create_range(struct xa_state *); #ifdef CONFIG_XARRAY_MULTI int xa_get_order(struct xarray *, unsigned long index); +void xas_split(struct xa_state *, void *entry, unsigned int order); +void xas_split_alloc(struct xa_state *, void *entry, unsigned int order, gfp_t); #else static inline int xa_get_order(struct xarray *xa, unsigned long index) { return 0; } + +static inline void xas_split(struct xa_state *xas, void *entry, + unsigned int order) +{ + xas_store(xas, entry); +} + +static inline void xas_split_alloc(struct xa_state *xas, void *entry, + unsigned int order, gfp_t gfp) +{ +} #endif /** -- cgit v1.2.3 From 8fb156c9ee2db94f7127c930c89917634a1a9f56 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:05:29 -0700 Subject: mm/page_owner: change split_page_owner to take a count The implementation of split_page_owner() prefers a count rather than the old order of the page. When we support a variable size THP, we won't have the order at this point, but we will have the number of pages. So change the interface to what the caller and callee would prefer. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: SeongJae Park Acked-by: Kirill A. Shutemov Cc: Huang Ying Link: https://lkml.kernel.org/r/20200908195539.25896-4-willy@infradead.org Signed-off-by: Linus Torvalds --- include/linux/page_owner.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h index 8679ccd722e8..3468794f83d2 100644 --- a/include/linux/page_owner.h +++ b/include/linux/page_owner.h @@ -11,7 +11,7 @@ extern struct page_ext_operations page_owner_ops; extern void __reset_page_owner(struct page *page, unsigned int order); extern void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask); -extern void __split_page_owner(struct page *page, unsigned int order); +extern void __split_page_owner(struct page *page, unsigned int nr); extern void __copy_page_owner(struct page *oldpage, struct page *newpage); extern void __set_page_owner_migrate_reason(struct page *page, int reason); extern void __dump_page_owner(struct page *page); @@ -31,10 +31,10 @@ static inline void set_page_owner(struct page *page, __set_page_owner(page, order, gfp_mask); } -static inline void split_page_owner(struct page *page, unsigned int order) +static inline void split_page_owner(struct page *page, unsigned int nr) { if (static_branch_unlikely(&page_owner_inited)) - __split_page_owner(page, order); + __split_page_owner(page, nr); } static inline void copy_page_owner(struct page *oldpage, struct page *newpage) { -- cgit v1.2.3 From 01c70267053d6718820ac0902d8823d5dd2a6adb Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:06:00 -0700 Subject: fs: add a filesystem flag for THPs The page cache needs to know whether the filesystem supports THPs so that it doesn't send THPs to filesystems which can't handle them. Dave Chinner points out that getting from the page mapping to the filesystem type is too many steps (mapping->host->i_sb->s_type->fs_flags) so cache that information in the address space flags. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: Alexander Viro Cc: "Matthew Wilcox (Oracle)" Cc: Hugh Dickins Cc: Song Liu Cc: Rik van Riel Cc: "Kirill A . Shutemov" Cc: Johannes Weiner Cc: Dave Chinner Cc: Christoph Hellwig Link: https://lkml.kernel.org/r/20200916032717.22917-1-willy@infradead.org Signed-off-by: Linus Torvalds --- include/linux/fs.h | 1 + include/linux/pagemap.h | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index ae97d87a00d2..72369be23f91 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2209,6 +2209,7 @@ struct file_system_type { #define FS_HAS_SUBTYPE 4 #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ #define FS_DISALLOW_NOTIFY_PERM 16 /* Disable fanotify permission events */ +#define FS_THP_SUPPORT 8192 /* Remove once all fs converted */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ int (*init_fs_context)(struct fs_context *); const struct fs_parameter_spec *parameters; diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index c3afd3242b54..820c970fd24a 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -29,6 +29,7 @@ enum mapping_flags { AS_EXITING = 4, /* final truncate in progress */ /* writeback related tags are not used */ AS_NO_WRITEBACK_TAGS = 5, + AS_THP_SUPPORT = 6, /* THPs supported */ }; /** @@ -120,6 +121,11 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) m->gfp_mask = mask; } +static inline bool mapping_thp_support(struct address_space *mapping) +{ + return test_bit(AS_THP_SUPPORT, &mapping->flags); +} + void release_pages(struct page **pages, int nr); /* -- cgit v1.2.3 From 6f4d2f9770cf154f9867f466d7b1b463a39f05a7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:06:03 -0700 Subject: fs: do not update nr_thps for mappings which support THPs The nr_thps counter is to support THPs in the page cache when the filesystem doesn't understand THPs. Eventually it will be removed, but we should still support filesystems which do not understand THPs yet. Move the nr_thp manipulation functions to filemap.h since they're page-cache specific. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: Alexander Viro Cc: "Matthew Wilcox (Oracle)" Cc: Hugh Dickins Cc: Song Liu Cc: Rik van Riel Cc: "Kirill A . Shutemov" Cc: Johannes Weiner Cc: Dave Chinner Cc: Christoph Hellwig Link: https://lkml.kernel.org/r/20200916032717.22917-2-willy@infradead.org Signed-off-by: Linus Torvalds --- include/linux/fs.h | 27 --------------------------- include/linux/pagemap.h | 29 +++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 72369be23f91..d1d166b46131 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2697,33 +2697,6 @@ static inline errseq_t file_sample_sb_err(struct file *file) return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err); } -static inline int filemap_nr_thps(struct address_space *mapping) -{ -#ifdef CONFIG_READ_ONLY_THP_FOR_FS - return atomic_read(&mapping->nr_thps); -#else - return 0; -#endif -} - -static inline void filemap_nr_thps_inc(struct address_space *mapping) -{ -#ifdef CONFIG_READ_ONLY_THP_FOR_FS - atomic_inc(&mapping->nr_thps); -#else - WARN_ON_ONCE(1); -#endif -} - -static inline void filemap_nr_thps_dec(struct address_space *mapping) -{ -#ifdef CONFIG_READ_ONLY_THP_FOR_FS - atomic_dec(&mapping->nr_thps); -#else - WARN_ON_ONCE(1); -#endif -} - extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync); extern int vfs_fsync(struct file *file, int datasync); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 820c970fd24a..a0024528a9ee 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -126,6 +126,35 @@ static inline bool mapping_thp_support(struct address_space *mapping) return test_bit(AS_THP_SUPPORT, &mapping->flags); } +static inline int filemap_nr_thps(struct address_space *mapping) +{ +#ifdef CONFIG_READ_ONLY_THP_FOR_FS + return atomic_read(&mapping->nr_thps); +#else + return 0; +#endif +} + +static inline void filemap_nr_thps_inc(struct address_space *mapping) +{ +#ifdef CONFIG_READ_ONLY_THP_FOR_FS + if (!mapping_thp_support(mapping)) + atomic_inc(&mapping->nr_thps); +#else + WARN_ON_ONCE(1); +#endif +} + +static inline void filemap_nr_thps_dec(struct address_space *mapping) +{ +#ifdef CONFIG_READ_ONLY_THP_FOR_FS + if (!mapping_thp_support(mapping)) + atomic_dec(&mapping->nr_thps); +#else + WARN_ON_ONCE(1); +#endif +} + void release_pages(struct page **pages, int nr); /* -- cgit v1.2.3 From 1aa83cfa5a20a6bbd39d2355a89c95152e4b37b4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:06:10 -0700 Subject: mm/readahead: add DEFINE_READAHEAD Patch series "Readahead patches for 5.9/5.10". These are infrastructure for both the THP patchset and for the fscache rewrite, For both pieces of infrastructure being build on top of this patchset, we want the ractl to be available higher in the call-stack. For David's work, he wants to add the 'critical page' to the ractl so that he knows which page NEEDS to be brought in from storage, and which ones are nice-to-have. We might want something similar in block storage too. It used to be simple -- the first page was the critical one, but then mmap added fault-around and so for that usecase, the middle page is the critical one. Anyway, I don't have any code to show that yet, we just know that the lowest point in the callchain where we have that information is do_sync_mmap_readahead() and so the ractl needs to start its life there. For THP, we havew the code that needs it. It's actually the apex patch to the series; the one which finally starts to allocate THPs and present them to consenting filesystems: http://git.infradead.org/users/willy/pagecache.git/commitdiff/798bcf30ab2eff278caad03a9edca74d2f8ae760 This patch (of 8): Allow for a more concise definition of a struct readahead_control. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: Eric Biggers Cc: David Howells Link: https://lkml.kernel.org/r/20200903140844.14194-1-willy@infradead.org Link: https://lkml.kernel.org/r/20200903140844.14194-3-willy@infradead.org Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index a0024528a9ee..63c81b512e80 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -812,6 +812,13 @@ struct readahead_control { unsigned int _batch_count; }; +#define DEFINE_READAHEAD(rac, f, m, i) \ + struct readahead_control rac = { \ + .file = f, \ + .mapping = m, \ + ._index = i, \ + } + /** * readahead_page - Get the next page to read. * @rac: The current readahead request. -- cgit v1.2.3 From 73bb49da50cd460bb3ba31250ed2e7fbf2115edf Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:06:14 -0700 Subject: mm/readahead: make page_cache_ra_unbounded take a readahead_control Define it in the callers instead of in page_cache_ra_unbounded(). Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: David Howells Cc: Eric Biggers Link: https://lkml.kernel.org/r/20200903140844.14194-4-willy@infradead.org Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 63c81b512e80..37f209ccef0f 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -768,9 +768,8 @@ void page_cache_sync_readahead(struct address_space *, struct file_ra_state *, void page_cache_async_readahead(struct address_space *, struct file_ra_state *, struct file *, struct page *, pgoff_t index, unsigned long req_count); -void page_cache_readahead_unbounded(struct address_space *, struct file *, - pgoff_t index, unsigned long nr_to_read, - unsigned long lookahead_count); +void page_cache_ra_unbounded(struct readahead_control *, + unsigned long nr_to_read, unsigned long lookahead_count); /* * Like add_to_page_cache_locked, but used to add newly allocated pages: -- cgit v1.2.3 From fefa7c478fdafe71c64b5ddf817ac0271aed1146 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Oct 2020 20:06:28 -0700 Subject: mm/readahead: add page_cache_sync_ra and page_cache_async_ra Reimplement page_cache_sync_readahead() and page_cache_async_readahead() as wrappers around versions of the function which take a readahead_control in preparation for making do_sync_mmap_readahead() pass down an RAC struct. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: David Howells Cc: Eric Biggers Link: https://lkml.kernel.org/r/20200903140844.14194-8-willy@infradead.org Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 64 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 54 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 37f209ccef0f..c77b7c31b2e4 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -761,16 +761,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); void delete_from_page_cache_batch(struct address_space *mapping, struct pagevec *pvec); -#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) - -void page_cache_sync_readahead(struct address_space *, struct file_ra_state *, - struct file *, pgoff_t index, unsigned long req_count); -void page_cache_async_readahead(struct address_space *, struct file_ra_state *, - struct file *, struct page *, pgoff_t index, - unsigned long req_count); -void page_cache_ra_unbounded(struct readahead_control *, - unsigned long nr_to_read, unsigned long lookahead_count); - /* * Like add_to_page_cache_locked, but used to add newly allocated pages: * the page is new, so we can just run __SetPageLocked() against it. @@ -818,6 +808,60 @@ struct readahead_control { ._index = i, \ } +#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) + +void page_cache_ra_unbounded(struct readahead_control *, + unsigned long nr_to_read, unsigned long lookahead_count); +void page_cache_sync_ra(struct readahead_control *, struct file_ra_state *, + unsigned long req_count); +void page_cache_async_ra(struct readahead_control *, struct file_ra_state *, + struct page *, unsigned long req_count); + +/** + * page_cache_sync_readahead - generic file readahead + * @mapping: address_space which holds the pagecache and I/O vectors + * @ra: file_ra_state which holds the readahead state + * @file: Used by the filesystem for authentication. + * @index: Index of first page to be read. + * @req_count: Total number of pages being read by the caller. + * + * page_cache_sync_readahead() should be called when a cache miss happened: + * it will submit the read. The readahead logic may decide to piggyback more + * pages onto the read request if access patterns suggest it will improve + * performance. + */ +static inline +void page_cache_sync_readahead(struct address_space *mapping, + struct file_ra_state *ra, struct file *file, pgoff_t index, + unsigned long req_count) +{ + DEFINE_READAHEAD(ractl, file, mapping, index); + page_cache_sync_ra(&ractl, ra, req_count); +} + +/** + * page_cache_async_readahead - file readahead for marked pages + * @mapping: address_space which holds the pagecache and I/O vectors + * @ra: file_ra_state which holds the readahead state + * @file: Used by the filesystem for authentication. + * @page: The page at @index which triggered the readahead call. + * @index: Index of first page to be read. + * @req_count: Total number of pages being read by the caller. + * + * page_cache_async_readahead() should be called when a page is used which + * is marked as PageReadahead; this is a marker to suggest that the application + * has used up enough of the readahead window that we should start pulling in + * more pages. + */ +static inline +void page_cache_async_readahead(struct address_space *mapping, + struct file_ra_state *ra, struct file *file, + struct page *page, pgoff_t index, unsigned long req_count) +{ + DEFINE_READAHEAD(ractl, file, mapping, index); + page_cache_async_ra(&ractl, ra, page, req_count); +} + /** * readahead_page - Get the next page to read. * @rac: The current readahead request. -- cgit v1.2.3 From 7e27f22c9e40b66186e0675376f0495725ff1b0a Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 15 Oct 2020 20:06:50 -0700 Subject: mm,hwpoison: unexport get_hwpoison_page and make it static Since get_hwpoison_page is only used in memory-failure code now, let us un-export it and make it private to that code. Signed-off-by: Oscar Salvador Signed-off-by: Andrew Morton Acked-by: Naoya Horiguchi Cc: "Aneesh Kumar K.V" Cc: Aneesh Kumar K.V Cc: Aristeu Rozanski Cc: Dave Hansen Cc: David Hildenbrand Cc: Dmitry Yakunin Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qian Cai Cc: Tony Luck Link: https://lkml.kernel.org/r/20200922135650.1634-5-osalvador@suse.de Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 620961e4f32b..1977c09afe7a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3025,7 +3025,6 @@ extern int memory_failure(unsigned long pfn, int flags); extern void memory_failure_queue(unsigned long pfn, int flags); extern void memory_failure_queue_kick(int cpu); extern int unpoison_memory(unsigned long pfn); -extern int get_hwpoison_page(struct page *page); #define put_hwpoison_page(page) put_page(page) extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; -- cgit v1.2.3 From dd6e2402fad966290f35dc687294fb6049714aac Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 15 Oct 2020 20:06:57 -0700 Subject: mm,hwpoison: kill put_hwpoison_page After commit 4e41a30c6d50 ("mm: hwpoison: adjust for new thp refcounting"), put_hwpoison_page got reduced to a put_page. Let us just use put_page instead. Signed-off-by: Oscar Salvador Signed-off-by: Andrew Morton Acked-by: Naoya Horiguchi Cc: "Aneesh Kumar K.V" Cc: Aneesh Kumar K.V Cc: Aristeu Rozanski Cc: Dave Hansen Cc: David Hildenbrand Cc: Dmitry Yakunin Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qian Cai Cc: Tony Luck Link: https://lkml.kernel.org/r/20200922135650.1634-7-osalvador@suse.de Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1977c09afe7a..ab038a3521b4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3025,7 +3025,6 @@ extern int memory_failure(unsigned long pfn, int flags); extern void memory_failure_queue(unsigned long pfn, int flags); extern void memory_failure_queue_kick(int cpu); extern int unpoison_memory(unsigned long pfn); -#define put_hwpoison_page(page) put_page(page) extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; extern void shake_page(struct page *p, int access); -- cgit v1.2.3 From 06be6ff3d2ec8be806b859fc054a1909b16d2473 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 15 Oct 2020 20:07:05 -0700 Subject: mm,hwpoison: rework soft offline for free pages When trying to soft-offline a free page, we need to first take it off the buddy allocator. Once we know is out of reach, we can safely flag it as poisoned. take_page_off_buddy will be used to take a page meant to be poisoned off the buddy allocator. take_page_off_buddy calls break_down_buddy_pages, which splits a higher-order page in case our page belongs to one. Once the page is under our control, we call page_handle_poison to set it as poisoned and grab a refcount on it. Signed-off-by: Oscar Salvador Signed-off-by: Andrew Morton Acked-by: Naoya Horiguchi Cc: "Aneesh Kumar K.V" Cc: Aneesh Kumar K.V Cc: Aristeu Rozanski Cc: Dave Hansen Cc: David Hildenbrand Cc: Dmitry Yakunin Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qian Cai Cc: Tony Luck Link: https://lkml.kernel.org/r/20200922135650.1634-9-osalvador@suse.de Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 38ded408bd4c..a02b6d0221db 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -432,6 +432,7 @@ PAGEFLAG(HWPoison, hwpoison, PF_ANY) TESTSCFLAG(HWPoison, hwpoison, PF_ANY) #define __PG_HWPOISON (1UL << PG_hwpoison) extern bool set_hwpoison_free_buddy_page(struct page *page); +extern bool take_page_off_buddy(struct page *page); #else PAGEFLAG_FALSE(HWPoison) static inline bool set_hwpoison_free_buddy_page(struct page *page) -- cgit v1.2.3 From 79f5f8fab482dfff62948214468ac4ebbf0a016f Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 15 Oct 2020 20:07:09 -0700 Subject: mm,hwpoison: rework soft offline for in-use pages This patch changes the way we set and handle in-use poisoned pages. Until now, poisoned pages were released to the buddy allocator, trusting that the checks that take place at allocation time would act as a safe net and would skip that page. This has proved to be wrong, as we got some pfn walkers out there, like compaction, that all they care is the page to be in a buddy freelist. Although this might not be the only user, having poisoned pages in the buddy allocator seems a bad idea as we should only have free pages that are ready and meant to be used as such. Before explaining the taken approach, let us break down the kind of pages we can soft offline. - Anonymous THP (after the split, they end up being 4K pages) - Hugetlb - Order-0 pages (that can be either migrated or invalited) * Normal pages (order-0 and anon-THP) - If they are clean and unmapped page cache pages, we invalidate then by means of invalidate_inode_page(). - If they are mapped/dirty, we do the isolate-and-migrate dance. Either way, do not call put_page directly from those paths. Instead, we keep the page and send it to page_handle_poison to perform the right handling. page_handle_poison sets the HWPoison flag and does the last put_page. Down the chain, we placed a check for HWPoison page in free_pages_prepare, that just skips any poisoned page, so those pages do not end up in any pcplist/freelist. After that, we set the refcount on the page to 1 and we increment the poisoned pages counter. If we see that the check in free_pages_prepare creates trouble, we can always do what we do for free pages: - wait until the page hits buddy's freelists - take it off, and flag it The downside of the above approach is that we could race with an allocation, so by the time we want to take the page off the buddy, the page has been already allocated so we cannot soft offline it. But the user could always retry it. * Hugetlb pages - We isolate-and-migrate them After the migration has been successful, we call dissolve_free_huge_page, and we set HWPoison on the page if we succeed. Hugetlb has a slightly different handling though. While for non-hugetlb pages we cared about closing the race with an allocation, doing so for hugetlb pages requires quite some additional and intrusive code (we would need to hook in free_huge_page and some other places). So I decided to not make the code overly complicated and just fail normally if the page we allocated in the meantime. We can always build on top of this. As a bonus, because of the way we handle now in-use pages, we no longer need the put-as-isolation-migratetype dance, that was guarding for poisoned pages to end up in pcplists. Signed-off-by: Oscar Salvador Signed-off-by: Andrew Morton Acked-by: Naoya Horiguchi Cc: "Aneesh Kumar K.V" Cc: Aneesh Kumar K.V Cc: Aristeu Rozanski Cc: Dave Hansen Cc: David Hildenbrand Cc: Dmitry Yakunin Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qian Cai Cc: Tony Luck Link: https://lkml.kernel.org/r/20200922135650.1634-10-osalvador@suse.de Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index a02b6d0221db..4f6ba9379112 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -431,14 +431,9 @@ PAGEFLAG_FALSE(Uncached) PAGEFLAG(HWPoison, hwpoison, PF_ANY) TESTSCFLAG(HWPoison, hwpoison, PF_ANY) #define __PG_HWPOISON (1UL << PG_hwpoison) -extern bool set_hwpoison_free_buddy_page(struct page *page); extern bool take_page_off_buddy(struct page *page); #else PAGEFLAG_FALSE(HWPoison) -static inline bool set_hwpoison_free_buddy_page(struct page *page) -{ - return 0; -} #define __PG_HWPOISON 0 #endif -- cgit v1.2.3 From 5d1fd5dc877bc1c670e7b1c174aa659b76c07de1 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 15 Oct 2020 20:07:21 -0700 Subject: mm,hwpoison: introduce MF_MSG_UNSPLIT_THP memory_failure() is supposed to call action_result() when it handles a memory error event, but there's one missing case. So let's add it. I find that include/ras/ras_event.h has some other MF_MSG_* undefined, so this patch also adds them. Signed-off-by: Naoya Horiguchi Signed-off-by: Oscar Salvador Signed-off-by: Andrew Morton Cc: "Aneesh Kumar K.V" Cc: Aneesh Kumar K.V Cc: Aristeu Rozanski Cc: Dave Hansen Cc: David Hildenbrand Cc: Dmitry Yakunin Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Qian Cai Cc: Tony Luck Link: https://lkml.kernel.org/r/20200922135650.1634-13-osalvador@suse.de Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index ab038a3521b4..a9df46309e07 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3064,6 +3064,7 @@ enum mf_action_page_type { MF_MSG_BUDDY, MF_MSG_BUDDY_2ND, MF_MSG_DAX, + MF_MSG_UNSPLIT_THP, MF_MSG_UNKNOWN, }; -- cgit v1.2.3 From 257bea71582d895894201b604990a900df489103 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:07:59 -0700 Subject: mm/page_alloc: simplify __offline_isolated_pages() offline_pages() is the only user. __offline_isolated_pages() never gets called with ranges that contain memory holes and we no longer care about the return value. Drop the return value handling and all pfn_valid() checks. Update the documentation. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Cc: Wei Yang Cc: Baoquan He Cc: Pankaj Gupta Cc: Charan Teja Reddy Cc: Dan Williams Cc: Fenghua Yu Cc: Logan Gunthorpe Cc: "Matthew Wilcox (Oracle)" Cc: Mel Gorman Cc: Mel Gorman Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Tony Luck Link: https://lkml.kernel.org/r/20200819175957.28465-5-david@redhat.com Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index c0faa7a30c46..76b314031f09 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -103,8 +103,8 @@ extern int online_pages(unsigned long pfn, unsigned long nr_pages, int online_type, int nid); extern struct zone *test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn); -extern unsigned long __offline_isolated_pages(unsigned long start_pfn, - unsigned long end_pfn); +extern void __offline_isolated_pages(unsigned long start_pfn, + unsigned long end_pfn); typedef void (*online_page_callback_t)(struct page *page, unsigned int order); -- cgit v1.2.3 From d882c0067d99d0f2add9a41628703cc99511a639 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:08:19 -0700 Subject: mm: pass migratetype into memmap_init_zone() and move_pfn_range_to_zone() On the memory onlining path, we want to start with MIGRATE_ISOLATE, to un-isolate the pages after memory onlining is complete. Let's allow passing in the migratetype. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Cc: Wei Yang Cc: Baoquan He Cc: Pankaj Gupta Cc: Tony Luck Cc: Fenghua Yu Cc: Logan Gunthorpe Cc: Dan Williams Cc: Mike Rapoport Cc: "Matthew Wilcox (Oracle)" Cc: Michel Lespinasse Cc: Charan Teja Reddy Cc: Mel Gorman Link: https://lkml.kernel.org/r/20200819175957.28465-10-david@redhat.com Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 3 ++- include/linux/mm.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 76b314031f09..51a877fec8da 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -351,7 +351,8 @@ extern int add_memory_resource(int nid, struct resource *resource); extern int add_memory_driver_managed(int nid, u64 start, u64 size, const char *resource_name); extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, - unsigned long nr_pages, struct vmem_altmap *altmap); + unsigned long nr_pages, + struct vmem_altmap *altmap, int migratetype); extern void remove_pfn_range_from_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages); diff --git a/include/linux/mm.h b/include/linux/mm.h index a9df46309e07..61a2633fcc7f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2440,7 +2440,7 @@ extern int __meminit __early_pfn_to_nid(unsigned long pfn, extern void set_dma_reserve(unsigned long new_dma_reserve); extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long, - enum meminit_context, struct vmem_altmap *); + enum meminit_context, struct vmem_altmap *, int migratetype); extern void setup_per_zone_wmarks(void); extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); -- cgit v1.2.3 From ec62d04e3fdc4ba3a7912cd7f6da1a4e787a0d75 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:08:28 -0700 Subject: kernel/resource: make release_mem_region_adjustable() never fail Patch series "selective merging of system ram resources", v4. Some add_memory*() users add memory in small, contiguous memory blocks. Examples include virtio-mem, hyper-v balloon, and the XEN balloon. This can quickly result in a lot of memory resources, whereby the actual resource boundaries are not of interest (e.g., it might be relevant for DIMMs, exposed via /proc/iomem to user space). We really want to merge added resources in this scenario where possible. Resources are effectively stored in a list-based tree. Having a lot of resources not only wastes memory, it also makes traversing that tree more expensive, and makes /proc/iomem explode in size (e.g., requiring kexec-tools to manually merge resources when creating a kdump header. The current kexec-tools resource count limit does not allow for more than ~100GB of memory with a memory block size of 128MB on x86-64). Let's allow to selectively merge system ram resources by specifying a new flag for add_memory*(). Patch #5 contains a /proc/iomem example. Only tested with virtio-mem. This patch (of 8): Let's make sure splitting a resource on memory hotunplug will never fail. This will become more relevant once we merge selected System RAM resources - then, we'll trigger that case more often on memory hotunplug. In general, this function is already unlikely to fail. When we remove memory, we free up quite a lot of metadata (memmap, page tables, memory block device, etc.). The only reason it could really fail would be when injecting allocation errors. All other error cases inside release_mem_region_adjustable() seem to be sanity checks if the function would be abused in different context - let's add WARN_ON_ONCE() in these cases so we can catch them. [natechancellor@gmail.com: fix use of ternary condition in release_mem_region_adjustable] Link: https://lkml.kernel.org/r/20200922060748.2452056-1-natechancellor@gmail.com Link: https://github.com/ClangBuiltLinux/linux/issues/1159 Signed-off-by: David Hildenbrand Signed-off-by: Nathan Chancellor Signed-off-by: Andrew Morton Cc: Michal Hocko Cc: Dan Williams Cc: Jason Gunthorpe Cc: Kees Cook Cc: Ard Biesheuvel Cc: Pankaj Gupta Cc: Baoquan He Cc: Wei Yang Cc: Anton Blanchard Cc: Benjamin Herrenschmidt Cc: Boris Ostrovsky Cc: Christian Borntraeger Cc: Dave Jiang Cc: Eric Biederman Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Heiko Carstens Cc: Jason Wang Cc: Juergen Gross Cc: Julien Grall Cc: "K. Y. Srinivasan" Cc: Len Brown Cc: Leonardo Bras Cc: Libor Pechacek Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Nathan Lynch Cc: "Oliver O'Halloran" Cc: Paul Mackerras Cc: Pingfan Liu Cc: "Rafael J. Wysocki" Cc: Roger Pau Monn Cc: Stefano Stabellini Cc: Stephen Hemminger Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vishal Verma Cc: Wei Liu Link: https://lkml.kernel.org/r/20200911103459.10306-2-david@redhat.com Signed-off-by: Linus Torvalds --- include/linux/ioport.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 6c2b06fe8beb..52a91f5fa1a3 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -248,8 +248,8 @@ extern struct resource * __request_region(struct resource *, extern void __release_region(struct resource *, resource_size_t, resource_size_t); #ifdef CONFIG_MEMORY_HOTREMOVE -extern int release_mem_region_adjustable(struct resource *, resource_size_t, - resource_size_t); +extern void release_mem_region_adjustable(struct resource *, resource_size_t, + resource_size_t); #endif /* Wrappers for managed devices */ -- cgit v1.2.3 From 7cf603d17d9bddbda90c424b6f30c7bc2e6f48f2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:08:33 -0700 Subject: kernel/resource: move and rename IORESOURCE_MEM_DRIVER_MANAGED MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit IORESOURCE_MEM_DRIVER_MANAGED currently uses an unused PnP bit, which is always set to 0 by hardware. This is far from beautiful (and confusing), and the bit only applies to SYSRAM. So let's move it out of the bus-specific (PnP) defined bits. We'll add another SYSRAM specific bit soon. If we ever need more bits for other purposes, we can steal some from "desc", or reshuffle/regroup what we have. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Cc: Michal Hocko Cc: Dan Williams Cc: Jason Gunthorpe Cc: Kees Cook Cc: Ard Biesheuvel Cc: Pankaj Gupta Cc: Baoquan He Cc: Wei Yang Cc: Eric Biederman Cc: Thomas Gleixner Cc: Greg Kroah-Hartman Cc: Anton Blanchard Cc: Benjamin Herrenschmidt Cc: Boris Ostrovsky Cc: Christian Borntraeger Cc: Dave Jiang Cc: Haiyang Zhang Cc: Heiko Carstens Cc: Jason Wang Cc: Juergen Gross Cc: Julien Grall Cc: "K. Y. Srinivasan" Cc: Len Brown Cc: Leonardo Bras Cc: Libor Pechacek Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Nathan Lynch Cc: "Oliver O'Halloran" Cc: Paul Mackerras Cc: Pingfan Liu Cc: "Rafael J. Wysocki" Cc: Roger Pau Monné Cc: Stefano Stabellini Cc: Stephen Hemminger Cc: Vasily Gorbik Cc: Vishal Verma Cc: Wei Liu Link: https://lkml.kernel.org/r/20200911103459.10306-3-david@redhat.com Signed-off-by: Linus Torvalds --- include/linux/ioport.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 52a91f5fa1a3..d7620d7c941a 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -58,6 +58,9 @@ struct resource { #define IORESOURCE_EXT_TYPE_BITS 0x01000000 /* Resource extended types */ #define IORESOURCE_SYSRAM 0x01000000 /* System RAM (modifier) */ +/* IORESOURCE_SYSRAM specific bits. */ +#define IORESOURCE_SYSRAM_DRIVER_MANAGED 0x02000000 /* Always detected via a driver. */ + #define IORESOURCE_EXCLUSIVE 0x08000000 /* Userland may not map this resource */ #define IORESOURCE_DISABLED 0x10000000 @@ -103,7 +106,6 @@ struct resource { #define IORESOURCE_MEM_32BIT (3<<3) #define IORESOURCE_MEM_SHADOWABLE (1<<5) /* dup: IORESOURCE_SHADOWABLE */ #define IORESOURCE_MEM_EXPANSIONROM (1<<6) -#define IORESOURCE_MEM_DRIVER_MANAGED (1<<7) /* PnP I/O specific bits (IORESOURCE_BITS) */ #define IORESOURCE_IO_16BIT_ADDR (1<<0) -- cgit v1.2.3 From 3a0aaefe4134951b4e89feb873c457428154530c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:08:39 -0700 Subject: mm/memory_hotplug: guard more declarations by CONFIG_MEMORY_HOTPLUG MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We soon want to pass flags via a new type to add_memory() and friends. That revealed that we currently don't guard some declarations by CONFIG_MEMORY_HOTPLUG. While some definitions could be moved to different places, let's keep it minimal for now and use CONFIG_MEMORY_HOTPLUG for all functions only compiled with CONFIG_MEMORY_HOTPLUG. Wrap sparse_decode_mem_map() into CONFIG_MEMORY_HOTPLUG, it's only called from CONFIG_MEMORY_HOTPLUG code. While at it, remove allow_online_pfn_range(), which is no longer around, and mhp_notimplemented(), which is unused. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Cc: Michal Hocko Cc: Dan Williams Cc: Pankaj Gupta Cc: Baoquan He Cc: Wei Yang Cc: Anton Blanchard Cc: Ard Biesheuvel Cc: Benjamin Herrenschmidt Cc: Boris Ostrovsky Cc: Christian Borntraeger Cc: Dave Jiang Cc: Eric Biederman Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Heiko Carstens Cc: Jason Gunthorpe Cc: Jason Wang Cc: Juergen Gross Cc: Julien Grall Cc: Kees Cook Cc: "K. Y. Srinivasan" Cc: Len Brown Cc: Leonardo Bras Cc: Libor Pechacek Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Nathan Lynch Cc: "Oliver O'Halloran" Cc: Paul Mackerras Cc: Pingfan Liu Cc: "Rafael J. Wysocki" Cc: Roger Pau Monné Cc: Stefano Stabellini Cc: Stephen Hemminger Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vishal Verma Cc: Wei Liu Link: https://lkml.kernel.org/r/20200911103459.10306-4-david@redhat.com Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 51a877fec8da..1504b4d5ae6c 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -247,13 +247,6 @@ static inline void zone_span_writelock(struct zone *zone) {} static inline void zone_span_writeunlock(struct zone *zone) {} static inline void zone_seqlock_init(struct zone *zone) {} -static inline int mhp_notimplemented(const char *func) -{ - printk(KERN_WARNING "%s() called, with CONFIG_MEMORY_HOTPLUG disabled\n", func); - dump_stack(); - return -ENOSYS; -} - static inline void register_page_bootmem_info_node(struct pglist_data *pgdat) { } @@ -344,6 +337,7 @@ static inline void __remove_memory(int nid, u64 start, u64 size) {} extern void set_zone_contiguous(struct zone *zone); extern void clear_zone_contiguous(struct zone *zone); +#ifdef CONFIG_MEMORY_HOTPLUG extern void __ref free_area_init_core_hotplug(int nid); extern int __add_memory(int nid, u64 start, u64 size); extern int add_memory(int nid, u64 start, u64 size); @@ -364,8 +358,8 @@ extern void sparse_remove_section(struct mem_section *ms, unsigned long map_offset, struct vmem_altmap *altmap); extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); -extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, - int online_type); extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, unsigned long nr_pages); +#endif /* CONFIG_MEMORY_HOTPLUG */ + #endif /* __LINUX_MEMORY_HOTPLUG_H */ -- cgit v1.2.3 From b6117199787c60539105d2de0d010146e8396fc3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:08:44 -0700 Subject: mm/memory_hotplug: prepare passing flags to add_memory() and friends MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We soon want to pass flags, e.g., to mark added System RAM resources. mergeable. Prepare for that. This patch is based on a similar patch by Oscar Salvador: https://lkml.kernel.org/r/20190625075227.15193-3-osalvador@suse.de Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Juergen Gross # Xen related part Reviewed-by: Pankaj Gupta Acked-by: Wei Liu Cc: Michal Hocko Cc: Dan Williams Cc: Jason Gunthorpe Cc: Baoquan He Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: Greg Kroah-Hartman Cc: Vishal Verma Cc: Dave Jiang Cc: "K. Y. Srinivasan" Cc: Haiyang Zhang Cc: Stephen Hemminger Cc: Wei Liu Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: David Hildenbrand Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Boris Ostrovsky Cc: Stefano Stabellini Cc: "Oliver O'Halloran" Cc: Pingfan Liu Cc: Nathan Lynch Cc: Libor Pechacek Cc: Anton Blanchard Cc: Leonardo Bras Cc: Ard Biesheuvel Cc: Eric Biederman Cc: Julien Grall Cc: Kees Cook Cc: Roger Pau Monné Cc: Thomas Gleixner Cc: Wei Yang Link: https://lkml.kernel.org/r/20200911103459.10306-5-david@redhat.com Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 1504b4d5ae6c..33eb80fdba22 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -57,6 +57,12 @@ enum { MMOP_ONLINE_MOVABLE, }; +/* Flags for add_memory() and friends to specify memory hotplug details. */ +typedef int __bitwise mhp_t; + +/* No special request */ +#define MHP_NONE ((__force mhp_t)0) + /* * Extended parameters for memory hotplug: * altmap: alternative allocator for memmap array (optional) @@ -339,11 +345,13 @@ extern void clear_zone_contiguous(struct zone *zone); #ifdef CONFIG_MEMORY_HOTPLUG extern void __ref free_area_init_core_hotplug(int nid); -extern int __add_memory(int nid, u64 start, u64 size); -extern int add_memory(int nid, u64 start, u64 size); -extern int add_memory_resource(int nid, struct resource *resource); +extern int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); +extern int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); +extern int add_memory_resource(int nid, struct resource *resource, + mhp_t mhp_flags); extern int add_memory_driver_managed(int nid, u64 start, u64 size, - const char *resource_name); + const char *resource_name, + mhp_t mhp_flags); extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap, int migratetype); -- cgit v1.2.3 From 9ca6551ee24368a4d2b09566ea4d10fe87860379 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:08:49 -0700 Subject: mm/memory_hotplug: MEMHP_MERGE_RESOURCE to specify merging of System RAM resources MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some add_memory*() users add memory in small, contiguous memory blocks. Examples include virtio-mem, hyper-v balloon, and the XEN balloon. This can quickly result in a lot of memory resources, whereby the actual resource boundaries are not of interest (e.g., it might be relevant for DIMMs, exposed via /proc/iomem to user space). We really want to merge added resources in this scenario where possible. Let's provide a flag (MEMHP_MERGE_RESOURCE) to specify that a resource either created within add_memory*() or passed via add_memory_resource() shall be marked mergeable and merged with applicable siblings. To implement that, we need a kernel/resource interface to mark selected System RAM resources mergeable (IORESOURCE_SYSRAM_MERGEABLE) and trigger merging. Note: We really want to merge after the whole operation succeeded, not directly when adding a resource to the resource tree (it would break add_memory_resource() and require splitting resources again when the operation failed - e.g., due to -ENOMEM). Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Pankaj Gupta Cc: Michal Hocko Cc: Dan Williams Cc: Jason Gunthorpe Cc: Kees Cook Cc: Ard Biesheuvel Cc: Thomas Gleixner Cc: "K. Y. Srinivasan" Cc: Haiyang Zhang Cc: Stephen Hemminger Cc: Wei Liu Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Stefano Stabellini Cc: Roger Pau Monné Cc: Julien Grall Cc: Baoquan He Cc: Wei Yang Cc: Anton Blanchard Cc: Benjamin Herrenschmidt Cc: Christian Borntraeger Cc: Dave Jiang Cc: Eric Biederman Cc: Greg Kroah-Hartman Cc: Heiko Carstens Cc: Jason Wang Cc: Len Brown Cc: Leonardo Bras Cc: Libor Pechacek Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Nathan Lynch Cc: "Oliver O'Halloran" Cc: Paul Mackerras Cc: Pingfan Liu Cc: "Rafael J. Wysocki" Cc: Vasily Gorbik Cc: Vishal Verma Link: https://lkml.kernel.org/r/20200911103459.10306-6-david@redhat.com Signed-off-by: Linus Torvalds --- include/linux/ioport.h | 4 ++++ include/linux/memory_hotplug.h | 7 +++++++ 2 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index d7620d7c941a..7e61389dcb01 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -60,6 +60,7 @@ struct resource { /* IORESOURCE_SYSRAM specific bits. */ #define IORESOURCE_SYSRAM_DRIVER_MANAGED 0x02000000 /* Always detected via a driver. */ +#define IORESOURCE_SYSRAM_MERGEABLE 0x04000000 /* Resource can be merged. */ #define IORESOURCE_EXCLUSIVE 0x08000000 /* Userland may not map this resource */ @@ -253,6 +254,9 @@ extern void __release_region(struct resource *, resource_size_t, extern void release_mem_region_adjustable(struct resource *, resource_size_t, resource_size_t); #endif +#ifdef CONFIG_MEMORY_HOTPLUG +extern void merge_system_ram_resource(struct resource *res); +#endif /* Wrappers for managed devices */ struct device; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 33eb80fdba22..d65c6fdc5cfc 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -62,6 +62,13 @@ typedef int __bitwise mhp_t; /* No special request */ #define MHP_NONE ((__force mhp_t)0) +/* + * Allow merging of the added System RAM resource with adjacent, + * mergeable resources. After a successful call to add_memory_resource() + * with this flag set, the resource pointer must no longer be used as it + * might be stale, or the resource might have changed. + */ +#define MEMHP_MERGE_RESOURCE ((__force mhp_t)BIT(0)) /* * Extended parameters for memory hotplug: -- cgit v1.2.3 From cb8e3c8b4f45e4ed8987a581956dc9c3827a5bcf Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 15 Oct 2020 20:09:12 -0700 Subject: kernel/resource: make iomem_resource implicit in release_mem_region_adjustable() "mem" in the name already indicates the root, similar to release_mem_region() and devm_request_mem_region(). Make it implicit. The only single caller always passes iomem_resource, other parents are not applicable. Suggested-by: Wei Yang Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Wei Yang Cc: Michal Hocko Cc: Dan Williams Cc: Jason Gunthorpe Cc: Kees Cook Cc: Ard Biesheuvel Cc: Pankaj Gupta Cc: Baoquan He Link: https://lkml.kernel.org/r/20200916073041.10355-1-david@redhat.com Signed-off-by: Linus Torvalds --- include/linux/ioport.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 7e61389dcb01..5135d4b86cd6 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -251,8 +251,7 @@ extern struct resource * __request_region(struct resource *, extern void __release_region(struct resource *, resource_size_t, resource_size_t); #ifdef CONFIG_MEMORY_HOTREMOVE -extern void release_mem_region_adjustable(struct resource *, resource_size_t, - resource_size_t); +extern void release_mem_region_adjustable(resource_size_t, resource_size_t); #endif #ifdef CONFIG_MEMORY_HOTPLUG extern void merge_system_ram_resource(struct resource *res); -- cgit v1.2.3 From 90c7eaeb14a325a760d732184ff1fbed47e5fa98 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Thu, 15 Oct 2020 20:09:15 -0700 Subject: mm: don't panic when links can't be created in sysfs At boot time, or when doing memory hot-add operations, if the links in sysfs can't be created, the system is still able to run, so just report the error in the kernel log rather than BUG_ON and potentially make system unusable because the callpath can be called with locks held. Since the number of memory blocks managed could be high, the messages are rate limited. As a consequence, link_mem_sections() has no status to report anymore. Signed-off-by: Laurent Dufour Signed-off-by: Andrew Morton Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Acked-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Fenghua Yu Cc: Nathan Lynch Cc: "Rafael J . Wysocki" Cc: Scott Cheloha Cc: Tony Luck Link: https://lkml.kernel.org/r/20200915094143.79181-4-ldufour@linux.ibm.com Signed-off-by: Linus Torvalds --- include/linux/node.h | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/node.h b/include/linux/node.h index 014ba3ab2efd..8e5a29897936 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -99,15 +99,14 @@ extern struct node *node_devices[]; typedef void (*node_registration_func_t)(struct node *); #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_NUMA) -int link_mem_sections(int nid, unsigned long start_pfn, - unsigned long end_pfn, - enum meminit_context context); +void link_mem_sections(int nid, unsigned long start_pfn, + unsigned long end_pfn, + enum meminit_context context); #else -static inline int link_mem_sections(int nid, unsigned long start_pfn, - unsigned long end_pfn, - enum meminit_context context) +static inline void link_mem_sections(int nid, unsigned long start_pfn, + unsigned long end_pfn, + enum meminit_context context) { - return 0; } #endif @@ -130,8 +129,7 @@ static inline int register_one_node(int nid) if (error) return error; /* link memory sections under this node */ - error = link_mem_sections(nid, start_pfn, end_pfn, - MEMINIT_EARLY); + link_mem_sections(nid, start_pfn, end_pfn, MEMINIT_EARLY); } return error; -- cgit v1.2.3 From ed0173733dd468883198c3136284394320b8fad6 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Thu, 15 Oct 2020 20:09:55 -0700 Subject: mm: use self-explanatory macros rather than "2" Signed-off-by: Yu Zhao Signed-off-by: Andrew Morton Cc: Alex Shi Link: http://lkml.kernel.org/r/20200831175042.3527153-2-yuzhao@google.com Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 12 ++++++++---- include/linux/vmstat.h | 2 +- 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c27fb1faffe5..7e0ea3fe95ca 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -266,6 +266,8 @@ static inline bool is_active_lru(enum lru_list lru) return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); } +#define ANON_AND_FILE 2 + enum lruvec_flags { LRUVEC_CONGESTED, /* lruvec has many dirty pages * backed by a congested BDI @@ -283,8 +285,8 @@ struct lruvec { unsigned long file_cost; /* Non-resident age, driven by LRU movement */ atomic_long_t nonresident_age; - /* Refaults at the time of last reclaim cycle, anon=0, file=1 */ - unsigned long refaults[2]; + /* Refaults at the time of last reclaim cycle */ + unsigned long refaults[ANON_AND_FILE]; /* Various lruvec state flags (enum lruvec_flags) */ unsigned long flags; #ifdef CONFIG_MEMCG @@ -441,6 +443,8 @@ enum zone_type { #ifndef __GENERATING_BOUNDS_H +#define ASYNC_AND_SYNC 2 + struct zone { /* Read-mostly fields */ @@ -560,8 +564,8 @@ struct zone { #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* pfn where compaction free scanner should start */ unsigned long compact_cached_free_pfn; - /* pfn where async and sync compaction migration scanner should start */ - unsigned long compact_cached_migrate_pfn[2]; + /* pfn where compaction migration scanner should start */ + unsigned long compact_cached_migrate_pfn[ASYNC_AND_SYNC]; unsigned long compact_init_migrate_pfn; unsigned long compact_init_free_pfn; #endif diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 7557c1070fd7..322dcbfcc933 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -28,7 +28,7 @@ struct reclaim_stat { unsigned nr_writeback; unsigned nr_immediate; unsigned nr_pageout; - unsigned nr_activate[2]; + unsigned nr_activate[ANON_AND_FILE]; unsigned nr_ref_keep; unsigned nr_unmap_fail; unsigned nr_lazyfree_fail; -- cgit v1.2.3 From 1f0f8c0de09066d23760c1f5fac2cd53b32f1127 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 15 Oct 2020 20:10:11 -0700 Subject: include/linux/mmzone.h: remove unused early_pfn_valid() The early_pfn_valid() macro is defined but it is never used. Remove it. Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Acked-by: David Hildenbrand Link: https://lkml.kernel.org/r/20200923162915.26935-1-rppt@kernel.org Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7e0ea3fe95ca..fb3bf696c05e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1420,7 +1420,6 @@ static inline unsigned long next_present_section_nr(unsigned long section_nr) #define pfn_to_nid(pfn) (0) #endif -#define early_pfn_valid(pfn) pfn_valid(pfn) void sparse_init(void); #else #define sparse_init() do {} while (0) @@ -1440,10 +1439,6 @@ struct mminit_pfnnid_cache { int last_nid; }; -#ifndef early_pfn_valid -#define early_pfn_valid(pfn) (1) -#endif - /* * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we * need to check pfn validity within that MAX_ORDER_NR_PAGES block. -- cgit v1.2.3 From b296a6d53339a79082c1d2c1761e948e8b3def69 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 15 Oct 2020 20:10:21 -0700 Subject: kernel.h: split out min()/max() et al. helpers kernel.h is being used as a dump for all kinds of stuff for a long time. Here is the attempt to start cleaning it up by splitting out min()/max() et al. helpers. At the same time convert users in header and lib folder to use new header. Though for time being include new header back to kernel.h to avoid twisted indirected includes for other existing users. Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton Cc: "Rafael J. Wysocki" Cc: Steven Rostedt Cc: Rasmus Villemoes Cc: Joe Perches Cc: Linus Torvalds Link: https://lkml.kernel.org/r/20200910164152.GA1891694@smile.fi.intel.com Signed-off-by: Linus Torvalds --- include/linux/blkdev.h | 1 + include/linux/bvec.h | 6 +- include/linux/jiffies.h | 3 +- include/linux/kernel.h | 150 +--------------------------------------------- include/linux/minmax.h | 153 +++++++++++++++++++++++++++++++++++++++++++++++ include/linux/nodemask.h | 2 +- include/linux/uaccess.h | 1 + 7 files changed, 164 insertions(+), 152 deletions(-) create mode 100644 include/linux/minmax.h (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c09375e0a0eb..639cae2c158b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/bvec.h b/include/linux/bvec.h index dd74503f7e5e..2efec10bf792 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -7,10 +7,14 @@ #ifndef __LINUX_BVEC_ITER_H #define __LINUX_BVEC_ITER_H -#include #include #include +#include +#include #include +#include + +struct page; /** * struct bio_vec - a contiguous range of physical memory addresses diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index fed6ba96c527..5e13f801c902 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -3,8 +3,9 @@ #define _LINUX_JIFFIES_H #include +#include #include -#include +#include #include #include #include diff --git a/include/linux/kernel.h b/include/linux/kernel.h index e4aa29b1ad62..c629215fdad9 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -833,155 +834,6 @@ ftrace_vprintk(const char *fmt, va_list ap) static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } #endif /* CONFIG_TRACING */ -/* - * min()/max()/clamp() macros must accomplish three things: - * - * - avoid multiple evaluations of the arguments (so side-effects like - * "x++" happen only once) when non-constant. - * - perform strict type-checking (to generate warnings instead of - * nasty runtime surprises). See the "unnecessary" pointer comparison - * in __typecheck(). - * - retain result as a constant expressions when called with only - * constant expressions (to avoid tripping VLA warnings in stack - * allocation usage). - */ -#define __typecheck(x, y) \ - (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1))) - -/* - * This returns a constant expression while determining if an argument is - * a constant expression, most importantly without evaluating the argument. - * Glory to Martin Uecker - */ -#define __is_constexpr(x) \ - (sizeof(int) == sizeof(*(8 ? ((void *)((long)(x) * 0l)) : (int *)8))) - -#define __no_side_effects(x, y) \ - (__is_constexpr(x) && __is_constexpr(y)) - -#define __safe_cmp(x, y) \ - (__typecheck(x, y) && __no_side_effects(x, y)) - -#define __cmp(x, y, op) ((x) op (y) ? (x) : (y)) - -#define __cmp_once(x, y, unique_x, unique_y, op) ({ \ - typeof(x) unique_x = (x); \ - typeof(y) unique_y = (y); \ - __cmp(unique_x, unique_y, op); }) - -#define __careful_cmp(x, y, op) \ - __builtin_choose_expr(__safe_cmp(x, y), \ - __cmp(x, y, op), \ - __cmp_once(x, y, __UNIQUE_ID(__x), __UNIQUE_ID(__y), op)) - -/** - * min - return minimum of two values of the same or compatible types - * @x: first value - * @y: second value - */ -#define min(x, y) __careful_cmp(x, y, <) - -/** - * max - return maximum of two values of the same or compatible types - * @x: first value - * @y: second value - */ -#define max(x, y) __careful_cmp(x, y, >) - -/** - * min3 - return minimum of three values - * @x: first value - * @y: second value - * @z: third value - */ -#define min3(x, y, z) min((typeof(x))min(x, y), z) - -/** - * max3 - return maximum of three values - * @x: first value - * @y: second value - * @z: third value - */ -#define max3(x, y, z) max((typeof(x))max(x, y), z) - -/** - * min_not_zero - return the minimum that is _not_ zero, unless both are zero - * @x: value1 - * @y: value2 - */ -#define min_not_zero(x, y) ({ \ - typeof(x) __x = (x); \ - typeof(y) __y = (y); \ - __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); }) - -/** - * clamp - return a value clamped to a given range with strict typechecking - * @val: current value - * @lo: lowest allowable value - * @hi: highest allowable value - * - * This macro does strict typechecking of @lo/@hi to make sure they are of the - * same type as @val. See the unnecessary pointer comparisons. - */ -#define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi) - -/* - * ..and if you can't take the strict - * types, you can specify one yourself. - * - * Or not use min/max/clamp at all, of course. - */ - -/** - * min_t - return minimum of two values, using the specified type - * @type: data type to use - * @x: first value - * @y: second value - */ -#define min_t(type, x, y) __careful_cmp((type)(x), (type)(y), <) - -/** - * max_t - return maximum of two values, using the specified type - * @type: data type to use - * @x: first value - * @y: second value - */ -#define max_t(type, x, y) __careful_cmp((type)(x), (type)(y), >) - -/** - * clamp_t - return a value clamped to a given range using a given type - * @type: the type of variable to use - * @val: current value - * @lo: minimum allowable value - * @hi: maximum allowable value - * - * This macro does no typechecking and uses temporary variables of type - * @type to make all the comparisons. - */ -#define clamp_t(type, val, lo, hi) min_t(type, max_t(type, val, lo), hi) - -/** - * clamp_val - return a value clamped to a given range using val's type - * @val: current value - * @lo: minimum allowable value - * @hi: maximum allowable value - * - * This macro does no typechecking and uses temporary variables of whatever - * type the input argument @val is. This is useful when @val is an unsigned - * type and @lo and @hi are literals that will otherwise be assigned a signed - * integer type. - */ -#define clamp_val(val, lo, hi) clamp_t(typeof(val), val, lo, hi) - - -/** - * swap - swap values of @a and @b - * @a: first value - * @b: second value - */ -#define swap(a, b) \ - do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) - /* This counts to 12. Any more, it will return 13th argument. */ #define __COUNT_ARGS(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _n, X...) _n #define COUNT_ARGS(X...) __COUNT_ARGS(, ##X, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) diff --git a/include/linux/minmax.h b/include/linux/minmax.h new file mode 100644 index 000000000000..c0f57b0c64d9 --- /dev/null +++ b/include/linux/minmax.h @@ -0,0 +1,153 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_MINMAX_H +#define _LINUX_MINMAX_H + +/* + * min()/max()/clamp() macros must accomplish three things: + * + * - avoid multiple evaluations of the arguments (so side-effects like + * "x++" happen only once) when non-constant. + * - perform strict type-checking (to generate warnings instead of + * nasty runtime surprises). See the "unnecessary" pointer comparison + * in __typecheck(). + * - retain result as a constant expressions when called with only + * constant expressions (to avoid tripping VLA warnings in stack + * allocation usage). + */ +#define __typecheck(x, y) \ + (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1))) + +/* + * This returns a constant expression while determining if an argument is + * a constant expression, most importantly without evaluating the argument. + * Glory to Martin Uecker + */ +#define __is_constexpr(x) \ + (sizeof(int) == sizeof(*(8 ? ((void *)((long)(x) * 0l)) : (int *)8))) + +#define __no_side_effects(x, y) \ + (__is_constexpr(x) && __is_constexpr(y)) + +#define __safe_cmp(x, y) \ + (__typecheck(x, y) && __no_side_effects(x, y)) + +#define __cmp(x, y, op) ((x) op (y) ? (x) : (y)) + +#define __cmp_once(x, y, unique_x, unique_y, op) ({ \ + typeof(x) unique_x = (x); \ + typeof(y) unique_y = (y); \ + __cmp(unique_x, unique_y, op); }) + +#define __careful_cmp(x, y, op) \ + __builtin_choose_expr(__safe_cmp(x, y), \ + __cmp(x, y, op), \ + __cmp_once(x, y, __UNIQUE_ID(__x), __UNIQUE_ID(__y), op)) + +/** + * min - return minimum of two values of the same or compatible types + * @x: first value + * @y: second value + */ +#define min(x, y) __careful_cmp(x, y, <) + +/** + * max - return maximum of two values of the same or compatible types + * @x: first value + * @y: second value + */ +#define max(x, y) __careful_cmp(x, y, >) + +/** + * min3 - return minimum of three values + * @x: first value + * @y: second value + * @z: third value + */ +#define min3(x, y, z) min((typeof(x))min(x, y), z) + +/** + * max3 - return maximum of three values + * @x: first value + * @y: second value + * @z: third value + */ +#define max3(x, y, z) max((typeof(x))max(x, y), z) + +/** + * min_not_zero - return the minimum that is _not_ zero, unless both are zero + * @x: value1 + * @y: value2 + */ +#define min_not_zero(x, y) ({ \ + typeof(x) __x = (x); \ + typeof(y) __y = (y); \ + __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); }) + +/** + * clamp - return a value clamped to a given range with strict typechecking + * @val: current value + * @lo: lowest allowable value + * @hi: highest allowable value + * + * This macro does strict typechecking of @lo/@hi to make sure they are of the + * same type as @val. See the unnecessary pointer comparisons. + */ +#define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi) + +/* + * ..and if you can't take the strict + * types, you can specify one yourself. + * + * Or not use min/max/clamp at all, of course. + */ + +/** + * min_t - return minimum of two values, using the specified type + * @type: data type to use + * @x: first value + * @y: second value + */ +#define min_t(type, x, y) __careful_cmp((type)(x), (type)(y), <) + +/** + * max_t - return maximum of two values, using the specified type + * @type: data type to use + * @x: first value + * @y: second value + */ +#define max_t(type, x, y) __careful_cmp((type)(x), (type)(y), >) + +/** + * clamp_t - return a value clamped to a given range using a given type + * @type: the type of variable to use + * @val: current value + * @lo: minimum allowable value + * @hi: maximum allowable value + * + * This macro does no typechecking and uses temporary variables of type + * @type to make all the comparisons. + */ +#define clamp_t(type, val, lo, hi) min_t(type, max_t(type, val, lo), hi) + +/** + * clamp_val - return a value clamped to a given range using val's type + * @val: current value + * @lo: minimum allowable value + * @hi: maximum allowable value + * + * This macro does no typechecking and uses temporary variables of whatever + * type the input argument @val is. This is useful when @val is an unsigned + * type and @lo and @hi are literals that will otherwise be assigned a signed + * integer type. + */ +#define clamp_val(val, lo, hi) clamp_t(typeof(val), val, lo, hi) + +/** + * swap - swap values of @a and @b + * @a: first value + * @b: second value + */ +#define swap(a, b) \ + do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) + +#endif /* _LINUX_MINMAX_H */ diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 3334ce056335..ac398e143c9a 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -90,9 +90,9 @@ * for such situations. See below and CPUMASK_ALLOC also. */ -#include #include #include +#include #include typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t; diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 1ae36bc8db35..ef084eacaa7c 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -3,6 +3,7 @@ #define __LINUX_UACCESS_H__ #include +#include #include #include -- cgit v1.2.3 From 3b6742618ed9216dd6caad968fe8c83b32dff485 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 15 Oct 2020 20:11:17 -0700 Subject: lib/idr.c: document calling context for IDA APIs mustn't use locks The documentation for these functions indicates that callers don't need to hold a lock while calling them, but that documentation is only in one place under "IDA Usage". Let's state the same information on each IDA function so that it's clear what the calling context requires. Furthermore, let's document ida_simple_get() with the same information so that callers know how this API works. Signed-off-by: Stephen Boyd Signed-off-by: Andrew Morton Reviewed-by: Greg Kroah-Hartman Cc: Tri Vo Cc: Jonathan Corbet Cc: Matthew Wilcox Link: https://lkml.kernel.org/r/20200910055246.2297797-1-swboyd@chromium.org Signed-off-by: Linus Torvalds --- include/linux/idr.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/idr.h b/include/linux/idr.h index 3ade03e5c7af..b235ed987021 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -263,7 +263,8 @@ void ida_destroy(struct ida *ida); * * Allocate an ID between 0 and %INT_MAX, inclusive. * - * Context: Any context. + * Context: Any context. It is safe to call this function without + * locking in your code. * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, * or %-ENOSPC if there are no free IDs. */ @@ -280,7 +281,8 @@ static inline int ida_alloc(struct ida *ida, gfp_t gfp) * * Allocate an ID between @min and %INT_MAX, inclusive. * - * Context: Any context. + * Context: Any context. It is safe to call this function without + * locking in your code. * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, * or %-ENOSPC if there are no free IDs. */ @@ -297,7 +299,8 @@ static inline int ida_alloc_min(struct ida *ida, unsigned int min, gfp_t gfp) * * Allocate an ID between 0 and @max, inclusive. * - * Context: Any context. + * Context: Any context. It is safe to call this function without + * locking in your code. * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, * or %-ENOSPC if there are no free IDs. */ -- cgit v1.2.3 From 3264ceec8f17a99a3895de7de06b4d7e9c8f3f30 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 15 Oct 2020 20:11:21 -0700 Subject: lib/idr.c: document that ida_simple_{get,remove}() are deprecated These two functions are deprecated. Users should call ida_alloc() or ida_free() respectively instead. Add documentation to this effect until the macro can be removed. Signed-off-by: Stephen Boyd Signed-off-by: Andrew Morton Reviewed-by: Tri Vo Cc: Greg KH Cc: Jonathan Corbet Cc: Matthew Wilcox Link: https://lkml.kernel.org/r/20200910055246.2297797-2-swboyd@chromium.org Signed-off-by: Linus Torvalds --- include/linux/idr.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/idr.h b/include/linux/idr.h index b235ed987021..a0dce14090a9 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -314,6 +314,10 @@ static inline void ida_init(struct ida *ida) xa_init_flags(&ida->xa, IDA_INIT_FLAGS); } +/* + * ida_simple_get() and ida_simple_remove() are deprecated. Use + * ida_alloc() and ida_free() instead respectively. + */ #define ida_simple_get(ida, start, end, gfp) \ ida_alloc_range(ida, start, (end) - 1, gfp) #define ida_simple_remove(ida, id) ida_free(ida, id) -- cgit v1.2.3 From e130816164e244b692921de49771eeb28205152d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 15 Oct 2020 20:11:31 -0700 Subject: include/linux/list.h: add a macro to test if entry is pointing to the head Add a macro to test if entry is pointing to the head of the list which is useful in cases like: list_for_each_entry(pos, &head, member) { if (cond) break; } if (list_entry_is_head(pos, &head, member)) return -ERRNO; that allows to avoid additional variable to be added to track if loop has not been stopped in the middle. While here, convert list_for_each_entry*() family of macros to use a new one. Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton Reviewed-by: Cezary Rojewski Link: https://lkml.kernel.org/r/20200929134342.51489-1-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Torvalds --- include/linux/list.h | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/list.h b/include/linux/list.h index 0d0d17a10d25..a18c87b63376 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -609,6 +609,15 @@ static inline void list_splice_tail_init(struct list_head *list, pos != (head); \ pos = n, n = pos->prev) +/** + * list_entry_is_head - test if the entry points to the head of the list + * @pos: the type * to cursor + * @head: the head for your list. + * @member: the name of the list_head within the struct. + */ +#define list_entry_is_head(pos, head, member) \ + (&pos->member == (head)) + /** * list_for_each_entry - iterate over list of given type * @pos: the type * to use as a loop cursor. @@ -617,7 +626,7 @@ static inline void list_splice_tail_init(struct list_head *list, */ #define list_for_each_entry(pos, head, member) \ for (pos = list_first_entry(head, typeof(*pos), member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** @@ -628,7 +637,7 @@ static inline void list_splice_tail_init(struct list_head *list, */ #define list_for_each_entry_reverse(pos, head, member) \ for (pos = list_last_entry(head, typeof(*pos), member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** @@ -653,7 +662,7 @@ static inline void list_splice_tail_init(struct list_head *list, */ #define list_for_each_entry_continue(pos, head, member) \ for (pos = list_next_entry(pos, member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** @@ -667,7 +676,7 @@ static inline void list_splice_tail_init(struct list_head *list, */ #define list_for_each_entry_continue_reverse(pos, head, member) \ for (pos = list_prev_entry(pos, member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** @@ -679,7 +688,7 @@ static inline void list_splice_tail_init(struct list_head *list, * Iterate over list of given type, continuing from current position. */ #define list_for_each_entry_from(pos, head, member) \ - for (; &pos->member != (head); \ + for (; !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** @@ -692,7 +701,7 @@ static inline void list_splice_tail_init(struct list_head *list, * Iterate backwards over list of given type, continuing from current position. */ #define list_for_each_entry_from_reverse(pos, head, member) \ - for (; &pos->member != (head); \ + for (; !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** @@ -705,7 +714,7 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_for_each_entry_safe(pos, n, head, member) \ for (pos = list_first_entry(head, typeof(*pos), member), \ n = list_next_entry(pos, member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** @@ -721,7 +730,7 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_for_each_entry_safe_continue(pos, n, head, member) \ for (pos = list_next_entry(pos, member), \ n = list_next_entry(pos, member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** @@ -736,7 +745,7 @@ static inline void list_splice_tail_init(struct list_head *list, */ #define list_for_each_entry_safe_from(pos, n, head, member) \ for (n = list_next_entry(pos, member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** @@ -752,7 +761,7 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_for_each_entry_safe_reverse(pos, n, head, member) \ for (pos = list_last_entry(head, typeof(*pos), member), \ n = list_prev_entry(pos, member); \ - &pos->member != (head); \ + !list_entry_is_head(pos, head, member); \ pos = n, n = list_prev_entry(n, member)) /** -- cgit v1.2.3 From a9eb63705e379f10a3c9d13fc6aee8b50805e862 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 15 Oct 2020 20:11:41 -0700 Subject: bitops: simplify get_count_order_long() These two cases could be unified into one. Signed-off-by: Wei Yang Signed-off-by: Andrew Morton Cc: Christian Brauner Cc: Andy Shevchenko Link: https://lkml.kernel.org/r/20200807085837.11697-2-richard.weiyang@linux.alibaba.com Signed-off-by: Linus Torvalds --- include/linux/bitops.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 99f2ac30b1d9..030a98f0c452 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -206,10 +206,7 @@ static inline int get_count_order_long(unsigned long l) { if (l == 0UL) return -1; - else if (l & (l - 1UL)) - return (int)fls_long(l); - else - return (int)fls_long(l) - 1; + return (int)fls_long(--l); } /** -- cgit v1.2.3 From 004fba1ae6ddd66ba0faa4f60c603b3ca77b3554 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 15 Oct 2020 20:11:46 -0700 Subject: bitops: use the same mechanism for get_count_order[_long] These two functions share the same logic. Signed-off-by: Wei Yang Signed-off-by: Andrew Morton Cc: Christian Brauner Cc: Andy Shevchenko Link: https://lkml.kernel.org/r/20200807085837.11697-3-richard.weiyang@linux.alibaba.com Signed-off-by: Linus Torvalds --- include/linux/bitops.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 030a98f0c452..5b74bdf159d6 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -188,12 +188,10 @@ static inline unsigned fls_long(unsigned long l) static inline int get_count_order(unsigned int count) { - int order; + if (count == 0) + return -1; - order = fls(count) - 1; - if (count & (count - 1)) - order++; - return order; + return fls(--count); } /** -- cgit v1.2.3 From afc63a97b764bc5a715762d0d9cc9785c2ef4e75 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 15 Oct 2020 20:12:46 -0700 Subject: coredump: refactor page range dumping into common helper Both fs/binfmt_elf.c and fs/binfmt_elf_fdpic.c need to dump ranges of pages into the coredump file. Extract that logic into a common helper. Signed-off-by: Jann Horn Signed-off-by: Andrew Morton Acked-by: Linus Torvalds Cc: Christoph Hellwig Cc: Alexander Viro Cc: "Eric W . Biederman" Cc: Oleg Nesterov Cc: Hugh Dickins Link: http://lkml.kernel.org/r/20200827114932.3572699-4-jannh@google.com Signed-off-by: Linus Torvalds --- include/linux/coredump.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 7a899e83835d..f0b71a74d0bc 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -16,6 +16,8 @@ extern int dump_skip(struct coredump_params *cprm, size_t nr); extern int dump_emit(struct coredump_params *cprm, const void *addr, int nr); extern int dump_align(struct coredump_params *cprm, int align); extern void dump_truncate(struct coredump_params *cprm); +int dump_user_range(struct coredump_params *cprm, unsigned long start, + unsigned long len); #ifdef CONFIG_COREDUMP extern void do_coredump(const kernel_siginfo_t *siginfo); #else -- cgit v1.2.3 From 429a22e776a2b9f85a2b9c53d8e647598b553dd1 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 15 Oct 2020 20:12:50 -0700 Subject: coredump: rework elf/elf_fdpic vma_dump_size() into common helper At the moment, the binfmt_elf and binfmt_elf_fdpic code have slightly different code to figure out which VMAs should be dumped, and if so, whether the dump should contain the entire VMA or just its first page. Eliminate duplicate code by reworking the binfmt_elf version into a generic core dumping helper in coredump.c. As part of that, change the heuristic for detecting executable/library header pages to check whether the inode is executable instead of looking at the file mode. This is less problematic in terms of locking because it lets us avoid get_user() under the mmap_sem. (And arguably it looks nicer and makes more sense in generic code.) Adjust a little bit based on the binfmt_elf_fdpic version: ->anon_vma is only meaningful under CONFIG_MMU, otherwise we have to assume that the VMA has been written to. Suggested-by: Linus Torvalds Signed-off-by: Jann Horn Signed-off-by: Andrew Morton Acked-by: Linus Torvalds Cc: Christoph Hellwig Cc: Alexander Viro Cc: "Eric W . Biederman" Cc: Oleg Nesterov Cc: Hugh Dickins Link: http://lkml.kernel.org/r/20200827114932.3572699-5-jannh@google.com Signed-off-by: Linus Torvalds --- include/linux/coredump.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/coredump.h b/include/linux/coredump.h index f0b71a74d0bc..bfecb8d79a7f 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -16,6 +16,7 @@ extern int dump_skip(struct coredump_params *cprm, size_t nr); extern int dump_emit(struct coredump_params *cprm, const void *addr, int nr); extern int dump_align(struct coredump_params *cprm, int align); extern void dump_truncate(struct coredump_params *cprm); +unsigned long vma_dump_size(struct vm_area_struct *vma, unsigned long mm_flags); int dump_user_range(struct coredump_params *cprm, unsigned long start, unsigned long len); #ifdef CONFIG_COREDUMP -- cgit v1.2.3 From a07279c9a8cd7dbd321640ff7210591599ee00a4 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 15 Oct 2020 20:12:54 -0700 Subject: binfmt_elf, binfmt_elf_fdpic: use a VMA list snapshot In both binfmt_elf and binfmt_elf_fdpic, use a new helper dump_vma_snapshot() to take a snapshot of the VMA list (including the gate VMA, if we have one) while protected by the mmap_lock, and then use that snapshot instead of walking the VMA list without locking. An alternative approach would be to keep the mmap_lock held across the entire core dumping operation; however, keeping the mmap_lock locked while we may be blocked for an unbounded amount of time (e.g. because we're dumping to a FUSE filesystem or so) isn't really optimal; the mmap_lock blocks things like the ->release handler of userfaultfd, and we don't really want critical system daemons to grind to a halt just because someone "gifted" them SCM_RIGHTS to an eternally-locked userfaultfd, or something like that. Since both the normal ELF code and the FDPIC ELF code need this functionality (and if any other binfmt wants to add coredump support in the future, they'd probably need it, too), implement this with a common helper in fs/coredump.c. A downside of this approach is that we now need a bigger amount of kernel memory per userspace VMA in the normal ELF case, and that we need O(n) kernel memory in the FDPIC ELF case at all; but 40 bytes per VMA shouldn't be terribly bad. There currently is a data race between stack expansion and anything that reads ->vm_start or ->vm_end under the mmap_lock held in read mode; to mitigate that for core dumping, take the mmap_lock in write mode when taking a snapshot of the VMA hierarchy. (If we only took the mmap_lock in read mode, we could end up with a corrupted core dump if someone does get_user_pages_remote() concurrently. Not really a major problem, but taking the mmap_lock either way works here, so we might as well avoid the issue.) (This doesn't do anything about the existing data races with stack expansion in other mm code.) Signed-off-by: Jann Horn Signed-off-by: Andrew Morton Acked-by: Linus Torvalds Cc: Christoph Hellwig Cc: Alexander Viro Cc: "Eric W . Biederman" Cc: Oleg Nesterov Cc: Hugh Dickins Link: http://lkml.kernel.org/r/20200827114932.3572699-6-jannh@google.com Signed-off-by: Linus Torvalds --- include/linux/coredump.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/coredump.h b/include/linux/coredump.h index bfecb8d79a7f..e58e8c207782 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -7,6 +7,12 @@ #include #include +struct core_vma_metadata { + unsigned long start, end; + unsigned long flags; + unsigned long dump_size; +}; + /* * These are the only things you should do on a core-file: use only these * functions to write out all the necessary info. @@ -16,9 +22,11 @@ extern int dump_skip(struct coredump_params *cprm, size_t nr); extern int dump_emit(struct coredump_params *cprm, const void *addr, int nr); extern int dump_align(struct coredump_params *cprm, int align); extern void dump_truncate(struct coredump_params *cprm); -unsigned long vma_dump_size(struct vm_area_struct *vma, unsigned long mm_flags); int dump_user_range(struct coredump_params *cprm, unsigned long start, unsigned long len); +int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, + struct core_vma_metadata **vma_meta, + size_t *vma_data_size_ptr); #ifdef CONFIG_COREDUMP extern void do_coredump(const kernel_siginfo_t *siginfo); #else -- cgit v1.2.3 From 4d45e75a9955ade5c2f49bd96fc4173b2cec9a72 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 15 Oct 2020 20:13:00 -0700 Subject: mm: remove the now-unnecessary mmget_still_valid() hack The preceding patches have ensured that core dumping properly takes the mmap_lock. Thanks to that, we can now remove mmget_still_valid() and all its users. Signed-off-by: Jann Horn Signed-off-by: Andrew Morton Acked-by: Linus Torvalds Cc: Christoph Hellwig Cc: Alexander Viro Cc: "Eric W . Biederman" Cc: Oleg Nesterov Cc: Hugh Dickins Link: http://lkml.kernel.org/r/20200827114932.3572699-8-jannh@google.com Signed-off-by: Linus Torvalds --- include/linux/sched/mm.h | 25 ------------------------- 1 file changed, 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 15bfb06f2884..981e34cb1409 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -49,31 +49,6 @@ static inline void mmdrop(struct mm_struct *mm) __mmdrop(mm); } -/* - * This has to be called after a get_task_mm()/mmget_not_zero() - * followed by taking the mmap_lock for writing before modifying the - * vmas or anything the coredump pretends not to change from under it. - * - * It also has to be called when mmgrab() is used in the context of - * the process, but then the mm_count refcount is transferred outside - * the context of the process to run down_write() on that pinned mm. - * - * NOTE: find_extend_vma() called from GUP context is the only place - * that can modify the "mm" (notably the vm_start/end) under mmap_lock - * for reading and outside the context of the process, so it is also - * the only case that holds the mmap_lock for reading that must call - * this function. Generally if the mmap_lock is hold for reading - * there's no need of this check after get_task_mm()/mmget_not_zero(). - * - * This function can be obsoleted and the check can be removed, after - * the coredump code will hold the mmap_lock for writing before - * invoking the ->core_dump methods. - */ -static inline bool mmget_still_valid(struct mm_struct *mm) -{ - return likely(!mm->core_state); -} - /** * mmget() - Pin the address space associated with a &struct mm_struct. * @mm: The address space to pin. -- cgit v1.2.3 From 5cf53f3ce3b9ff5321b56f9ed9d90d59307be7d0 Mon Sep 17 00:00:00 2001 From: Elena Petrova Date: Thu, 15 Oct 2020 20:13:35 -0700 Subject: sched.h: drop in_ubsan field when UBSAN is in trap mode in_ubsan field of task_struct is only used in lib/ubsan.c, which in its turn is used only `ifneq ($(CONFIG_UBSAN_TRAP),y)`. Removing unnecessary field from a task_struct will help preserve the ABI between vanilla and CONFIG_UBSAN_TRAP'ed kernels. In particular, this will help enabling bounds sanitizer transparently for Android's GKI. Signed-off-by: Elena Petrova Signed-off-by: Andrew Morton Acked-by: Kees Cook Cc: Jann Horn Link: https://lkml.kernel.org/r/20200910134802.3160311-1-lenaptr@google.com Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 9030f3abd969..063cd120b459 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1013,7 +1013,7 @@ struct task_struct { struct held_lock held_locks[MAX_LOCK_DEPTH]; #endif -#ifdef CONFIG_UBSAN +#if defined(CONFIG_UBSAN) && !defined(CONFIG_UBSAN_TRAP) unsigned int in_ubsan; #endif -- cgit v1.2.3 From 2c739ced5886cd8c8361faa79a9522ec05174ed0 Mon Sep 17 00:00:00 2001 From: Albert van der Linde Date: Thu, 15 Oct 2020 20:13:46 -0700 Subject: lib, include/linux: add usercopy failure capability Patch series "add fault injection to user memory access", v3. The goal of this series is to improve testing of fault-tolerance in usages of user memory access functions, by adding support for fault injection. syzkaller/syzbot are using the existing fault injection modes and will use this particular feature also. The first patch adds failure injection capability for usercopy functions. The second changes usercopy functions to use this new failure capability (copy_from_user, ...). The third patch adds get/put/clear_user failures to x86. This patch (of 3): Add a failure injection capability to improve testing of fault-tolerance in usages of user memory access functions. Add CONFIG_FAULT_INJECTION_USERCOPY to enable faults in usercopy functions. The should_fail_usercopy function is to be called by these functions (copy_from_user, get_user, ...) in order to fail or not. Signed-off-by: Albert van der Linde Signed-off-by: Andrew Morton Reviewed-by: Akinobu Mita Reviewed-by: Alexander Potapenko Cc: Borislav Petkov Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Thomas Gleixner Cc: Arnd Bergmann Cc: Peter Zijlstra (Intel) Cc: "H. Peter Anvin" Cc: Al Viro Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Marco Elver Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20200831171733.955393-1-alinde@google.com Link: http://lkml.kernel.org/r/20200831171733.955393-2-alinde@google.com Signed-off-by: Linus Torvalds --- include/linux/fault-inject-usercopy.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 include/linux/fault-inject-usercopy.h (limited to 'include/linux') diff --git a/include/linux/fault-inject-usercopy.h b/include/linux/fault-inject-usercopy.h new file mode 100644 index 000000000000..56c3a693fdd9 --- /dev/null +++ b/include/linux/fault-inject-usercopy.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_FAULT_INJECT_USERCOPY_H__ +#define __LINUX_FAULT_INJECT_USERCOPY_H__ + +/* + * This header provides a wrapper for injecting failures to user space memory + * access functions. + */ + +#include + +#ifdef CONFIG_FAULT_INJECTION_USERCOPY + +bool should_fail_usercopy(void); + +#else + +static inline bool should_fail_usercopy(void) { return false; } + +#endif /* CONFIG_FAULT_INJECTION_USERCOPY */ + +#endif /* __LINUX_FAULT_INJECT_USERCOPY_H__ */ -- cgit v1.2.3 From 4d0e9df5e43dba52d38b251e3b909df8fa1110be Mon Sep 17 00:00:00 2001 From: Albert van der Linde Date: Thu, 15 Oct 2020 20:13:50 -0700 Subject: lib, uaccess: add failure injection to usercopy functions To test fault-tolerance of user memory access functions, introduce fault injection to usercopy functions. If a failure is expected return either -EFAULT or the total amount of bytes that were not copied. Signed-off-by: Albert van der Linde Signed-off-by: Andrew Morton Reviewed-by: Akinobu Mita Reviewed-by: Alexander Potapenko Cc: Al Viro Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Dmitry Vyukov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Marco Elver Cc: Peter Zijlstra (Intel) Cc: Thomas Gleixner Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20200831171733.955393-3-alinde@google.com Signed-off-by: Linus Torvalds --- include/linux/uaccess.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index ef084eacaa7c..1b8c9d6162bc 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -2,6 +2,7 @@ #ifndef __LINUX_UACCESS_H__ #define __LINUX_UACCESS_H__ +#include #include #include #include @@ -84,6 +85,8 @@ static __always_inline __must_check unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { might_fault(); + if (should_fail_usercopy()) + return n; instrument_copy_from_user(to, from, n); check_object_size(to, n, false); return raw_copy_from_user(to, from, n); @@ -105,6 +108,8 @@ __copy_from_user(void *to, const void __user *from, unsigned long n) static __always_inline __must_check unsigned long __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) { + if (should_fail_usercopy()) + return n; instrument_copy_to_user(to, from, n); check_object_size(from, n, true); return raw_copy_to_user(to, from, n); @@ -114,6 +119,8 @@ static __always_inline __must_check unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); + if (should_fail_usercopy()) + return n; instrument_copy_to_user(to, from, n); check_object_size(from, n, true); return raw_copy_to_user(to, from, n); @@ -125,7 +132,7 @@ _copy_from_user(void *to, const void __user *from, unsigned long n) { unsigned long res = n; might_fault(); - if (likely(access_ok(from, n))) { + if (!should_fail_usercopy() && likely(access_ok(from, n))) { instrument_copy_from_user(to, from, n); res = raw_copy_from_user(to, from, n); } @@ -143,6 +150,8 @@ static inline __must_check unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); + if (should_fail_usercopy()) + return n; if (access_ok(to, n)) { instrument_copy_to_user(to, from, n); n = raw_copy_to_user(to, from, n); -- cgit v1.2.3 From 98447d65b4a7a59f8ea37dc6e5d743247d9a7b01 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 14 Oct 2020 10:48:51 -0600 Subject: io_uring: move io identity items into separate struct io-wq contains a pointer to the identity, which we just hold in io_kiocb for now. This is in preparation for putting this outside io_kiocb. The only exception is struct files_struct, which we'll need different rules for to avoid a circular dependency. No functional changes in this patch. Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 96315cfaf6d1..352aa6bbd36b 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -4,7 +4,18 @@ #include #include -#include + +struct io_identity { + struct files_struct *files; + struct mm_struct *mm; +#ifdef CONFIG_BLK_CGROUP + struct cgroup_subsys_state *blkcg_css; +#endif + const struct cred *creds; + struct nsproxy *nsproxy; + struct fs_struct *fs; + unsigned long fsize; +}; struct io_uring_task { /* submission side */ -- cgit v1.2.3 From 1e6fa5216a0e59ef02e8b6b40d553238a3b81d49 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 15 Oct 2020 08:46:24 -0600 Subject: io_uring: COW io_identity on mismatch If the io_identity doesn't completely match the task, then create a copy of it and use that. The existing copy remains valid until the last user of it has gone away. This also changes the personality lookup to be indexed by io_identity, instead of creds directly. Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 352aa6bbd36b..342cc574d5c0 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -15,6 +15,7 @@ struct io_identity { struct nsproxy *nsproxy; struct fs_struct *fs; unsigned long fsize; + refcount_t count; }; struct io_uring_task { -- cgit v1.2.3 From 5c3462cfd123b341c9d3c947c1a2bab373f1697f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 15 Oct 2020 09:02:33 -0600 Subject: io_uring: store io_identity in io_uring_task This is, by definition, a per-task structure. So store it in the task context, instead of doing carrying it in each io_kiocb. We're being a bit inefficient if members have changed, as that requires an alloc and copy of a new io_identity struct. The next patch will fix that up. Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 342cc574d5c0..bd3346194bca 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -24,6 +24,7 @@ struct io_uring_task { struct wait_queue_head wait; struct file *last; atomic_long_t req_issue; + struct io_identity identity; /* completion side */ bool in_idle ____cacheline_aligned_in_smp; -- cgit v1.2.3 From 500a373d731ac506612db12631ec21295c1ff360 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 15 Oct 2020 17:38:03 -0600 Subject: io_uring: assign new io_identity for task if members have changed This avoids doing a copy for each new async IO, if some parts of the io_identity has changed. We avoid reference counting for the normal fast path of nothing ever changing. Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index bd3346194bca..607d14f61132 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -24,7 +24,8 @@ struct io_uring_task { struct wait_queue_head wait; struct file *last; atomic_long_t req_issue; - struct io_identity identity; + struct io_identity __identity; + struct io_identity *identity; /* completion side */ bool in_idle ____cacheline_aligned_in_smp; -- cgit v1.2.3 From d8a6df10aac9f2e4d5f30aff3129d552d2984ce7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 15 Oct 2020 16:24:45 -0600 Subject: io_uring: use percpu counters to track inflight requests Even though we place the req_issued and req_complete in separate cachelines, there's considerable overhead in doing the atomics particularly on the completion side. Get rid of having the two counters, and just use a percpu_counter for this. That's what it was made for, after all. This considerably reduces the overhead in __io_free_req(). Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 607d14f61132..28939820b6b0 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -23,13 +23,10 @@ struct io_uring_task { struct xarray xa; struct wait_queue_head wait; struct file *last; - atomic_long_t req_issue; + struct percpu_counter inflight; struct io_identity __identity; struct io_identity *identity; - - /* completion side */ - bool in_idle ____cacheline_aligned_in_smp; - atomic_long_t req_complete; + bool in_idle; }; #if defined(CONFIG_IO_URING) -- cgit v1.2.3 From 4ea33a976bfe79293965d0815e1914e4b6e58967 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 15 Oct 2020 13:46:44 -0600 Subject: io-wq: inherit audit loginuid and sessionid Make sure the async io-wq workers inherit the loginuid and sessionid from the original task, and restore them to unset once we're done with the async work item. While at it, disable the ability for kernel threads to write to their own loginuid. Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 28939820b6b0..868364cea3b7 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -15,6 +15,10 @@ struct io_identity { struct nsproxy *nsproxy; struct fs_struct *fs; unsigned long fsize; +#ifdef CONFIG_AUDIT + kuid_t loginuid; + unsigned int sessionid; +#endif refcount_t count; }; -- cgit v1.2.3 From 3c532798ec96b6c2d77706f04ed1d8b566a805df Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 3 Oct 2020 10:49:22 -0600 Subject: tracehook: clear TIF_NOTIFY_RESUME in tracehook_notify_resume() All the callers currently do this, clean it up and move the clearing into tracehook_notify_resume() instead. Reviewed-by: Oleg Nesterov Reviewed-by: Thomas Gleixner Signed-off-by: Jens Axboe --- include/linux/tracehook.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 36fb3bbed6b2..b480e1a07ed8 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -178,9 +178,9 @@ static inline void set_notify_resume(struct task_struct *task) */ static inline void tracehook_notify_resume(struct pt_regs *regs) { + clear_thread_flag(TIF_NOTIFY_RESUME); /* - * The caller just cleared TIF_NOTIFY_RESUME. This barrier - * pairs with task_work_add()->set_notify_resume() after + * This barrier pairs with task_work_add()->set_notify_resume() after * hlist_add_head(task->task_works); */ smp_mb__after_atomic(); -- cgit v1.2.3 From 91989c707884ecc7cd537281ab1a4b8fb7219da3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 16 Oct 2020 09:02:26 -0600 Subject: task_work: cleanup notification modes A previous commit changed the notification mode from true/false to an int, allowing notify-no, notify-yes, or signal-notify. This was backwards compatible in the sense that any existing true/false user would translate to either 0 (on notification sent) or 1, the latter which mapped to TWA_RESUME. TWA_SIGNAL was assigned a value of 2. Clean this up properly, and define a proper enum for the notification mode. Now we have: - TWA_NONE. This is 0, same as before the original change, meaning no notification requested. - TWA_RESUME. This is 1, same as before the original change, meaning that we use TIF_NOTIFY_RESUME. - TWA_SIGNAL. This uses TIF_SIGPENDING/JOBCTL_TASK_WORK for the notification. Clean up all the callers, switching their 0/1/false/true to using the appropriate TWA_* mode for notifications. Fixes: e91b48162332 ("task_work: teach task_work_add() to do signal_wake_up()") Reviewed-by: Thomas Gleixner Signed-off-by: Jens Axboe --- include/linux/task_work.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/task_work.h b/include/linux/task_work.h index 0fb93aafa478..0d848a1e9e62 100644 --- a/include/linux/task_work.h +++ b/include/linux/task_work.h @@ -13,9 +13,14 @@ init_task_work(struct callback_head *twork, task_work_func_t func) twork->func = func; } -#define TWA_RESUME 1 -#define TWA_SIGNAL 2 -int task_work_add(struct task_struct *task, struct callback_head *twork, int); +enum task_work_notify_mode { + TWA_NONE, + TWA_RESUME, + TWA_SIGNAL, +}; + +int task_work_add(struct task_struct *task, struct callback_head *twork, + enum task_work_notify_mode mode); struct callback_head *task_work_cancel(struct task_struct *, task_work_func_t); void task_work_run(void); -- cgit v1.2.3 From 15a119e09344a346384ec05c781c126a29b18235 Mon Sep 17 00:00:00 2001 From: Hui Su Date: Wed, 23 Sep 2020 01:12:31 +0800 Subject: jbd2: fix the comment of struct jbd2_journal_handle the struct name was modified long ago, but the comment still use struct handle_s. Signed-off-by: Hui Su Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20200922171231.GA53120@rlk Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 08f904943ab2..a1ef05412acf 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -452,8 +452,8 @@ struct jbd2_inode { struct jbd2_revoke_table_s; /** - * struct handle_s - The handle_s type is the concrete type associated with - * handle_t. + * struct jbd2_journal_handle - The jbd2_journal_handle type is the concrete + * type associated with handle_t. * @h_transaction: Which compound transaction is this update a part of? * @h_journal: Which journal handle belongs to - used iff h_reserved set. * @h_rsv_handle: Handle reserved for finishing the logical operation. -- cgit v1.2.3 From aa3c0c61f62d682259e3e66cdc01846290f9cd6c Mon Sep 17 00:00:00 2001 From: Mauricio Faria de Oliveira Date: Mon, 5 Oct 2020 21:48:38 -0300 Subject: jbd2: introduce/export functions jbd2_journal_submit|finish_inode_data_buffers() Export functions that implement the current behavior done for an inode in journal_submit|finish_inode_data_buffers(). No functional change. Signed-off-by: Mauricio Faria de Oliveira Suggested-by: Jan Kara Reviewed-by: Jan Kara Reviewed-by: Andreas Dilger Link: https://lore.kernel.org/r/20201006004841.600488-2-mfo@canonical.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index a1ef05412acf..8b7b06066bc2 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1421,6 +1421,10 @@ extern int jbd2_journal_inode_ranged_write(handle_t *handle, extern int jbd2_journal_inode_ranged_wait(handle_t *handle, struct jbd2_inode *inode, loff_t start_byte, loff_t length); +extern int jbd2_journal_submit_inode_data_buffers( + struct jbd2_inode *jinode); +extern int jbd2_journal_finish_inode_data_buffers( + struct jbd2_inode *jinode); extern int jbd2_journal_begin_ordered_truncate(journal_t *journal, struct jbd2_inode *inode, loff_t new_size); extern void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode); -- cgit v1.2.3 From 342af94ec6c02aa478fe2adcd41b950e154b03ba Mon Sep 17 00:00:00 2001 From: Mauricio Faria de Oliveira Date: Mon, 5 Oct 2020 21:48:39 -0300 Subject: jbd2, ext4, ocfs2: introduce/use journal callbacks j_submit|finish_inode_data_buffers() Introduce journal callbacks to allow different behaviors for an inode in journal_submit|finish_inode_data_buffers(). The existing users of the current behavior (ext4, ocfs2) are adapted to use the previously exported functions that implement the current behavior. Users are callers of jbd2_journal_inode_ranged_write|wait(), which adds the inode to the transaction's inode list with the JI_WRITE|WAIT_DATA flags. Only ext4 and ocfs2 in-tree. Both CONFIG_EXT4_FS and CONFIG_OCSFS2_FS select CONFIG_JBD2, which builds fs/jbd2/commit.c and journal.c that define and export the functions, so we can call directly in ext4/ocfs2. Signed-off-by: Mauricio Faria de Oliveira Suggested-by: Jan Kara Reviewed-by: Jan Kara Reviewed-by: Andreas Dilger Link: https://lore.kernel.org/r/20201006004841.600488-3-mfo@canonical.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 8b7b06066bc2..04afa6dcd60d 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -629,7 +629,9 @@ struct transaction_s struct journal_head *t_shadow_list; /* - * List of inodes whose data we've modified in data=ordered mode. + * List of inodes associated with the transaction; e.g., ext4 uses + * this to track inodes in data=ordered and data=journal mode that + * need special handling on transaction commit; also used by ocfs2. * [j_list_lock] */ struct list_head t_inode_list; @@ -1111,6 +1113,27 @@ struct journal_s void (*j_commit_callback)(journal_t *, transaction_t *); + /** + * @j_submit_inode_data_buffers: + * + * This function is called for all inodes associated with the + * committing transaction marked with JI_WRITE_DATA flag + * before we start to write out the transaction to the journal. + */ + int (*j_submit_inode_data_buffers) + (struct jbd2_inode *); + + /** + * @j_finish_inode_data_buffers: + * + * This function is called for all inodes associated with the + * committing transaction marked with JI_WAIT_DATA flag + * after we have written the transaction to the journal + * but before we write out the commit block. + */ + int (*j_finish_inode_data_buffers) + (struct jbd2_inode *); + /* * Journal statistics */ -- cgit v1.2.3 From b87d8cefe43c7f22e8aa13919c1dfa2b4b4b4e01 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Sat, 17 Oct 2020 16:13:40 -0700 Subject: mm, memcg: rework remote charging API to support nesting Currently the remote memcg charging API consists of two functions: memalloc_use_memcg() and memalloc_unuse_memcg(), which set and clear the memcg value, which overwrites the memcg of the current task. memalloc_use_memcg(target_memcg); <...> memalloc_unuse_memcg(); It works perfectly for allocations performed from a normal context, however an attempt to call it from an interrupt context or just nest two remote charging blocks will lead to an incorrect accounting. On exit from the inner block the active memcg will be cleared instead of being restored. memalloc_use_memcg(target_memcg); memalloc_use_memcg(target_memcg_2); <...> memalloc_unuse_memcg(); Error: allocation here are charged to the memcg of the current process instead of target_memcg. memalloc_unuse_memcg(); This patch extends the remote charging API by switching to a single function: struct mem_cgroup *set_active_memcg(struct mem_cgroup *memcg), which sets the new value and returns the old one. So a remote charging block will look like: old_memcg = set_active_memcg(target_memcg); <...> set_active_memcg(old_memcg); This patch is heavily based on the patch by Johannes Weiner, which can be found here: https://lkml.org/lkml/2020/5/28/806 . Signed-off-by: Roman Gushchin Signed-off-by: Andrew Morton Reviewed-by: Shakeel Butt Cc: Johannes Weiner Cc: Dan Schatzberg Link: https://lkml.kernel.org/r/20200821212056.3769116-1-guro@fb.com Signed-off-by: Linus Torvalds --- include/linux/sched/mm.h | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 981e34cb1409..1a80fb128e74 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -280,38 +280,28 @@ static inline void memalloc_nocma_restore(unsigned int flags) #ifdef CONFIG_MEMCG /** - * memalloc_use_memcg - Starts the remote memcg charging scope. + * set_active_memcg - Starts the remote memcg charging scope. * @memcg: memcg to charge. * * This function marks the beginning of the remote memcg charging scope. All the * __GFP_ACCOUNT allocations till the end of the scope will be charged to the * given memcg. * - * NOTE: This function is not nesting safe. + * NOTE: This function can nest. Users must save the return value and + * reset the previous value after their own charging scope is over. */ -static inline void memalloc_use_memcg(struct mem_cgroup *memcg) +static inline struct mem_cgroup * +set_active_memcg(struct mem_cgroup *memcg) { - WARN_ON_ONCE(current->active_memcg); + struct mem_cgroup *old = current->active_memcg; current->active_memcg = memcg; -} - -/** - * memalloc_unuse_memcg - Ends the remote memcg charging scope. - * - * This function marks the end of the remote memcg charging scope started by - * memalloc_use_memcg(). - */ -static inline void memalloc_unuse_memcg(void) -{ - current->active_memcg = NULL; + return old; } #else -static inline void memalloc_use_memcg(struct mem_cgroup *memcg) -{ -} - -static inline void memalloc_unuse_memcg(void) +static inline struct mem_cgroup * +set_active_memcg(struct mem_cgroup *memcg) { + return NULL; } #endif -- cgit v1.2.3 From 37d5985c003daab138a72dd4af9853b396d91c26 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Sat, 17 Oct 2020 16:13:50 -0700 Subject: mm: kmem: prepare remote memcg charging infra for interrupt contexts Remote memcg charging API uses current->active_memcg to store the currently active memory cgroup, which overwrites the memory cgroup of the current process. It works well for normal contexts, but doesn't work for interrupt contexts: indeed, if an interrupt occurs during the execution of a section with an active memcg set, all allocations inside the interrupt will be charged to the active memcg set (given that we'll enable accounting for allocations from an interrupt context). But because the interrupt might have no relation to the active memcg set outside, it's obviously wrong from the accounting prospective. To resolve this problem, let's add a global percpu int_active_memcg variable, which will be used to store an active memory cgroup which will be used from interrupt contexts. set_active_memcg() will transparently use current->active_memcg or int_active_memcg depending on the context. To make the read part simple and transparent for the caller, let's introduce two new functions: - struct mem_cgroup *active_memcg(void), - struct mem_cgroup *get_active_memcg(void). They are returning the active memcg if it's set, hiding all implementation details: where to get it depending on the current context. Signed-off-by: Roman Gushchin Signed-off-by: Andrew Morton Reviewed-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Link: http://lkml.kernel.org/r/20200827225843.1270629-4-guro@fb.com Signed-off-by: Linus Torvalds --- include/linux/sched/mm.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 1a80fb128e74..d5ece7a9a403 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -279,6 +279,7 @@ static inline void memalloc_nocma_restore(unsigned int flags) #endif #ifdef CONFIG_MEMCG +DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg); /** * set_active_memcg - Starts the remote memcg charging scope. * @memcg: memcg to charge. @@ -293,8 +294,16 @@ static inline void memalloc_nocma_restore(unsigned int flags) static inline struct mem_cgroup * set_active_memcg(struct mem_cgroup *memcg) { - struct mem_cgroup *old = current->active_memcg; - current->active_memcg = memcg; + struct mem_cgroup *old; + + if (in_interrupt()) { + old = this_cpu_read(int_active_memcg); + this_cpu_write(int_active_memcg, memcg); + } else { + old = current->active_memcg; + current->active_memcg = memcg; + } + return old; } #else -- cgit v1.2.3 From 4127c6504f25c4fcff52dc996efda2ef859dd661 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Sat, 17 Oct 2020 16:13:53 -0700 Subject: mm: kmem: enable kernel memcg accounting from interrupt contexts If a memcg to charge can be determined (using remote charging API), there are no reasons to exclude allocations made from an interrupt context from the accounting. Such allocations will pass even if the resulting memcg size will exceed the hard limit, but it will affect the application of the memory pressure and an inability to put the workload under the limit will eventually trigger the OOM. To use active_memcg() helper, memcg_kmem_bypass() is moved back to memcontrol.c. Signed-off-by: Roman Gushchin Signed-off-by: Andrew Morton Reviewed-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Link: http://lkml.kernel.org/r/20200827225843.1270629-5-guro@fb.com Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6ef4a552e09d..e391e3c56de5 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1531,18 +1531,6 @@ static inline bool memcg_kmem_enabled(void) return static_branch_likely(&memcg_kmem_enabled_key); } -static inline bool memcg_kmem_bypass(void) -{ - if (in_interrupt()) - return true; - - /* Allow remote memcg charging in kthread contexts. */ - if ((!current->mm || (current->flags & PF_KTHREAD)) && - !current->active_memcg) - return true; - return false; -} - static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) { -- cgit v1.2.3 From 0726b01e70455f9900ab524117c7b520d197dc8c Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Sat, 17 Oct 2020 16:14:50 -0700 Subject: mm/madvise: pass mm to do_madvise Patch series "introduce memory hinting API for external process", v9. Now, we have MADV_PAGEOUT and MADV_COLD as madvise hinting API. With that, application could give hints to kernel what memory range are preferred to be reclaimed. However, in some platform(e.g., Android), the information required to make the hinting decision is not known to the app. Instead, it is known to a centralized userspace daemon(e.g., ActivityManagerService), and that daemon must be able to initiate reclaim on its own without any app involvement. To solve the concern, this patch introduces new syscall - process_madvise(2). Bascially, it's same with madvise(2) syscall but it has some differences. 1. It needs pidfd of target process to provide the hint 2. It supports only MADV_{COLD|PAGEOUT|MERGEABLE|UNMEREABLE} at this moment. Other hints in madvise will be opened when there are explicit requests from community to prevent unexpected bugs we couldn't support. 3. Only privileged processes can do something for other process's address space. For more detail of the new API, please see "mm: introduce external memory hinting API" description in this patchset. This patch (of 3): In upcoming patches, do_madvise will be called from external process context so we shouldn't asssume "current" is always hinted process's task_struct. Furthermore, we must not access mm_struct via task->mm, but obtain it via access_mm() once (in the following patch) and only use that pointer [1], so pass it to do_madvise() as well. Note the vma->vm_mm pointers are safe, so we can use them further down the call stack. And let's pass current->mm as arguments of do_madvise so it shouldn't change existing behavior but prepare next patch to make review easy. [vbabka@suse.cz: changelog tweak] [minchan@kernel.org: use current->mm for io_uring] Link: http://lkml.kernel.org/r/20200423145215.72666-1-minchan@kernel.org [akpm@linux-foundation.org: fix it for upstream changes] [akpm@linux-foundation.org: whoops] [rdunlap@infradead.org: add missing includes] Signed-off-by: Minchan Kim Signed-off-by: Andrew Morton Reviewed-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Acked-by: David Rientjes Cc: Jens Axboe Cc: Jann Horn Cc: Tim Murray Cc: Daniel Colascione Cc: Sandeep Patil Cc: Sonny Rao Cc: Brian Geffon Cc: Michal Hocko Cc: Johannes Weiner Cc: Shakeel Butt Cc: John Dias Cc: Joel Fernandes Cc: Alexander Duyck Cc: SeongJae Park Cc: Christian Brauner Cc: Kirill Tkhai Cc: Oleksandr Natalenko Cc: SeongJae Park Cc: Christian Brauner Cc: Florian Weimer Cc: Link: https://lkml.kernel.org/r/20200901000633.1920247-1-minchan@kernel.org Link: http://lkml.kernel.org/r/20200622192900.22757-1-minchan@kernel.org Link: http://lkml.kernel.org/r/20200302193630.68771-2-minchan@kernel.org Link: http://lkml.kernel.org/r/20200622192900.22757-2-minchan@kernel.org Link: https://lkml.kernel.org/r/20200901000633.1920247-2-minchan@kernel.org Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 61a2633fcc7f..ef360fe70aaf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2579,7 +2579,7 @@ extern int __do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf, bool downgrade); extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); -extern int do_madvise(unsigned long start, size_t len_in, int behavior); +extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior); #ifdef CONFIG_MMU extern int __mm_populate(unsigned long addr, unsigned long len, -- cgit v1.2.3 From 1aa92cd31c1c032ddfed27e79d646bbb429e9b52 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Sat, 17 Oct 2020 16:14:54 -0700 Subject: pid: move pidfd_get_pid() to pid.c process_madvise syscall needs pidfd_get_pid function to translate pidfd to pid so this patch move the function to kernel/pid.c. Suggested-by: Alexander Duyck Signed-off-by: Minchan Kim Signed-off-by: Andrew Morton Reviewed-by: Suren Baghdasaryan Reviewed-by: Alexander Duyck Reviewed-by: Vlastimil Babka Acked-by: Christian Brauner Acked-by: David Rientjes Cc: Jens Axboe Cc: Jann Horn Cc: Brian Geffon Cc: Daniel Colascione Cc: Joel Fernandes Cc: Johannes Weiner Cc: John Dias Cc: Kirill Tkhai Cc: Michal Hocko Cc: Oleksandr Natalenko Cc: Sandeep Patil Cc: SeongJae Park Cc: SeongJae Park Cc: Shakeel Butt Cc: Sonny Rao Cc: Tim Murray Cc: Christian Brauner Cc: Florian Weimer Cc: Link: http://lkml.kernel.org/r/20200302193630.68771-5-minchan@kernel.org Link: http://lkml.kernel.org/r/20200622192900.22757-3-minchan@kernel.org Link: https://lkml.kernel.org/r/20200901000633.1920247-3-minchan@kernel.org Signed-off-by: Linus Torvalds --- include/linux/pid.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pid.h b/include/linux/pid.h index 176d6cf80e7c..fa10acb8d6a4 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -77,6 +77,7 @@ extern const struct file_operations pidfd_fops; struct file; extern struct pid *pidfd_pid(const struct file *file); +struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags); static inline struct pid *get_pid(struct pid *pid) { -- cgit v1.2.3 From ecb8ac8b1f146915aa6b96449b66dd48984caacc Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Sat, 17 Oct 2020 16:14:59 -0700 Subject: mm/madvise: introduce process_madvise() syscall: an external memory hinting API There is usecase that System Management Software(SMS) want to give a memory hint like MADV_[COLD|PAGEEOUT] to other processes and in the case of Android, it is the ActivityManagerService. The information required to make the reclaim decision is not known to the app. Instead, it is known to the centralized userspace daemon(ActivityManagerService), and that daemon must be able to initiate reclaim on its own without any app involvement. To solve the issue, this patch introduces a new syscall process_madvise(2). It uses pidfd of an external process to give the hint. It also supports vector address range because Android app has thousands of vmas due to zygote so it's totally waste of CPU and power if we should call the syscall one by one for each vma.(With testing 2000-vma syscall vs 1-vector syscall, it showed 15% performance improvement. I think it would be bigger in real practice because the testing ran very cache friendly environment). Another potential use case for the vector range is to amortize the cost ofTLB shootdowns for multiple ranges when using MADV_DONTNEED; this could benefit users like TCP receive zerocopy and malloc implementations. In future, we could find more usecases for other advises so let's make it happens as API since we introduce a new syscall at this moment. With that, existing madvise(2) user could replace it with process_madvise(2) with their own pid if they want to have batch address ranges support feature. ince it could affect other process's address range, only privileged process(PTRACE_MODE_ATTACH_FSCREDS) or something else(e.g., being the same UID) gives it the right to ptrace the process could use it successfully. The flag argument is reserved for future use if we need to extend the API. I think supporting all hints madvise has/will supported/support to process_madvise is rather risky. Because we are not sure all hints make sense from external process and implementation for the hint may rely on the caller being in the current context so it could be error-prone. Thus, I just limited hints as MADV_[COLD|PAGEOUT] in this patch. If someone want to add other hints, we could hear the usecase and review it for each hint. It's safer for maintenance rather than introducing a buggy syscall but hard to fix it later. So finally, the API is as follows, ssize_t process_madvise(int pidfd, const struct iovec *iovec, unsigned long vlen, int advice, unsigned int flags); DESCRIPTION The process_madvise() system call is used to give advice or directions to the kernel about the address ranges from external process as well as local process. It provides the advice to address ranges of process described by iovec and vlen. The goal of such advice is to improve system or application performance. The pidfd selects the process referred to by the PID file descriptor specified in pidfd. (See pidofd_open(2) for further information) The pointer iovec points to an array of iovec structures, defined in as: struct iovec { void *iov_base; /* starting address */ size_t iov_len; /* number of bytes to be advised */ }; The iovec describes address ranges beginning at address(iov_base) and with size length of bytes(iov_len). The vlen represents the number of elements in iovec. The advice is indicated in the advice argument, which is one of the following at this moment if the target process specified by pidfd is external. MADV_COLD MADV_PAGEOUT Permission to provide a hint to external process is governed by a ptrace access mode PTRACE_MODE_ATTACH_FSCREDS check; see ptrace(2). The process_madvise supports every advice madvise(2) has if target process is in same thread group with calling process so user could use process_madvise(2) to extend existing madvise(2) to support vector address ranges. RETURN VALUE On success, process_madvise() returns the number of bytes advised. This return value may be less than the total number of requested bytes, if an error occurred. The caller should check return value to determine whether a partial advice occurred. FAQ: Q.1 - Why does any external entity have better knowledge? Quote from Sandeep "For Android, every application (including the special SystemServer) are forked from Zygote. The reason of course is to share as many libraries and classes between the two as possible to benefit from the preloading during boot. After applications start, (almost) all of the APIs end up calling into this SystemServer process over IPC (binder) and back to the application. In a fully running system, the SystemServer monitors every single process periodically to calculate their PSS / RSS and also decides which process is "important" to the user for interactivity. So, because of how these processes start _and_ the fact that the SystemServer is looping to monitor each process, it does tend to *know* which address range of the application is not used / useful. Besides, we can never rely on applications to clean things up themselves. We've had the "hey app1, the system is low on memory, please trim your memory usage down" notifications for a long time[1]. They rely on applications honoring the broadcasts and very few do. So, if we want to avoid the inevitable killing of the application and restarting it, some way to be able to tell the OS about unimportant memory in these applications will be useful. - ssp Q.2 - How to guarantee the race(i.e., object validation) between when giving a hint from an external process and get the hint from the target process? process_madvise operates on the target process's address space as it exists at the instant that process_madvise is called. If the space target process can run between the time the process_madvise process inspects the target process address space and the time that process_madvise is actually called, process_madvise may operate on memory regions that the calling process does not expect. It's the responsibility of the process calling process_madvise to close this race condition. For example, the calling process can suspend the target process with ptrace, SIGSTOP, or the freezer cgroup so that it doesn't have an opportunity to change its own address space before process_madvise is called. Another option is to operate on memory regions that the caller knows a priori will be unchanged in the target process. Yet another option is to accept the race for certain process_madvise calls after reasoning that mistargeting will do no harm. The suggested API itself does not provide synchronization. It also apply other APIs like move_pages, process_vm_write. The race isn't really a problem though. Why is it so wrong to require that callers do their own synchronization in some manner? Nobody objects to write(2) merely because it's possible for two processes to open the same file and clobber each other's writes --- instead, we tell people to use flock or something. Think about mmap. It never guarantees newly allocated address space is still valid when the user tries to access it because other threads could unmap the memory right before. That's where we need synchronization by using other API or design from userside. It shouldn't be part of API itself. If someone needs more fine-grained synchronization rather than process level, there were two ideas suggested - cookie[2] and anon-fd[3]. Both are applicable via using last reserved argument of the API but I don't think it's necessary right now since we have already ways to prevent the race so don't want to add additional complexity with more fine-grained optimization model. To make the API extend, it reserved an unsigned long as last argument so we could support it in future if someone really needs it. Q.3 - Why doesn't ptrace work? Injecting an madvise in the target process using ptrace would not work for us because such injected madvise would have to be executed by the target process, which means that process would have to be runnable and that creates the risk of the abovementioned race and hinting a wrong VMA. Furthermore, we want to act the hint in caller's context, not the callee's, because the callee is usually limited in cpuset/cgroups or even freezed state so they can't act by themselves quick enough, which causes more thrashing/kill. It doesn't work if the target process are ptraced(e.g., strace, debugger, minidump) because a process can have at most one ptracer. [1] https://developer.android.com/topic/performance/memory" [2] process_getinfo for getting the cookie which is updated whenever vma of process address layout are changed - Daniel Colascione - https://lore.kernel.org/lkml/20190520035254.57579-1-minchan@kernel.org/T/#m7694416fd179b2066a2c62b5b139b14e3894e224 [3] anonymous fd which is used for the object(i.e., address range) validation - Michal Hocko - https://lore.kernel.org/lkml/20200120112722.GY18451@dhcp22.suse.cz/ [minchan@kernel.org: fix process_madvise build break for arm64] Link: http://lkml.kernel.org/r/20200303145756.GA219683@google.com [minchan@kernel.org: fix build error for mips of process_madvise] Link: http://lkml.kernel.org/r/20200508052517.GA197378@google.com [akpm@linux-foundation.org: fix patch ordering issue] [akpm@linux-foundation.org: fix arm64 whoops] [minchan@kernel.org: make process_madvise() vlen arg have type size_t, per Florian] [akpm@linux-foundation.org: fix i386 build] [sfr@canb.auug.org.au: fix syscall numbering] Link: https://lkml.kernel.org/r/20200905142639.49fc3f1a@canb.auug.org.au [sfr@canb.auug.org.au: madvise.c needs compat.h] Link: https://lkml.kernel.org/r/20200908204547.285646b4@canb.auug.org.au [minchan@kernel.org: fix mips build] Link: https://lkml.kernel.org/r/20200909173655.GC2435453@google.com [yuehaibing@huawei.com: remove duplicate header which is included twice] Link: https://lkml.kernel.org/r/20200915121550.30584-1-yuehaibing@huawei.com [minchan@kernel.org: do not use helper functions for process_madvise] Link: https://lkml.kernel.org/r/20200921175539.GB387368@google.com [akpm@linux-foundation.org: pidfd_get_pid() gained an argument] [sfr@canb.auug.org.au: fix up for "iov_iter: transparently handle compat iovecs in import_iovec"] Link: https://lkml.kernel.org/r/20200928212542.468e1fef@canb.auug.org.au Signed-off-by: Minchan Kim Signed-off-by: YueHaibing Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Reviewed-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Acked-by: David Rientjes Cc: Alexander Duyck Cc: Brian Geffon Cc: Christian Brauner Cc: Daniel Colascione Cc: Jann Horn Cc: Jens Axboe Cc: Joel Fernandes Cc: Johannes Weiner Cc: John Dias Cc: Kirill Tkhai Cc: Michal Hocko Cc: Oleksandr Natalenko Cc: Sandeep Patil Cc: SeongJae Park Cc: SeongJae Park Cc: Shakeel Butt Cc: Sonny Rao Cc: Tim Murray Cc: Christian Brauner Cc: Florian Weimer Cc: Link: http://lkml.kernel.org/r/20200302193630.68771-3-minchan@kernel.org Link: http://lkml.kernel.org/r/20200508183320.GA125527@google.com Link: http://lkml.kernel.org/r/20200622192900.22757-4-minchan@kernel.org Link: https://lkml.kernel.org/r/20200901000633.1920247-4-minchan@kernel.org Signed-off-by: Linus Torvalds --- include/linux/syscalls.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 06db09875aa4..2eda7678fe1d 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -879,6 +879,8 @@ asmlinkage long sys_munlockall(void); asmlinkage long sys_mincore(unsigned long start, size_t len, unsigned char __user * vec); asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior); +asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec, + size_t vlen, int behavior, unsigned int flags); asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long flags); -- cgit v1.2.3 From b944afc9d64ddf1b6a152c23ff86bf26e1fd430c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 17 Oct 2020 16:15:06 -0700 Subject: mm: add a VM_MAP_PUT_PAGES flag for vmap Add a flag so that vmap takes ownership of the passed in page array. When vfree is called on such an allocation it will put one reference on each page, and free the page array itself. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Boris Ostrovsky Cc: Chris Wilson Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Juergen Gross Cc: Matthew Auld Cc: "Matthew Wilcox (Oracle)" Cc: Minchan Kim Cc: Nitin Gupta Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Stefano Stabellini Cc: Tvrtko Ursulin Cc: Uladzislau Rezki (Sony) Link: https://lkml.kernel.org/r/20201002122204.1534411-3-hch@lst.de Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 0221f852a7e1..b899681e3ff9 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -24,6 +24,7 @@ struct notifier_block; /* in notifier.h */ #define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */ #define VM_NO_GUARD 0x00000040 /* don't add guard page */ #define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ +#define VM_MAP_PUT_PAGES 0x00000100 /* put pages and free array in vfree */ /* * VM_KASAN is used slighly differently depending on CONFIG_KASAN_VMALLOC. -- cgit v1.2.3 From 3e9a9e256b1e1e6e8f19faf76fa9c37578ae35ee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 17 Oct 2020 16:15:10 -0700 Subject: mm: add a vmap_pfn function Add a proper helper to remap PFNs into kernel virtual space so that drivers don't have to abuse alloc_vm_area and open coded PTE manipulation for it. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Boris Ostrovsky Cc: Chris Wilson Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Juergen Gross Cc: Matthew Auld Cc: "Matthew Wilcox (Oracle)" Cc: Minchan Kim Cc: Nitin Gupta Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Stefano Stabellini Cc: Tvrtko Ursulin Cc: Uladzislau Rezki (Sony) Link: https://lkml.kernel.org/r/20201002122204.1534411-4-hch@lst.de Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index b899681e3ff9..c77efeac2425 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -122,6 +122,7 @@ extern void vfree_atomic(const void *addr); extern void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot); +void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot); extern void vunmap(const void *addr); extern int remap_vmalloc_range_partial(struct vm_area_struct *vma, -- cgit v1.2.3 From 301fa9f2ddf7fb248c188af292c9cc04f8283dff Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 17 Oct 2020 16:15:39 -0700 Subject: mm: remove alloc_vm_area All users are gone now. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Boris Ostrovsky Cc: Chris Wilson Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Juergen Gross Cc: Matthew Auld Cc: "Matthew Wilcox (Oracle)" Cc: Minchan Kim Cc: Nitin Gupta Cc: Peter Zijlstra Cc: Rodrigo Vivi Cc: Stefano Stabellini Cc: Tvrtko Ursulin Cc: Uladzislau Rezki (Sony) Link: https://lkml.kernel.org/r/20201002122204.1534411-12-hch@lst.de Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index c77efeac2425..938eaf9517e2 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -169,6 +169,7 @@ extern struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, const void *caller); +void free_vm_area(struct vm_struct *area); extern struct vm_struct *remove_vm_area(const void *addr); extern struct vm_struct *find_vm_area(const void *addr); @@ -204,10 +205,6 @@ static inline void set_vm_flush_reset_perms(void *addr) } #endif -/* Allocate/destroy a 'vmalloc' VM area. */ -extern struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes); -extern void free_vm_area(struct vm_struct *area); - /* for /dev/kmem */ extern long vread(char *buf, char *addr, unsigned long count); extern long vwrite(char *buf, char *addr, unsigned long count); -- cgit v1.2.3 From 695cebe58dcf3d9802cdfa9c327b5c7641a5914b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 20 Oct 2020 10:41:07 +0200 Subject: dma-mapping: move more functions to dma-map-ops.h Due to a mismerge a bunch of prototypes that should have moved to dma-map-ops.h are still in dma-mapping.h, fix that up. Signed-off-by: Christoph Hellwig --- include/linux/dma-map-ops.h | 23 +++++++++++++++++++++++ include/linux/dma-mapping.h | 24 ------------------------ 2 files changed, 23 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 8029f7e04145..a5f89fc4d6df 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -203,6 +203,29 @@ static inline int dma_mmap_from_global_coherent(struct vm_area_struct *vma, } #endif /* CONFIG_DMA_DECLARE_COHERENT */ +int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs); +int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs); +struct page *dma_common_alloc_pages(struct device *dev, size_t size, + dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp); +void dma_common_free_pages(struct device *dev, size_t size, struct page *vaddr, + dma_addr_t dma_handle, enum dma_data_direction dir); + +struct page **dma_common_find_pages(void *cpu_addr); +void *dma_common_contiguous_remap(struct page *page, size_t size, pgprot_t prot, + const void *caller); +void *dma_common_pages_remap(struct page **pages, size_t size, pgprot_t prot, + const void *caller); +void dma_common_free_remap(void *cpu_addr, size_t size); + +struct page *dma_alloc_from_pool(struct device *dev, size_t size, + void **cpu_addr, gfp_t flags, + bool (*phys_addr_ok)(struct device *, phys_addr_t, size_t)); +bool dma_free_from_pool(struct device *dev, void *start, size_t size); + #ifdef CONFIG_ARCH_HAS_DMA_COHERENCE_H #include #elif defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 3f029afdc9dc..956151052d45 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -389,30 +389,6 @@ static inline void dma_sync_sgtable_for_device(struct device *dev, #define dma_get_sgtable(d, t, v, h, s) dma_get_sgtable_attrs(d, t, v, h, s, 0) #define dma_mmap_coherent(d, v, c, h, s) dma_mmap_attrs(d, v, c, h, s, 0) -extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t dma_addr, size_t size, - unsigned long attrs); -struct page *dma_common_alloc_pages(struct device *dev, size_t size, - dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp); -void dma_common_free_pages(struct device *dev, size_t size, struct page *vaddr, - dma_addr_t dma_handle, enum dma_data_direction dir); -struct page **dma_common_find_pages(void *cpu_addr); -void *dma_common_contiguous_remap(struct page *page, size_t size, - pgprot_t prot, const void *caller); - -void *dma_common_pages_remap(struct page **pages, size_t size, - pgprot_t prot, const void *caller); -void dma_common_free_remap(void *cpu_addr, size_t size); - -struct page *dma_alloc_from_pool(struct device *dev, size_t size, - void **cpu_addr, gfp_t flags, - bool (*phys_addr_ok)(struct device *, phys_addr_t, size_t)); -bool dma_free_from_pool(struct device *dev, void *start, size_t size); - -int -dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr, - dma_addr_t dma_addr, size_t size, unsigned long attrs); - static inline void *dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp) { -- cgit v1.2.3 From cb3a92da231bcf55c243d00fa619ee36281b0001 Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Tue, 20 Oct 2020 05:22:56 -0400 Subject: block: remove unused members for io_context After removing blk-sq code, there is no user of nr_batch_requests and last_waited in kernel. Signed-off-by: Yufen Yu Signed-off-by: Jens Axboe --- include/linux/iocontext.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 1dcd9198beb7..0a9dc40b7be8 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -106,12 +106,6 @@ struct io_context { unsigned short ioprio; - /* - * For request batching - */ - int nr_batch_requests; /* Number of requests left in the batch */ - unsigned long last_waited; /* Time last woken after wait for request */ - struct radix_tree_root icq_tree; struct io_cq __rcu *icq_hint; struct hlist_head icq_list; -- cgit v1.2.3 From aa9c9b3f3f08cb0fda8a8139e6fb302c9a2e21ed Mon Sep 17 00:00:00 2001 From: Bean Huo Date: Tue, 20 Oct 2020 17:00:27 +0200 Subject: PM: runtime: Fix typo in pm_runtime_set_active() helper comment This patch is to fix typo in the comment of helper pm_runtime_set_active(). Signed-off-by: Bean Huo [ rjw: Subject edit ] Signed-off-by: Rafael J. Wysocki --- include/linux/pm_runtime.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 6245caa18034..18b02dcc168e 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -479,7 +479,7 @@ static inline int pm_runtime_set_active(struct device *dev) } /** - * pm_runtime_set_suspended - Set runtime PM status to "active". + * pm_runtime_set_suspended - Set runtime PM status to "suspended". * @dev: Target device. * * Set the runtime PM status of @dev to %RPM_SUSPENDED and ensure that -- cgit v1.2.3 From 0cfcd405e758ba1d277e58436fb32f06888c3e41 Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Sun, 18 Oct 2020 23:42:49 -0400 Subject: NFSv4.2: Fix NFS4ERR_STALE error when doing inter server copy NFS_FS=y as dependency of CONFIG_NFSD_V4_2_INTER_SSC still have build errors and some configs with NFSD=m to get NFS4ERR_STALE error when doing inter server copy. Added ops table in nfs_common for knfsd to access NFS client modules. Fixes: 3ac3711adb88 ("NFSD: Fix NFS server build errors") Signed-off-by: Dai Ngo Signed-off-by: J. Bruce Fields --- include/linux/nfs_ssc.h | 67 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 include/linux/nfs_ssc.h (limited to 'include/linux') diff --git a/include/linux/nfs_ssc.h b/include/linux/nfs_ssc.h new file mode 100644 index 000000000000..f5ba0fbff72f --- /dev/null +++ b/include/linux/nfs_ssc.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * include/linux/nfs_ssc.h + * + * Author: Dai Ngo + * + * Copyright (c) 2020, Oracle and/or its affiliates. + */ + +#include + +extern struct nfs_ssc_client_ops_tbl nfs_ssc_client_tbl; + +/* + * NFS_V4 + */ +struct nfs4_ssc_client_ops { + struct file *(*sco_open)(struct vfsmount *ss_mnt, + struct nfs_fh *src_fh, nfs4_stateid *stateid); + void (*sco_close)(struct file *filep); +}; + +/* + * NFS_FS + */ +struct nfs_ssc_client_ops { + void (*sco_sb_deactive)(struct super_block *sb); +}; + +struct nfs_ssc_client_ops_tbl { + const struct nfs4_ssc_client_ops *ssc_nfs4_ops; + const struct nfs_ssc_client_ops *ssc_nfs_ops; +}; + +extern void nfs42_ssc_register_ops(void); +extern void nfs42_ssc_unregister_ops(void); + +extern void nfs42_ssc_register(const struct nfs4_ssc_client_ops *ops); +extern void nfs42_ssc_unregister(const struct nfs4_ssc_client_ops *ops); + +#ifdef CONFIG_NFSD_V4_2_INTER_SSC +static inline struct file *nfs42_ssc_open(struct vfsmount *ss_mnt, + struct nfs_fh *src_fh, nfs4_stateid *stateid) +{ + if (nfs_ssc_client_tbl.ssc_nfs4_ops) + return (*nfs_ssc_client_tbl.ssc_nfs4_ops->sco_open)(ss_mnt, src_fh, stateid); + return ERR_PTR(-EIO); +} + +static inline void nfs42_ssc_close(struct file *filep) +{ + if (nfs_ssc_client_tbl.ssc_nfs4_ops) + (*nfs_ssc_client_tbl.ssc_nfs4_ops->sco_close)(filep); +} +#endif + +/* + * NFS_FS + */ +extern void nfs_ssc_register(const struct nfs_ssc_client_ops *ops); +extern void nfs_ssc_unregister(const struct nfs_ssc_client_ops *ops); + +static inline void nfs_do_sb_deactive(struct super_block *sb) +{ + if (nfs_ssc_client_tbl.ssc_nfs_ops) + (*nfs_ssc_client_tbl.ssc_nfs_ops->sco_sb_deactive)(sb); +} -- cgit v1.2.3 From 0afa15e1a5294754066343cad24af5ec8edae96d Mon Sep 17 00:00:00 2001 From: Pierre Morel Date: Thu, 10 Sep 2020 10:53:49 +0200 Subject: virtio: let arch advertise guest's memory access restrictions An architecture may restrict host access to guest memory, e.g. IBM s390 Secure Execution or AMD SEV. Provide a new Kconfig entry the architecture can select, CONFIG_ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS, when it provides the arch_has_restricted_virtio_memory_access callback to advertise to VIRTIO common code when the architecture restricts memory access from the host. The common code can then fail the probe for any device where VIRTIO_F_ACCESS_PLATFORM is required, but not set. Signed-off-by: Pierre Morel Reviewed-by: Cornelia Huck Reviewed-by: Halil Pasic Link: https://lore.kernel.org/r/1599728030-17085-2-git-send-email-pmorel@linux.ibm.com Signed-off-by: Michael S. Tsirkin Acked-by: Christian Borntraeger --- include/linux/virtio_config.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 8fe857e27ef3..3f697c8c8205 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -540,4 +540,14 @@ static inline void virtio_cwrite64(struct virtio_device *vdev, virtio_cread_le((vdev), structname, member, ptr); \ _r; \ }) + +#ifdef CONFIG_ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS +int arch_has_restricted_virtio_memory_access(void); +#else +static inline int arch_has_restricted_virtio_memory_access(void) +{ + return 0; +} +#endif /* CONFIG_ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS */ + #endif /* _LINUX_VIRTIO_CONFIG_H */ -- cgit v1.2.3 From 9e9eb226b91225fc199bbafc06f3cd70bfce0100 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 14 Oct 2020 11:26:46 -0700 Subject: KVM: Cache as_id in kvm_memory_slot Cache the address space ID just like the slot ID. It will be used in order to fill in the dirty ring entries. Suggested-by: Paolo Bonzini Suggested-by: Sean Christopherson Reviewed-by: Sean Christopherson Signed-off-by: Peter Xu Message-Id: <20201014182700.2888246-7-bgardon@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 05e3c2fb3ef7..c6f45687ba89 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -346,6 +346,7 @@ struct kvm_memory_slot { unsigned long userspace_addr; u32 flags; short id; + u16 as_id; }; static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot) -- cgit v1.2.3 From ba452c9e996d8a4c347b32805f91abb70de5de7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 20 Oct 2020 23:25:56 +0200 Subject: bpf: Fix bpf_redirect_neigh helper api to support supplying nexthop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Based on the discussion in [0], update the bpf_redirect_neigh() helper to accept an optional parameter specifying the nexthop information. This makes it possible to combine bpf_fib_lookup() and bpf_redirect_neigh() without incurring a duplicate FIB lookup - since the FIB lookup helper will return the nexthop information even if no neighbour is present, this can simply be passed on to bpf_redirect_neigh() if bpf_fib_lookup() returns BPF_FIB_LKUP_RET_NO_NEIGH. Thus fix & extend it before helper API is frozen. [0] https://lore.kernel.org/bpf/393e17fc-d187-3a8d-2f0d-a627c7c63fca@iogearbox.net/ Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Reviewed-by: David Ahern Link: https://lore.kernel.org/bpf/160322915615.32199.1187570224032024535.stgit@toke.dk --- include/linux/filter.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 20fc24c9779a..72d62cbc1578 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -607,12 +607,21 @@ struct bpf_skb_data_end { void *data_end; }; +struct bpf_nh_params { + u32 nh_family; + union { + u32 ipv4_nh; + struct in6_addr ipv6_nh; + }; +}; + struct bpf_redirect_info { u32 flags; u32 tgt_index; void *tgt_value; struct bpf_map *map; u32 kern_flags; + struct bpf_nh_params nh; }; DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); -- cgit v1.2.3 From ebfe3c5183733f784264450a41646a482f964e5e Mon Sep 17 00:00:00 2001 From: Di Zhu Date: Wed, 21 Oct 2020 10:00:53 +0800 Subject: rtnetlink: fix data overflow in rtnl_calcit() "ip addr show" command execute error when we have a physical network card with a large number of VFs The return value of if_nlmsg_size() in rtnl_calcit() will exceed range of u16 data type when any network cards has a larger number of VFs. rtnl_vfinfo_size() will significant increase needed dump size when the value of num_vfs is larger. Eventually we get a wrong value of min_ifinfo_dump_size because of overflow which decides the memory size needed by netlink dump and netlink_dump() will return -EMSGSIZE because of not enough memory was allocated. So fix it by promoting min_dump_alloc data type to u32 to avoid whole netlink message size overflow and it's also align with the data type of struct netlink_callback{}.min_dump_alloc which is assigned by return value of rtnl_calcit() Signed-off-by: Di Zhu Link: https://lore.kernel.org/r/20201021020053.1401-1-zhudi21@huawei.com Signed-off-by: Jakub Kicinski --- include/linux/netlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 666cd0390699..9f118771e248 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -240,7 +240,7 @@ struct netlink_dump_control { int (*done)(struct netlink_callback *); void *data; struct module *module; - u16 min_dump_alloc; + u32 min_dump_alloc; }; int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, -- cgit v1.2.3 From 995a3ed67fc8c0e3301a770016fb66f1bbf15ec8 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 15 Oct 2020 13:37:54 -0700 Subject: ext4: add fast_commit feature and handling for extended mount options We are running out of mount option bits. Add handling for using s_mount_opt2. Add ext4 and jbd2 fast commit feature flag and also add ability to turn off the fast commit feature in Ext4. Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201015203802.3597742-3-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 04afa6dcd60d..0685cc95e501 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -289,6 +289,7 @@ typedef struct journal_superblock_s #define JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT 0x00000004 #define JBD2_FEATURE_INCOMPAT_CSUM_V2 0x00000008 #define JBD2_FEATURE_INCOMPAT_CSUM_V3 0x00000010 +#define JBD2_FEATURE_INCOMPAT_FAST_COMMIT 0x00000020 /* See "journal feature predicate functions" below */ @@ -299,7 +300,8 @@ typedef struct journal_superblock_s JBD2_FEATURE_INCOMPAT_64BIT | \ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | \ JBD2_FEATURE_INCOMPAT_CSUM_V2 | \ - JBD2_FEATURE_INCOMPAT_CSUM_V3) + JBD2_FEATURE_INCOMPAT_CSUM_V3 | \ + JBD2_FEATURE_INCOMPAT_FAST_COMMIT) #ifdef __KERNEL__ @@ -1263,6 +1265,7 @@ JBD2_FEATURE_INCOMPAT_FUNCS(64bit, 64BIT) JBD2_FEATURE_INCOMPAT_FUNCS(async_commit, ASYNC_COMMIT) JBD2_FEATURE_INCOMPAT_FUNCS(csum2, CSUM_V2) JBD2_FEATURE_INCOMPAT_FUNCS(csum3, CSUM_V3) +JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit, FAST_COMMIT) /* * Journal flag definitions -- cgit v1.2.3 From 6866d7b3f2bb4f011041ba54c98b1584497fe2fd Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 15 Oct 2020 13:37:55 -0700 Subject: ext4 / jbd2: add fast commit initialization This patch adds fast commit area trackers in the journal_t structure. These are initialized via the jbd2_fc_init() routine that this patch adds. This patch also adds ext4/fast_commit.c and ext4/fast_commit.h files for fast commit code that will be added in subsequent patches in this series. Reported-by: kernel test robot Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201015203802.3597742-4-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 0685cc95e501..008629b4d615 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -918,6 +918,30 @@ struct journal_s */ unsigned long j_last; + /** + * @j_fc_first: + * + * The block number of the first fast commit block in the journal + * [j_state_lock]. + */ + unsigned long j_fc_first; + + /** + * @j_fc_off: + * + * Number of fast commit blocks currently allocated. + * [j_state_lock]. + */ + unsigned long j_fc_off; + + /** + * @j_fc_last: + * + * The block number one beyond the last fast commit block in the journal + * [j_state_lock]. + */ + unsigned long j_fc_last; + /** * @j_dev: Device where we store the journal. */ @@ -1068,6 +1092,12 @@ struct journal_s */ struct buffer_head **j_wbuf; + /** + * @j_fc_wbuf: Array of fast commit bhs for + * jbd2_journal_commit_transaction. + */ + struct buffer_head **j_fc_wbuf; + /** * @j_wbufsize: * @@ -1075,6 +1105,13 @@ struct journal_s */ int j_wbufsize; + /** + * @j_fc_wbufsize: + * + * Size of @j_fc_wbuf array. + */ + int j_fc_wbufsize; + /** * @j_last_sync_writer: * @@ -1535,6 +1572,8 @@ void __jbd2_log_wait_for_space(journal_t *journal); extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *); extern int jbd2_cleanup_journal_tail(journal_t *); +/* Fast commit related APIs */ +int jbd2_fc_init(journal_t *journal, int num_fc_blks); /* * is_journal_abort * -- cgit v1.2.3 From ff780b91efe901b8eecd8114785abae5341820ad Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 15 Oct 2020 13:37:56 -0700 Subject: jbd2: add fast commit machinery This functions adds necessary APIs needed in JBD2 layer for fast commits. Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201015203802.3597742-5-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 008629b4d615..a009d9b9c620 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -861,6 +861,13 @@ struct journal_s */ wait_queue_head_t j_wait_reserved; + /** + * @j_fc_wait: + * + * Wait queue to wait for completion of async fast commits. + */ + wait_queue_head_t j_fc_wait; + /** * @j_checkpoint_mutex: * @@ -1232,6 +1239,15 @@ struct journal_s */ struct lockdep_map j_trans_commit_map; #endif + + /** + * @j_fc_cleanup_callback: + * + * Clean-up after fast commit or full commit. JBD2 calls this function + * after every commit operation. + */ + void (*j_fc_cleanup_callback)(struct journal_s *journal, int); + }; #define jbd2_might_wait_for_commit(j) \ @@ -1316,6 +1332,8 @@ JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit, FAST_COMMIT) #define JBD2_ABORT_ON_SYNCDATA_ERR 0x040 /* Abort the journal on file * data write error in ordered * mode */ +#define JBD2_FAST_COMMIT_ONGOING 0x100 /* Fast commit is ongoing */ +#define JBD2_FULL_COMMIT_ONGOING 0x200 /* Full commit is ongoing */ /* * Function declarations for the journaling transaction and buffer @@ -1574,6 +1592,15 @@ extern int jbd2_cleanup_journal_tail(journal_t *); /* Fast commit related APIs */ int jbd2_fc_init(journal_t *journal, int num_fc_blks); +int jbd2_fc_begin_commit(journal_t *journal, tid_t tid); +int jbd2_fc_end_commit(journal_t *journal); +int jbd2_fc_end_commit_fallback(journal_t *journal, tid_t tid); +int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out); +int jbd2_submit_inode_data(struct jbd2_inode *jinode); +int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode); +int jbd2_fc_wait_bufs(journal_t *journal, int num_blks); +int jbd2_fc_release_bufs(journal_t *journal); + /* * is_journal_abort * -- cgit v1.2.3 From 5b849b5f96b47d82b5a432d8b91a8ad260e1de46 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 15 Oct 2020 13:37:58 -0700 Subject: jbd2: fast commit recovery path This patch adds fast commit recovery support in JBD2. Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201015203802.3597742-7-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index a009d9b9c620..fb3d71ad6eea 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -751,6 +751,11 @@ jbd2_time_diff(unsigned long start, unsigned long end) #define JBD2_NR_BATCH 64 +enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY}; + +#define JBD2_FC_REPLAY_STOP 0 +#define JBD2_FC_REPLAY_CONTINUE 1 + /** * struct journal_s - The journal_s type is the concrete type associated with * journal_t. @@ -1248,6 +1253,21 @@ struct journal_s */ void (*j_fc_cleanup_callback)(struct journal_s *journal, int); + /* + * @j_fc_replay_callback: + * + * File-system specific function that performs replay of a fast + * commit. JBD2 calls this function for each fast commit block found in + * the journal. This function should return JBD2_FC_REPLAY_CONTINUE + * to indicate that the block was processed correctly and more fast + * commit replay should continue. Return value of JBD2_FC_REPLAY_STOP + * indicates the end of replay (no more blocks remaining). A negative + * return value indicates error. + */ + int (*j_fc_replay_callback)(struct journal_s *journal, + struct buffer_head *bh, + enum passtype pass, int off, + tid_t expected_commit_id); }; #define jbd2_might_wait_for_commit(j) \ -- cgit v1.2.3 From ee6e00c868221f5f7d0b6eb4e8379a148e26bc20 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 22 Oct 2020 14:15:51 -0600 Subject: splice: change exported internal do_splice() helper to take kernel offset With the set_fs change, we can no longer rely on copy_{to,from}_user() accepting a kernel pointer, and it was bad form to do so anyway. Clean this up and change the internal helper that io_uring uses to deal with kernel pointers instead. This puts the offset copy in/out in __do_splice() instead, which just calls the same helper. Signed-off-by: Jens Axboe --- include/linux/splice.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/splice.h b/include/linux/splice.h index 5c47013f708e..a55179fd60fc 100644 --- a/include/linux/splice.h +++ b/include/linux/splice.h @@ -78,8 +78,8 @@ extern ssize_t add_to_pipe(struct pipe_inode_info *, struct pipe_buffer *); extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, splice_direct_actor *); -extern long do_splice(struct file *in, loff_t __user *off_in, - struct file *out, loff_t __user *off_out, +extern long do_splice(struct file *in, loff_t *off_in, + struct file *out, loff_t *off_out, size_t len, unsigned int flags); extern long do_tee(struct file *in, struct file *out, size_t len, -- cgit v1.2.3 From 879bc2d27904354b98ca295b6168718e045c4aa2 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 19 Oct 2020 16:57:50 +0200 Subject: hil/parisc: Disable HIL driver when it gets stuck When starting a HP machine with HIL driver but without an HIL keyboard or HIL mouse attached, it may happen that data written to the HIL loop gets stuck (e.g. because the transaction queue is full). Usually one will then have to reboot the machine because all you see is and endless output of: Transaction add failed: transaction already queued? In the higher layers hp_sdc_enqueue_transaction() is called to queued up a HIL packet. This function returns an error code, and this patch adds the necessary checks for this return code and disables the HIL driver if further packets can't be sent. Tested on a HP 730 and a HP 715/64 machine. Signed-off-by: Helge Deller Cc: --- include/linux/hil_mlc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hil_mlc.h b/include/linux/hil_mlc.h index 774f7d3b8f6a..369221fd5518 100644 --- a/include/linux/hil_mlc.h +++ b/include/linux/hil_mlc.h @@ -103,7 +103,7 @@ struct hilse_node { /* Methods for back-end drivers, e.g. hp_sdc_mlc */ typedef int (hil_mlc_cts) (hil_mlc *mlc); -typedef void (hil_mlc_out) (hil_mlc *mlc); +typedef int (hil_mlc_out) (hil_mlc *mlc); typedef int (hil_mlc_in) (hil_mlc *mlc, suseconds_t timeout); struct hil_mlc_devinfo { -- cgit v1.2.3 From a6a0b05da9f37ff56faa6b8351ed6e0b55032460 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Wed, 14 Oct 2020 11:26:55 -0700 Subject: kvm: x86/mmu: Support dirty logging for the TDP MMU Dirty logging is a key feature of the KVM MMU and must be supported by the TDP MMU. Add support for both the write protection and PML dirty logging modes. Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell machine. This series introduced no new failures. This series can be viewed in Gerrit at: https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538 Signed-off-by: Ben Gardon Message-Id: <20201014182700.2888246-16-bgardon@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index c6f45687ba89..7f2e2a09ebbd 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -798,6 +798,7 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn); bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn); +void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn); void mark_page_dirty(struct kvm *kvm, gfn_t gfn); struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From 3f1b623a1be92103386bcab818e25885d6be9419 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 23 Oct 2020 17:00:41 +0800 Subject: vdpa: introduce config op to get valid iova range This patch introduce a config op to get valid iova range from the vDPA device. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20201023090043.14430-2-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- include/linux/vdpa.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index eae0bfd87d91..30bc7a7223bb 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -52,6 +52,16 @@ struct vdpa_device { int nvqs; }; +/** + * vDPA IOVA range - the IOVA range support by the device + * @first: start of the IOVA range + * @last: end of the IOVA range + */ +struct vdpa_iova_range { + u64 first; + u64 last; +}; + /** * vDPA_config_ops - operations for configuring a vDPA device. * Note: vDPA device drivers are required to implement all of the @@ -151,6 +161,10 @@ struct vdpa_device { * @get_generation: Get device config generation (optional) * @vdev: vdpa device * Returns u32: device generation + * @get_iova_range: Get supported iova range (optional) + * @vdev: vdpa device + * Returns the iova range supported by + * the device. * @set_map: Set device memory mapping (optional) * Needed for device that using device * specific DMA translation (on-chip IOMMU) @@ -216,6 +230,7 @@ struct vdpa_config_ops { void (*set_config)(struct vdpa_device *vdev, unsigned int offset, const void *buf, unsigned int len); u32 (*get_generation)(struct vdpa_device *vdev); + struct vdpa_iova_range (*get_iova_range)(struct vdpa_device *vdev); /* DMA ops */ int (*set_map)(struct vdpa_device *vdev, struct vhost_iotlb *iotlb); -- cgit v1.2.3 From c51f8f88d705e06bd696d7510aff22b33eb8e638 Mon Sep 17 00:00:00 2001 From: George Spelvin Date: Sun, 9 Aug 2020 06:57:44 +0000 Subject: random32: make prandom_u32() output unpredictable Non-cryptographic PRNGs may have great statistical properties, but are usually trivially predictable to someone who knows the algorithm, given a small sample of their output. An LFSR like prandom_u32() is particularly simple, even if the sample is widely scattered bits. It turns out the network stack uses prandom_u32() for some things like random port numbers which it would prefer are *not* trivially predictable. Predictability led to a practical DNS spoofing attack. Oops. This patch replaces the LFSR with a homebrew cryptographic PRNG based on the SipHash round function, which is in turn seeded with 128 bits of strong random key. (The authors of SipHash have *not* been consulted about this abuse of their algorithm.) Speed is prioritized over security; attacks are rare, while performance is always wanted. Replacing all callers of prandom_u32() is the quick fix. Whether to reinstate a weaker PRNG for uses which can tolerate it is an open question. Commit f227e3ec3b5c ("random32: update the net random state on interrupt and activity") was an earlier attempt at a solution. This patch replaces it. Reported-by: Amit Klein Cc: Willy Tarreau Cc: Eric Dumazet Cc: "Jason A. Donenfeld" Cc: Andy Lutomirski Cc: Kees Cook Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Linus Torvalds Cc: tytso@mit.edu Cc: Florian Westphal Cc: Marc Plumb Fixes: f227e3ec3b5c ("random32: update the net random state on interrupt and activity") Signed-off-by: George Spelvin Link: https://lore.kernel.org/netdev/20200808152628.GA27941@SDF.ORG/ [ willy: partial reversal of f227e3ec3b5c; moved SIPROUND definitions to prandom.h for later use; merged George's prandom_seed() proposal; inlined siprand_u32(); replaced the net_rand_state[] array with 4 members to fix a build issue; cosmetic cleanups to make checkpatch happy; fixed RANDOM32_SELFTEST build ] Signed-off-by: Willy Tarreau --- include/linux/prandom.h | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/prandom.h b/include/linux/prandom.h index aa16e6468f91..cc1e71334e53 100644 --- a/include/linux/prandom.h +++ b/include/linux/prandom.h @@ -16,12 +16,44 @@ void prandom_bytes(void *buf, size_t nbytes); void prandom_seed(u32 seed); void prandom_reseed_late(void); +#if BITS_PER_LONG == 64 +/* + * The core SipHash round function. Each line can be executed in + * parallel given enough CPU resources. + */ +#define PRND_SIPROUND(v0, v1, v2, v3) ( \ + v0 += v1, v1 = rol64(v1, 13), v2 += v3, v3 = rol64(v3, 16), \ + v1 ^= v0, v0 = rol64(v0, 32), v3 ^= v2, \ + v0 += v3, v3 = rol64(v3, 21), v2 += v1, v1 = rol64(v1, 17), \ + v3 ^= v0, v1 ^= v2, v2 = rol64(v2, 32) \ +) + +#define PRND_K0 (0x736f6d6570736575 ^ 0x6c7967656e657261) +#define PRND_K1 (0x646f72616e646f6d ^ 0x7465646279746573) + +#elif BITS_PER_LONG == 32 +/* + * On 32-bit machines, we use HSipHash, a reduced-width version of SipHash. + * This is weaker, but 32-bit machines are not used for high-traffic + * applications, so there is less output for an attacker to analyze. + */ +#define PRND_SIPROUND(v0, v1, v2, v3) ( \ + v0 += v1, v1 = rol32(v1, 5), v2 += v3, v3 = rol32(v3, 8), \ + v1 ^= v0, v0 = rol32(v0, 16), v3 ^= v2, \ + v0 += v3, v3 = rol32(v3, 7), v2 += v1, v1 = rol32(v1, 13), \ + v3 ^= v0, v1 ^= v2, v2 = rol32(v2, 16) \ +) +#define PRND_K0 0x6c796765 +#define PRND_K1 0x74656462 + +#else +#error Unsupported BITS_PER_LONG +#endif + struct rnd_state { __u32 s1, s2, s3, s4; }; -DECLARE_PER_CPU(struct rnd_state, net_rand_state); - u32 prandom_u32_state(struct rnd_state *state); void prandom_bytes_state(struct rnd_state *state, void *buf, size_t nbytes); void prandom_seed_full_state(struct rnd_state __percpu *pcpu_state); -- cgit v1.2.3 From 3744741adab6d9195551ce30e65e726c7a408421 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 10 Aug 2020 10:27:42 +0200 Subject: random32: add noise from network and scheduling activity With the removal of the interrupt perturbations in previous random32 change (random32: make prandom_u32() output unpredictable), the PRNG has become 100% deterministic again. While SipHash is expected to be way more robust against brute force than the previous Tausworthe LFSR, there's still the risk that whoever has even one temporary access to the PRNG's internal state is able to predict all subsequent draws till the next reseed (roughly every minute). This may happen through a side channel attack or any data leak. This patch restores the spirit of commit f227e3ec3b5c ("random32: update the net random state on interrupt and activity") in that it will perturb the internal PRNG's statee using externally collected noise, except that it will not pick that noise from the random pool's bits nor upon interrupt, but will rather combine a few elements along the Tx path that are collectively hard to predict, such as dev, skb and txq pointers, packet length and jiffies values. These ones are combined using a single round of SipHash into a single long variable that is mixed with the net_rand_state upon each invocation. The operation was inlined because it produces very small and efficient code, typically 3 xor, 2 add and 2 rol. The performance was measured to be the same (even very slightly better) than before the switch to SipHash; on a 6-core 12-thread Core i7-8700k equipped with a 40G NIC (i40e), the connection rate dropped from 556k/s to 555k/s while the SYN cookie rate grew from 5.38 Mpps to 5.45 Mpps. Link: https://lore.kernel.org/netdev/20200808152628.GA27941@SDF.ORG/ Cc: George Spelvin Cc: Amit Klein Cc: Eric Dumazet Cc: "Jason A. Donenfeld" Cc: Andy Lutomirski Cc: Kees Cook Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Linus Torvalds Cc: tytso@mit.edu Cc: Florian Westphal Cc: Marc Plumb Tested-by: Sedat Dilek Signed-off-by: Willy Tarreau --- include/linux/prandom.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/prandom.h b/include/linux/prandom.h index cc1e71334e53..bbf4b4ad61df 100644 --- a/include/linux/prandom.h +++ b/include/linux/prandom.h @@ -16,6 +16,12 @@ void prandom_bytes(void *buf, size_t nbytes); void prandom_seed(u32 seed); void prandom_reseed_late(void); +DECLARE_PER_CPU(unsigned long, net_rand_noise); + +#define PRANDOM_ADD_NOISE(a, b, c, d) \ + prandom_u32_add_noise((unsigned long)(a), (unsigned long)(b), \ + (unsigned long)(c), (unsigned long)(d)) + #if BITS_PER_LONG == 64 /* * The core SipHash round function. Each line can be executed in @@ -50,6 +56,18 @@ void prandom_reseed_late(void); #error Unsupported BITS_PER_LONG #endif +static inline void prandom_u32_add_noise(unsigned long a, unsigned long b, + unsigned long c, unsigned long d) +{ + /* + * This is not used cryptographically; it's just + * a convenient 4-word hash function. (3 xor, 2 add, 2 rol) + */ + a ^= raw_cpu_read(net_rand_noise); + PRND_SIPROUND(a, b, c, d); + raw_cpu_write(net_rand_noise, d); +} + struct rnd_state { __u32 s1, s2, s3, s4; }; @@ -99,6 +117,7 @@ static inline void prandom_seed_state(struct rnd_state *state, u64 seed) state->s2 = __seed(i, 8U); state->s3 = __seed(i, 16U); state->s4 = __seed(i, 128U); + PRANDOM_ADD_NOISE(state, i, 0, 0); } /* Pseudo random number generator from numerical recipes. */ -- cgit v1.2.3 From 23224e45004ed84c8466fd1e8e5860f541187029 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 23 Oct 2020 16:27:16 -0700 Subject: mm: remove kzfree() compatibility definition Commit 453431a54934 ("mm, treewide: rename kzfree() to kfree_sensitive()") renamed kzfree() to kfree_sensitive(), but it left a compatibility definition of kzfree() to avoid being too disruptive. Since then a few more instances of kzfree() have slipped in. Just get rid of them and remove the compatibility definition once and for all. Signed-off-by: Eric Biggers Signed-off-by: Linus Torvalds --- include/linux/slab.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 9e155cc83b8a..dd6897f62010 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -187,8 +187,6 @@ void kfree_sensitive(const void *); size_t __ksize(const void *); size_t ksize(const void *); -#define kzfree(x) kfree_sensitive(x) /* For backward compatibility */ - #ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR void __check_heap_object(const void *ptr, unsigned long n, struct page *page, bool to_user); -- cgit v1.2.3 From 33def8498fdde180023444b08e12b72a9efed41d Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 21 Oct 2020 19:36:07 -0700 Subject: treewide: Convert macro and uses of __section(foo) to __section("foo") Use a more generic form for __section that requires quotes to avoid complications with clang and gcc differences. Remove the quote operator # from compiler_attributes.h __section macro. Convert all unquoted __section(foo) uses to quoted __section("foo"). Also convert __attribute__((section("foo"))) uses to __section("foo") even if the __attribute__ has multiple list entry forms. Conversion done using the script at: https://lore.kernel.org/lkml/75393e5ddc272dc7403de74d645e6c6e0f4e70eb.camel@perches.com/2-convert_section.pl Signed-off-by: Joe Perches Reviewed-by: Nick Desaulniers Reviewed-by: Miguel Ojeda Signed-off-by: Linus Torvalds --- include/linux/acpi.h | 4 ++-- include/linux/cache.h | 2 +- include/linux/compiler.h | 8 ++++---- include/linux/compiler_attributes.h | 2 +- include/linux/cpu.h | 2 +- include/linux/dynamic_debug.h | 2 +- include/linux/export.h | 2 +- include/linux/firmware.h | 2 +- include/linux/init.h | 34 +++++++++++++++++----------------- include/linux/init_task.h | 4 ++-- include/linux/interrupt.h | 4 ++-- include/linux/kernel.h | 6 +++--- include/linux/linkage.h | 4 ++-- include/linux/lsm_hooks.h | 4 ++-- include/linux/module.h | 2 +- include/linux/moduleparam.h | 4 ++-- include/linux/mtd/xip.h | 2 +- include/linux/objtool.h | 2 +- include/linux/of.h | 2 +- include/linux/percpu-defs.h | 2 +- include/linux/printk.h | 4 ++-- include/linux/rcupdate.h | 2 +- include/linux/sched/debug.h | 2 +- include/linux/serial_core.h | 2 +- include/linux/spinlock.h | 2 +- include/linux/syscalls.h | 6 +++--- include/linux/trace_events.h | 2 +- include/linux/tracepoint.h | 8 ++++---- 28 files changed, 61 insertions(+), 61 deletions(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 143c6ffce2db..39263c6b52e1 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1153,7 +1153,7 @@ struct acpi_probe_entry { #define ACPI_DECLARE_PROBE_ENTRY(table, name, table_id, subtable, \ valid, data, fn) \ static const struct acpi_probe_entry __acpi_probe_##name \ - __used __section(__##table##_acpi_probe_table) = { \ + __used __section("__" #table "_acpi_probe_table") = { \ .id = table_id, \ .type = subtable, \ .subtable_valid = valid, \ @@ -1164,7 +1164,7 @@ struct acpi_probe_entry { #define ACPI_DECLARE_SUBTABLE_PROBE_ENTRY(table, name, table_id, \ subtable, valid, data, fn) \ static const struct acpi_probe_entry __acpi_probe_##name \ - __used __section(__##table##_acpi_probe_table) = { \ + __used __section("__" #table "_acpi_probe_table") = { \ .id = table_id, \ .type = subtable, \ .subtable_valid = valid, \ diff --git a/include/linux/cache.h b/include/linux/cache.h index 1aa8009f6d06..d742c57eaee5 100644 --- a/include/linux/cache.h +++ b/include/linux/cache.h @@ -34,7 +34,7 @@ * but may get written to during init, so can't live in .rodata (via "const"). */ #ifndef __ro_after_init -#define __ro_after_init __attribute__((__section__(".data..ro_after_init"))) +#define __ro_after_init __section(".data..ro_after_init") #endif #ifndef ____cacheline_aligned diff --git a/include/linux/compiler.h b/include/linux/compiler.h index ac45f6d40d39..e512f5505dad 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -24,7 +24,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, long ______r; \ static struct ftrace_likely_data \ __aligned(4) \ - __section(_ftrace_annotated_branch) \ + __section("_ftrace_annotated_branch") \ ______f = { \ .data.func = __func__, \ .data.file = __FILE__, \ @@ -60,7 +60,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #define __trace_if_value(cond) ({ \ static struct ftrace_branch_data \ __aligned(4) \ - __section(_ftrace_branch) \ + __section("_ftrace_branch") \ __if_trace = { \ .func = __func__, \ .file = __FILE__, \ @@ -118,7 +118,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, ".popsection\n\t" /* Annotate a C jump table to allow objtool to follow the code flow */ -#define __annotate_jump_table __section(.rodata..c_jump_table) +#define __annotate_jump_table __section(".rodata..c_jump_table") #else #define annotate_reachable() @@ -206,7 +206,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, * visible to the compiler. */ #define __ADDRESSABLE(sym) \ - static void * __section(.discard.addressable) __used \ + static void * __section(".discard.addressable") __used \ __UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)&sym; /** diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index ea7b756b1c8f..b2a3f4f641a7 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -254,7 +254,7 @@ * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-section-variable-attribute * clang: https://clang.llvm.org/docs/AttributeReference.html#section-declspec-allocate */ -#define __section(S) __attribute__((__section__(#S))) +#define __section(section) __attribute__((__section__(section))) /* * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-unused-function-attribute diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 8aa84c052fdf..d6428aaf67e7 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -173,7 +173,7 @@ void cpu_startup_entry(enum cpuhp_state state); void cpu_idle_poll_ctrl(bool enable); /* Attach to any functions which should be considered cpuidle. */ -#define __cpuidle __attribute__((__section__(".cpuidle.text"))) +#define __cpuidle __section(".cpuidle.text") bool cpu_in_idle(unsigned long pc); diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index 8aa0c7c2608c..a57ee75342cf 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -84,7 +84,7 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor, #define DEFINE_DYNAMIC_DEBUG_METADATA(name, fmt) \ static struct _ddebug __aligned(8) \ - __section(__dyndbg) name = { \ + __section("__dyndbg") name = { \ .modname = KBUILD_MODNAME, \ .function = __func__, \ .filename = __FILE__, \ diff --git a/include/linux/export.h b/include/linux/export.h index 8933ff6ad23a..fceb5e855717 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -130,7 +130,7 @@ struct kernel_symbol { * discarded in the final link stage. */ #define __ksym_marker(sym) \ - static int __ksym_marker_##sym[0] __section(.discard.ksym) __used + static int __ksym_marker_##sym[0] __section(".discard.ksym") __used #define __EXPORT_SYMBOL(sym, sec, ns) \ __ksym_marker(sym); \ diff --git a/include/linux/firmware.h b/include/linux/firmware.h index c15acadc6cf4..84e346ae766e 100644 --- a/include/linux/firmware.h +++ b/include/linux/firmware.h @@ -36,7 +36,7 @@ struct builtin_fw { #define DECLARE_BUILTIN_FIRMWARE_SIZE(name, blob, size) \ static const struct builtin_fw __fw_concat(__builtin_fw,__COUNTER__) \ - __used __section(.builtin_fw) = { name, blob, size } + __used __section(".builtin_fw") = { name, blob, size } #if defined(CONFIG_FW_LOADER) || (defined(CONFIG_FW_LOADER_MODULE) && defined(MODULE)) int request_firmware(const struct firmware **fw, const char *name, diff --git a/include/linux/init.h b/include/linux/init.h index 212fc9e2f691..7b53cb3092ee 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -47,11 +47,11 @@ /* These are for everybody (although not all archs will actually discard it in modules) */ -#define __init __section(.init.text) __cold __latent_entropy __noinitretpoline -#define __initdata __section(.init.data) -#define __initconst __section(.init.rodata) -#define __exitdata __section(.exit.data) -#define __exit_call __used __section(.exitcall.exit) +#define __init __section(".init.text") __cold __latent_entropy __noinitretpoline +#define __initdata __section(".init.data") +#define __initconst __section(".init.rodata") +#define __exitdata __section(".exit.data") +#define __exit_call __used __section(".exitcall.exit") /* * modpost check for section mismatches during the kernel build. @@ -70,9 +70,9 @@ * * The markers follow same syntax rules as __init / __initdata. */ -#define __ref __section(.ref.text) noinline -#define __refdata __section(.ref.data) -#define __refconst __section(.ref.rodata) +#define __ref __section(".ref.text") noinline +#define __refdata __section(".ref.data") +#define __refconst __section(".ref.rodata") #ifdef MODULE #define __exitused @@ -80,16 +80,16 @@ #define __exitused __used #endif -#define __exit __section(.exit.text) __exitused __cold notrace +#define __exit __section(".exit.text") __exitused __cold notrace /* Used for MEMORY_HOTPLUG */ -#define __meminit __section(.meminit.text) __cold notrace \ +#define __meminit __section(".meminit.text") __cold notrace \ __latent_entropy -#define __meminitdata __section(.meminit.data) -#define __meminitconst __section(.meminit.rodata) -#define __memexit __section(.memexit.text) __exitused __cold notrace -#define __memexitdata __section(.memexit.data) -#define __memexitconst __section(.memexit.rodata) +#define __meminitdata __section(".meminit.data") +#define __meminitconst __section(".meminit.rodata") +#define __memexit __section(".memexit.text") __exitused __cold notrace +#define __memexitdata __section(".memexit.data") +#define __memexitconst __section(".memexit.rodata") /* For assembly routines */ #define __HEAD .section ".head.text","ax" @@ -254,7 +254,7 @@ struct obs_kernel_param { static const char __setup_str_##unique_id[] __initconst \ __aligned(1) = str; \ static struct obs_kernel_param __setup_##unique_id \ - __used __section(.init.setup) \ + __used __section(".init.setup") \ __attribute__((aligned((sizeof(long))))) \ = { __setup_str_##unique_id, fn, early } @@ -298,7 +298,7 @@ void __init parse_early_options(char *cmdline); #endif /* Data marked not to be saved by software suspend */ -#define __nosavedata __section(.data..nosave) +#define __nosavedata __section(".data..nosave") #ifdef MODULE #define __exit_p(x) x diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 2c620d7ac432..b2412b4d4c20 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -40,12 +40,12 @@ extern struct cred init_cred; /* Attach to the init_task data structure for proper alignment */ #ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK -#define __init_task_data __attribute__((__section__(".data..init_task"))) +#define __init_task_data __section(".data..init_task") #else #define __init_task_data /**/ #endif /* Attach to the thread_info data structure for proper alignment */ -#define __init_thread_info __attribute__((__section__(".data..init_thread_info"))) +#define __init_thread_info __section(".data..init_thread_info") #endif diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index f9aee3538461..ee8299eb1f52 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -792,9 +792,9 @@ extern int arch_early_irq_init(void); * We want to know which function is an entrypoint of a hardirq or a softirq. */ #ifndef __irq_entry -# define __irq_entry __attribute__((__section__(".irqentry.text"))) +# define __irq_entry __section(".irqentry.text") #endif -#define __softirq_entry __attribute__((__section__(".softirqentry.text"))) +#define __softirq_entry __section(".softirqentry.text") #endif diff --git a/include/linux/kernel.h b/include/linux/kernel.h index c629215fdad9..2f05e9128201 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -729,7 +729,7 @@ do { \ #define do_trace_printk(fmt, args...) \ do { \ static const char *trace_printk_fmt __used \ - __attribute__((section("__trace_printk_fmt"))) = \ + __section("__trace_printk_fmt") = \ __builtin_constant_p(fmt) ? fmt : NULL; \ \ __trace_printk_check_format(fmt, ##args); \ @@ -773,7 +773,7 @@ int __trace_printk(unsigned long ip, const char *fmt, ...); #define trace_puts(str) ({ \ static const char *trace_printk_fmt __used \ - __attribute__((section("__trace_printk_fmt"))) = \ + __section("__trace_printk_fmt") = \ __builtin_constant_p(str) ? str : NULL; \ \ if (__builtin_constant_p(str)) \ @@ -795,7 +795,7 @@ extern void trace_dump_stack(int skip); do { \ if (__builtin_constant_p(fmt)) { \ static const char *trace_printk_fmt __used \ - __attribute__((section("__trace_printk_fmt"))) = \ + __section("__trace_printk_fmt") = \ __builtin_constant_p(fmt) ? fmt : NULL; \ \ __ftrace_vbprintk(_THIS_IP_, trace_printk_fmt, vargs); \ diff --git a/include/linux/linkage.h b/include/linux/linkage.h index d796ec20d114..5bcfbd972e97 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h @@ -36,8 +36,8 @@ __stringify(name)) #endif -#define __page_aligned_data __section(.data..page_aligned) __aligned(PAGE_SIZE) -#define __page_aligned_bss __section(.bss..page_aligned) __aligned(PAGE_SIZE) +#define __page_aligned_data __section(".data..page_aligned") __aligned(PAGE_SIZE) +#define __page_aligned_bss __section(".bss..page_aligned") __aligned(PAGE_SIZE) /* * For assembly routines. diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 8814e3d5952d..c503f7ab8afb 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1611,12 +1611,12 @@ extern struct lsm_info __start_early_lsm_info[], __end_early_lsm_info[]; #define DEFINE_LSM(lsm) \ static struct lsm_info __lsm_##lsm \ - __used __section(.lsm_info.init) \ + __used __section(".lsm_info.init") \ __aligned(sizeof(unsigned long)) #define DEFINE_EARLY_LSM(lsm) \ static struct lsm_info __early_lsm_##lsm \ - __used __section(.early_lsm_info.init) \ + __used __section(".early_lsm_info.init") \ __aligned(sizeof(unsigned long)) #ifdef CONFIG_SECURITY_SELINUX_DISABLE diff --git a/include/linux/module.h b/include/linux/module.h index a29187f7c360..7ccdf87f376f 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -278,7 +278,7 @@ extern typeof(name) __mod_##type##__##name##_device_table \ .version = _version, \ }; \ static const struct module_version_attribute \ - __used __attribute__ ((__section__ ("__modver"))) \ + __used __section("__modver") \ * __moduleparam_const __modver_attr = &___modver_attr #endif diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 47879fc7f75e..6388eb9734a5 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -22,7 +22,7 @@ #define __MODULE_INFO(tag, name, info) \ static const char __UNIQUE_ID(name)[] \ - __used __attribute__((section(".modinfo"), unused, aligned(1))) \ + __used __section(".modinfo") __attribute__((unused, aligned(1))) \ = __MODULE_INFO_PREFIX __stringify(tag) "=" info #define __MODULE_PARM_TYPE(name, _type) \ @@ -289,7 +289,7 @@ struct kparam_array static const char __param_str_##name[] = prefix #name; \ static struct kernel_param __moduleparam_const __param_##name \ __used \ - __attribute__ ((unused,__section__ ("__param"),aligned(sizeof(void *)))) \ + __section("__param") __attribute__ ((unused, aligned(sizeof(void *)))) \ = { __param_str_##name, THIS_MODULE, ops, \ VERIFY_OCTAL_PERMISSIONS(perm), level, flags, { arg } } diff --git a/include/linux/mtd/xip.h b/include/linux/mtd/xip.h index a4e352b1dfe6..3cac9360588f 100644 --- a/include/linux/mtd/xip.h +++ b/include/linux/mtd/xip.h @@ -28,7 +28,7 @@ * those functions so they get relocated to ram. */ #ifdef CONFIG_XIP_KERNEL -#define __xipram noinline __attribute__ ((__section__ (".xiptext"))) +#define __xipram noinline __section(".xiptext") #endif /* diff --git a/include/linux/objtool.h b/include/linux/objtool.h index ab82c793c897..577f51436cf9 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -60,7 +60,7 @@ struct unwind_hint { * For more information, see tools/objtool/Documentation/stack-validation.txt. */ #define STACK_FRAME_NON_STANDARD(func) \ - static void __used __section(.discard.func_stack_frame_non_standard) \ + static void __used __section(".discard.func_stack_frame_non_standard") \ *__func_stack_frame_non_standard_##func = func #else /* __ASSEMBLY__ */ diff --git a/include/linux/of.h b/include/linux/of.h index 481ec0467285..5d51891cbf1a 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -1299,7 +1299,7 @@ static inline int of_get_available_child_count(const struct device_node *np) #if defined(CONFIG_OF) && !defined(MODULE) #define _OF_DECLARE(table, name, compat, fn, fn_type) \ static const struct of_device_id __of_table_##name \ - __used __section(__##table##_of_table) \ + __used __section("__" #table "_of_table") \ = { .compatible = compat, \ .data = (fn == (fn_type)NULL) ? fn : fn } #else diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 176bfbd52d97..dff7040f629a 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -51,7 +51,7 @@ PER_CPU_ATTRIBUTES #define __PCPU_DUMMY_ATTRS \ - __attribute__((section(".discard"), unused)) + __section(".discard") __attribute__((unused)) /* * s390 and alpha modules require percpu variables to be defined as diff --git a/include/linux/printk.h b/include/linux/printk.h index 78479633ccfc..fe7eb2351610 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -437,7 +437,7 @@ extern int kptr_restrict; #ifdef CONFIG_PRINTK #define printk_once(fmt, ...) \ ({ \ - static bool __section(.data.once) __print_once; \ + static bool __section(".data.once") __print_once; \ bool __ret_print_once = !__print_once; \ \ if (!__print_once) { \ @@ -448,7 +448,7 @@ extern int kptr_restrict; }) #define printk_deferred_once(fmt, ...) \ ({ \ - static bool __section(.data.once) __print_once; \ + static bool __section(".data.once") __print_once; \ bool __ret_print_once = !__print_once; \ \ if (!__print_once) { \ diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 7c1ceff02852..6cdd0152c253 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -299,7 +299,7 @@ static inline int rcu_read_lock_any_held(void) */ #define RCU_LOCKDEP_WARN(c, s) \ do { \ - static bool __section(.data.unlikely) __warned; \ + static bool __section(".data.unlikely") __warned; \ if (debug_lockdep_rcu_enabled() && !__warned && (c)) { \ __warned = true; \ lockdep_rcu_suspicious(__FILE__, __LINE__, s); \ diff --git a/include/linux/sched/debug.h b/include/linux/sched/debug.h index 00c45a0e6abe..ae51f4529fc9 100644 --- a/include/linux/sched/debug.h +++ b/include/linux/sched/debug.h @@ -43,7 +43,7 @@ extern void proc_sched_set_task(struct task_struct *p); #endif /* Attach to any functions which should be ignored in wchan output. */ -#define __sched __attribute__((__section__(".sched.text"))) +#define __sched __section(".sched.text") /* Linker adds these: start and end of __sched functions */ extern char __sched_text_start[], __sched_text_end[]; diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 8a99279a579b..ff63c2963359 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -373,7 +373,7 @@ extern const struct earlycon_id *__earlycon_table_end[]; .compatible = compat, \ .setup = fn }; \ static const struct earlycon_id EARLYCON_USED_OR_UNUSED \ - __section(__earlycon_table) \ + __section("__earlycon_table") \ * const __PASTE(__p, unique_id) = &unique_id #define OF_EARLYCON_DECLARE(_name, compat, fn) \ diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index f2f12d746dbd..79897841a2cc 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -76,7 +76,7 @@ #define LOCK_SECTION_END \ ".previous\n\t" -#define __lockfunc __attribute__((section(".spinlock.text"))) +#define __lockfunc __section(".spinlock.text") /* * Pull the arch_spinlock_t and arch_rwlock_t definitions: diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 2eda7678fe1d..37bea07c12f2 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -144,7 +144,7 @@ extern struct trace_event_functions exit_syscall_print_funcs; .flags = TRACE_EVENT_FL_CAP_ANY, \ }; \ static struct trace_event_call __used \ - __attribute__((section("_ftrace_events"))) \ + __section("_ftrace_events") \ *__event_enter_##sname = &event_enter_##sname; #define SYSCALL_TRACE_EXIT_EVENT(sname) \ @@ -160,7 +160,7 @@ extern struct trace_event_functions exit_syscall_print_funcs; .flags = TRACE_EVENT_FL_CAP_ANY, \ }; \ static struct trace_event_call __used \ - __attribute__((section("_ftrace_events"))) \ + __section("_ftrace_events") \ *__event_exit_##sname = &event_exit_##sname; #define SYSCALL_METADATA(sname, nb, ...) \ @@ -184,7 +184,7 @@ extern struct trace_event_functions exit_syscall_print_funcs; .enter_fields = LIST_HEAD_INIT(__syscall_meta_##sname.enter_fields), \ }; \ static struct syscall_metadata __used \ - __attribute__((section("__syscalls_metadata"))) \ + __section("__syscalls_metadata") \ *__p_syscall_meta_##sname = &__syscall_meta_##sname; static inline int is_syscall_trace_event(struct trace_event_call *tp_event) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 5c6943354049..d321fe5ad1a1 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -709,7 +709,7 @@ do { \ tracing_record_cmdline(current); \ if (__builtin_constant_p(fmt)) { \ static const char *trace_printk_fmt \ - __attribute__((section("__trace_printk_fmt"))) = \ + __section("__trace_printk_fmt") = \ __builtin_constant_p(fmt) ? fmt : NULL; \ \ __trace_bprintk(ip, trace_printk_fmt, ##args); \ diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 81fa0b2f271e..0f21617f1a66 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -119,7 +119,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define __TRACEPOINT_ENTRY(name) \ static tracepoint_ptr_t __tracepoint_ptr_##name __used \ - __section(__tracepoints_ptrs) = &__tracepoint_##name + __section("__tracepoints_ptrs") = &__tracepoint_##name #endif #endif /* _LINUX_TRACEPOINT_H */ @@ -286,11 +286,11 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) */ #define DEFINE_TRACE_FN(_name, _reg, _unreg, proto, args) \ static const char __tpstrtab_##_name[] \ - __section(__tracepoints_strings) = #_name; \ + __section("__tracepoints_strings") = #_name; \ extern struct static_call_key STATIC_CALL_KEY(tp_func_##_name); \ int __traceiter_##_name(void *__data, proto); \ struct tracepoint __tracepoint_##_name __used \ - __section(__tracepoints) = { \ + __section("__tracepoints") = { \ .name = __tpstrtab_##_name, \ .key = STATIC_KEY_INIT_FALSE, \ .static_call_key = &STATIC_CALL_KEY(tp_func_##_name), \ @@ -396,7 +396,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) static const char *___tp_str __tracepoint_string = str; \ ___tp_str; \ }) -#define __tracepoint_string __used __section(__tracepoint_str) +#define __tracepoint_string __used __section("__tracepoint_str") #else /* * tracepoint_string() is used to save the string address for userspace -- cgit v1.2.3 From cb47755725da7b90fecbb2aa82ac3b24a7adb89b Mon Sep 17 00:00:00 2001 From: Zeng Tao Date: Tue, 1 Sep 2020 17:30:13 +0800 Subject: time: Prevent undefined behaviour in timespec64_to_ns() UBSAN reports: Undefined behaviour in ./include/linux/time64.h:127:27 signed integer overflow: 17179869187 * 1000000000 cannot be represented in type 'long long int' Call Trace: timespec64_to_ns include/linux/time64.h:127 [inline] set_cpu_itimer+0x65c/0x880 kernel/time/itimer.c:180 do_setitimer+0x8e/0x740 kernel/time/itimer.c:245 __x64_sys_setitimer+0x14c/0x2c0 kernel/time/itimer.c:336 do_syscall_64+0xa1/0x540 arch/x86/entry/common.c:295 Commit bd40a175769d ("y2038: itimer: change implementation to timespec64") replaced the original conversion which handled time clamping correctly with timespec64_to_ns() which has no overflow protection. Fix it in timespec64_to_ns() as this is not necessarily limited to the usage in itimers. [ tglx: Added comment and adjusted the fixes tag ] Fixes: 361a3bf00582 ("time64: Add time64.h header and define struct timespec64") Signed-off-by: Zeng Tao Signed-off-by: Thomas Gleixner Reviewed-by: Arnd Bergmann Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/1598952616-6416-1-git-send-email-prime.zeng@hisilicon.com --- include/linux/time64.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/time64.h b/include/linux/time64.h index c9dcb3e5781f..5117cb5b5656 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -124,6 +124,10 @@ static inline bool timespec64_valid_settod(const struct timespec64 *ts) */ static inline s64 timespec64_to_ns(const struct timespec64 *ts) { + /* Prevent multiplication overflow */ + if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX) + return KTIME_MAX; + return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec; } -- cgit v1.2.3 From fbdd0049d98d44914fc57d4b91f867f4996c787b Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Mon, 26 Oct 2020 15:43:59 +0200 Subject: RDMA/mlx5: Fix devlink deadlock on net namespace deletion When a mlx5 core devlink instance is reloaded in different net namespace, its associated IB device is deleted and recreated. Example sequence is: $ ip netns add foo $ devlink dev reload pci/0000:00:08.0 netns foo $ ip netns del foo mlx5 IB device needs to attach and detach the netdevice to it through the netdev notifier chain during load and unload sequence. A below call graph of the unload flow. cleanup_net() down_read(&pernet_ops_rwsem); <- first sem acquired ops_pre_exit_list() pre_exit() devlink_pernet_pre_exit() devlink_reload() mlx5_devlink_reload_down() mlx5_unload_one() [...] mlx5_ib_remove() mlx5_ib_unbind_slave_port() mlx5_remove_netdev_notifier() unregister_netdevice_notifier() down_write(&pernet_ops_rwsem);<- recurrsive lock Hence, when net namespace is deleted, mlx5 reload results in deadlock. When deadlock occurs, devlink mutex is also held. This not only deadlocks the mlx5 device under reload, but all the processes which attempt to access unrelated devlink devices are deadlocked. Hence, fix this by mlx5 ib driver to register for per net netdev notifier instead of global one, which operats on the net namespace without holding the pernet_ops_rwsem. Fixes: 4383cfcc65e7 ("net/mlx5: Add devlink reload") Link: https://lore.kernel.org/r/20201026134359.23150-1-parav@nvidia.com Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/linux/mlx5/driver.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index add85094f9a5..0f23e1ed5e71 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1213,4 +1213,22 @@ static inline bool mlx5_is_roce_enabled(struct mlx5_core_dev *dev) return val.vbool; } +/** + * mlx5_core_net - Provide net namespace of the mlx5_core_dev + * @dev: mlx5 core device + * + * mlx5_core_net() returns the net namespace of mlx5 core device. + * This can be called only in below described limited context. + * (a) When a devlink instance for mlx5_core is registered and + * when devlink reload operation is disabled. + * or + * (b) during devlink reload reload_down() and reload_up callbacks + * where it is ensured that devlink instance's net namespace is + * stable. + */ +static inline struct net *mlx5_core_net(struct mlx5_core_dev *dev) +{ + return devlink_net(priv_to_devlink(dev)); +} + #endif /* MLX5_DRIVER_H */ -- cgit v1.2.3 From 343a3e8bc635bd4c58d45a4fe67f9c3a78fbd191 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 26 Oct 2020 17:20:50 +0100 Subject: bpf: Fix -Wshadow warnings There are thousands of warnings about one macro in a W=2 build: include/linux/filter.h:561:6: warning: declaration of 'ret' shadows a previous local [-Wshadow] Prefix all the locals in that macro with __ to avoid most of these warnings. Fixes: 492ecee892c2 ("bpf: enable program stats") Signed-off-by: Arnd Bergmann Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201026162110.3710415-1-arnd@kernel.org --- include/linux/filter.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 72d62cbc1578..1b62397bd124 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -558,21 +558,21 @@ struct sk_filter { DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); #define __BPF_PROG_RUN(prog, ctx, dfunc) ({ \ - u32 ret; \ + u32 __ret; \ cant_migrate(); \ if (static_branch_unlikely(&bpf_stats_enabled_key)) { \ - struct bpf_prog_stats *stats; \ - u64 start = sched_clock(); \ - ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func); \ - stats = this_cpu_ptr(prog->aux->stats); \ - u64_stats_update_begin(&stats->syncp); \ - stats->cnt++; \ - stats->nsecs += sched_clock() - start; \ - u64_stats_update_end(&stats->syncp); \ + struct bpf_prog_stats *__stats; \ + u64 __start = sched_clock(); \ + __ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func); \ + __stats = this_cpu_ptr(prog->aux->stats); \ + u64_stats_update_begin(&__stats->syncp); \ + __stats->cnt++; \ + __stats->nsecs += sched_clock() - __start; \ + u64_stats_update_end(&__stats->syncp); \ } else { \ - ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func); \ + __ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func); \ } \ - ret; }) + __ret; }) #define BPF_PROG_RUN(prog, ctx) \ __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nop_func) -- cgit v1.2.3 From 1c534352f47fd83eb08075ac2474f707e74bf7f7 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 23 Oct 2020 17:35:19 +0200 Subject: cpufreq: Introduce CPUFREQ_NEED_UPDATE_LIMITS driver flag Generally, a cpufreq driver may need to update some internal upper and lower frequency boundaries on policy max and min changes, respectively, but currently this does not work if the target frequency does not change along with the policy limit. Namely, if the target frequency does not change along with the policy min or max, the "target_freq == policy->cur" check in __cpufreq_driver_target() prevents driver callbacks from being invoked and they do not even have a chance to update the corresponding internal boundary. This particularly affects the "powersave" and "performance" governors that always set the target frequency to one of the policy limits and it never changes when the other limit is updated. To allow cpufreq the drivers needing to update internal frequency boundaries on policy limits changes to avoid this issue, introduce a new driver flag, CPUFREQ_NEED_UPDATE_LIMITS, that (when set) will neutralize the check mentioned above. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- include/linux/cpufreq.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index fa37b1c66443..038ed83aab41 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -298,7 +298,7 @@ __ATTR(_name, 0644, show_##_name, store_##_name) struct cpufreq_driver { char name[CPUFREQ_NAME_LEN]; - u8 flags; + u16 flags; void *driver_data; /* needed by all drivers */ @@ -422,6 +422,14 @@ struct cpufreq_driver { */ #define CPUFREQ_IS_COOLING_DEV BIT(7) +/* + * Set by drivers that need to update internale upper and lower boundaries along + * with the target frequency and so the core and governors should also invoke + * the diver if the target frequency does not change, but the policy min or max + * may have changed. + */ +#define CPUFREQ_NEED_UPDATE_LIMITS BIT(8) + int cpufreq_register_driver(struct cpufreq_driver *driver_data); int cpufreq_unregister_driver(struct cpufreq_driver *driver_data); -- cgit v1.2.3 From 1de111b51b829bcf01d2e57971f8fd07a665fa3f Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Fri, 23 Oct 2020 08:47:50 -0700 Subject: KVM: arm64: ARM_SMCCC_ARCH_WORKAROUND_1 doesn't return SMCCC_RET_NOT_REQUIRED According to the SMCCC spec[1](7.5.2 Discovery) the ARM_SMCCC_ARCH_WORKAROUND_1 function id only returns 0, 1, and SMCCC_RET_NOT_SUPPORTED. 0 is "workaround required and safe to call this function" 1 is "workaround not required but safe to call this function" SMCCC_RET_NOT_SUPPORTED is "might be vulnerable or might not be, who knows, I give up!" SMCCC_RET_NOT_SUPPORTED might as well mean "workaround required, except calling this function may not work because it isn't implemented in some cases". Wonderful. We map this SMC call to 0 is SPECTRE_MITIGATED 1 is SPECTRE_UNAFFECTED SMCCC_RET_NOT_SUPPORTED is SPECTRE_VULNERABLE For KVM hypercalls (hvc), we've implemented this function id to return SMCCC_RET_NOT_SUPPORTED, 0, and SMCCC_RET_NOT_REQUIRED. One of those isn't supposed to be there. Per the code we call arm64_get_spectre_v2_state() to figure out what to return for this feature discovery call. 0 is SPECTRE_MITIGATED SMCCC_RET_NOT_REQUIRED is SPECTRE_UNAFFECTED SMCCC_RET_NOT_SUPPORTED is SPECTRE_VULNERABLE Let's clean this up so that KVM tells the guest this mapping: 0 is SPECTRE_MITIGATED 1 is SPECTRE_UNAFFECTED SMCCC_RET_NOT_SUPPORTED is SPECTRE_VULNERABLE Note: SMCCC_RET_NOT_AFFECTED is 1 but isn't part of the SMCCC spec Fixes: c118bbb52743 ("arm64: KVM: Propagate full Spectre v2 workaround state to KVM guests") Signed-off-by: Stephen Boyd Acked-by: Marc Zyngier Acked-by: Will Deacon Cc: Andre Przywara Cc: Steven Price Cc: Marc Zyngier Cc: stable@vger.kernel.org Link: https://developer.arm.com/documentation/den0028/latest [1] Link: https://lore.kernel.org/r/20201023154751.1973872-1-swboyd@chromium.org Signed-off-by: Will Deacon --- include/linux/arm-smccc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 885c9ffc835c..f860645f6512 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -87,6 +87,8 @@ ARM_SMCCC_SMC_32, \ 0, 0x7fff) +#define SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED 1 + /* Paravirtualised time calls (defined by ARM DEN0057A) */ #define ARM_SMCCC_HV_PV_TIME_FEATURES \ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ -- cgit v1.2.3 From cbdc0f54560f94c2205ddbebb5464d65868af0d8 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 23 Oct 2020 18:33:18 +0200 Subject: usb: fix kernel-doc markups There is a common comment marked, instead, with kernel-doc notation. Also, some identifiers have different names between their prototypes and the kernel-doc markup. Signed-off-by: Mauro Carvalho Chehab Acked-by: Felipe Balbi Link: https://lore.kernel.org/r/0b964be3884def04fcd20ea5c12cb90d0014871c.1603469755.git.mchehab+huawei@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/composite.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h index 2040696d75b6..a2d229ab63ba 100644 --- a/include/linux/usb/composite.h +++ b/include/linux/usb/composite.h @@ -437,7 +437,7 @@ static inline struct usb_composite_driver *to_cdriver( #define OS_STRING_IDX 0xEE /** - * struct usb_composite_device - represents one composite usb gadget + * struct usb_composite_dev - represents one composite usb gadget * @gadget: read-only, abstracts the gadget's usb peripheral controller * @req: used for control responses; buffer is pre-allocated * @os_desc_req: used for OS descriptors responses; buffer is pre-allocated -- cgit v1.2.3 From 13150bc5416f45234c955e5bed91623d178c6117 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 27 Oct 2020 16:11:32 +0100 Subject: module: use hidden visibility for weak symbol references Geert reports that commit be2881824ae9eb92 ("arm64/build: Assert for unwanted sections") results in build errors on arm64 for configurations that have CONFIG_MODULES disabled. The commit in question added ASSERT()s to the arm64 linker script to ensure that linker generated sections such as .got.plt etc are empty, but as it turns out, there are corner cases where the linker does emit content into those sections. More specifically, weak references to function symbols (which can remain unsatisfied, and can therefore not be emitted as relative references) will be emitted as GOT and PLT entries when linking the kernel in PIE mode (which is the case when CONFIG_RELOCATABLE is enabled, which is on by default). What happens is that code such as struct device *(*fn)(struct device *dev); struct device *iommu_device; fn = symbol_get(mdev_get_iommu_device); if (fn) { iommu_device = fn(dev); essentially gets converted into the following when CONFIG_MODULES is off: struct device *iommu_device; if (&mdev_get_iommu_device) { iommu_device = mdev_get_iommu_device(dev); where mdev_get_iommu_device is emitted as a weak symbol reference into the object file. The first reference is decorated with an ordinary ABS64 data relocation (which yields 0x0 if the reference remains unsatisfied). However, the indirect call is turned into a direct call covered by a R_AARCH64_CALL26 relocation, which is converted into a call via a PLT entry taking the target address from the associated GOT entry. Given that such GOT and PLT entries are unnecessary for fully linked binaries such as the kernel, let's give these weak symbol references hidden visibility, so that the linker knows that the weak reference via R_AARCH64_CALL26 can simply remain unsatisfied. Signed-off-by: Ard Biesheuvel Tested-by: Geert Uytterhoeven Reviewed-by: Fangrui Song Acked-by: Jessica Yu Cc: Jessica Yu Cc: Kees Cook Cc: Geert Uytterhoeven Cc: Nick Desaulniers Link: https://lore.kernel.org/r/20201027151132.14066-1-ardb@kernel.org Signed-off-by: Will Deacon --- include/linux/module.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 7ccdf87f376f..6264617bab4d 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -740,7 +740,7 @@ static inline bool within_module(unsigned long addr, const struct module *mod) } /* Get/put a kernel symbol (calls should be symmetric) */ -#define symbol_get(x) ({ extern typeof(x) x __attribute__((weak)); &(x); }) +#define symbol_get(x) ({ extern typeof(x) x __attribute__((weak,visibility("hidden"))); &(x); }) #define symbol_put(x) do { } while (0) #define symbol_put_addr(x) do { } while (0) -- cgit v1.2.3 From 6a6223ec7779dfdabb9c2567bb42079bc300cf27 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 27 Oct 2020 10:51:13 +0100 Subject: blk-mq: docs: add kernel-doc description for a new struct member As reported by kernel-doc: ./include/linux/blk-mq.h:267: warning: Function parameter or member 'active_queues_shared_sbitmap' not described in 'blk_mq_tag_set' There is now a new member for struct blk_mq_tag_set. Add a description for it, based on the commit that introduced it. Fixes: f1b49fdc1c64 ("blk-mq: Record active_queues_shared_sbitmap per tag_set for when using shared sbitmap") Signed-off-by: Mauro Carvalho Chehab Reviewed-by: Jens Axboe Reviewed-by: John Garry Link: https://lore.kernel.org/r/8e513153b83eefc05e358f51f2632b592c3f6772.1603791716.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- include/linux/blk-mq.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b23eeca4d677..794b2a33a2c3 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -235,6 +235,8 @@ enum hctx_type { * @flags: Zero or more BLK_MQ_F_* flags. * @driver_data: Pointer to data owned by the block driver that created this * tag set. + * @active_queues_shared_sbitmap: + * number of active request queues per tag set. * @__bitmap_tags: A shared tags sbitmap, used over all hctx's * @__breserved_tags: * A shared reserved tags sbitmap, used over all hctx's -- cgit v1.2.3 From 89b422354409c275e898d26607201797cc05a932 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 27 Oct 2020 10:51:17 +0100 Subject: mm: pagemap.h: fix two kernel-doc markups Changeset a8cf7f272b5a ("mm: add find_lock_head") renamed the index parameter, but forgot to update the kernel-doc markups accordingly. Fixes: a8cf7f272b5a ("mm: add find_lock_head") Signed-off-by: Mauro Carvalho Chehab Reviewed-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/dce89b296a4f5f9f8f798d5e76b6736c14a916ac.1603791716.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- include/linux/pagemap.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index c77b7c31b2e4..e1e19c1f9ec9 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -344,9 +344,9 @@ static inline struct page *find_get_page_flags(struct address_space *mapping, /** * find_lock_page - locate, pin and lock a pagecache page * @mapping: the address_space to search - * @offset: the page index + * @index: the page index * - * Looks up the page cache entry at @mapping & @offset. If there is a + * Looks up the page cache entry at @mapping & @index. If there is a * page cache page, it is returned locked and with an increased * refcount. * @@ -363,9 +363,9 @@ static inline struct page *find_lock_page(struct address_space *mapping, /** * find_lock_head - Locate, pin and lock a pagecache page. * @mapping: The address_space to search. - * @offset: The page index. + * @index: The page index. * - * Looks up the page cache entry at @mapping & @offset. If there is a + * Looks up the page cache entry at @mapping & @index. If there is a * page cache page, its head page is returned locked and with an increased * refcount. * -- cgit v1.2.3 From e86c6569c588a01f20e7554cc245f8fae831957b Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 27 Oct 2020 10:51:18 +0100 Subject: net: phy: remove kernel-doc duplication Sphinx 3 now checks for duplicated function declarations: .../Documentation/networking/kapi:143: ../include/linux/phy.h:163: WARNING: Duplicate C declaration, also defined in 'networking/kapi'. Declaration is 'unsigned int phy_supported_speeds (struct phy_device *phy, unsigned int *speeds, unsigned int size)'. .../Documentation/networking/kapi:143: ../include/linux/phy.h:1034: WARNING: Duplicate C declaration, also defined in 'networking/kapi'. Declaration is 'int phy_read_mmd (struct phy_device *phydev, int devad, u32 regnum)'. .../Documentation/networking/kapi:143: ../include/linux/phy.h:1076: WARNING: Duplicate C declaration, also defined in 'networking/kapi'. Declaration is 'int __phy_read_mmd (struct phy_device *phydev, int devad, u32 regnum)'. .../Documentation/networking/kapi:143: ../include/linux/phy.h:1088: WARNING: Duplicate C declaration, also defined in 'networking/kapi'. Declaration is 'int phy_write_mmd (struct phy_device *phydev, int devad, u32 regnum, u16 val)'. .../Documentation/networking/kapi:143: ../include/linux/phy.h:1100: WARNING: Duplicate C declaration, also defined in 'networking/kapi'. Declaration is 'int __phy_write_mmd (struct phy_device *phydev, int devad, u32 regnum, u16 val)'. It turns that both the C and the H files have the same kernel-doc markup for the same functions. Let's drop the at the header file, keeping the one closer to the code. Signed-off-by: Mauro Carvalho Chehab Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/75e9a357f9a716833d2094b04898754876365e68.1603791716.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- include/linux/phy.h | 40 +++++----------------------------------- 1 file changed, 5 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index eb3cb1a98b45..56563e5e0dc7 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -147,16 +147,8 @@ typedef enum { PHY_INTERFACE_MODE_MAX, } phy_interface_t; -/** +/* * phy_supported_speeds - return all speeds currently supported by a PHY device - * @phy: The PHY device to return supported speeds of. - * @speeds: buffer to store supported speeds in. - * @size: size of speeds buffer. - * - * Description: Returns the number of supported speeds, and fills - * the speeds buffer with the supported speeds. If speeds buffer is - * too small to contain all currently supported speeds, will return as - * many speeds as can fit. */ unsigned int phy_supported_speeds(struct phy_device *phy, unsigned int *speeds, @@ -1022,14 +1014,9 @@ static inline int __phy_modify_changed(struct phy_device *phydev, u32 regnum, regnum, mask, set); } -/** +/* * phy_read_mmd - Convenience function for reading a register * from an MMD on a given PHY. - * @phydev: The phy_device struct - * @devad: The MMD to read from - * @regnum: The register on the MMD to read - * - * Same rules as for phy_read(); */ int phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum); @@ -1064,38 +1051,21 @@ int phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum); __ret; \ }) -/** +/* * __phy_read_mmd - Convenience function for reading a register * from an MMD on a given PHY. - * @phydev: The phy_device struct - * @devad: The MMD to read from - * @regnum: The register on the MMD to read - * - * Same rules as for __phy_read(); */ int __phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum); -/** +/* * phy_write_mmd - Convenience function for writing a register * on an MMD on a given PHY. - * @phydev: The phy_device struct - * @devad: The MMD to write to - * @regnum: The register on the MMD to read - * @val: value to write to @regnum - * - * Same rules as for phy_write(); */ int phy_write_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val); -/** +/* * __phy_write_mmd - Convenience function for writing a register * on an MMD on a given PHY. - * @phydev: The phy_device struct - * @devad: The MMD to write to - * @regnum: The register on the MMD to read - * @val: value to write to @regnum - * - * Same rules as for __phy_write(); */ int __phy_write_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val); -- cgit v1.2.3 From cf38cc9f1e71151f22584c40357afaab6609384b Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 27 Oct 2020 10:51:23 +0100 Subject: locking/refcount: move kernel-doc markups to the proper place Changeset a435b9a14356 ("locking/refcount: Provide __refcount API to obtain the old value") added a set of functions starting with __ that have a new parameter, adding a series of new warnings: $ ./scripts/kernel-doc -none include/linux/refcount.h include/linux/refcount.h:169: warning: Function parameter or member 'oldp' not described in '__refcount_add_not_zero' include/linux/refcount.h:208: warning: Function parameter or member 'oldp' not described in '__refcount_add' include/linux/refcount.h:239: warning: Function parameter or member 'oldp' not described in '__refcount_inc_not_zero' include/linux/refcount.h:261: warning: Function parameter or member 'oldp' not described in '__refcount_inc' include/linux/refcount.h:291: warning: Function parameter or member 'oldp' not described in '__refcount_sub_and_test' include/linux/refcount.h:327: warning: Function parameter or member 'oldp' not described in '__refcount_dec_and_test' include/linux/refcount.h:347: warning: Function parameter or member 'oldp' not described in '__refcount_dec' The issue is that the kernel-doc markups are now misplaced, as they should be added just before the functions. So, move the kernel-doc markups to the proper places, in order to drop the warnings. It should be noticed that git show produces a crappy output, for this patch without "--patience" flag. Fixes: a435b9a14356 ("locking/refcount: Provide __refcount API to obtain the old value") Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/7985c31d1ace591bc5e1faa05c367f1295b78afd.1603791716.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- include/linux/refcount.h | 130 +++++++++++++++++++++++------------------------ 1 file changed, 65 insertions(+), 65 deletions(-) (limited to 'include/linux') diff --git a/include/linux/refcount.h b/include/linux/refcount.h index 7fabb1af18e0..497990c69b0b 100644 --- a/include/linux/refcount.h +++ b/include/linux/refcount.h @@ -147,24 +147,6 @@ static inline unsigned int refcount_read(const refcount_t *r) return atomic_read(&r->refs); } -/** - * refcount_add_not_zero - add a value to a refcount unless it is 0 - * @i: the value to add to the refcount - * @r: the refcount - * - * Will saturate at REFCOUNT_SATURATED and WARN. - * - * Provides no memory ordering, it is assumed the caller has guaranteed the - * object memory to be stable (RCU, etc.). It does provide a control dependency - * and thereby orders future stores. See the comment on top. - * - * Use of this function is not recommended for the normal reference counting - * use case in which references are taken and released one at a time. In these - * cases, refcount_inc(), or one of its variants, should instead be used to - * increment a reference count. - * - * Return: false if the passed refcount is 0, true otherwise - */ static inline __must_check bool __refcount_add_not_zero(int i, refcount_t *r, int *oldp) { int old = refcount_read(r); @@ -183,17 +165,12 @@ static inline __must_check bool __refcount_add_not_zero(int i, refcount_t *r, in return old; } -static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r) -{ - return __refcount_add_not_zero(i, r, NULL); -} - /** - * refcount_add - add a value to a refcount + * refcount_add_not_zero - add a value to a refcount unless it is 0 * @i: the value to add to the refcount * @r: the refcount * - * Similar to atomic_add(), but will saturate at REFCOUNT_SATURATED and WARN. + * Will saturate at REFCOUNT_SATURATED and WARN. * * Provides no memory ordering, it is assumed the caller has guaranteed the * object memory to be stable (RCU, etc.). It does provide a control dependency @@ -203,7 +180,14 @@ static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r) * use case in which references are taken and released one at a time. In these * cases, refcount_inc(), or one of its variants, should instead be used to * increment a reference count. + * + * Return: false if the passed refcount is 0, true otherwise */ +static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r) +{ + return __refcount_add_not_zero(i, r, NULL); +} + static inline void __refcount_add(int i, refcount_t *r, int *oldp) { int old = atomic_fetch_add_relaxed(i, &r->refs); @@ -217,11 +201,32 @@ static inline void __refcount_add(int i, refcount_t *r, int *oldp) refcount_warn_saturate(r, REFCOUNT_ADD_OVF); } +/** + * refcount_add - add a value to a refcount + * @i: the value to add to the refcount + * @r: the refcount + * + * Similar to atomic_add(), but will saturate at REFCOUNT_SATURATED and WARN. + * + * Provides no memory ordering, it is assumed the caller has guaranteed the + * object memory to be stable (RCU, etc.). It does provide a control dependency + * and thereby orders future stores. See the comment on top. + * + * Use of this function is not recommended for the normal reference counting + * use case in which references are taken and released one at a time. In these + * cases, refcount_inc(), or one of its variants, should instead be used to + * increment a reference count. + */ static inline void refcount_add(int i, refcount_t *r) { __refcount_add(i, r, NULL); } +static inline __must_check bool __refcount_inc_not_zero(refcount_t *r, int *oldp) +{ + return __refcount_add_not_zero(1, r, oldp); +} + /** * refcount_inc_not_zero - increment a refcount unless it is 0 * @r: the refcount to increment @@ -235,14 +240,14 @@ static inline void refcount_add(int i, refcount_t *r) * * Return: true if the increment was successful, false otherwise */ -static inline __must_check bool __refcount_inc_not_zero(refcount_t *r, int *oldp) +static inline __must_check bool refcount_inc_not_zero(refcount_t *r) { - return __refcount_add_not_zero(1, r, oldp); + return __refcount_inc_not_zero(r, NULL); } -static inline __must_check bool refcount_inc_not_zero(refcount_t *r) +static inline void __refcount_inc(refcount_t *r, int *oldp) { - return __refcount_inc_not_zero(r, NULL); + __refcount_add(1, r, oldp); } /** @@ -257,14 +262,27 @@ static inline __must_check bool refcount_inc_not_zero(refcount_t *r) * Will WARN if the refcount is 0, as this represents a possible use-after-free * condition. */ -static inline void __refcount_inc(refcount_t *r, int *oldp) +static inline void refcount_inc(refcount_t *r) { - __refcount_add(1, r, oldp); + __refcount_inc(r, NULL); } -static inline void refcount_inc(refcount_t *r) +static inline __must_check bool __refcount_sub_and_test(int i, refcount_t *r, int *oldp) { - __refcount_inc(r, NULL); + int old = atomic_fetch_sub_release(i, &r->refs); + + if (oldp) + *oldp = old; + + if (old == i) { + smp_acquire__after_ctrl_dep(); + return true; + } + + if (unlikely(old < 0 || old - i < 0)) + refcount_warn_saturate(r, REFCOUNT_SUB_UAF); + + return false; } /** @@ -287,27 +305,14 @@ static inline void refcount_inc(refcount_t *r) * * Return: true if the resulting refcount is 0, false otherwise */ -static inline __must_check bool __refcount_sub_and_test(int i, refcount_t *r, int *oldp) +static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r) { - int old = atomic_fetch_sub_release(i, &r->refs); - - if (oldp) - *oldp = old; - - if (old == i) { - smp_acquire__after_ctrl_dep(); - return true; - } - - if (unlikely(old < 0 || old - i < 0)) - refcount_warn_saturate(r, REFCOUNT_SUB_UAF); - - return false; + return __refcount_sub_and_test(i, r, NULL); } -static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r) +static inline __must_check bool __refcount_dec_and_test(refcount_t *r, int *oldp) { - return __refcount_sub_and_test(i, r, NULL); + return __refcount_sub_and_test(1, r, oldp); } /** @@ -323,26 +328,11 @@ static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r) * * Return: true if the resulting refcount is 0, false otherwise */ -static inline __must_check bool __refcount_dec_and_test(refcount_t *r, int *oldp) -{ - return __refcount_sub_and_test(1, r, oldp); -} - static inline __must_check bool refcount_dec_and_test(refcount_t *r) { return __refcount_dec_and_test(r, NULL); } -/** - * refcount_dec - decrement a refcount - * @r: the refcount - * - * Similar to atomic_dec(), it will WARN on underflow and fail to decrement - * when saturated at REFCOUNT_SATURATED. - * - * Provides release memory ordering, such that prior loads and stores are done - * before. - */ static inline void __refcount_dec(refcount_t *r, int *oldp) { int old = atomic_fetch_sub_release(1, &r->refs); @@ -354,6 +344,16 @@ static inline void __refcount_dec(refcount_t *r, int *oldp) refcount_warn_saturate(r, REFCOUNT_DEC_LEAK); } +/** + * refcount_dec - decrement a refcount + * @r: the refcount + * + * Similar to atomic_dec(), it will WARN on underflow and fail to decrement + * when saturated at REFCOUNT_SATURATED. + * + * Provides release memory ordering, such that prior loads and stores are done + * before. + */ static inline void refcount_dec(refcount_t *r) { __refcount_dec(r, NULL); -- cgit v1.2.3 From e029c5f2798720b463e8df0e184a4d1036311b43 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Mon, 26 Oct 2020 21:49:14 -0700 Subject: ext4: make num of fast commit blocks configurable This patch reserves a field in the jbd2 superblock for number of fast commit blocks. When this value is non-zero, Ext4 uses this field to set the number of fast commit blocks. Fixes: 6866d7b3f2bb ("ext4/jbd2: add fast commit initialization") Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201027044915.2553163-2-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index fb3d71ad6eea..7e88bbc16ffb 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -263,7 +263,10 @@ typedef struct journal_superblock_s /* 0x0050 */ __u8 s_checksum_type; /* checksum type */ __u8 s_padding2[3]; - __u32 s_padding[42]; +/* 0x0054 */ + __be32 s_num_fc_blks; /* Number of fast commit blocks */ +/* 0x0058 */ + __u32 s_padding[41]; __be32 s_checksum; /* crc32c(superblock) */ /* 0x0100 */ -- cgit v1.2.3 From ea4b01d9b81f5f381fc6832bc31046878a2d1a5d Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 27 Oct 2020 10:51:27 +0100 Subject: jbd2: fix a kernel-doc markup The kernel-doc markup that documents _fc_replay_callback is missing an asterisk, causing this warning: ../include/linux/jbd2.h:1271: warning: Function parameter or member 'j_fc_replay_callback' not described in 'journal_s' When building the docs. Fixes: 609f928af48f ("jbd2: fast commit recovery path") Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/6055927ada2015b55b413cdd2670533bdc9a8da2.1603791716.git.mchehab+huawei@kernel.org Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 7e88bbc16ffb..1d5566af48ac 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1256,7 +1256,7 @@ struct journal_s */ void (*j_fc_cleanup_callback)(struct journal_s *journal, int); - /* + /** * @j_fc_replay_callback: * * File-system specific function that performs replay of a fast -- cgit v1.2.3 From 80ade22c06ca115b81dd168e99479c8e09843513 Mon Sep 17 00:00:00 2001 From: Sudeep Dutt Date: Tue, 27 Oct 2020 20:14:15 -0700 Subject: misc: mic: remove the MIC drivers This patch removes the MIC drivers from the kernel tree since the corresponding devices have been discontinued. Removing the dma and char-misc changes in one patch and merging via the char-misc tree is best to avoid any potential build breakage. Cc: Nikhil Rao Reviewed-by: Ashutosh Dixit Signed-off-by: Sudeep Dutt Acked-By: Vinod Koul Reviewed-by: Sherry Sun Link: https://lore.kernel.org/r/8c1443136563de34699d2c084df478181c205db4.1603854416.git.sudeep.dutt@intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/mic_bus.h | 100 ---- include/linux/scif.h | 1339 ----------------------------------------------- 2 files changed, 1439 deletions(-) delete mode 100644 include/linux/mic_bus.h delete mode 100644 include/linux/scif.h (limited to 'include/linux') diff --git a/include/linux/mic_bus.h b/include/linux/mic_bus.h deleted file mode 100644 index e99c789424e0..000000000000 --- a/include/linux/mic_bus.h +++ /dev/null @@ -1,100 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2014 Intel Corporation. - * - * Intel MIC Bus driver. - * - * This implementation is very similar to the virtio bus driver - * implementation @ include/linux/virtio.h. - */ -#ifndef _MIC_BUS_H_ -#define _MIC_BUS_H_ -/* - * Everything a mbus driver needs to work with any particular mbus - * implementation. - */ -#include -#include - -struct mbus_device_id { - __u32 device; - __u32 vendor; -}; - -#define MBUS_DEV_DMA_HOST 2 -#define MBUS_DEV_DMA_MIC 3 -#define MBUS_DEV_ANY_ID 0xffffffff - -/** - * mbus_device - representation of a device using mbus - * @mmio_va: virtual address of mmio space - * @hw_ops: the hardware ops supported by this device. - * @id: the device type identification (used to match it with a driver). - * @dev: underlying device. - * be used to communicate with. - * @index: unique position on the mbus bus - */ -struct mbus_device { - void __iomem *mmio_va; - struct mbus_hw_ops *hw_ops; - struct mbus_device_id id; - struct device dev; - int index; -}; - -/** - * mbus_driver - operations for a mbus I/O driver - * @driver: underlying device driver (populate name and owner). - * @id_table: the ids serviced by this driver. - * @probe: the function to call when a device is found. Returns 0 or -errno. - * @remove: the function to call when a device is removed. - */ -struct mbus_driver { - struct device_driver driver; - const struct mbus_device_id *id_table; - int (*probe)(struct mbus_device *dev); - void (*scan)(struct mbus_device *dev); - void (*remove)(struct mbus_device *dev); -}; - -/** - * struct mic_irq - opaque pointer used as cookie - */ -struct mic_irq; - -/** - * mbus_hw_ops - Hardware operations for accessing a MIC device on the MIC bus. - */ -struct mbus_hw_ops { - struct mic_irq* (*request_threaded_irq)(struct mbus_device *mbdev, - irq_handler_t handler, - irq_handler_t thread_fn, - const char *name, void *data, - int intr_src); - void (*free_irq)(struct mbus_device *mbdev, - struct mic_irq *cookie, void *data); - void (*ack_interrupt)(struct mbus_device *mbdev, int num); -}; - -struct mbus_device * -mbus_register_device(struct device *pdev, int id, const struct dma_map_ops *dma_ops, - struct mbus_hw_ops *hw_ops, int index, - void __iomem *mmio_va); -void mbus_unregister_device(struct mbus_device *mbdev); - -int mbus_register_driver(struct mbus_driver *drv); -void mbus_unregister_driver(struct mbus_driver *drv); - -static inline struct mbus_device *dev_to_mbus(struct device *_dev) -{ - return container_of(_dev, struct mbus_device, dev); -} - -static inline struct mbus_driver *drv_to_mbus(struct device_driver *drv) -{ - return container_of(drv, struct mbus_driver, driver); -} - -#endif /* _MIC_BUS_H */ diff --git a/include/linux/scif.h b/include/linux/scif.h deleted file mode 100644 index 329e695b8fe5..000000000000 --- a/include/linux/scif.h +++ /dev/null @@ -1,1339 +0,0 @@ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * Copyright(c) 2014 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * BSD LICENSE - * - * Copyright(c) 2014 Intel Corporation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * Intel SCIF driver. - * - */ -#ifndef __SCIF_H__ -#define __SCIF_H__ - -#include -#include -#include -#include - -#define SCIF_ACCEPT_SYNC 1 -#define SCIF_SEND_BLOCK 1 -#define SCIF_RECV_BLOCK 1 - -enum { - SCIF_PROT_READ = (1 << 0), - SCIF_PROT_WRITE = (1 << 1) -}; - -enum { - SCIF_MAP_FIXED = 0x10, - SCIF_MAP_KERNEL = 0x20, -}; - -enum { - SCIF_FENCE_INIT_SELF = (1 << 0), - SCIF_FENCE_INIT_PEER = (1 << 1), - SCIF_SIGNAL_LOCAL = (1 << 4), - SCIF_SIGNAL_REMOTE = (1 << 5) -}; - -enum { - SCIF_RMA_USECPU = (1 << 0), - SCIF_RMA_USECACHE = (1 << 1), - SCIF_RMA_SYNC = (1 << 2), - SCIF_RMA_ORDERED = (1 << 3) -}; - -/* End of SCIF Admin Reserved Ports */ -#define SCIF_ADMIN_PORT_END 1024 - -/* End of SCIF Reserved Ports */ -#define SCIF_PORT_RSVD 1088 - -typedef struct scif_endpt *scif_epd_t; -typedef struct scif_pinned_pages *scif_pinned_pages_t; - -/** - * struct scif_range - SCIF registered range used in kernel mode - * @cookie: cookie used internally by SCIF - * @nr_pages: number of pages of PAGE_SIZE - * @prot_flags: R/W protection - * @phys_addr: Array of bus addresses - * @va: Array of kernel virtual addresses backed by the pages in the phys_addr - * array. The va is populated only when called on the host for a remote - * SCIF connection on MIC. This is required to support the use case of DMA - * between MIC and another device which is not a SCIF node e.g., an IB or - * ethernet NIC. - */ -struct scif_range { - void *cookie; - int nr_pages; - int prot_flags; - dma_addr_t *phys_addr; - void __iomem **va; -}; - -/** - * struct scif_pollepd - SCIF endpoint to be monitored via scif_poll - * @epd: SCIF endpoint - * @events: requested events - * @revents: returned events - */ -struct scif_pollepd { - scif_epd_t epd; - __poll_t events; - __poll_t revents; -}; - -/** - * scif_peer_dev - representation of a peer SCIF device - * - * Peer devices show up as PCIe devices for the mgmt node but not the cards. - * The mgmt node discovers all the cards on the PCIe bus and informs the other - * cards about their peers. Upon notification of a peer a node adds a peer - * device to the peer bus to maintain symmetry in the way devices are - * discovered across all nodes in the SCIF network. - * - * @dev: underlying device - * @dnode - The destination node which this device will communicate with. - */ -struct scif_peer_dev { - struct device dev; - u8 dnode; -}; - -/** - * scif_client - representation of a SCIF client - * @name: client name - * @probe - client method called when a peer device is registered - * @remove - client method called when a peer device is unregistered - * @si - subsys_interface used internally for implementing SCIF clients - */ -struct scif_client { - const char *name; - void (*probe)(struct scif_peer_dev *spdev); - void (*remove)(struct scif_peer_dev *spdev); - struct subsys_interface si; -}; - -#define SCIF_OPEN_FAILED ((scif_epd_t)-1) -#define SCIF_REGISTER_FAILED ((off_t)-1) -#define SCIF_MMAP_FAILED ((void *)-1) - -/** - * scif_open() - Create an endpoint - * - * Return: - * Upon successful completion, scif_open() returns an endpoint descriptor to - * be used in subsequent SCIF functions calls to refer to that endpoint; - * otherwise in user mode SCIF_OPEN_FAILED (that is ((scif_epd_t)-1)) is - * returned and errno is set to indicate the error; in kernel mode a NULL - * scif_epd_t is returned. - * - * Errors: - * ENOMEM - Insufficient kernel memory was available - */ -scif_epd_t scif_open(void); - -/** - * scif_bind() - Bind an endpoint to a port - * @epd: endpoint descriptor - * @pn: port number - * - * scif_bind() binds endpoint epd to port pn, where pn is a port number on the - * local node. If pn is zero, a port number greater than or equal to - * SCIF_PORT_RSVD is assigned and returned. Each endpoint may be bound to - * exactly one local port. Ports less than 1024 when requested can only be bound - * by system (or root) processes or by processes executed by privileged users. - * - * Return: - * Upon successful completion, scif_bind() returns the port number to which epd - * is bound; otherwise in user mode -1 is returned and errno is set to - * indicate the error; in kernel mode the negative of one of the following - * errors is returned. - * - * Errors: - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * EINVAL - the endpoint or the port is already bound - * EISCONN - The endpoint is already connected - * ENOSPC - No port number available for assignment - * EACCES - The port requested is protected and the user is not the superuser - */ -int scif_bind(scif_epd_t epd, u16 pn); - -/** - * scif_listen() - Listen for connections on an endpoint - * @epd: endpoint descriptor - * @backlog: maximum pending connection requests - * - * scif_listen() marks the endpoint epd as a listening endpoint - that is, as - * an endpoint that will be used to accept incoming connection requests. Once - * so marked, the endpoint is said to be in the listening state and may not be - * used as the endpoint of a connection. - * - * The endpoint, epd, must have been bound to a port. - * - * The backlog argument defines the maximum length to which the queue of - * pending connections for epd may grow. If a connection request arrives when - * the queue is full, the client may receive an error with an indication that - * the connection was refused. - * - * Return: - * Upon successful completion, scif_listen() returns 0; otherwise in user mode - * -1 is returned and errno is set to indicate the error; in kernel mode the - * negative of one of the following errors is returned. - * - * Errors: - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * EINVAL - the endpoint is not bound to a port - * EISCONN - The endpoint is already connected or listening - */ -int scif_listen(scif_epd_t epd, int backlog); - -/** - * scif_connect() - Initiate a connection on a port - * @epd: endpoint descriptor - * @dst: global id of port to which to connect - * - * The scif_connect() function requests the connection of endpoint epd to remote - * port dst. If the connection is successful, a peer endpoint, bound to dst, is - * created on node dst.node. On successful return, the connection is complete. - * - * If the endpoint epd has not already been bound to a port, scif_connect() - * will bind it to an unused local port. - * - * A connection is terminated when an endpoint of the connection is closed, - * either explicitly by scif_close(), or when a process that owns one of the - * endpoints of the connection is terminated. - * - * In user space, scif_connect() supports an asynchronous connection mode - * if the application has set the O_NONBLOCK flag on the endpoint via the - * fcntl() system call. Setting this flag will result in the calling process - * not to wait during scif_connect(). - * - * Return: - * Upon successful completion, scif_connect() returns the port ID to which the - * endpoint, epd, is bound; otherwise in user mode -1 is returned and errno is - * set to indicate the error; in kernel mode the negative of one of the - * following errors is returned. - * - * Errors: - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNREFUSED - The destination was not listening for connections or refused - * the connection request - * EINVAL - dst.port is not a valid port ID - * EISCONN - The endpoint is already connected - * ENOMEM - No buffer space is available - * ENODEV - The destination node does not exist, or the node is lost or existed, - * but is not currently in the network since it may have crashed - * ENOSPC - No port number available for assignment - * EOPNOTSUPP - The endpoint is listening and cannot be connected - */ -int scif_connect(scif_epd_t epd, struct scif_port_id *dst); - -/** - * scif_accept() - Accept a connection on an endpoint - * @epd: endpoint descriptor - * @peer: global id of port to which connected - * @newepd: new connected endpoint descriptor - * @flags: flags - * - * The scif_accept() call extracts the first connection request from the queue - * of pending connections for the port on which epd is listening. scif_accept() - * creates a new endpoint, bound to the same port as epd, and allocates a new - * SCIF endpoint descriptor, returned in newepd, for the endpoint. The new - * endpoint is connected to the endpoint through which the connection was - * requested. epd is unaffected by this call, and remains in the listening - * state. - * - * On successful return, peer holds the global port identifier (node id and - * local port number) of the port which requested the connection. - * - * A connection is terminated when an endpoint of the connection is closed, - * either explicitly by scif_close(), or when a process that owns one of the - * endpoints of the connection is terminated. - * - * The number of connections that can (subsequently) be accepted on epd is only - * limited by system resources (memory). - * - * The flags argument is formed by OR'ing together zero or more of the - * following values. - * SCIF_ACCEPT_SYNC - block until a connection request is presented. If - * SCIF_ACCEPT_SYNC is not in flags, and no pending - * connections are present on the queue, scif_accept() - * fails with an EAGAIN error - * - * In user mode, the select() and poll() functions can be used to determine - * when there is a connection request. In kernel mode, the scif_poll() - * function may be used for this purpose. A readable event will be delivered - * when a connection is requested. - * - * Return: - * Upon successful completion, scif_accept() returns 0; otherwise in user mode - * -1 is returned and errno is set to indicate the error; in kernel mode the - * negative of one of the following errors is returned. - * - * Errors: - * EAGAIN - SCIF_ACCEPT_SYNC is not set and no connections are present to be - * accepted or SCIF_ACCEPT_SYNC is not set and remote node failed to complete - * its connection request - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * EINTR - Interrupted function - * EINVAL - epd is not a listening endpoint, or flags is invalid, or peer is - * NULL, or newepd is NULL - * ENODEV - The requesting node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOMEM - Not enough space - * ENOENT - Secondary part of epd registration failed - */ -int scif_accept(scif_epd_t epd, struct scif_port_id *peer, scif_epd_t - *newepd, int flags); - -/** - * scif_close() - Close an endpoint - * @epd: endpoint descriptor - * - * scif_close() closes an endpoint and performs necessary teardown of - * facilities associated with that endpoint. - * - * If epd is a listening endpoint then it will no longer accept connection - * requests on the port to which it is bound. Any pending connection requests - * are rejected. - * - * If epd is a connected endpoint, then its peer endpoint is also closed. RMAs - * which are in-process through epd or its peer endpoint will complete before - * scif_close() returns. Registered windows of the local and peer endpoints are - * released as if scif_unregister() was called against each window. - * - * Closing a SCIF endpoint does not affect local registered memory mapped by - * a SCIF endpoint on a remote node. The local memory remains mapped by the peer - * SCIF endpoint explicitly removed by calling munmap(..) by the peer. - * - * If the peer endpoint's receive queue is not empty at the time that epd is - * closed, then the peer endpoint can be passed as the endpoint parameter to - * scif_recv() until the receive queue is empty. - * - * epd is freed and may no longer be accessed. - * - * Return: - * Upon successful completion, scif_close() returns 0; otherwise in user mode - * -1 is returned and errno is set to indicate the error; in kernel mode the - * negative of one of the following errors is returned. - * - * Errors: - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - */ -int scif_close(scif_epd_t epd); - -/** - * scif_send() - Send a message - * @epd: endpoint descriptor - * @msg: message buffer address - * @len: message length - * @flags: blocking mode flags - * - * scif_send() sends data to the peer of endpoint epd. Up to len bytes of data - * are copied from memory starting at address msg. On successful execution the - * return value of scif_send() is the number of bytes that were sent, and is - * zero if no bytes were sent because len was zero. scif_send() may be called - * only when the endpoint is in a connected state. - * - * If a scif_send() call is non-blocking, then it sends only those bytes which - * can be sent without waiting, up to a maximum of len bytes. - * - * If a scif_send() call is blocking, then it normally returns after sending - * all len bytes. If a blocking call is interrupted or the connection is - * reset, the call is considered successful if some bytes were sent or len is - * zero, otherwise the call is considered unsuccessful. - * - * In user mode, the select() and poll() functions can be used to determine - * when the send queue is not full. In kernel mode, the scif_poll() function - * may be used for this purpose. - * - * It is recommended that scif_send()/scif_recv() only be used for short - * control-type message communication between SCIF endpoints. The SCIF RMA - * APIs are expected to provide better performance for transfer sizes of - * 1024 bytes or longer for the current MIC hardware and software - * implementation. - * - * scif_send() will block until the entire message is sent if SCIF_SEND_BLOCK - * is passed as the flags argument. - * - * Return: - * Upon successful completion, scif_send() returns the number of bytes sent; - * otherwise in user mode -1 is returned and errno is set to indicate the - * error; in kernel mode the negative of one of the following errors is - * returned. - * - * Errors: - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNRESET - Connection reset by peer - * EINVAL - flags is invalid, or len is negative - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOMEM - Not enough space - * ENOTCONN - The endpoint is not connected - */ -int scif_send(scif_epd_t epd, void *msg, int len, int flags); - -/** - * scif_recv() - Receive a message - * @epd: endpoint descriptor - * @msg: message buffer address - * @len: message buffer length - * @flags: blocking mode flags - * - * scif_recv() receives data from the peer of endpoint epd. Up to len bytes of - * data are copied to memory starting at address msg. On successful execution - * the return value of scif_recv() is the number of bytes that were received, - * and is zero if no bytes were received because len was zero. scif_recv() may - * be called only when the endpoint is in a connected state. - * - * If a scif_recv() call is non-blocking, then it receives only those bytes - * which can be received without waiting, up to a maximum of len bytes. - * - * If a scif_recv() call is blocking, then it normally returns after receiving - * all len bytes. If the blocking call was interrupted due to a disconnection, - * subsequent calls to scif_recv() will copy all bytes received upto the point - * of disconnection. - * - * In user mode, the select() and poll() functions can be used to determine - * when data is available to be received. In kernel mode, the scif_poll() - * function may be used for this purpose. - * - * It is recommended that scif_send()/scif_recv() only be used for short - * control-type message communication between SCIF endpoints. The SCIF RMA - * APIs are expected to provide better performance for transfer sizes of - * 1024 bytes or longer for the current MIC hardware and software - * implementation. - * - * scif_recv() will block until the entire message is received if - * SCIF_RECV_BLOCK is passed as the flags argument. - * - * Return: - * Upon successful completion, scif_recv() returns the number of bytes - * received; otherwise in user mode -1 is returned and errno is set to - * indicate the error; in kernel mode the negative of one of the following - * errors is returned. - * - * Errors: - * EAGAIN - The destination node is returning from a low power state - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNRESET - Connection reset by peer - * EINVAL - flags is invalid, or len is negative - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOMEM - Not enough space - * ENOTCONN - The endpoint is not connected - */ -int scif_recv(scif_epd_t epd, void *msg, int len, int flags); - -/** - * scif_register() - Mark a memory region for remote access. - * @epd: endpoint descriptor - * @addr: starting virtual address - * @len: length of range - * @offset: offset of window - * @prot_flags: read/write protection flags - * @map_flags: mapping flags - * - * The scif_register() function opens a window, a range of whole pages of the - * registered address space of the endpoint epd, starting at offset po and - * continuing for len bytes. The value of po, further described below, is a - * function of the parameters offset and len, and the value of map_flags. Each - * page of the window represents the physical memory page which backs the - * corresponding page of the range of virtual address pages starting at addr - * and continuing for len bytes. addr and len are constrained to be multiples - * of the page size. A successful scif_register() call returns po. - * - * When SCIF_MAP_FIXED is set in the map_flags argument, po will be offset - * exactly, and offset is constrained to be a multiple of the page size. The - * mapping established by scif_register() will not replace any existing - * registration; an error is returned if any page within the range [offset, - * offset + len - 1] intersects an existing window. - * - * When SCIF_MAP_FIXED is not set, the implementation uses offset in an - * implementation-defined manner to arrive at po. The po value so chosen will - * be an area of the registered address space that the implementation deems - * suitable for a mapping of len bytes. An offset value of 0 is interpreted as - * granting the implementation complete freedom in selecting po, subject to - * constraints described below. A non-zero value of offset is taken to be a - * suggestion of an offset near which the mapping should be placed. When the - * implementation selects a value for po, it does not replace any extant - * window. In all cases, po will be a multiple of the page size. - * - * The physical pages which are so represented by a window are available for - * access in calls to mmap(), scif_readfrom(), scif_writeto(), - * scif_vreadfrom(), and scif_vwriteto(). While a window is registered, the - * physical pages represented by the window will not be reused by the memory - * subsystem for any other purpose. Note that the same physical page may be - * represented by multiple windows. - * - * Subsequent operations which change the memory pages to which virtual - * addresses are mapped (such as mmap(), munmap()) have no effect on - * existing window. - * - * If the process will fork(), it is recommended that the registered - * virtual address range be marked with MADV_DONTFORK. Doing so will prevent - * problems due to copy-on-write semantics. - * - * The prot_flags argument is formed by OR'ing together one or more of the - * following values. - * SCIF_PROT_READ - allow read operations from the window - * SCIF_PROT_WRITE - allow write operations to the window - * - * Return: - * Upon successful completion, scif_register() returns the offset at which the - * mapping was placed (po); otherwise in user mode SCIF_REGISTER_FAILED (that - * is (off_t *)-1) is returned and errno is set to indicate the error; in - * kernel mode the negative of one of the following errors is returned. - * - * Errors: - * EADDRINUSE - SCIF_MAP_FIXED is set in map_flags, and pages in the range - * [offset, offset + len -1] are already registered - * EAGAIN - The mapping could not be performed due to lack of resources - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNRESET - Connection reset by peer - * EINVAL - map_flags is invalid, or prot_flags is invalid, or SCIF_MAP_FIXED is - * set in flags, and offset is not a multiple of the page size, or addr is not a - * multiple of the page size, or len is not a multiple of the page size, or is - * 0, or offset is negative - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOMEM - Not enough space - * ENOTCONN -The endpoint is not connected - */ -off_t scif_register(scif_epd_t epd, void *addr, size_t len, off_t offset, - int prot_flags, int map_flags); - -/** - * scif_unregister() - Mark a memory region for remote access. - * @epd: endpoint descriptor - * @offset: start of range to unregister - * @len: length of range to unregister - * - * The scif_unregister() function closes those previously registered windows - * which are entirely within the range [offset, offset + len - 1]. It is an - * error to specify a range which intersects only a subrange of a window. - * - * On a successful return, pages within the window may no longer be specified - * in calls to mmap(), scif_readfrom(), scif_writeto(), scif_vreadfrom(), - * scif_vwriteto(), scif_get_pages, and scif_fence_signal(). The window, - * however, continues to exist until all previous references against it are - * removed. A window is referenced if there is a mapping to it created by - * mmap(), or if scif_get_pages() was called against the window - * (and the pages have not been returned via scif_put_pages()). A window is - * also referenced while an RMA, in which some range of the window is a source - * or destination, is in progress. Finally a window is referenced while some - * offset in that window was specified to scif_fence_signal(), and the RMAs - * marked by that call to scif_fence_signal() have not completed. While a - * window is in this state, its registered address space pages are not - * available for use in a new registered window. - * - * When all such references to the window have been removed, its references to - * all the physical pages which it represents are removed. Similarly, the - * registered address space pages of the window become available for - * registration in a new window. - * - * Return: - * Upon successful completion, scif_unregister() returns 0; otherwise in user - * mode -1 is returned and errno is set to indicate the error; in kernel mode - * the negative of one of the following errors is returned. In the event of an - * error, no windows are unregistered. - * - * Errors: - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNRESET - Connection reset by peer - * EINVAL - the range [offset, offset + len - 1] intersects a subrange of a - * window, or offset is negative - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOTCONN - The endpoint is not connected - * ENXIO - Offsets in the range [offset, offset + len - 1] are invalid for the - * registered address space of epd - */ -int scif_unregister(scif_epd_t epd, off_t offset, size_t len); - -/** - * scif_readfrom() - Copy from a remote address space - * @epd: endpoint descriptor - * @loffset: offset in local registered address space to - * which to copy - * @len: length of range to copy - * @roffset: offset in remote registered address space - * from which to copy - * @rma_flags: transfer mode flags - * - * scif_readfrom() copies len bytes from the remote registered address space of - * the peer of endpoint epd, starting at the offset roffset to the local - * registered address space of epd, starting at the offset loffset. - * - * Each of the specified ranges [loffset, loffset + len - 1] and [roffset, - * roffset + len - 1] must be within some registered window or windows of the - * local and remote nodes. A range may intersect multiple registered windows, - * but only if those windows are contiguous in the registered address space. - * - * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using - * programmed read/writes. Otherwise the data is copied using DMA. If rma_- - * flags includes SCIF_RMA_SYNC, then scif_readfrom() will return after the - * transfer is complete. Otherwise, the transfer may be performed asynchron- - * ously. The order in which any two asynchronous RMA operations complete - * is non-deterministic. The synchronization functions, scif_fence_mark()/ - * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to - * the completion of asynchronous RMA operations on the same endpoint. - * - * The DMA transfer of individual bytes is not guaranteed to complete in - * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last - * cacheline or partial cacheline of the source range will become visible on - * the destination node after all other transferred data in the source - * range has become visible on the destination node. - * - * The optimal DMA performance will likely be realized if both - * loffset and roffset are cacheline aligned (are a multiple of 64). Lower - * performance will likely be realized if loffset and roffset are not - * cacheline aligned but are separated by some multiple of 64. The lowest level - * of performance is likely if loffset and roffset are not separated by a - * multiple of 64. - * - * The rma_flags argument is formed by ORing together zero or more of the - * following values. - * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA - * engine. - * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the - * transfer has completed. Passing this flag results in the - * current implementation busy waiting and consuming CPU cycles - * while the DMA transfer is in progress for best performance by - * avoiding the interrupt latency. - * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of - * the source range becomes visible on the destination node - * after all other transferred data in the source range has - * become visible on the destination - * - * Return: - * Upon successful completion, scif_readfrom() returns 0; otherwise in user - * mode -1 is returned and errno is set to indicate the error; in kernel mode - * the negative of one of the following errors is returned. - * - * Errors: - * EACCES - Attempt to write to a read-only range - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNRESET - Connection reset by peer - * EINVAL - rma_flags is invalid - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOTCONN - The endpoint is not connected - * ENXIO - The range [loffset, loffset + len - 1] is invalid for the registered - * address space of epd, or, The range [roffset, roffset + len - 1] is invalid - * for the registered address space of the peer of epd, or loffset or roffset - * is negative - */ -int scif_readfrom(scif_epd_t epd, off_t loffset, size_t len, off_t - roffset, int rma_flags); - -/** - * scif_writeto() - Copy to a remote address space - * @epd: endpoint descriptor - * @loffset: offset in local registered address space - * from which to copy - * @len: length of range to copy - * @roffset: offset in remote registered address space to - * which to copy - * @rma_flags: transfer mode flags - * - * scif_writeto() copies len bytes from the local registered address space of - * epd, starting at the offset loffset to the remote registered address space - * of the peer of endpoint epd, starting at the offset roffset. - * - * Each of the specified ranges [loffset, loffset + len - 1] and [roffset, - * roffset + len - 1] must be within some registered window or windows of the - * local and remote nodes. A range may intersect multiple registered windows, - * but only if those windows are contiguous in the registered address space. - * - * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using - * programmed read/writes. Otherwise the data is copied using DMA. If rma_- - * flags includes SCIF_RMA_SYNC, then scif_writeto() will return after the - * transfer is complete. Otherwise, the transfer may be performed asynchron- - * ously. The order in which any two asynchronous RMA operations complete - * is non-deterministic. The synchronization functions, scif_fence_mark()/ - * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to - * the completion of asynchronous RMA operations on the same endpoint. - * - * The DMA transfer of individual bytes is not guaranteed to complete in - * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last - * cacheline or partial cacheline of the source range will become visible on - * the destination node after all other transferred data in the source - * range has become visible on the destination node. - * - * The optimal DMA performance will likely be realized if both - * loffset and roffset are cacheline aligned (are a multiple of 64). Lower - * performance will likely be realized if loffset and roffset are not cacheline - * aligned but are separated by some multiple of 64. The lowest level of - * performance is likely if loffset and roffset are not separated by a multiple - * of 64. - * - * The rma_flags argument is formed by ORing together zero or more of the - * following values. - * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA - * engine. - * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the - * transfer has completed. Passing this flag results in the - * current implementation busy waiting and consuming CPU cycles - * while the DMA transfer is in progress for best performance by - * avoiding the interrupt latency. - * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of - * the source range becomes visible on the destination node - * after all other transferred data in the source range has - * become visible on the destination - * - * Return: - * Upon successful completion, scif_readfrom() returns 0; otherwise in user - * mode -1 is returned and errno is set to indicate the error; in kernel mode - * the negative of one of the following errors is returned. - * - * Errors: - * EACCES - Attempt to write to a read-only range - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNRESET - Connection reset by peer - * EINVAL - rma_flags is invalid - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOTCONN - The endpoint is not connected - * ENXIO - The range [loffset, loffset + len - 1] is invalid for the registered - * address space of epd, or, The range [roffset , roffset + len -1] is invalid - * for the registered address space of the peer of epd, or loffset or roffset - * is negative - */ -int scif_writeto(scif_epd_t epd, off_t loffset, size_t len, off_t - roffset, int rma_flags); - -/** - * scif_vreadfrom() - Copy from a remote address space - * @epd: endpoint descriptor - * @addr: address to which to copy - * @len: length of range to copy - * @roffset: offset in remote registered address space - * from which to copy - * @rma_flags: transfer mode flags - * - * scif_vreadfrom() copies len bytes from the remote registered address - * space of the peer of endpoint epd, starting at the offset roffset, to local - * memory, starting at addr. - * - * The specified range [roffset, roffset + len - 1] must be within some - * registered window or windows of the remote nodes. The range may - * intersect multiple registered windows, but only if those windows are - * contiguous in the registered address space. - * - * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using - * programmed read/writes. Otherwise the data is copied using DMA. If rma_- - * flags includes SCIF_RMA_SYNC, then scif_vreadfrom() will return after the - * transfer is complete. Otherwise, the transfer may be performed asynchron- - * ously. The order in which any two asynchronous RMA operations complete - * is non-deterministic. The synchronization functions, scif_fence_mark()/ - * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to - * the completion of asynchronous RMA operations on the same endpoint. - * - * The DMA transfer of individual bytes is not guaranteed to complete in - * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last - * cacheline or partial cacheline of the source range will become visible on - * the destination node after all other transferred data in the source - * range has become visible on the destination node. - * - * If rma_flags includes SCIF_RMA_USECACHE, then the physical pages which back - * the specified local memory range may be remain in a pinned state even after - * the specified transfer completes. This may reduce overhead if some or all of - * the same virtual address range is referenced in a subsequent call of - * scif_vreadfrom() or scif_vwriteto(). - * - * The optimal DMA performance will likely be realized if both - * addr and roffset are cacheline aligned (are a multiple of 64). Lower - * performance will likely be realized if addr and roffset are not - * cacheline aligned but are separated by some multiple of 64. The lowest level - * of performance is likely if addr and roffset are not separated by a - * multiple of 64. - * - * The rma_flags argument is formed by ORing together zero or more of the - * following values. - * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA - * engine. - * SCIF_RMA_USECACHE - enable registration caching - * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the - * transfer has completed. Passing this flag results in the - * current implementation busy waiting and consuming CPU cycles - * while the DMA transfer is in progress for best performance by - * avoiding the interrupt latency. - * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of - * the source range becomes visible on the destination node - * after all other transferred data in the source range has - * become visible on the destination - * - * Return: - * Upon successful completion, scif_vreadfrom() returns 0; otherwise in user - * mode -1 is returned and errno is set to indicate the error; in kernel mode - * the negative of one of the following errors is returned. - * - * Errors: - * EACCES - Attempt to write to a read-only range - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNRESET - Connection reset by peer - * EINVAL - rma_flags is invalid - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOTCONN - The endpoint is not connected - * ENXIO - Offsets in the range [roffset, roffset + len - 1] are invalid for the - * registered address space of epd - */ -int scif_vreadfrom(scif_epd_t epd, void *addr, size_t len, off_t roffset, - int rma_flags); - -/** - * scif_vwriteto() - Copy to a remote address space - * @epd: endpoint descriptor - * @addr: address from which to copy - * @len: length of range to copy - * @roffset: offset in remote registered address space to - * which to copy - * @rma_flags: transfer mode flags - * - * scif_vwriteto() copies len bytes from the local memory, starting at addr, to - * the remote registered address space of the peer of endpoint epd, starting at - * the offset roffset. - * - * The specified range [roffset, roffset + len - 1] must be within some - * registered window or windows of the remote nodes. The range may intersect - * multiple registered windows, but only if those windows are contiguous in the - * registered address space. - * - * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using - * programmed read/writes. Otherwise the data is copied using DMA. If rma_- - * flags includes SCIF_RMA_SYNC, then scif_vwriteto() will return after the - * transfer is complete. Otherwise, the transfer may be performed asynchron- - * ously. The order in which any two asynchronous RMA operations complete - * is non-deterministic. The synchronization functions, scif_fence_mark()/ - * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to - * the completion of asynchronous RMA operations on the same endpoint. - * - * The DMA transfer of individual bytes is not guaranteed to complete in - * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last - * cacheline or partial cacheline of the source range will become visible on - * the destination node after all other transferred data in the source - * range has become visible on the destination node. - * - * If rma_flags includes SCIF_RMA_USECACHE, then the physical pages which back - * the specified local memory range may be remain in a pinned state even after - * the specified transfer completes. This may reduce overhead if some or all of - * the same virtual address range is referenced in a subsequent call of - * scif_vreadfrom() or scif_vwriteto(). - * - * The optimal DMA performance will likely be realized if both - * addr and offset are cacheline aligned (are a multiple of 64). Lower - * performance will likely be realized if addr and offset are not cacheline - * aligned but are separated by some multiple of 64. The lowest level of - * performance is likely if addr and offset are not separated by a multiple of - * 64. - * - * The rma_flags argument is formed by ORing together zero or more of the - * following values. - * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA - * engine. - * SCIF_RMA_USECACHE - allow registration caching - * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the - * transfer has completed. Passing this flag results in the - * current implementation busy waiting and consuming CPU cycles - * while the DMA transfer is in progress for best performance by - * avoiding the interrupt latency. - * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of - * the source range becomes visible on the destination node - * after all other transferred data in the source range has - * become visible on the destination - * - * Return: - * Upon successful completion, scif_vwriteto() returns 0; otherwise in user - * mode -1 is returned and errno is set to indicate the error; in kernel mode - * the negative of one of the following errors is returned. - * - * Errors: - * EACCES - Attempt to write to a read-only range - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNRESET - Connection reset by peer - * EINVAL - rma_flags is invalid - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOTCONN - The endpoint is not connected - * ENXIO - Offsets in the range [roffset, roffset + len - 1] are invalid for the - * registered address space of epd - */ -int scif_vwriteto(scif_epd_t epd, void *addr, size_t len, off_t roffset, - int rma_flags); - -/** - * scif_fence_mark() - Mark previously issued RMAs - * @epd: endpoint descriptor - * @flags: control flags - * @mark: marked value returned as output. - * - * scif_fence_mark() returns after marking the current set of all uncompleted - * RMAs initiated through the endpoint epd or the current set of all - * uncompleted RMAs initiated through the peer of endpoint epd. The RMAs are - * marked with a value returned at mark. The application may subsequently call - * scif_fence_wait(), passing the value returned at mark, to await completion - * of all RMAs so marked. - * - * The flags argument has exactly one of the following values. - * SCIF_FENCE_INIT_SELF - RMA operations initiated through endpoint - * epd are marked - * SCIF_FENCE_INIT_PEER - RMA operations initiated through the peer - * of endpoint epd are marked - * - * Return: - * Upon successful completion, scif_fence_mark() returns 0; otherwise in user - * mode -1 is returned and errno is set to indicate the error; in kernel mode - * the negative of one of the following errors is returned. - * - * Errors: - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNRESET - Connection reset by peer - * EINVAL - flags is invalid - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOTCONN - The endpoint is not connected - * ENOMEM - Insufficient kernel memory was available - */ -int scif_fence_mark(scif_epd_t epd, int flags, int *mark); - -/** - * scif_fence_wait() - Wait for completion of marked RMAs - * @epd: endpoint descriptor - * @mark: mark request - * - * scif_fence_wait() returns after all RMAs marked with mark have completed. - * The value passed in mark must have been obtained in a previous call to - * scif_fence_mark(). - * - * Return: - * Upon successful completion, scif_fence_wait() returns 0; otherwise in user - * mode -1 is returned and errno is set to indicate the error; in kernel mode - * the negative of one of the following errors is returned. - * - * Errors: - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNRESET - Connection reset by peer - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOTCONN - The endpoint is not connected - * ENOMEM - Insufficient kernel memory was available - */ -int scif_fence_wait(scif_epd_t epd, int mark); - -/** - * scif_fence_signal() - Request a memory update on completion of RMAs - * @epd: endpoint descriptor - * @loff: local offset - * @lval: local value to write to loffset - * @roff: remote offset - * @rval: remote value to write to roffset - * @flags: flags - * - * scif_fence_signal() returns after marking the current set of all uncompleted - * RMAs initiated through the endpoint epd or marking the current set of all - * uncompleted RMAs initiated through the peer of endpoint epd. - * - * If flags includes SCIF_SIGNAL_LOCAL, then on completion of the RMAs in the - * marked set, lval is written to memory at the address corresponding to offset - * loff in the local registered address space of epd. loff must be within a - * registered window. If flags includes SCIF_SIGNAL_REMOTE, then on completion - * of the RMAs in the marked set, rval is written to memory at the address - * corresponding to offset roff in the remote registered address space of epd. - * roff must be within a remote registered window of the peer of epd. Note - * that any specified offset must be DWORD (4 byte / 32 bit) aligned. - * - * The flags argument is formed by OR'ing together the following. - * Exactly one of the following values. - * SCIF_FENCE_INIT_SELF - RMA operations initiated through endpoint - * epd are marked - * SCIF_FENCE_INIT_PEER - RMA operations initiated through the peer - * of endpoint epd are marked - * One or more of the following values. - * SCIF_SIGNAL_LOCAL - On completion of the marked set of RMAs, write lval to - * memory at the address corresponding to offset loff in the local - * registered address space of epd. - * SCIF_SIGNAL_REMOTE - On completion of the marked set of RMAs, write rval to - * memory at the address corresponding to offset roff in the remote - * registered address space of epd. - * - * Return: - * Upon successful completion, scif_fence_signal() returns 0; otherwise in - * user mode -1 is returned and errno is set to indicate the error; in kernel - * mode the negative of one of the following errors is returned. - * - * Errors: - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNRESET - Connection reset by peer - * EINVAL - flags is invalid, or loff or roff are not DWORD aligned - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOTCONN - The endpoint is not connected - * ENXIO - loff is invalid for the registered address of epd, or roff is invalid - * for the registered address space, of the peer of epd - */ -int scif_fence_signal(scif_epd_t epd, off_t loff, u64 lval, off_t roff, - u64 rval, int flags); - -/** - * scif_get_node_ids() - Return information about online nodes - * @nodes: array in which to return online node IDs - * @len: number of entries in the nodes array - * @self: address to place the node ID of the local node - * - * scif_get_node_ids() fills in the nodes array with up to len node IDs of the - * nodes in the SCIF network. If there is not enough space in nodes, as - * indicated by the len parameter, only len node IDs are returned in nodes. The - * return value of scif_get_node_ids() is the total number of nodes currently in - * the SCIF network. By checking the return value against the len parameter, - * the user may determine if enough space for nodes was allocated. - * - * The node ID of the local node is returned at self. - * - * Return: - * Upon successful completion, scif_get_node_ids() returns the actual number of - * online nodes in the SCIF network including 'self'; otherwise in user mode - * -1 is returned and errno is set to indicate the error; in kernel mode no - * errors are returned. - */ -int scif_get_node_ids(u16 *nodes, int len, u16 *self); - -/** - * scif_pin_pages() - Pin a set of pages - * @addr: Virtual address of range to pin - * @len: Length of range to pin - * @prot_flags: Page protection flags - * @map_flags: Page classification flags - * @pinned_pages: Handle to pinned pages - * - * scif_pin_pages() pins (locks in physical memory) the physical pages which - * back the range of virtual address pages starting at addr and continuing for - * len bytes. addr and len are constrained to be multiples of the page size. A - * successful scif_pin_pages() call returns a handle to pinned_pages which may - * be used in subsequent calls to scif_register_pinned_pages(). - * - * The pages will remain pinned as long as there is a reference against the - * scif_pinned_pages_t value returned by scif_pin_pages() and until - * scif_unpin_pages() is called, passing the scif_pinned_pages_t value. A - * reference is added to a scif_pinned_pages_t value each time a window is - * created by calling scif_register_pinned_pages() and passing the - * scif_pinned_pages_t value. A reference is removed from a - * scif_pinned_pages_t value each time such a window is deleted. - * - * Subsequent operations which change the memory pages to which virtual - * addresses are mapped (such as mmap(), munmap()) have no effect on the - * scif_pinned_pages_t value or windows created against it. - * - * If the process will fork(), it is recommended that the registered - * virtual address range be marked with MADV_DONTFORK. Doing so will prevent - * problems due to copy-on-write semantics. - * - * The prot_flags argument is formed by OR'ing together one or more of the - * following values. - * SCIF_PROT_READ - allow read operations against the pages - * SCIF_PROT_WRITE - allow write operations against the pages - * The map_flags argument can be set as SCIF_MAP_KERNEL to interpret addr as a - * kernel space address. By default, addr is interpreted as a user space - * address. - * - * Return: - * Upon successful completion, scif_pin_pages() returns 0; otherwise the - * negative of one of the following errors is returned. - * - * Errors: - * EINVAL - prot_flags is invalid, map_flags is invalid, or offset is negative - * ENOMEM - Not enough space - */ -int scif_pin_pages(void *addr, size_t len, int prot_flags, int map_flags, - scif_pinned_pages_t *pinned_pages); - -/** - * scif_unpin_pages() - Unpin a set of pages - * @pinned_pages: Handle to pinned pages to be unpinned - * - * scif_unpin_pages() prevents scif_register_pinned_pages() from registering new - * windows against pinned_pages. The physical pages represented by pinned_pages - * will remain pinned until all windows previously registered against - * pinned_pages are deleted (the window is scif_unregister()'d and all - * references to the window are removed (see scif_unregister()). - * - * pinned_pages must have been obtain from a previous call to scif_pin_pages(). - * After calling scif_unpin_pages(), it is an error to pass pinned_pages to - * scif_register_pinned_pages(). - * - * Return: - * Upon successful completion, scif_unpin_pages() returns 0; otherwise the - * negative of one of the following errors is returned. - * - * Errors: - * EINVAL - pinned_pages is not valid - */ -int scif_unpin_pages(scif_pinned_pages_t pinned_pages); - -/** - * scif_register_pinned_pages() - Mark a memory region for remote access. - * @epd: endpoint descriptor - * @pinned_pages: Handle to pinned pages - * @offset: Registered address space offset - * @map_flags: Flags which control where pages are mapped - * - * The scif_register_pinned_pages() function opens a window, a range of whole - * pages of the registered address space of the endpoint epd, starting at - * offset po. The value of po, further described below, is a function of the - * parameters offset and pinned_pages, and the value of map_flags. Each page of - * the window represents a corresponding physical memory page of the range - * represented by pinned_pages; the length of the window is the same as the - * length of range represented by pinned_pages. A successful - * scif_register_pinned_pages() call returns po as the return value. - * - * When SCIF_MAP_FIXED is set in the map_flags argument, po will be offset - * exactly, and offset is constrained to be a multiple of the page size. The - * mapping established by scif_register_pinned_pages() will not replace any - * existing registration; an error is returned if any page of the new window - * would intersect an existing window. - * - * When SCIF_MAP_FIXED is not set, the implementation uses offset in an - * implementation-defined manner to arrive at po. The po so chosen will be an - * area of the registered address space that the implementation deems suitable - * for a mapping of the required size. An offset value of 0 is interpreted as - * granting the implementation complete freedom in selecting po, subject to - * constraints described below. A non-zero value of offset is taken to be a - * suggestion of an offset near which the mapping should be placed. When the - * implementation selects a value for po, it does not replace any extant - * window. In all cases, po will be a multiple of the page size. - * - * The physical pages which are so represented by a window are available for - * access in calls to scif_get_pages(), scif_readfrom(), scif_writeto(), - * scif_vreadfrom(), and scif_vwriteto(). While a window is registered, the - * physical pages represented by the window will not be reused by the memory - * subsystem for any other purpose. Note that the same physical page may be - * represented by multiple windows. - * - * Windows created by scif_register_pinned_pages() are unregistered by - * scif_unregister(). - * - * The map_flags argument can be set to SCIF_MAP_FIXED which interprets a - * fixed offset. - * - * Return: - * Upon successful completion, scif_register_pinned_pages() returns the offset - * at which the mapping was placed (po); otherwise the negative of one of the - * following errors is returned. - * - * Errors: - * EADDRINUSE - SCIF_MAP_FIXED is set in map_flags and pages in the new window - * would intersect an existing window - * EAGAIN - The mapping could not be performed due to lack of resources - * ECONNRESET - Connection reset by peer - * EINVAL - map_flags is invalid, or SCIF_MAP_FIXED is set in map_flags, and - * offset is not a multiple of the page size, or offset is negative - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOMEM - Not enough space - * ENOTCONN - The endpoint is not connected - */ -off_t scif_register_pinned_pages(scif_epd_t epd, - scif_pinned_pages_t pinned_pages, - off_t offset, int map_flags); - -/** - * scif_get_pages() - Add references to remote registered pages - * @epd: endpoint descriptor - * @offset: remote registered offset - * @len: length of range of pages - * @pages: returned scif_range structure - * - * scif_get_pages() returns the addresses of the physical pages represented by - * those pages of the registered address space of the peer of epd, starting at - * offset and continuing for len bytes. offset and len are constrained to be - * multiples of the page size. - * - * All of the pages in the specified range [offset, offset + len - 1] must be - * within a single window of the registered address space of the peer of epd. - * - * The addresses are returned as a virtually contiguous array pointed to by the - * phys_addr component of the scif_range structure whose address is returned in - * pages. The nr_pages component of scif_range is the length of the array. The - * prot_flags component of scif_range holds the protection flag value passed - * when the pages were registered. - * - * Each physical page whose address is returned by scif_get_pages() remains - * available and will not be released for reuse until the scif_range structure - * is returned in a call to scif_put_pages(). The scif_range structure returned - * by scif_get_pages() must be unmodified. - * - * It is an error to call scif_close() on an endpoint on which a scif_range - * structure of that endpoint has not been returned to scif_put_pages(). - * - * Return: - * Upon successful completion, scif_get_pages() returns 0; otherwise the - * negative of one of the following errors is returned. - * Errors: - * ECONNRESET - Connection reset by peer. - * EINVAL - offset is not a multiple of the page size, or offset is negative, or - * len is not a multiple of the page size - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOTCONN - The endpoint is not connected - * ENXIO - Offsets in the range [offset, offset + len - 1] are invalid - * for the registered address space of the peer epd - */ -int scif_get_pages(scif_epd_t epd, off_t offset, size_t len, - struct scif_range **pages); - -/** - * scif_put_pages() - Remove references from remote registered pages - * @pages: pages to be returned - * - * scif_put_pages() releases a scif_range structure previously obtained by - * calling scif_get_pages(). The physical pages represented by pages may - * be reused when the window which represented those pages is unregistered. - * Therefore, those pages must not be accessed after calling scif_put_pages(). - * - * Return: - * Upon successful completion, scif_put_pages() returns 0; otherwise the - * negative of one of the following errors is returned. - * Errors: - * EINVAL - pages does not point to a valid scif_range structure, or - * the scif_range structure pointed to by pages was already returned - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOTCONN - The endpoint is not connected - */ -int scif_put_pages(struct scif_range *pages); - -/** - * scif_poll() - Wait for some event on an endpoint - * @epds: Array of endpoint descriptors - * @nepds: Length of epds - * @timeout: Upper limit on time for which scif_poll() will block - * - * scif_poll() waits for one of a set of endpoints to become ready to perform - * an I/O operation. - * - * The epds argument specifies the endpoint descriptors to be examined and the - * events of interest for each endpoint descriptor. epds is a pointer to an - * array with one member for each open endpoint descriptor of interest. - * - * The number of items in the epds array is specified in nepds. The epd field - * of scif_pollepd is an endpoint descriptor of an open endpoint. The field - * events is a bitmask specifying the events which the application is - * interested in. The field revents is an output parameter, filled by the - * kernel with the events that actually occurred. The bits returned in revents - * can include any of those specified in events, or one of the values EPOLLERR, - * EPOLLHUP, or EPOLLNVAL. (These three bits are meaningless in the events - * field, and will be set in the revents field whenever the corresponding - * condition is true.) - * - * If none of the events requested (and no error) has occurred for any of the - * endpoint descriptors, then scif_poll() blocks until one of the events occurs. - * - * The timeout argument specifies an upper limit on the time for which - * scif_poll() will block, in milliseconds. Specifying a negative value in - * timeout means an infinite timeout. - * - * The following bits may be set in events and returned in revents. - * EPOLLIN - Data may be received without blocking. For a connected - * endpoint, this means that scif_recv() may be called without blocking. For a - * listening endpoint, this means that scif_accept() may be called without - * blocking. - * EPOLLOUT - Data may be sent without blocking. For a connected endpoint, this - * means that scif_send() may be called without blocking. EPOLLOUT may also be - * used to block waiting for a non-blocking connect to complete. This bit value - * has no meaning for a listening endpoint and is ignored if specified. - * - * The following bits are only returned in revents, and are ignored if set in - * events. - * EPOLLERR - An error occurred on the endpoint - * EPOLLHUP - The connection to the peer endpoint was disconnected - * EPOLLNVAL - The specified endpoint descriptor is invalid. - * - * Return: - * Upon successful completion, scif_poll() returns a non-negative value. A - * positive value indicates the total number of endpoint descriptors that have - * been selected (that is, endpoint descriptors for which the revents member is - * non-zero). A value of 0 indicates that the call timed out and no endpoint - * descriptors have been selected. Otherwise in user mode -1 is returned and - * errno is set to indicate the error; in kernel mode the negative of one of - * the following errors is returned. - * - * Errors: - * EINTR - A signal occurred before any requested event - * EINVAL - The nepds argument is greater than {OPEN_MAX} - * ENOMEM - There was no space to allocate file descriptor tables - */ -int scif_poll(struct scif_pollepd *epds, unsigned int nepds, long timeout); - -/** - * scif_client_register() - Register a SCIF client - * @client: client to be registered - * - * scif_client_register() registers a SCIF client. The probe() method - * of the client is called when SCIF peer devices come online and the - * remove() method is called when the peer devices disappear. - * - * Return: - * Upon successful completion, scif_client_register() returns a non-negative - * value. Otherwise the return value is the same as subsys_interface_register() - * in the kernel. - */ -int scif_client_register(struct scif_client *client); - -/** - * scif_client_unregister() - Unregister a SCIF client - * @client: client to be unregistered - * - * scif_client_unregister() unregisters a SCIF client. - * - * Return: - * None - */ -void scif_client_unregister(struct scif_client *client); - -#endif /* __SCIF_H__ */ -- cgit v1.2.3 From a62f68f5ca53ab61cba2f0a410d0add7a6d54a52 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 23 Oct 2020 17:35:46 +0200 Subject: cpufreq: Introduce cpufreq_driver_test_flags() Add a helper function to test the flags of the cpufreq driver in use againt a given flags mask. In particular, this will be needed to test the CPUFREQ_NEED_UPDATE_LIMITS cpufreq driver flag in the schedutil governor. Signed-off-by: Rafael J. Wysocki --- include/linux/cpufreq.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 038ed83aab41..1eaa04f1bae6 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -433,6 +433,7 @@ struct cpufreq_driver { int cpufreq_register_driver(struct cpufreq_driver *driver_data); int cpufreq_unregister_driver(struct cpufreq_driver *driver_data); +bool cpufreq_driver_test_flags(u16 flags); const char *cpufreq_get_current_driver(void); void *cpufreq_get_driver_data(void); -- cgit v1.2.3 From 4169e889e5889405d54cec27d6e9f7f0ce3c7096 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 2 Sep 2020 23:25:55 -0500 Subject: include: jhash/signal: Fix fall-through warnings for Clang In preparation to enable -Wimplicit-fallthrough for Clang, explicitly add break statements instead of letting the code fall through to the next case. This patch adds four break statements that, together, fix almost 40,000 warnings when building Linux 5.10-rc1 with Clang 12.0.0 and this[1] change reverted. Notice that in order to enable -Wimplicit-fallthrough for Clang, such change[1] is meant to be reverted at some point. So, this patch helps to move in that direction. Something important to mention is that there is currently a discrepancy between GCC and Clang when dealing with switch fall-through to empty case statements or to cases that only contain a break/continue/return statement[2][3][4]. Now that the -Wimplicit-fallthrough option has been globally enabled[5], any compiler should really warn on missing either a fallthrough annotation or any of the other case-terminating statements (break/continue/return/ goto) when falling through to the next case statement. Making exceptions to this introduces variation in case handling which may continue to lead to bugs, misunderstandings, and a general lack of robustness. The point of enabling options like -Wimplicit-fallthrough is to prevent human error and aid developers in spotting bugs before their code is even built/ submitted/committed, therefore eliminating classes of bugs. So, in order to really accomplish this, we should, and can, move in the direction of addressing any error-prone scenarios and get rid of the unintentional fallthrough bug-class in the kernel, entirely, even if there is some minor redundancy. Better to have explicit case-ending statements than continue to have exceptions where one must guess as to the right result. The compiler will eliminate any actual redundancy. [1] commit e2079e93f562c ("kbuild: Do not enable -Wimplicit-fallthrough for clang for now") [2] https://github.com/ClangBuiltLinux/linux/issues/636 [3] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91432 [4] https://godbolt.org/z/xgkvIh [5] commit a035d552a93b ("Makefile: Globally enable fall-through warning") Co-developed-by: Kees Cook Signed-off-by: Kees Cook Signed-off-by: Gustavo A. R. Silva --- include/linux/jhash.h | 2 ++ include/linux/signal.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/jhash.h b/include/linux/jhash.h index cfb62e9f37be..ab7f8c152b89 100644 --- a/include/linux/jhash.h +++ b/include/linux/jhash.h @@ -99,6 +99,7 @@ static inline u32 jhash(const void *key, u32 length, u32 initval) case 2: a += (u32)k[1]<<8; fallthrough; case 1: a += k[0]; __jhash_final(a, b, c); + break; case 0: /* Nothing left to add */ break; } @@ -136,6 +137,7 @@ static inline u32 jhash2(const u32 *k, u32 length, u32 initval) case 2: b += k[1]; fallthrough; case 1: a += k[0]; __jhash_final(a, b, c); + break; case 0: /* Nothing left to add */ break; } diff --git a/include/linux/signal.h b/include/linux/signal.h index 7bbc0e9cf084..b256f9c65661 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -238,6 +238,7 @@ static inline void siginitset(sigset_t *set, unsigned long mask) memset(&set->sig[1], 0, sizeof(long)*(_NSIG_WORDS-1)); break; case 2: set->sig[1] = 0; + break; case 1: ; } } @@ -250,6 +251,7 @@ static inline void siginitsetinv(sigset_t *set, unsigned long mask) memset(&set->sig[1], -1, sizeof(long)*(_NSIG_WORDS-1)); break; case 2: set->sig[1] = -1; + break; case 1: ; } } -- cgit v1.2.3 From a4147d855f50a676ebe61833a681f7c71945f343 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 31 Aug 2020 10:18:04 -0500 Subject: dmaengine: ti-cppi5: Replace zero-length array with flexible-array member MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use “flexible array members”[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://www.kernel.org/doc/html/v5.9-rc1/process/deprecated.html#zero-length-and-one-element-arrays Signed-off-by: Gustavo A. R. Silva --- include/linux/dma/ti-cppi5.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma/ti-cppi5.h b/include/linux/dma/ti-cppi5.h index 5896441ee604..efa2f0309f00 100644 --- a/include/linux/dma/ti-cppi5.h +++ b/include/linux/dma/ti-cppi5.h @@ -47,7 +47,7 @@ struct cppi5_host_desc_t { u32 buf_info1; u32 org_buf_len; u64 org_buf_ptr; - u32 epib[0]; + u32 epib[]; } __packed; #define CPPI5_DESC_MIN_ALIGN (16U) @@ -139,7 +139,7 @@ struct cppi5_desc_epib_t { */ struct cppi5_monolithic_desc_t { struct cppi5_desc_hdr_t hdr; - u32 epib[0]; + u32 epib[]; }; #define CPPI5_INFO2_MDESC_DATA_OFFSET_SHIFT (18U) -- cgit v1.2.3 From 277ffd6c1ec0aa60856a03e18455fcca7d2a1186 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 31 Aug 2020 10:19:18 -0500 Subject: mailbox: zynqmp-ipi-message: Replace zero-length array with flexible-array member MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use “flexible array members”[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://www.kernel.org/doc/html/v5.9-rc1/process/deprecated.html#zero-length-and-one-element-arrays Signed-off-by: Gustavo A. R. Silva --- include/linux/mailbox/zynqmp-ipi-message.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mailbox/zynqmp-ipi-message.h b/include/linux/mailbox/zynqmp-ipi-message.h index 9542b41eacfd..35ce84c8ca02 100644 --- a/include/linux/mailbox/zynqmp-ipi-message.h +++ b/include/linux/mailbox/zynqmp-ipi-message.h @@ -14,7 +14,7 @@ */ struct zynqmp_ipi_message { size_t len; - u8 data[0]; + u8 data[]; }; #endif /* _LINUX_ZYNQMP_IPI_MESSAGE_H_ */ -- cgit v1.2.3 From 883541051567a62add043a9f4ca5a31f2970bffd Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 31 Aug 2020 10:21:14 -0500 Subject: platform/chrome: cros_ec_commands: Replace zero-length array with flexible-array member MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use “flexible array members”[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://www.kernel.org/doc/html/v5.9-rc1/process/deprecated.html#zero-length-and-one-element-arrays Signed-off-by: Gustavo A. R. Silva --- include/linux/platform_data/cros_ec_commands.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h index 1fcfe9e63cb9..a3a9a878415f 100644 --- a/include/linux/platform_data/cros_ec_commands.h +++ b/include/linux/platform_data/cros_ec_commands.h @@ -1419,7 +1419,7 @@ struct ec_response_flash_info_2 { uint16_t num_banks_total; /* Number of banks described in banks array. */ uint16_t num_banks_desc; - struct ec_flash_bank banks[0]; + struct ec_flash_bank banks[]; } __ec_align4; /* @@ -2420,12 +2420,12 @@ struct ec_response_motion_sense_fifo_info { /* Total amount of vector lost */ uint16_t total_lost; /* Lost events since the last fifo_info, per sensors */ - uint16_t lost[0]; + uint16_t lost[]; } __ec_todo_packed; struct ec_response_motion_sense_fifo_data { uint32_t number_data; - struct ec_response_motion_sensor_data data[0]; + struct ec_response_motion_sensor_data data[]; } __ec_todo_packed; /* List supported activity recognition */ @@ -3093,7 +3093,7 @@ struct ec_response_tmp006_get_calibration_v1 { uint8_t algorithm; uint8_t num_params; uint8_t reserved[2]; - float val[0]; + float val[]; } __ec_align4; struct ec_params_tmp006_set_calibration_v1 { @@ -3101,7 +3101,7 @@ struct ec_params_tmp006_set_calibration_v1 { uint8_t algorithm; uint8_t num_params; uint8_t reserved; - float val[0]; + float val[]; } __ec_align4; @@ -5076,7 +5076,7 @@ struct ec_response_pd_log { uint8_t type; /* event type : see PD_EVENT_xx below */ uint8_t size_port; /* [7:5] port number [4:0] payload size in bytes */ uint16_t data; /* type-defined data payload */ - uint8_t payload[0]; /* optional additional data payload: 0..16 bytes */ + uint8_t payload[]; /* optional additional data payload: 0..16 bytes */ } __ec_align4; /* The timestamp is the microsecond counter shifted to get about a ms. */ @@ -5789,7 +5789,7 @@ struct ec_response_fp_encryption_status { struct ec_response_tp_frame_info { uint32_t n_frames; - uint32_t frame_sizes[0]; + uint32_t frame_sizes[]; } __ec_align4; /* Create a snapshot of current frame readings */ -- cgit v1.2.3 From 120088832042e6dc9866160ff267f8c347bf53e6 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 31 Aug 2020 10:21:55 -0500 Subject: platform/chrome: cros_ec_proto: Replace zero-length array with flexible-array member MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use “flexible array members”[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://www.kernel.org/doc/html/v5.9-rc1/process/deprecated.html#zero-length-and-one-element-arrays Signed-off-by: Gustavo A. R. Silva --- include/linux/platform_data/cros_ec_proto.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h index 4a415ae851ef..02599687770c 100644 --- a/include/linux/platform_data/cros_ec_proto.h +++ b/include/linux/platform_data/cros_ec_proto.h @@ -69,7 +69,7 @@ struct cros_ec_command { uint32_t outsize; uint32_t insize; uint32_t result; - uint8_t data[0]; + uint8_t data[]; }; /** -- cgit v1.2.3 From 5e01fdff04b7f7c3b8d456c11c8a9f978b4ddf65 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 31 Aug 2020 08:25:42 -0500 Subject: fs: Replace zero-length array with flexible-array member MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use “flexible array members”[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://www.kernel.org/doc/html/v5.9-rc1/process/deprecated.html#zero-length-and-one-element-arrays Signed-off-by: Gustavo A. R. Silva --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 0bd126418bb6..21cc971fd960 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3285,7 +3285,7 @@ static inline ino_t parent_ino(struct dentry *dentry) */ struct simple_transaction_argresp { ssize_t size; - char data[0]; + char data[]; }; #define SIMPLE_TRANSACTION_LIMIT (PAGE_SIZE - sizeof(struct simple_transaction_argresp)) -- cgit v1.2.3 From 080b6f40763565f65ebb9540219c71ce885cf568 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 28 Oct 2020 18:15:05 +0100 Subject: bpf: Don't rely on GCC __attribute__((optimize)) to disable GCSE Commit 3193c0836 ("bpf: Disable GCC -fgcse optimization for ___bpf_prog_run()") introduced a __no_fgcse macro that expands to a function scope __attribute__((optimize("-fno-gcse"))), to disable a GCC specific optimization that was causing trouble on x86 builds, and was not expected to have any positive effect in the first place. However, as the GCC manual documents, __attribute__((optimize)) is not for production use, and results in all other optimization options to be forgotten for the function in question. This can cause all kinds of trouble, but in one particular reported case, it causes -fno-asynchronous-unwind-tables to be disregarded, resulting in .eh_frame info to be emitted for the function. This reverts commit 3193c0836, and instead, it disables the -fgcse optimization for the entire source file, but only when building for X86 using GCC with CONFIG_BPF_JIT_ALWAYS_ON disabled. Note that the original commit states that CONFIG_RETPOLINE=n triggers the issue, whereas CONFIG_RETPOLINE=y performs better without the optimization, so it is kept disabled in both cases. Fixes: 3193c0836f20 ("bpf: Disable GCC -fgcse optimization for ___bpf_prog_run()") Signed-off-by: Ard Biesheuvel Signed-off-by: Alexei Starovoitov Tested-by: Geert Uytterhoeven Reviewed-by: Nick Desaulniers Link: https://lore.kernel.org/lkml/CAMuHMdUg0WJHEcq6to0-eODpXPOywLot6UD2=GFHpzoj_hCoBQ@mail.gmail.com/ Link: https://lore.kernel.org/bpf/20201028171506.15682-2-ardb@kernel.org --- include/linux/compiler-gcc.h | 2 -- include/linux/compiler_types.h | 4 ---- 2 files changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index d1e3c6896b71..5deb37024574 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -175,5 +175,3 @@ #else #define __diag_GCC_8(s) #endif - -#define __no_fgcse __attribute__((optimize("-fno-gcse"))) diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 6e390d58a9f8..ac3fa37a84f9 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -247,10 +247,6 @@ struct ftrace_likely_data { #define asm_inline asm #endif -#ifndef __no_fgcse -# define __no_fgcse -#endif - /* Are two types/vars the same type (ignoring qualifiers)? */ #define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) -- cgit v1.2.3 From 0d519cbf38eed4f895aed197d4b135fa7f60f7c2 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 23 Oct 2020 15:10:37 +0200 Subject: debugfs: remove return value of debugfs_create_devm_seqfile() No one checks the return value of debugfs_create_devm_seqfile(), as it's not needed, so make the return value void, so that no one tries to do so in the future. Link: https://lore.kernel.org/r/20201023131037.2500765-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- include/linux/debugfs.h | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index 851dd1f9a8a5..d6c4cc9ecc77 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -144,10 +144,9 @@ void debugfs_create_u32_array(const char *name, umode_t mode, struct dentry *parent, struct debugfs_u32_array *array); -struct dentry *debugfs_create_devm_seqfile(struct device *dev, const char *name, - struct dentry *parent, - int (*read_fn)(struct seq_file *s, - void *data)); +void debugfs_create_devm_seqfile(struct device *dev, const char *name, + struct dentry *parent, + int (*read_fn)(struct seq_file *s, void *data)); bool debugfs_initialized(void); @@ -327,13 +326,12 @@ static inline void debugfs_create_u32_array(const char *name, umode_t mode, { } -static inline struct dentry *debugfs_create_devm_seqfile(struct device *dev, - const char *name, - struct dentry *parent, - int (*read_fn)(struct seq_file *s, - void *data)) +static inline void debugfs_create_devm_seqfile(struct device *dev, + const char *name, + struct dentry *parent, + int (*read_fn)(struct seq_file *s, + void *data)) { - return ERR_PTR(-ENODEV); } static inline ssize_t debugfs_read_file_bool(struct file *file, -- cgit v1.2.3 From 46d6c5ae953cc0be38efd0e469284df7c4328cf8 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 29 Oct 2020 03:56:06 +0100 Subject: netfilter: use actual socket sk rather than skb sk when routing harder If netfilter changes the packet mark when mangling, the packet is rerouted using the route_me_harder set of functions. Prior to this commit, there's one big difference between route_me_harder and the ordinary initial routing functions, described in the comment above __ip_queue_xmit(): /* Note: skb->sk can be different from sk, in case of tunnels */ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, That function goes on to correctly make use of sk->sk_bound_dev_if, rather than skb->sk->sk_bound_dev_if. And indeed the comment is true: a tunnel will receive a packet in ndo_start_xmit with an initial skb->sk. It will make some transformations to that packet, and then it will send the encapsulated packet out of a *new* socket. That new socket will basically always have a different sk_bound_dev_if (otherwise there'd be a routing loop). So for the purposes of routing the encapsulated packet, the routing information as it pertains to the socket should come from that socket's sk, rather than the packet's original skb->sk. For that reason __ip_queue_xmit() and related functions all do the right thing. One might argue that all tunnels should just call skb_orphan(skb) before transmitting the encapsulated packet into the new socket. But tunnels do *not* do this -- and this is wisely avoided in skb_scrub_packet() too -- because features like TSQ rely on skb->destructor() being called when that buffer space is truely available again. Calling skb_orphan(skb) too early would result in buffers filling up unnecessarily and accounting info being all wrong. Instead, additional routing must take into account the new sk, just as __ip_queue_xmit() notes. So, this commit addresses the problem by fishing the correct sk out of state->sk -- it's already set properly in the call to nf_hook() in __ip_local_out(), which receives the sk as part of its normal functionality. So we make sure to plumb state->sk through the various route_me_harder functions, and then make correct use of it following the example of __ip_queue_xmit(). Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Jason A. Donenfeld Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_ipv4.h | 2 +- include/linux/netfilter_ipv6.h | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_ipv4.h b/include/linux/netfilter_ipv4.h index 082e2c41b7ff..5b70ca868bb1 100644 --- a/include/linux/netfilter_ipv4.h +++ b/include/linux/netfilter_ipv4.h @@ -16,7 +16,7 @@ struct ip_rt_info { u_int32_t mark; }; -int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned addr_type); +int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned addr_type); struct nf_queue_entry; diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h index 9b67394471e1..48314ade1506 100644 --- a/include/linux/netfilter_ipv6.h +++ b/include/linux/netfilter_ipv6.h @@ -42,7 +42,7 @@ struct nf_ipv6_ops { #if IS_MODULE(CONFIG_IPV6) int (*chk_addr)(struct net *net, const struct in6_addr *addr, const struct net_device *dev, int strict); - int (*route_me_harder)(struct net *net, struct sk_buff *skb); + int (*route_me_harder)(struct net *net, struct sock *sk, struct sk_buff *skb); int (*dev_get_saddr)(struct net *net, const struct net_device *dev, const struct in6_addr *daddr, unsigned int srcprefs, struct in6_addr *saddr); @@ -143,9 +143,9 @@ static inline int nf_br_ip6_fragment(struct net *net, struct sock *sk, #endif } -int ip6_route_me_harder(struct net *net, struct sk_buff *skb); +int ip6_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb); -static inline int nf_ip6_route_me_harder(struct net *net, struct sk_buff *skb) +static inline int nf_ip6_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb) { #if IS_MODULE(CONFIG_IPV6) const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops(); @@ -153,9 +153,9 @@ static inline int nf_ip6_route_me_harder(struct net *net, struct sk_buff *skb) if (!v6_ops) return -EHOSTUNREACH; - return v6_ops->route_me_harder(net, skb); + return v6_ops->route_me_harder(net, sk, skb); #elif IS_BUILTIN(CONFIG_IPV6) - return ip6_route_me_harder(net, skb); + return ip6_route_me_harder(net, sk, skb); #else return -EHOSTUNREACH; #endif -- cgit v1.2.3 From c0391b6ab810381df632677a1dcbbbbd63d05b6d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 29 Oct 2020 13:50:03 +0100 Subject: netfilter: nf_tables: missing validation from the abort path If userspace does not include the trailing end of batch message, then nfnetlink aborts the transaction. This allows to check that ruleset updates trigger no errors. After this patch, invoking this command from the prerouting chain: # nft -c add rule x y fib saddr . oif type local fails since oif is not supported there. This patch fixes the lack of rule validation from the abort/check path to catch configuration errors such as the one above. Fixes: a654de8fdc18 ("netfilter: nf_tables: fix chain dependency validation") Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nfnetlink.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index 89016d08f6a2..f6267e2883f2 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -24,6 +24,12 @@ struct nfnl_callback { const u_int16_t attr_count; /* number of nlattr's */ }; +enum nfnl_abort_action { + NFNL_ABORT_NONE = 0, + NFNL_ABORT_AUTOLOAD, + NFNL_ABORT_VALIDATE, +}; + struct nfnetlink_subsystem { const char *name; __u8 subsys_id; /* nfnetlink subsystem ID */ @@ -31,7 +37,8 @@ struct nfnetlink_subsystem { const struct nfnl_callback *cb; /* callback for individual types */ struct module *owner; int (*commit)(struct net *net, struct sk_buff *skb); - int (*abort)(struct net *net, struct sk_buff *skb, bool autoload); + int (*abort)(struct net *net, struct sk_buff *skb, + enum nfnl_abort_action action); void (*cleanup)(struct net *net); bool (*valid_genid)(struct net *net, u32 genid); }; -- cgit v1.2.3 From 290562075d4d9e85b7ff4104f9a634ffc3cccb69 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 27 Oct 2020 15:28:40 -0500 Subject: net/mlx5: Replace zero-length array with flexible-array member MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use “flexible array members”[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://www.kernel.org/doc/html/v5.9/process/deprecated.html#zero-length-and-one-element-arrays Signed-off-by: Gustavo A. R. Silva --- include/linux/mlx5/mlx5_ifc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 651591a2965d..a092346c7b2d 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -5823,7 +5823,7 @@ struct mlx5_ifc_alloc_modify_header_context_in_bits { u8 reserved_at_68[0x10]; u8 num_of_actions[0x8]; - union mlx5_ifc_set_add_copy_action_in_auto_bits actions[0]; + union mlx5_ifc_set_add_copy_action_in_auto_bits actions[]; }; struct mlx5_ifc_dealloc_modify_header_context_out_bits { @@ -9761,7 +9761,7 @@ struct mlx5_ifc_mcda_reg_bits { u8 reserved_at_60[0x20]; - u8 data[0][0x20]; + u8 data[][0x20]; }; enum { -- cgit v1.2.3 From f51778db088b2407ec177f2f4da0f6290602aa3f Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Mon, 2 Nov 2020 12:43:27 +1100 Subject: swiotlb: using SIZE_MAX needs limits.h included After merging the drm-misc tree, linux-next build (arm multi_v7_defconfig) failed like this: In file included from drivers/gpu/drm/nouveau/nouveau_ttm.c:26: include/linux/swiotlb.h: In function 'swiotlb_max_mapping_size': include/linux/swiotlb.h:99:9: error: 'SIZE_MAX' undeclared (first use in this function) 99 | return SIZE_MAX; | ^~~~~~~~ include/linux/swiotlb.h:7:1: note: 'SIZE_MAX' is defined in header ''; did you forget to '#include '? 6 | #include +++ |+#include 7 | #include include/linux/swiotlb.h:99:9: note: each undeclared identifier is reported only once for each function it appears in 99 | return SIZE_MAX; | ^~~~~~~~ Caused by commit abe420bfae52 ("swiotlb: Introduce swiotlb_max_mapping_size()") but only exposed by commit "drm/nouveu: fix swiotlb include" Fix it by including linux/limits.h as appropriate. Fixes: abe420bfae52 ("swiotlb: Introduce swiotlb_max_mapping_size()") Signed-off-by: Stephen Rothwell Link: https://lore.kernel.org/r/20201102124327.2f82b2a7@canb.auug.org.au Signed-off-by: Michael S. Tsirkin --- include/linux/swiotlb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 046bb94bd4d6..fa5122c6711e 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -5,6 +5,7 @@ #include #include #include +#include struct device; struct page; -- cgit v1.2.3 From fc0021aa340af65a0a37d77be39e22aa886a6132 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 23 Oct 2020 08:33:09 +0200 Subject: swiotlb: remove the tbl_dma_addr argument to swiotlb_tbl_map_single The tbl_dma_addr argument is used to check the DMA boundary for the allocations, and thus needs to be a dma_addr_t. swiotlb-xen instead passed a physical address, which could lead to incorrect results for strange offsets. Fix this by removing the parameter entirely and hard code the DMA address for io_tlb_start instead. Fixes: 91ffe4ad534a ("swiotlb-xen: introduce phys_to_dma/dma_to_phys translations") Signed-off-by: Christoph Hellwig Reviewed-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- include/linux/swiotlb.h | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 513913ff7486..3bb72266a75a 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -45,13 +45,9 @@ enum dma_sync_target { SYNC_FOR_DEVICE = 1, }; -extern phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, - dma_addr_t tbl_dma_addr, - phys_addr_t phys, - size_t mapping_size, - size_t alloc_size, - enum dma_data_direction dir, - unsigned long attrs); +phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys, + size_t mapping_size, size_t alloc_size, + enum dma_data_direction dir, unsigned long attrs); extern void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, -- cgit v1.2.3 From e0e398e204634db8fb71bd89cf2f6e3e5bd09b51 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 21 Oct 2020 21:12:15 +0200 Subject: PM: runtime: Drop runtime PM references to supplier on link removal While removing a device link, drop the supplier device's runtime PM usage counter as many times as needed to drop all of the runtime PM references to it from the consumer in addition to dropping the consumer's link count. Fixes: baa8809f6097 ("PM / runtime: Optimize the use of device links") Signed-off-by: Rafael J. Wysocki Cc: 5.1+ # 5.1+ Tested-by: Xiang Chen Reviewed-by: Greg Kroah-Hartman --- include/linux/pm_runtime.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 18b02dcc168e..eadc1fdebce6 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -58,7 +58,7 @@ extern void pm_runtime_clean_up_links(struct device *dev); extern void pm_runtime_get_suppliers(struct device *dev); extern void pm_runtime_put_suppliers(struct device *dev); extern void pm_runtime_new_link(struct device *dev); -extern void pm_runtime_drop_link(struct device *dev); +extern void pm_runtime_drop_link(struct device_link *link); /** * pm_runtime_get_if_in_use - Conditionally bump up runtime PM usage counter. @@ -280,7 +280,7 @@ static inline void pm_runtime_clean_up_links(struct device *dev) {} static inline void pm_runtime_get_suppliers(struct device *dev) {} static inline void pm_runtime_put_suppliers(struct device *dev) {} static inline void pm_runtime_new_link(struct device *dev) {} -static inline void pm_runtime_drop_link(struct device *dev) {} +static inline void pm_runtime_drop_link(struct device_link *link) {} #endif /* !CONFIG_PM */ -- cgit v1.2.3 From d6e36668598154820177bfd78c1621d8e6c580a2 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 21 Oct 2020 21:13:10 +0200 Subject: PM: runtime: Drop pm_runtime_clean_up_links() After commit d12544fb2aa9 ("PM: runtime: Remove link state checks in rpm_get/put_supplier()") nothing prevents the consumer device's runtime PM from acquiring additional references to the supplier device after pm_runtime_clean_up_links() has run (or even while it is running), so calling this function from __device_release_driver() may be pointless (or even harmful). Moreover, it ignores stateless device links, so the runtime PM handling of managed and stateless device links is inconsistent because of it, so better get rid of it entirely. Fixes: d12544fb2aa9 ("PM: runtime: Remove link state checks in rpm_get/put_supplier()") Signed-off-by: Rafael J. Wysocki Cc: 5.1+ # 5.1+ Tested-by: Xiang Chen Reviewed-by: Greg Kroah-Hartman --- include/linux/pm_runtime.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index eadc1fdebce6..4b708f4e8eed 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -54,7 +54,6 @@ extern u64 pm_runtime_autosuspend_expiration(struct device *dev); extern void pm_runtime_update_max_time_suspended(struct device *dev, s64 delta_ns); extern void pm_runtime_set_memalloc_noio(struct device *dev, bool enable); -extern void pm_runtime_clean_up_links(struct device *dev); extern void pm_runtime_get_suppliers(struct device *dev); extern void pm_runtime_put_suppliers(struct device *dev); extern void pm_runtime_new_link(struct device *dev); @@ -276,7 +275,6 @@ static inline u64 pm_runtime_autosuspend_expiration( struct device *dev) { return 0; } static inline void pm_runtime_set_memalloc_noio(struct device *dev, bool enable){} -static inline void pm_runtime_clean_up_links(struct device *dev) {} static inline void pm_runtime_get_suppliers(struct device *dev) {} static inline void pm_runtime_put_suppliers(struct device *dev) {} static inline void pm_runtime_new_link(struct device *dev) {} -- cgit v1.2.3 From f8f6ae5d077a9bdaf5cbf2ac960a5d1a04b47482 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 1 Nov 2020 17:08:00 -0800 Subject: mm: always have io_remap_pfn_range() set pgprot_decrypted() The purpose of io_remap_pfn_range() is to map IO memory, such as a memory mapped IO exposed through a PCI BAR. IO devices do not understand encryption, so this memory must always be decrypted. Automatically call pgprot_decrypted() as part of the generic implementation. This fixes a bug where enabling AMD SME causes subsystems, such as RDMA, using io_remap_pfn_range() to expose BAR pages to user space to fail. The CPU will encrypt access to those BAR pages instead of passing unencrypted IO directly to the device. Places not mapping IO should use remap_pfn_range(). Fixes: aca20d546214 ("x86/mm: Add support to make use of Secure Memory Encryption") Signed-off-by: Jason Gunthorpe Signed-off-by: Andrew Morton Cc: Arnd Bergmann Cc: Tom Lendacky Cc: Thomas Gleixner Cc: Andrey Ryabinin Cc: Borislav Petkov Cc: Brijesh Singh Cc: Jonathan Corbet Cc: Dmitry Vyukov Cc: "Dave Young" Cc: Alexander Potapenko Cc: Konrad Rzeszutek Wilk Cc: Andy Lutomirski Cc: Larry Woodman Cc: Matt Fleming Cc: Ingo Molnar Cc: "Michael S. Tsirkin" Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Rik van Riel Cc: Toshimitsu Kani Cc: Link: https://lkml.kernel.org/r/0-v1-025d64bdf6c4+e-amd_sme_fix_jgg@nvidia.com Signed-off-by: Linus Torvalds --- include/linux/mm.h | 9 +++++++++ include/linux/pgtable.h | 4 ---- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index ef360fe70aaf..db6ae4d3fb4e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2759,6 +2759,15 @@ static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma, return VM_FAULT_NOPAGE; } +#ifndef io_remap_pfn_range +static inline int io_remap_pfn_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long pfn, + unsigned long size, pgprot_t prot) +{ + return remap_pfn_range(vma, addr, pfn, size, pgprot_decrypted(prot)); +} +#endif + static inline vm_fault_t vmf_error(int err) { if (err == -ENOMEM) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 38c33eabea89..71125a4676c4 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1427,10 +1427,6 @@ typedef unsigned int pgtbl_mod_mask; #endif /* !__ASSEMBLY__ */ -#ifndef io_remap_pfn_range -#define io_remap_pfn_range remap_pfn_range -#endif - #ifndef has_transparent_hugepage #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define has_transparent_hugepage() 1 -- cgit v1.2.3 From 286228d382ba6320f04fa2e7c6fc8d4d92e428f4 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 18 Dec 2019 09:39:02 +0100 Subject: can: can_create_echo_skb(): fix echo skb generation: always use skb_clone() All user space generated SKBs are owned by a socket (unless injected into the key via AF_PACKET). If a socket is closed, all associated skbs will be cleaned up. This leads to a problem when a CAN driver calls can_put_echo_skb() on a unshared SKB. If the socket is closed prior to the TX complete handler, can_get_echo_skb() and the subsequent delivering of the echo SKB to all registered callbacks, a SKB with a refcount of 0 is delivered. To avoid the problem, in can_get_echo_skb() the original SKB is now always cloned, regardless of shared SKB or not. If the process exists it can now safely discard its SKBs, without disturbing the delivery of the echo SKB. The problem shows up in the j1939 stack, when it clones the incoming skb, which detects the already 0 refcount. We can easily reproduce this with following example: testj1939 -B -r can0: & cansend can0 1823ff40#0123 WARNING: CPU: 0 PID: 293 at lib/refcount.c:25 refcount_warn_saturate+0x108/0x174 refcount_t: addition on 0; use-after-free. Modules linked in: coda_vpu imx_vdoa videobuf2_vmalloc dw_hdmi_ahb_audio vcan CPU: 0 PID: 293 Comm: cansend Not tainted 5.5.0-rc6-00376-g9e20dcb7040d #1 Hardware name: Freescale i.MX6 Quad/DualLite (Device Tree) Backtrace: [] (dump_backtrace) from [] (show_stack+0x20/0x24) [] (show_stack) from [] (dump_stack+0x8c/0xa0) [] (dump_stack) from [] (__warn+0xe0/0x108) [] (__warn) from [] (warn_slowpath_fmt+0xa8/0xcc) [] (warn_slowpath_fmt) from [] (refcount_warn_saturate+0x108/0x174) [] (refcount_warn_saturate) from [] (j1939_can_recv+0x20c/0x210) [] (j1939_can_recv) from [] (can_rcv_filter+0xb4/0x268) [] (can_rcv_filter) from [] (can_receive+0xb0/0xe4) [] (can_receive) from [] (can_rcv+0x48/0x98) [] (can_rcv) from [] (__netif_receive_skb_one_core+0x64/0x88) [] (__netif_receive_skb_one_core) from [] (__netif_receive_skb+0x38/0x94) [] (__netif_receive_skb) from [] (netif_receive_skb_internal+0x64/0xf8) [] (netif_receive_skb_internal) from [] (netif_receive_skb+0x34/0x19c) [] (netif_receive_skb) from [] (can_rx_offload_napi_poll+0x58/0xb4) Fixes: 0ae89beb283a ("can: add destructor for self generated skbs") Signed-off-by: Oleksij Rempel Link: http://lore.kernel.org/r/20200124132656.22156-1-o.rempel@pengutronix.de Acked-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- include/linux/can/skb.h | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h index 900b9f4e0605..fc61cf4eff1c 100644 --- a/include/linux/can/skb.h +++ b/include/linux/can/skb.h @@ -61,21 +61,17 @@ static inline void can_skb_set_owner(struct sk_buff *skb, struct sock *sk) */ static inline struct sk_buff *can_create_echo_skb(struct sk_buff *skb) { - if (skb_shared(skb)) { - struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); + struct sk_buff *nskb; - if (likely(nskb)) { - can_skb_set_owner(nskb, skb->sk); - consume_skb(skb); - return nskb; - } else { - kfree_skb(skb); - return NULL; - } + nskb = skb_clone(skb, GFP_ATOMIC); + if (unlikely(!nskb)) { + kfree_skb(skb); + return NULL; } - /* we can assume to have an unshared skb with proper owner */ - return skb; + can_skb_set_owner(nskb, skb->sk); + consume_skb(skb); + return nskb; } #endif /* !_CAN_SKB_H */ -- cgit v1.2.3 From 763e4cdc0f6d5cea45c896fef67f7be4bdefcca7 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 29 Oct 2020 14:30:48 -0700 Subject: iomap: support partial page discard on writeback block mapping failure iomap writeback mapping failure only calls into ->discard_page() if the current page has not been added to the ioend. Accordingly, the XFS callback assumes a full page discard and invalidation. This is problematic for sub-page block size filesystems where some portion of a page might have been mapped successfully before a failure to map a delalloc block occurs. ->discard_page() is not called in that error scenario and the bio is explicitly failed by iomap via the error return from ->prepare_ioend(). As a result, the filesystem leaks delalloc blocks and corrupts the filesystem block counters. Since XFS is the only user of ->discard_page(), tweak the semantics to invoke the callback unconditionally on mapping errors and provide the file offset that failed to map. Update xfs_discard_page() to discard the corresponding portion of the file and pass the range along to iomap_invalidatepage(). The latter already properly handles both full and sub-page scenarios by not changing any iomap or page state on sub-page invalidations. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- include/linux/iomap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 172b3397a1a3..5bd3cac4df9c 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -221,7 +221,7 @@ struct iomap_writeback_ops { * Optional, allows the file system to discard state on a page where * we failed to submit any I/O. */ - void (*discard_page)(struct page *page); + void (*discard_page)(struct page *page, loff_t fileoff); }; struct iomap_writepage_ctx { -- cgit v1.2.3 From fdaf083cdfb556a45c422c8998268baf1ab26829 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 30 Oct 2020 09:37:30 -0600 Subject: io_uring: properly handle SQPOLL request cancelations Track if a given task io_uring context contains SQPOLL instances, so we can iterate those for cancelation (and request counts). This ensures that we properly wait on SQPOLL contexts, and find everything that needs canceling. Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 868364cea3b7..35b2d845704d 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -30,7 +30,8 @@ struct io_uring_task { struct percpu_counter inflight; struct io_identity __identity; struct io_identity *identity; - bool in_idle; + atomic_t in_idle; + bool sqpoll; }; #if defined(CONFIG_IO_URING) -- cgit v1.2.3 From d4d50710a8b46082224376ef119a4dbb75b25c56 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Nov 2020 09:27:33 +0100 Subject: seq_file: add seq_read_iter iov_iter based variant for reading a seq_file. seq_read is reimplemented on top of the iter variant. Signed-off-by: Christoph Hellwig Tested-by: Greg Kroah-Hartman Signed-off-by: Linus Torvalds --- include/linux/seq_file.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 813614d4b71f..b83b3ae3c877 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -107,6 +107,7 @@ void seq_pad(struct seq_file *m, char c); char *mangle_path(char *s, const char *p, const char *esc); int seq_open(struct file *, const struct seq_operations *); ssize_t seq_read(struct file *, char __user *, size_t, loff_t *); +ssize_t seq_read_iter(struct kiocb *iocb, struct iov_iter *iter); loff_t seq_lseek(struct file *, loff_t, int); int seq_release(struct inode *, struct file *); int seq_write(struct seq_file *seq, const void *data, size_t len); -- cgit v1.2.3 From ede7dc7fa0af619afc08995776eadb9ff3b0a711 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:58:54 -0800 Subject: jbd2: rename j_maxlen to j_total_len and add jbd2_journal_max_txn_bufs The on-disk superblock field sb->s_maxlen represents the total size of the journal including the fast commit area and is no more the max number of blocks available for a transaction. The maximum number of blocks available to a transaction is reduced by the number of fast commit blocks. So, this patch renames j_maxlen to j_total_len to better represent its intent. Also, it adds a function to calculate max number of bufs available for a transaction. Suggested-by: Jan Kara Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201106035911.1942128-6-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 1d5566af48ac..e0b6b53eae64 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -988,9 +988,9 @@ struct journal_s struct block_device *j_fs_dev; /** - * @j_maxlen: Total maximum capacity of the journal region on disk. + * @j_total_len: Total maximum capacity of the journal region on disk. */ - unsigned int j_maxlen; + unsigned int j_total_len; /** * @j_reserved_credits: @@ -1624,6 +1624,11 @@ int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode); int jbd2_fc_wait_bufs(journal_t *journal, int num_blks); int jbd2_fc_release_bufs(journal_t *journal); +static inline int jbd2_journal_get_max_txn_bufs(journal_t *journal) +{ + return (journal->j_total_len - journal->j_fc_wbufsize) / 4; +} + /* * is_journal_abort * -- cgit v1.2.3 From a1e5e465b31d6015fccb359d99053b39e5180466 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:58:55 -0800 Subject: ext4: clean up the JBD2 API that initializes fast commits This patch removes jbd2_fc_init() API and its related functions to simplify enabling fast commits. With this change, the number of fast commit blocks to use is solely determined by the JBD2 layer. So, we move the default value for minimum number of fast commit blocks from ext4/fast_commit.h to include/linux/jbd2.h. However, whether or not to use fast commits is determined by the file system. The file system just sets the fast commit feature using jbd2_journal_set_features(). JBD2 layer then determines how many blocks to use for fast commits (based on the value found in the JBD2 superblock). Note that the JBD2 feature flag of fast commits is just an indication that there are fast commit blocks present on disk. It doesn't tell JBD2 layer about the intent of the file system of whether to it wants to use fast commit or not. That's why, we blindly clear the fast commit flag in journal_reset() after the recovery is done. Suggested-by: Jan Kara Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201106035911.1942128-7-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index e0b6b53eae64..b2caf7bbd8e5 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -68,6 +68,7 @@ extern void *jbd2_alloc(size_t size, gfp_t flags); extern void jbd2_free(void *ptr, size_t size); #define JBD2_MIN_JOURNAL_BLOCKS 1024 +#define JBD2_MIN_FC_BLOCKS 256 #ifdef __KERNEL__ @@ -1614,7 +1615,6 @@ extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *); extern int jbd2_cleanup_journal_tail(journal_t *); /* Fast commit related APIs */ -int jbd2_fc_init(journal_t *journal, int num_fc_blks); int jbd2_fc_begin_commit(journal_t *journal, tid_t tid); int jbd2_fc_end_commit(journal_t *journal); int jbd2_fc_end_commit_fallback(journal_t *journal, tid_t tid); -- cgit v1.2.3 From c460e5edc85a063ec9cb60addff93d00ed378701 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:58:57 -0800 Subject: jbd2: don't use state lock during commit path Variables journal->j_fc_off, journal->j_fc_wbuf are accessed during commit path. Since today we allow only one process to perform a fast commit, there is no need take state lock before accessing these variables. This patch removes these locks and adds comments to describe this. Suggested-by: Jan Kara Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201106035911.1942128-9-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index b2caf7bbd8e5..5f0ef6380b0c 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -945,8 +945,9 @@ struct journal_s /** * @j_fc_off: * - * Number of fast commit blocks currently allocated. - * [j_state_lock]. + * Number of fast commit blocks currently allocated. Accessed only + * during fast commit. Currently only process can do fast commit, so + * this field is not protected by any lock. */ unsigned long j_fc_off; @@ -1109,8 +1110,9 @@ struct journal_s struct buffer_head **j_wbuf; /** - * @j_fc_wbuf: Array of fast commit bhs for - * jbd2_journal_commit_transaction. + * @j_fc_wbuf: Array of fast commit bhs for fast commit. Accessed only + * during a fast commit. Currently only process can do fast commit, so + * this field is not protected by any lock. */ struct buffer_head **j_fc_wbuf; -- cgit v1.2.3 From 0bce577bf9cae13ae32d391432d0030e3f67fc1d Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 5 Nov 2020 19:58:58 -0800 Subject: jbd2: don't pass tid to jbd2_fc_end_commit_fallback() In jbd2_fc_end_commit_fallback(), we know which tid to commit. There's no need for caller to pass it. Suggested-by: Jan Kara Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201106035911.1942128-10-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 5f0ef6380b0c..1c49fd62ff2e 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1619,7 +1619,7 @@ extern int jbd2_cleanup_journal_tail(journal_t *); /* Fast commit related APIs */ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid); int jbd2_fc_end_commit(journal_t *journal); -int jbd2_fc_end_commit_fallback(journal_t *journal, tid_t tid); +int jbd2_fc_end_commit_fallback(journal_t *journal); int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out); int jbd2_submit_inode_data(struct jbd2_inode *jinode); int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode); -- cgit v1.2.3 From 267fb27352b6fc9fdbad753127a239f75618ecbc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 30 Oct 2020 15:50:32 +0100 Subject: perf: Reduce stack usage of perf_output_begin() __perf_output_begin() has an on-stack struct perf_sample_data in the unlikely case it needs to generate a LOST record. However, every call to perf_output_begin() must already have a perf_sample_data on-stack. Reported-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201030151954.985416146@infradead.org --- include/linux/perf_event.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 0c19d279b97f..b775ae0a8c87 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1400,11 +1400,14 @@ perf_event_addr_filters(struct perf_event *event) extern void perf_event_addr_filters_sync(struct perf_event *event); extern int perf_output_begin(struct perf_output_handle *handle, + struct perf_sample_data *data, struct perf_event *event, unsigned int size); extern int perf_output_begin_forward(struct perf_output_handle *handle, - struct perf_event *event, - unsigned int size); + struct perf_sample_data *data, + struct perf_event *event, + unsigned int size); extern int perf_output_begin_backward(struct perf_output_handle *handle, + struct perf_sample_data *data, struct perf_event *event, unsigned int size); -- cgit v1.2.3 From 76a4efa80900fc40e0fdf243b42aec9fb8c35d24 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 30 Oct 2020 12:14:21 +0100 Subject: perf/arch: Remove perf_sample_data::regs_user_copy struct perf_sample_data lives on-stack, we should be careful about it's size. Furthermore, the pt_regs copy in there is only because x86_64 is a trainwreck, solve it differently. Reported-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Tested-by: Steven Rostedt Link: https://lkml.kernel.org/r/20201030151955.258178461@infradead.org --- include/linux/perf_event.h | 6 ------ include/linux/perf_regs.h | 6 ++---- 2 files changed, 2 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index b775ae0a8c87..96450f6fb1de 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1022,13 +1022,7 @@ struct perf_sample_data { struct perf_callchain_entry *callchain; u64 aux_size; - /* - * regs_user may point to task_pt_regs or to regs_user_copy, depending - * on arch details. - */ struct perf_regs regs_user; - struct pt_regs regs_user_copy; - struct perf_regs regs_intr; u64 stack_user_size; diff --git a/include/linux/perf_regs.h b/include/linux/perf_regs.h index 2d12e97d5e7b..f632c5725f16 100644 --- a/include/linux/perf_regs.h +++ b/include/linux/perf_regs.h @@ -20,8 +20,7 @@ u64 perf_reg_value(struct pt_regs *regs, int idx); int perf_reg_validate(u64 mask); u64 perf_reg_abi(struct task_struct *task); void perf_get_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy); + struct pt_regs *regs); #else #define PERF_REG_EXTENDED_MASK 0 @@ -42,8 +41,7 @@ static inline u64 perf_reg_abi(struct task_struct *task) } static inline void perf_get_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy) + struct pt_regs *regs) { regs_user->regs = task_pt_regs(current); regs_user->abi = perf_reg_abi(current); -- cgit v1.2.3 From 9a2a9ebc0a758d887ee06e067e9f7f0b36ff7574 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 10 Nov 2020 18:25:57 +0100 Subject: cpufreq: Introduce governor flags A new cpufreq governor flag will be added subsequently, so replace the bool dynamic_switching fleid in struct cpufreq_governor with a flags field and introduce CPUFREQ_GOV_DYNAMIC_SWITCHING to set for the "dynamic switching" governors instead of it. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- include/linux/cpufreq.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 1eaa04f1bae6..9bdfcf3c4748 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -570,12 +570,17 @@ struct cpufreq_governor { char *buf); int (*store_setspeed) (struct cpufreq_policy *policy, unsigned int freq); - /* For governors which change frequency dynamically by themselves */ - bool dynamic_switching; struct list_head governor_list; struct module *owner; + u8 flags; }; +/* Governor flags */ + +/* For governors which change frequency dynamically by themselves */ +#define CPUFREQ_GOV_DYNAMIC_SWITCHING BIT(0) + + /* Pass a target to the cpufreq driver */ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, unsigned int target_freq); -- cgit v1.2.3 From 218f66870181bec7aaa6e3c72f346039c590c3c2 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 10 Nov 2020 18:26:10 +0100 Subject: cpufreq: Introduce CPUFREQ_GOV_STRICT_TARGET Introduce a new governor flag, CPUFREQ_GOV_STRICT_TARGET, for the governors that want the target frequency to be set exactly to the given value without leaving any room for adjustments on the hardware side and set this flag for the powersave and performance governors. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- include/linux/cpufreq.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 9bdfcf3c4748..6eb9a3b8ec7b 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -580,6 +580,9 @@ struct cpufreq_governor { /* For governors which change frequency dynamically by themselves */ #define CPUFREQ_GOV_DYNAMIC_SWITCHING BIT(0) +/* For governors wanting the target frequency to be set exactly */ +#define CPUFREQ_GOV_STRICT_TARGET BIT(1) + /* Pass a target to the cpufreq driver */ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, -- cgit v1.2.3 From ea9364bbadf11f0c55802cf11387d74f524cee84 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 10 Nov 2020 18:26:37 +0100 Subject: cpufreq: Add strict_target to struct cpufreq_policy Add a new field to be set when the CPUFREQ_GOV_STRICT_TARGET flag is set for the current governor to struct cpufreq_policy, so that the drivers needing to check CPUFREQ_GOV_STRICT_TARGET do not have to access the governor object during every frequency transition. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- include/linux/cpufreq.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 6eb9a3b8ec7b..acbad3b36322 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -109,6 +109,12 @@ struct cpufreq_policy { bool fast_switch_possible; bool fast_switch_enabled; + /* + * Set if the CPUFREQ_GOV_STRICT_TARGET flag is set for the current + * governor. + */ + bool strict_target; + /* * Preferred average time interval between consecutive invocations of * the driver to set the frequency for this policy. To be set by the -- cgit v1.2.3 From 8a3c84b649b033024d2349f96234b26cbd6083a6 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 10 Nov 2020 16:50:21 -0800 Subject: vfs: separate __sb_start_write into blocking and non-blocking helpers Break this function into two helpers so that it's obvious that the trylock versions return a value that must be checked, and the blocking versions don't require that. While we're at it, clean up the return type mismatch. Signed-off-by: Darrick J. Wong Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig --- include/linux/fs.h | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 0bd126418bb6..305989afd49c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1581,7 +1581,8 @@ extern struct timespec64 current_time(struct inode *inode); */ void __sb_end_write(struct super_block *sb, int level); -int __sb_start_write(struct super_block *sb, int level, bool wait); +void __sb_start_write(struct super_block *sb, int level); +bool __sb_start_write_trylock(struct super_block *sb, int level); #define __sb_writers_acquired(sb, lev) \ percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) @@ -1645,12 +1646,12 @@ static inline void sb_end_intwrite(struct super_block *sb) */ static inline void sb_start_write(struct super_block *sb) { - __sb_start_write(sb, SB_FREEZE_WRITE, true); + __sb_start_write(sb, SB_FREEZE_WRITE); } -static inline int sb_start_write_trylock(struct super_block *sb) +static inline bool sb_start_write_trylock(struct super_block *sb) { - return __sb_start_write(sb, SB_FREEZE_WRITE, false); + return __sb_start_write_trylock(sb, SB_FREEZE_WRITE); } /** @@ -1674,7 +1675,7 @@ static inline int sb_start_write_trylock(struct super_block *sb) */ static inline void sb_start_pagefault(struct super_block *sb) { - __sb_start_write(sb, SB_FREEZE_PAGEFAULT, true); + __sb_start_write(sb, SB_FREEZE_PAGEFAULT); } /* @@ -1692,12 +1693,12 @@ static inline void sb_start_pagefault(struct super_block *sb) */ static inline void sb_start_intwrite(struct super_block *sb) { - __sb_start_write(sb, SB_FREEZE_FS, true); + __sb_start_write(sb, SB_FREEZE_FS); } -static inline int sb_start_intwrite_trylock(struct super_block *sb) +static inline bool sb_start_intwrite_trylock(struct super_block *sb) { - return __sb_start_write(sb, SB_FREEZE_FS, false); + return __sb_start_write_trylock(sb, SB_FREEZE_FS); } @@ -2756,14 +2757,14 @@ static inline void file_start_write(struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return; - __sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true); + sb_start_write(file_inode(file)->i_sb); } static inline bool file_start_write_trylock(struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return true; - return __sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, false); + return sb_start_write_trylock(file_inode(file)->i_sb); } static inline void file_end_write(struct file *file) -- cgit v1.2.3 From 9b8523423b23ee3dfd88e32f5b7207be56a4e782 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 10 Nov 2020 16:50:21 -0800 Subject: vfs: move __sb_{start,end}_write* to fs.h Now that we've straightened out the callers, move these three functions to fs.h since they're fairly trivial. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara --- include/linux/fs.h | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 305989afd49c..6dabd019cab0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1580,9 +1580,24 @@ extern struct timespec64 current_time(struct inode *inode); * Snapshotting support. */ -void __sb_end_write(struct super_block *sb, int level); -void __sb_start_write(struct super_block *sb, int level); -bool __sb_start_write_trylock(struct super_block *sb, int level); +/* + * These are internal functions, please use sb_start_{write,pagefault,intwrite} + * instead. + */ +static inline void __sb_end_write(struct super_block *sb, int level) +{ + percpu_up_read(sb->s_writers.rw_sem + level-1); +} + +static inline void __sb_start_write(struct super_block *sb, int level) +{ + percpu_down_read(sb->s_writers.rw_sem + level - 1); +} + +static inline bool __sb_start_write_trylock(struct super_block *sb, int level) +{ + return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1); +} #define __sb_writers_acquired(sb, lev) \ percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) -- cgit v1.2.3 From 5e844cc37a5cbaa460e68f9a989d321d63088a89 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Wed, 11 Nov 2020 20:07:10 +0100 Subject: spi: Introduce device-managed SPI controller allocation SPI driver probing currently comprises two steps, whereas removal comprises only one step: spi_alloc_master() spi_register_controller() spi_unregister_controller() That's because spi_unregister_controller() calls device_unregister() instead of device_del(), thereby releasing the reference on the spi_controller which was obtained by spi_alloc_master(). An SPI driver's private data is contained in the same memory allocation as the spi_controller struct. Thus, once spi_unregister_controller() has been called, the private data is inaccessible. But some drivers need to access it after spi_unregister_controller() to perform further teardown steps. Introduce devm_spi_alloc_master() and devm_spi_alloc_slave(), which release a reference on the spi_controller struct only after the driver has unbound, thereby keeping the memory allocation accessible. Change spi_unregister_controller() to not release a reference if the spi_controller was allocated by one of these new devm functions. The present commit is small enough to be backportable to stable. It allows fixing drivers which use the private data in their ->remove() hook after it's been freed. It also allows fixing drivers which neglect to release a reference on the spi_controller in the probe error path. Long-term, most SPI drivers shall be moved over to the devm functions introduced herein. The few that can't shall be changed in a treewide commit to explicitly release the last reference on the controller. That commit shall amend spi_unregister_controller() to no longer release a reference, thereby completing the migration. As a result, the behaviour will be less surprising and more consistent with subsystems such as IIO, which also includes the private data in the allocation of the generic iio_dev struct, but calls device_del() in iio_device_unregister(). Signed-off-by: Lukas Wunner Link: https://lore.kernel.org/r/272bae2ef08abd21388c98e23729886663d19192.1605121038.git.lukas@wunner.de Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 99380c0825db..b390fdac1587 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -734,6 +734,25 @@ static inline struct spi_controller *spi_alloc_slave(struct device *host, return __spi_alloc_controller(host, size, true); } +struct spi_controller *__devm_spi_alloc_controller(struct device *dev, + unsigned int size, + bool slave); + +static inline struct spi_controller *devm_spi_alloc_master(struct device *dev, + unsigned int size) +{ + return __devm_spi_alloc_controller(dev, size, false); +} + +static inline struct spi_controller *devm_spi_alloc_slave(struct device *dev, + unsigned int size) +{ + if (!IS_ENABLED(CONFIG_SPI_SLAVE)) + return NULL; + + return __devm_spi_alloc_controller(dev, size, true); +} + extern int spi_register_controller(struct spi_controller *ctlr); extern int devm_spi_register_controller(struct device *dev, struct spi_controller *ctlr); -- cgit v1.2.3 From 7e890c37c25c7cbca37ff0ab292873d8146e713b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 12 Nov 2020 17:50:04 +0100 Subject: block: add a return value to set_capacity_revalidate_and_notify Return if the function ended up sending an uevent or not. Cc: stable@vger.kernel.org # v5.9 Signed-off-by: Christoph Hellwig Reviewed-by: Petr Vorel Signed-off-by: Jens Axboe --- include/linux/genhd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 38f23d757013..03da3f603d30 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -315,7 +315,7 @@ static inline int get_disk_ro(struct gendisk *disk) extern void disk_block_events(struct gendisk *disk); extern void disk_unblock_events(struct gendisk *disk); extern void disk_flush_events(struct gendisk *disk, unsigned int mask); -void set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, +bool set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, bool update_bdev); /* drivers/char/random.c */ -- cgit v1.2.3 From 3347acc6fcd4ee71ad18a9ff9d9dac176b517329 Mon Sep 17 00:00:00 2001 From: Arvind Sankar Date: Fri, 13 Nov 2020 22:51:59 -0800 Subject: compiler.h: fix barrier_data() on clang Commit 815f0ddb346c ("include/linux/compiler*.h: make compiler-*.h mutually exclusive") neglected to copy barrier_data() from compiler-gcc.h into compiler-clang.h. The definition in compiler-gcc.h was really to work around clang's more aggressive optimization, so this broke barrier_data() on clang, and consequently memzero_explicit() as well. For example, this results in at least the memzero_explicit() call in lib/crypto/sha256.c:sha256_transform() being optimized away by clang. Fix this by moving the definition of barrier_data() into compiler.h. Also move the gcc/clang definition of barrier() into compiler.h, __memory_barrier() is icc-specific (and barrier() is already defined using it in compiler-intel.h) and doesn't belong in compiler.h. [rdunlap@infradead.org: fix ALPHA builds when SMP is not enabled] Link: https://lkml.kernel.org/r/20201101231835.4589-1-rdunlap@infradead.org Fixes: 815f0ddb346c ("include/linux/compiler*.h: make compiler-*.h mutually exclusive") Signed-off-by: Arvind Sankar Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Tested-by: Nick Desaulniers Reviewed-by: Nick Desaulniers Reviewed-by: Kees Cook Cc: Link: https://lkml.kernel.org/r/20201014212631.207844-1-nivedita@alum.mit.edu Signed-off-by: Linus Torvalds --- include/linux/compiler-clang.h | 6 ------ include/linux/compiler-gcc.h | 19 ------------------- include/linux/compiler.h | 18 ++++++++++++++++-- 3 files changed, 16 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index 230604e7f057..dd7233c48bf3 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -60,12 +60,6 @@ #define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1 #endif -/* The following are for compatibility with GCC, from compiler-gcc.h, - * and may be redefined here because they should not be shared with other - * compilers, like ICC. - */ -#define barrier() __asm__ __volatile__("" : : : "memory") - #if __has_feature(shadow_call_stack) # define __noscs __attribute__((__no_sanitize__("shadow-call-stack"))) #endif diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 5deb37024574..74c6c0486eed 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -15,25 +15,6 @@ # error Sorry, your version of GCC is too old - please use 4.9 or newer. #endif -/* Optimization barrier */ - -/* The "volatile" is due to gcc bugs */ -#define barrier() __asm__ __volatile__("": : :"memory") -/* - * This version is i.e. to prevent dead stores elimination on @ptr - * where gcc and llvm may behave differently when otherwise using - * normal barrier(): while gcc behavior gets along with a normal - * barrier(), llvm needs an explicit input variable to be assumed - * clobbered. The issue is as follows: while the inline asm might - * access any memory it wants, the compiler could have fit all of - * @ptr into memory registers instead, and since @ptr never escaped - * from that, it proved that the inline asm wasn't touching any of - * it. This version works well with both compilers, i.e. we're telling - * the compiler that the inline asm absolutely may see the contents - * of @ptr. See also: https://llvm.org/bugs/show_bug.cgi?id=15495 - */ -#define barrier_data(ptr) __asm__ __volatile__("": :"r"(ptr) :"memory") - /* * This macro obfuscates arithmetic on a variable address so that gcc * shouldn't recognize the original var, and make assumptions about it. diff --git a/include/linux/compiler.h b/include/linux/compiler.h index e512f5505dad..b8fe0c23cfff 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -80,11 +80,25 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, /* Optimization barrier */ #ifndef barrier -# define barrier() __memory_barrier() +/* The "volatile" is due to gcc bugs */ +# define barrier() __asm__ __volatile__("": : :"memory") #endif #ifndef barrier_data -# define barrier_data(ptr) barrier() +/* + * This version is i.e. to prevent dead stores elimination on @ptr + * where gcc and llvm may behave differently when otherwise using + * normal barrier(): while gcc behavior gets along with a normal + * barrier(), llvm needs an explicit input variable to be assumed + * clobbered. The issue is as follows: while the inline asm might + * access any memory it wants, the compiler could have fit all of + * @ptr into memory registers instead, and since @ptr never escaped + * from that, it proved that the inline asm wasn't touching any of + * it. This version works well with both compilers, i.e. we're telling + * the compiler that the inline asm absolutely may see the contents + * of @ptr. See also: https://llvm.org/bugs/show_bug.cgi?id=15495 + */ +# define barrier_data(ptr) __asm__ __volatile__("": :"r"(ptr) :"memory") #endif /* workaround for GCC PR82365 if needed */ -- cgit v1.2.3 From 8b21ca0218d29cc6bb7028125c7e5a10dfb4730c Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 13 Nov 2020 22:52:13 -0800 Subject: mm: memcontrol: fix missing wakeup polling thread When we poll the swap.events, we can miss being woken up when the swap event occurs. Because we didn't notify. Fixes: f3a53a3a1e5b ("mm, memcontrol: implement memory.swap.events") Signed-off-by: Muchun Song Signed-off-by: Andrew Morton Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Roman Gushchin Cc: Michal Hocko Cc: Yafang Shao Cc: Chris Down Cc: Tejun Heo Link: https://lkml.kernel.org/r/20201105161936.98312-1-songmuchun@bytedance.com Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e391e3c56de5..a80c59af2c60 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -900,12 +900,19 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, static inline void memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event) { + bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX || + event == MEMCG_SWAP_FAIL; + atomic_long_inc(&memcg->memory_events_local[event]); - cgroup_file_notify(&memcg->events_local_file); + if (!swap_event) + cgroup_file_notify(&memcg->events_local_file); do { atomic_long_inc(&memcg->memory_events[event]); - cgroup_file_notify(&memcg->events_file); + if (swap_event) + cgroup_file_notify(&memcg->swap_events_file); + else + cgroup_file_notify(&memcg->events_file); if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) break; -- cgit v1.2.3 From dd8088d5a8969dc2b42f71d7bc01c25c61a78066 Mon Sep 17 00:00:00 2001 From: Zhang Qilong Date: Tue, 10 Nov 2020 17:29:32 +0800 Subject: PM: runtime: Add pm_runtime_resume_and_get to deal with usage counter In many case, we need to check return value of pm_runtime_get_sync, but it brings a trouble to the usage counter processing. Many callers forget to decrease the usage counter when it failed, which could resulted in reference leak. It has been discussed a lot[0][1]. So we add a function to deal with the usage counter for better coding. [0]https://lkml.org/lkml/2020/6/14/88 [1]https://patchwork.ozlabs.org/project/linux-tegra/list/?series=178139 Signed-off-by: Zhang Qilong Acked-by: Rafael J. Wysocki Signed-off-by: Jakub Kicinski --- include/linux/pm_runtime.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 4b708f4e8eed..b492ae00cc90 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -386,6 +386,27 @@ static inline int pm_runtime_get_sync(struct device *dev) return __pm_runtime_resume(dev, RPM_GET_PUT); } +/** + * pm_runtime_resume_and_get - Bump up usage counter of a device and resume it. + * @dev: Target device. + * + * Resume @dev synchronously and if that is successful, increment its runtime + * PM usage counter. Return 0 if the runtime PM usage counter of @dev has been + * incremented or a negative error code otherwise. + */ +static inline int pm_runtime_resume_and_get(struct device *dev) +{ + int ret; + + ret = __pm_runtime_resume(dev, RPM_GET_PUT); + if (ret < 0) { + pm_runtime_put_noidle(dev); + return ret; + } + + return 0; +} + /** * pm_runtime_put - Drop device usage counter and queue up "idle check" if 0. * @dev: Target device. -- cgit v1.2.3 From f97bb5272d9e95d400d6c8643ebb146b3e3e7842 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 17 Nov 2020 09:08:41 +0100 Subject: sched: Fix data-race in wakeup Mel reported that on some ARM64 platforms loadavg goes bananas and Will tracked it down to the following race: CPU0 CPU1 schedule() prev->sched_contributes_to_load = X; deactivate_task(prev); try_to_wake_up() if (p->on_rq &&) // false if (smp_load_acquire(&p->on_cpu) && // true ttwu_queue_wakelist()) p->sched_remote_wakeup = Y; smp_store_release(prev->on_cpu, 0); where both p->sched_contributes_to_load and p->sched_remote_wakeup are in the same word, and thus the stores X and Y race (and can clobber one another's data). Whereas prior to commit c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu") the p->on_cpu handoff serialized access to p->sched_remote_wakeup (just as it still does with p->sched_contributes_to_load) that commit broke that by calling ttwu_queue_wakelist() with p->on_cpu != 0. However, due to p->XXX = X ttwu() schedule() if (p->on_rq && ...) // false smp_mb__after_spinlock() if (smp_load_acquire(&p->on_cpu) && deactivate_task() ttwu_queue_wakelist()) p->on_rq = 0; p->sched_remote_wakeup = Y; We can be sure any 'current' store is complete and 'current' is guaranteed asleep. Therefore we can move p->sched_remote_wakeup into the current flags word. Note: while the observed failure was loadavg accounting gone wrong due to ttwu() cobbering p->sched_contributes_to_load, the reverse problem is also possible where schedule() clobbers p->sched_remote_wakeup, this could result in enqueue_entity() wrecking ->vruntime and causing scheduling artifacts. Fixes: c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu") Reported-by: Mel Gorman Debugged-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201117083016.GK3121392@hirez.programming.kicks-ass.net --- include/linux/sched.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index d383cf09e78f..0e91b451d2a2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -769,7 +769,6 @@ struct task_struct { unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1; unsigned sched_migrated:1; - unsigned sched_remote_wakeup:1; #ifdef CONFIG_PSI unsigned sched_psi_wake_requeue:1; #endif @@ -779,6 +778,21 @@ struct task_struct { /* Unserialized, strictly 'current' */ + /* + * This field must not be in the scheduler word above due to wakelist + * queueing no longer being serialized by p->on_cpu. However: + * + * p->XXX = X; ttwu() + * schedule() if (p->on_rq && ..) // false + * smp_mb__after_spinlock(); if (smp_load_acquire(&p->on_cpu) && //true + * deactivate_task() ttwu_queue_wakelist()) + * p->on_rq = 0; p->sched_remote_wakeup = Y; + * + * guarantees all stores of 'current' are visible before + * ->sched_remote_wakeup gets used, so it can be in this word. + */ + unsigned sched_remote_wakeup:1; + /* Bit to tell LSMs we're in execve(): */ unsigned in_execve:1; unsigned in_iowait:1; -- cgit v1.2.3 From 2279f540ea7d05f22d2f0c4224319330228586bc Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 17 Nov 2020 07:14:32 +0100 Subject: sched/deadline: Fix priority inheritance with multiple scheduling classes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Glenn reported that "an application [he developed produces] a BUG in deadline.c when a SCHED_DEADLINE task contends with CFS tasks on nested PTHREAD_PRIO_INHERIT mutexes. I believe the bug is triggered when a CFS task that was boosted by a SCHED_DEADLINE task boosts another CFS task (nested priority inheritance). ------------[ cut here ]------------ kernel BUG at kernel/sched/deadline.c:1462! invalid opcode: 0000 [#1] PREEMPT SMP CPU: 12 PID: 19171 Comm: dl_boost_bug Tainted: ... Hardware name: ... RIP: 0010:enqueue_task_dl+0x335/0x910 Code: ... RSP: 0018:ffffc9000c2bbc68 EFLAGS: 00010002 RAX: 0000000000000009 RBX: ffff888c0af94c00 RCX: ffffffff81e12500 RDX: 000000000000002e RSI: ffff888c0af94c00 RDI: ffff888c10b22600 RBP: ffffc9000c2bbd08 R08: 0000000000000009 R09: 0000000000000078 R10: ffffffff81e12440 R11: ffffffff81e1236c R12: ffff888bc8932600 R13: ffff888c0af94eb8 R14: ffff888c10b22600 R15: ffff888bc8932600 FS: 00007fa58ac55700(0000) GS:ffff888c10b00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fa58b523230 CR3: 0000000bf44ab003 CR4: 00000000007606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ? intel_pstate_update_util_hwp+0x13/0x170 rt_mutex_setprio+0x1cc/0x4b0 task_blocks_on_rt_mutex+0x225/0x260 rt_spin_lock_slowlock_locked+0xab/0x2d0 rt_spin_lock_slowlock+0x50/0x80 hrtimer_grab_expiry_lock+0x20/0x30 hrtimer_cancel+0x13/0x30 do_nanosleep+0xa0/0x150 hrtimer_nanosleep+0xe1/0x230 ? __hrtimer_init_sleeper+0x60/0x60 __x64_sys_nanosleep+0x8d/0xa0 do_syscall_64+0x4a/0x100 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x7fa58b52330d ... ---[ end trace 0000000000000002 ]— He also provided a simple reproducer creating the situation below: So the execution order of locking steps are the following (N1 and N2 are non-deadline tasks. D1 is a deadline task. M1 and M2 are mutexes that are enabled * with priority inheritance.) Time moves forward as this timeline goes down: N1 N2 D1 | | | | | | Lock(M1) | | | | | | Lock(M2) | | | | | | Lock(M2) | | | | Lock(M1) | | (!!bug triggered!) | Daniel reported a similar situation as well, by just letting ksoftirqd run with DEADLINE (and eventually block on a mutex). Problem is that boosted entities (Priority Inheritance) use static DEADLINE parameters of the top priority waiter. However, there might be cases where top waiter could be a non-DEADLINE entity that is currently boosted by a DEADLINE entity from a different lock chain (i.e., nested priority chains involving entities of non-DEADLINE classes). In this case, top waiter static DEADLINE parameters could be null (initialized to 0 at fork()) and replenish_dl_entity() would hit a BUG(). Fix this by keeping track of the original donor and using its parameters when a task is boosted. Reported-by: Glenn Elliott Reported-by: Daniel Bristot de Oliveira Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Tested-by: Daniel Bristot de Oliveira Link: https://lkml.kernel.org/r/20201117061432.517340-1-juri.lelli@redhat.com --- include/linux/sched.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 0e91b451d2a2..095fdec07b38 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -551,7 +551,6 @@ struct sched_dl_entity { * overruns. */ unsigned int dl_throttled : 1; - unsigned int dl_boosted : 1; unsigned int dl_yielded : 1; unsigned int dl_non_contending : 1; unsigned int dl_overrun : 1; @@ -570,6 +569,15 @@ struct sched_dl_entity { * time. */ struct hrtimer inactive_timer; + +#ifdef CONFIG_RT_MUTEXES + /* + * Priority Inheritance. When a DEADLINE scheduling entity is boosted + * pi_se points to the donor, otherwise points to the dl_se it belongs + * to (the original one/itself). + */ + struct sched_dl_entity *pi_se; +#endif }; #ifdef CONFIG_UCLAMP_TASK -- cgit v1.2.3 From 4d213e76a359e540ca786ee937da7f35faa8e5f8 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Tue, 10 Nov 2020 15:19:08 +0800 Subject: iommu/vt-d: Avoid panic if iommu init fails in tboot system "intel_iommu=off" command line is used to disable iommu but iommu is force enabled in a tboot system for security reason. However for better performance on high speed network device, a new option "intel_iommu=tboot_noforce" is introduced to disable the force on. By default kernel should panic if iommu init fail in tboot for security reason, but it's unnecessory if we use "intel_iommu=tboot_noforce,off". Fix the code setting force_on and move intel_iommu_tboot_noforce from tboot code to intel iommu code. Fixes: 7304e8f28bb2 ("iommu/vt-d: Correctly disable Intel IOMMU force on") Signed-off-by: Zhenzhong Duan Tested-by: Lukasz Hawrylko Acked-by: Lu Baolu Link: https://lore.kernel.org/r/20201110071908.3133-1-zhenzhong.duan@gmail.com Signed-off-by: Will Deacon --- include/linux/intel-iommu.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index fbf5b3e7707e..d956987ed032 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -798,7 +798,6 @@ extern int iommu_calculate_agaw(struct intel_iommu *iommu); extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu); extern int dmar_disabled; extern int intel_iommu_enabled; -extern int intel_iommu_tboot_noforce; extern int intel_iommu_gfx_mapped; #else static inline int iommu_calculate_agaw(struct intel_iommu *iommu) -- cgit v1.2.3 From 2bf31d94423c8ae3ff58e38a115b177df6940399 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 16 Nov 2020 11:18:08 +0100 Subject: jbd2: fix kernel-doc markups Kernel-doc markup should use this format: identifier - description They should not have any type before that, as otherwise the parser won't do the right thing. Also, some identifiers have different names between their prototypes and the kernel-doc markup. Reviewed-by: Jan Kara Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/72f5c6628f5f278d67625f60893ffbc2ca28d46e.1605521731.git.mchehab+huawei@kernel.org Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 1c49fd62ff2e..578ff196b3ce 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -401,7 +401,7 @@ static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh) #define JI_WAIT_DATA (1 << __JI_WAIT_DATA) /** - * struct jbd_inode - The jbd_inode type is the structure linking inodes in + * struct jbd2_inode - The jbd_inode type is the structure linking inodes in * ordered mode present in a transaction so that we can sync them during commit. */ struct jbd2_inode { -- cgit v1.2.3 From bc2dc4406c463174613047d8b7946e12c8808cda Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Sat, 21 Nov 2020 22:17:01 -0800 Subject: compiler-clang: remove version check for BPF Tracing bpftrace parses the kernel headers and uses Clang under the hood. Remove the version check when __BPF_TRACING__ is defined (as bpftrace does) so that this tool can continue to parse kernel headers, even with older clang sources. Fixes: commit 1f7a44f63e6c ("compiler-clang: add build check for clang 10.0.1") Reported-by: Chen Yu Reported-by: Jarkko Sakkinen Signed-off-by: Nick Desaulniers Signed-off-by: Andrew Morton Tested-by: Jarkko Sakkinen Acked-by: Jarkko Sakkinen Acked-by: Song Liu Acked-by: Nathan Chancellor Acked-by: Miguel Ojeda Link: https://lkml.kernel.org/r/20201104191052.390657-1-ndesaulniers@google.com Signed-off-by: Linus Torvalds --- include/linux/compiler-clang.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index dd7233c48bf3..98cff1b4b088 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -8,8 +8,10 @@ + __clang_patchlevel__) #if CLANG_VERSION < 100001 +#ifndef __BPF_TRACING__ # error Sorry, your version of Clang is too old - please use 10.0.1 or newer. #endif +#endif /* Compiler specific definitions for Clang compiler */ -- cgit v1.2.3 From a927bd6ba952d13c52b8b385030943032f659a3e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 21 Nov 2020 22:17:05 -0800 Subject: mm: fix phys_to_target_node() and memory_add_physaddr_to_nid() exports The core-mm has a default __weak implementation of phys_to_target_node() to mirror the weak definition of memory_add_physaddr_to_nid(). That symbol is exported for modules. However, while the export in mm/memory_hotplug.c exported the symbol in the configuration cases of: CONFIG_NUMA_KEEP_MEMINFO=y CONFIG_MEMORY_HOTPLUG=y ...and: CONFIG_NUMA_KEEP_MEMINFO=n CONFIG_MEMORY_HOTPLUG=y ...it failed to export the symbol in the case of: CONFIG_NUMA_KEEP_MEMINFO=y CONFIG_MEMORY_HOTPLUG=n Not only is that broken, but Christoph points out that the kernel should not be exporting any __weak symbol, which means that memory_add_physaddr_to_nid() example that phys_to_target_node() copied is broken too. Rework the definition of phys_to_target_node() and memory_add_physaddr_to_nid() to not require weak symbols. Move to the common arch override design-pattern of an asm header defining a symbol to replace the default implementation. The only common header that all memory_add_physaddr_to_nid() producing architectures implement is asm/sparsemem.h. In fact, powerpc already defines its memory_add_physaddr_to_nid() helper in sparsemem.h. Double-down on that observation and define phys_to_target_node() where necessary in asm/sparsemem.h. An alternate consideration that was discarded was to put this override in asm/numa.h, but that entangles with the definition of MAX_NUMNODES relative to the inclusion of linux/nodemask.h, and requires powerpc to grow a new header. The dependency on NUMA_KEEP_MEMINFO for DEV_DAX_HMEM_DEVICES is invalid now that the symbol is properly exported / stubbed in all combinations of CONFIG_NUMA_KEEP_MEMINFO and CONFIG_MEMORY_HOTPLUG. [dan.j.williams@intel.com: v4] Link: https://lkml.kernel.org/r/160461461867.1505359.5301571728749534585.stgit@dwillia2-desk3.amr.corp.intel.com [dan.j.williams@intel.com: powerpc: fix create_section_mapping compile warning] Link: https://lkml.kernel.org/r/160558386174.2948926.2740149041249041764.stgit@dwillia2-desk3.amr.corp.intel.com Fixes: a035b6bf863e ("mm/memory_hotplug: introduce default phys_to_target_node() implementation") Reported-by: Randy Dunlap Reported-by: Thomas Gleixner Reported-by: kernel test robot Reported-by: Christoph Hellwig Signed-off-by: Dan Williams Signed-off-by: Andrew Morton Tested-by: Randy Dunlap Tested-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Reviewed-by: Christoph Hellwig Cc: Joao Martins Cc: Tony Luck Cc: Fenghua Yu Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Vishal Verma Cc: Stephen Rothwell Link: https://lkml.kernel.org/r/160447639846.1133764.7044090803980177548.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 14 -------------- include/linux/numa.h | 30 +++++++++++++++++++++++++++++- 2 files changed, 29 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index d65c6fdc5cfc..551093b74596 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -281,20 +281,6 @@ static inline bool movable_node_is_enabled(void) } #endif /* ! CONFIG_MEMORY_HOTPLUG */ -#ifdef CONFIG_NUMA -extern int memory_add_physaddr_to_nid(u64 start); -extern int phys_to_target_node(u64 start); -#else -static inline int memory_add_physaddr_to_nid(u64 start) -{ - return 0; -} -static inline int phys_to_target_node(u64 start) -{ - return 0; -} -#endif - #if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT) /* * pgdat resizing functions diff --git a/include/linux/numa.h b/include/linux/numa.h index 8cb33ccfb671..cb44cfe2b725 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -21,13 +21,41 @@ #endif #ifdef CONFIG_NUMA +#include +#include + /* Generic implementation available */ int numa_map_to_online_node(int node); -#else + +#ifndef memory_add_physaddr_to_nid +static inline int memory_add_physaddr_to_nid(u64 start) +{ + pr_info_once("Unknown online node for memory at 0x%llx, assuming node 0\n", + start); + return 0; +} +#endif +#ifndef phys_to_target_node +static inline int phys_to_target_node(u64 start) +{ + pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n", + start); + return 0; +} +#endif +#else /* !CONFIG_NUMA */ static inline int numa_map_to_online_node(int node) { return NUMA_NO_NODE; } +static inline int memory_add_physaddr_to_nid(u64 start) +{ + return 0; +} +static inline int phys_to_target_node(u64 start) +{ + return 0; +} #endif #endif /* _LINUX_NUMA_H */ -- cgit v1.2.3 From 4349a83a3190c1d4414371161b0f4a4c3ccd3f9d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 21 Nov 2020 22:17:08 -0800 Subject: mm: fix readahead_page_batch for retry entries Both btrfs and fuse have reported faults caused by seeing a retry entry instead of the page they were looking for. This was caused by a missing check in the iterator. As can be seen in the below panic log, the accessing 0x402 causes a panic. In the xarray.h, 0x402 means RETRY_ENTRY. BUG: kernel NULL pointer dereference, address: 0000000000000402 CPU: 14 PID: 306003 Comm: as Not tainted 5.9.0-1-amd64 #1 Debian 5.9.1-1 Hardware name: Lenovo ThinkSystem SR665/7D2VCTO1WW, BIOS D8E106Q-1.01 05/30/2020 RIP: 0010:fuse_readahead+0x152/0x470 [fuse] Code: 41 8b 57 18 4c 8d 54 10 ff 4c 89 d6 48 8d 7c 24 10 e8 d2 e3 28 f9 48 85 c0 0f 84 fe 00 00 00 44 89 f2 49 89 04 d4 44 8d 72 01 <48> 8b 10 41 8b 4f 1c 48 c1 ea 10 83 e2 01 80 fa 01 19 d2 81 e2 01 RSP: 0018:ffffad99ceaebc50 EFLAGS: 00010246 RAX: 0000000000000402 RBX: 0000000000000001 RCX: 0000000000000002 RDX: 0000000000000000 RSI: ffff94c5af90bd98 RDI: ffffad99ceaebc60 RBP: ffff94ddc1749a00 R08: 0000000000000402 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000100 R12: ffff94de6c429ce0 R13: ffff94de6c4d3700 R14: 0000000000000001 R15: ffffad99ceaebd68 FS: 00007f228c5c7040(0000) GS:ffff94de8ed80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000402 CR3: 0000001dbd9b4000 CR4: 0000000000350ee0 Call Trace: read_pages+0x83/0x270 page_cache_readahead_unbounded+0x197/0x230 generic_file_buffered_read+0x57a/0xa20 new_sync_read+0x112/0x1a0 vfs_read+0xf8/0x180 ksys_read+0x5f/0xe0 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 042124cc64c3 ("mm: add new readahead_control API") Reported-by: David Sterba Reported-by: Wonhyuk Yang Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: Link: https://lkml.kernel.org/r/20201103142852.8543-1-willy@infradead.org Link: https://lkml.kernel.org/r/20201103124349.16722-1-vvghjk1234@gmail.com Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index e1e19c1f9ec9..d5570deff400 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -906,6 +906,8 @@ static inline unsigned int __readahead_batch(struct readahead_control *rac, xas_set(&xas, rac->_index); rcu_read_lock(); xas_for_each(&xas, page, rac->_index + rac->_nr_pages - 1) { + if (xas_retry(&xas, page)) + continue; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageTail(page), page); array[i++] = page; -- cgit v1.2.3